Update to 3.4-final.

author Jeff Mahoney <jeffm@suse.com>

Mon, 21 May 2012 12:49:11 +0000 (08:49 -0400)

committer Jeff Mahoney <jeffm@suse.com>

Mon, 21 May 2012 12:49:11 +0000 (08:49 -0400)
author Jeff Mahoney <jeffm@suse.com>
Mon, 21 May 2012 12:49:11 +0000 (08:49 -0400)
committer Jeff Mahoney <jeffm@suse.com>
Mon, 21 May 2012 12:49:11 +0000 (08:49 -0400)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index c1601e5..0688880 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -520,6 +520,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         Range: 0 - 8192
                         Default: 64
  
+       hpet64          [X86-64,HPET] enable 64-bit mode of the HPET timer (bnc#456700)
+
         com20020=       [HW,NET] ARCnet - COM20020 chipset
                         Format:
                         <io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]]
@@ -887,6 +889,24 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
         gpt             [EFI] Forces disk with valid GPT signature but
                         invalid Protective MBR to be treated as GPT.
  
+       guestdev=       [PCI,ACPI,XEN]
+                       Format: {<device path>|<sbdf>}][,{<device path>|<sbdf>}[,...]]
+                       Format of device path: <hid>[:<uid>]-<dev>.<func>[-<dev>.<func>[,...]][+iomul]
+                       Format of sbdf: [<segment>:]<bus>:<dev>.<func>[+iomul]
+                       Specifies PCI device for guest domain.
+                       If PCI-PCI bridge is specified, all PCI devices
+                       behind PCI-PCI bridge are reserved.
+                       +iomul means that this PCI function will share
+                       IO ports with other +iomul functions under same
+                       switch. NOTE: if +iomul is specfied, all the functions
+                       of the device will share IO ports.
+
+       guestiomuldev=  [PCI,ACPI,XEN]
+                       Format: [sbd][,<sbd>][,...]
+                       Format of sbdf: [<segment>:]<bus>:<dev>
+                       Note: function shouldn't be specified.
+                       Specifies PCI device for IO port multiplexing driver.
+
         hashdist=       [KNL,NUMA] Large hashes allocated during boot
                         are distributed across NUMA nodes.  Defaults on
                         for 64-bit NUMA, off otherwise.
@@ -2162,6 +2182,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                 realloc         same as realloc=on
                 noari           do not use PCIe ARI.
  
+       pci_reserve=    [PCI]
+                       Format: [<sbdf>[+IO<size>][+MEM<size>]][,<sbdf>...]
+                       Format of sbdf: [<segment>:]<bus>:<dev>.<func>
+                       Specifies the least reserved io size or memory size
+                       which is assigned to PCI bridge even when no child
+                       pci device exists. This is useful with PCI hotplug.
+
         pcie_aspm=      [PCIE] Forcibly enable or disable PCIe Active State Power
                         Management.
                 off     Disable ASPM.
@@ -2347,6 +2374,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                         Run specified binary instead of /init from the ramdisk,
                         used for early userspace startup. See initrd.
  
+       reassign_resources      [PCI,ACPI,XEN]
+                       Use guestdev= parameter to reassign device's
+                       resources, or specify =all here.
+
         reboot=         [BUGS=X86-32,BUGS=ARM,BUGS=IA-64] Rebooting mode
                         Format: <reboot_mode>[,<reboot_mode2>[,...]]
                         See arch/*/kernel/reboot.c or arch/*/kernel/process.c
@@ -2733,6 +2764,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
         unknown_nmi_panic
                         [X86] Cause panic on unknown NMI.
  
+       unsupported     Allow loading of unsupported kernel modules:
+                       0 = only allow supported modules,
+                       1 = warn when loading unsupported modules,
+                       2 = don't warn.
+
+                       CONFIG_ENTERPRISE_SUPPORT must be enabled for this
+                       to have any effect.
+
         usbcore.authorized_default=
                         [USB] Default USB device authorization:
                         (default -1 = authorized except for wireless USB,
diff --git a/Documentation/kmsg/s390/aes_s390 b/Documentation/kmsg/s390/aes_s390

new file mode 100644 (file)

index 0000000..277abae
--- /dev/null
+++ b/Documentation/kmsg/s390/aes_s390
@@ -0,0 +1,30 @@
+/*?
+ * Text: "Allocating AES fallback algorithm %s failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: algorithm name
+ * Description:
+ * The advanced encryption standard (AES) algorithm includes three modes with
+ * 128-bit, 192-bit, and 256-bit keys. Your hardware system only provides
+ * hardware acceleration for the 128-bit mode. The aes_s390 module failed to
+ * allocate a software fallback for the AES modes that are not supported by the
+ * hardware. A possible reason for this problem is that the aes_generic module
+ * that provides the fallback algorithms is not available.
+ * User action:
+ * Use the 128-bit mode only or ensure that the aes_generic module is available
+ * and loaded and reload the aes_s390 module.
+ */
+
+/*?
+ * Text: "AES hardware acceleration is only available for 128-bit keys\n"
+ * Severity: Informational
+ * Description:
+ * The advanced encryption standard (AES) algorithm includes three modes with
+ * 128-bit, 192-bit, and 256-bit keys. Your hardware system only provides
+ * hardware acceleration for the 128-bit key mode. The aes_s390 module
+ * will use the less performant software fallback algorithm for the 192-bit
+ * and 256-bit key modes.
+ * User action:
+ * None.
+ */
+
diff --git a/Documentation/kmsg/s390/af_iucv b/Documentation/kmsg/s390/af_iucv

new file mode 100644 (file)

index 0000000..f799df0
--- /dev/null
+++ b/Documentation/kmsg/s390/af_iucv
@@ -0,0 +1,33 @@
+/*?
+ * Text: "Application %s on z/VM guest %s exceeds message limit\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: application name
+ *   @2: z/VM user ID
+ * Description:
+ * Messages or packets destined for the application have accumulated and
+ * reached the maximum value. The default for the message limit is 65535.
+ * You can specify a different limit as the value for MSGLIMIT within
+ * the IUCV statement of the z/VM virtual machine on which the application
+ * runs.
+ * User action:
+ * Ensure that you do not send data faster than the application retrieves
+ * them. Ensure that the message limit on the z/VM guest virtual machine
+ * on which the application runs is high enough.
+ */
+
+/*?
+ * Text: "The af_iucv module cannot be loaded without z/VM\n"
+ * Severity: Error
+ * Description:
+ * The AF_IUCV protocol connects socket applications running in Linux
+ * kernels on different z/VM virtual machines, or it connects a Linux
+ * application to another sockets application running in a z/VM virtual
+ * machine. On Linux instances that run in environments other than the
+ * z/VM hypervisor, the AF_IUCV protocol does not provide any useful
+ * function and the corresponding af_iucv module cannot be loaded.
+ * User action:
+ * Load the af_iucv module only on Linux instances that run as guest
+ * operating systems of the z/VM hypervisor. If the module has been
+ * compiled into the kernel, ignore this message.
+ */
diff --git a/Documentation/kmsg/s390/ap b/Documentation/kmsg/s390/ap

new file mode 100644 (file)

index 0000000..dc19578
--- /dev/null
+++ b/Documentation/kmsg/s390/ap
@@ -0,0 +1,47 @@
+/*?
+ * Text: "%d is not a valid cryptographic domain\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: AP  domain index
+ * Description:
+ * The cryptographic domain specified for the 'domain=' module or kernel
+ * parameter must be an integer in the range 0 to 15.
+ * User action:
+ * Reload the cryptographic device driver with a correct module parameter.
+ * If the device driver has been compiled into the kernel, correct the value
+ * in the kernel parameter line and reboot Linux.
+ */
+
+/*?
+ * Text: "The hardware system does not support AP instructions\n"
+ * Severity: Warning
+ * Description:
+ * The ap module addresses AP adapters through AP instructions. The hardware
+ * system on which the Linux instance runs does not support AP instructions.
+ * The ap module cannot detect any AP adapters.
+ * User action:
+ * Load the ap module only if your Linux instance runs on hardware that
+ * supports AP instructions. If the ap module has been compiled into the kernel,
+ * ignore this message.
+ */
+
+/*?
+ * Text: "Registering adapter interrupts for AP %d failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: AP device ID
+ * Description:
+ * The hardware system supports AP adapter interrupts but failed to enable
+ * an adapter for interrupts. Possible causes for this error are:
+ * i)   The AP adapter firmware does not support AP interrupts.
+ * ii)  An AP adapter firmware update to a firmware level that supports AP
+ *      adapter interrupts failed.
+ * iii) The AP adapter firmware has been successfully updated to a level that
+ *      supports AP interrupts but the new firmware has not been activated.
+ * User action:
+ * Ensure that the firmware on your AP adapters support AP interrupts and that
+ * any firmware updates have completed successfully. If necessary, deconfigure
+ * your cryptographic adapters and reconfigure them to ensure that any firmware
+ * updates become active, then reload the ap module. If the ap module has been
+ * compiled into the kernel, reboot Linux.
+ */
diff --git a/Documentation/kmsg/s390/appldata b/Documentation/kmsg/s390/appldata

new file mode 100644 (file)

index 0000000..7c2001c
--- /dev/null
+++ b/Documentation/kmsg/s390/appldata
@@ -0,0 +1,88 @@
+/*?
+ * Text: "Starting the data collection for %s failed with rc=%d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: appldata module
+ *   @2: return code
+ * Description:
+ * The specified data collection module used the z/VM diagnose call
+ * DIAG 0xDC to start writing data. z/VM returned an error and the data
+ * collection could not start. If the return code is 5, your z/VM guest
+ * virtual machine is not authorized to write data records.
+ * User action:
+ * If the return code is 5, ensure that your z/VM guest virtual machine's
+ * entry in the z/VM directory includes the OPTION APPLMON statement.
+ * For other return codes see the section about DIAGNOSE Code X'DC'
+ * in "z/VM CP Programming Services".
+ */
+
+/*?
+ * Text: "Stopping the data collection for %s failed with rc=%d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: appldata module
+ *   @2: return code
+ * Description:
+ * The specified data collection module used the z/VM diagnose call DIAG 0xDC
+ * to stop writing data. z/VM returned an error and the data collection
+ * continues.
+ * User action:
+ * See the section about DIAGNOSE Code X'DC' in "z/VM CP Programming Services".
+ */
+
+/*?
+ * Text: "Starting a new OS data collection failed with rc=%d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: return code
+ * Description:
+ * After a CPU hotplug event, the record size for the running operating
+ * system data collection is no longer correct. The appldata_os module tried
+ * to start a new data collection with the correct record size but received
+ * an error from the z/VM diagnose call DIAG 0xDC. Any data collected with
+ * the current record size might be faulty.
+ * User action:
+ * Start a new data collection with the cappldata_os module. For information
+ * about starting data collections see "Device Drivers, Features, and
+ * Commands". For information about the return codes see the section about
+ * DIAGNOSE Code X'DC' in "z/VM CP Programming Services".
+ */
+
+/*?
+ * Text: "Stopping a faulty OS data collection failed with rc=%d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: return code
+ * Description:
+ * After a CPU hotplug event, the record size for the running operating
+ * system data collection is no longer correct. The appldata_os module tried
+ * to stop the faulty data collection but received an error from the z/VM
+ * diagnose call DIAG 0xDC. Any data collected with the current record size
+ * might be faulty.
+ * User action:
+ * Try to restart appldata_os monitoring. For information about stopping
+ * and starting data collections see "Device Drivers, Features, and
+ * Commands". For information about the return codes see the section about
+ * DIAGNOSE Code X'DC' in "z/VM CP Programming Services".
+ */
+
+/*?
+ * Text: "Maximum OS record size %i exceeds the maximum record size %i\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: no of bytes
+ *   @2: no of bytes
+ * Description:
+ * The OS record size grows with the number of CPUs and is adjusted by the
+ * appldata_os module in response to CPU hotplug events. For more than 110
+ * CPUs the record size would exceed the maximum record size of 4024 bytes
+ * that is  supported by the z/VM hypervisor. To prevent the maximum supported
+ * record size from being exceeded while data collection is in progress,
+ * you cannot load the appldata_os module on Linux instances that are
+ * configured for a maximum of more than 110 CPUs.
+ * User action:
+ * If you do not want to collect operating system data, you can ignore this
+ * message. If you want to collect operating system data, reconfigure your
+ * Linux instance to support less than 110 CPUs.
+ */
+
diff --git a/Documentation/kmsg/s390/cio b/Documentation/kmsg/s390/cio

new file mode 100644 (file)

index 0000000..6d209ba
--- /dev/null
+++ b/Documentation/kmsg/s390/cio
@@ -0,0 +1,145 @@
+/*?
+ * Text: "%s is not a valid device for the cio_ignore kernel parameter\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: device bus-ID
+ * Description:
+ * The device specification for the cio_ignore kernel parameter is
+ * syntactically incorrect or specifies an unknown device. This device is not
+ * excluded from being sensed and analyzed.
+ * User action:
+ * Correct your device specification in the kernel parameter line to have the
+ * device excluded when you next reboot Linux. You can write the correct
+ * device specification to /proc/cio_ignore to add the device to the list of
+ * devices to be excluded. This does not immediately make the device
+ * inaccessible but the device is ignored if it disappears and later reappears.
+ */
+
+/*?
+ * Text: "0.%x.%04x to 0.%x.%04x is not a valid range for cio_ignore\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: from subchannel set ID
+ *   @2: from device number
+ *   @3: to subchannel set ID
+ *   @4: to device number
+ * Description:
+ * The device range specified for the cio_ignore kernel parameter is
+ * syntactically incorrect. No devices specified with this range are
+ * excluded from being sensed and analyzed.
+ * User action:
+ * Correct your range specification in the kernel parameter line to have the
+ * range of devices  excluded when you next reboot Linux. You can write the
+ * correct range specification to /proc/cio_ignore to add the range of devices
+ * to the list of devices to be excluded. This does not immediately make the
+ * devices in the range inaccessible but any of these devices are ignored if
+ * they disappear and later reappear.
+ */
+
+/*?
+ * Text: "Processing %s for channel path %x.%02x\n"
+ * Severity: Notice
+ * Parameter:
+ *   @1: configuration change
+ *   @2: channel subsystem ID
+ *   @3: CHPID
+ * Description:
+ * A configuration change is in progress for the given channel path.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "No CCW console was found\n"
+ * Severity: Warning
+ * Description:
+ * Linux did not find the expected CCW console and tries to use an alternative
+ * console. A possible reason why the console was not found is that the console
+ * has been specified in the cio_ignore list.
+ * User action:
+ * None, if an appropriate alternative console has been found, and you want
+ * to use this alternative console. If you want to use the CCW console, ensure
+ * that is not specified in the cio_ignore list, explicitly specify the console
+ * with the 'condev=' kernel parameter, and reboot Linux.
+ */
+
+/*?
+ * Text: "Channel measurement facility initialized using format %s (mode %s)\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: format
+ *   @2: mode
+ * Description:
+ * The channel measurement facility has been initialized successfully.
+ * Format 'extended' should be used for z990 and later mainframe systems.
+ * Format 'basic' is intended for earlier mainframes. Mode 'autodetected' means
+ * that the format has been set automatically. Mode 'parameter' means that the
+ * format has been set according to the 'format=' kernel parameter.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "The CSS device driver initialization failed with errno=%d\n"
+ * Severity: Alert
+ * Parameter:
+ *   @1: Return code
+ * Description:
+ * The channel subsystem bus could not be established.
+ * User action:
+ * See the errno man page to find out what caused the problem.
+ */
+ /*? Text: "%s: Got subchannel machine check but no sch_event handler provided.\n" */
+
+/*?
+ * Text: "%s: Setting the device online failed because it is boxed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: Device bus-ID
+ * Description:
+ * Initialization of a device did not complete because it did not respond in
+ * time or it was reserved by another operating system.
+ * User action:
+ * Make sure that the device is working correctly, then try again to set it
+ * online. For devices that support the reserve/release mechanism (for example
+ * DASDs), you can try to override the reservation of the other system by
+ * writing 'force' to the 'online' sysfs attribute of the affected device.
+ */
+
+/*?
+ * Text: "%s: Setting the device online failed because it is not operational\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: Device bus-ID
+ * Description:
+ * Initialization of a device did not complete because it is not present or
+ * not operational.
+ * User action:
+ * Make sure that the device is present and working correctly, then try again
+ * to set it online.
+ */
+
+/*?
+ * Text: "%s: The device stopped operating while being set offline\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: Device bus-ID
+ * Description:
+ * While the device was set offline, it was not present or not operational.
+ * The device is now inactive, but setting it online again might fail.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: The device entered boxed state while being set offline\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: Device bus-ID
+ * Description:
+ * While the device was set offline, it did not respond in time or it was
+ * reserved by another operating system. The device is now inactive, but
+ * setting it online again might fail.
+ * User action:
+ * None.
+ */
diff --git a/Documentation/kmsg/s390/claw b/Documentation/kmsg/s390/claw

new file mode 100644 (file)

index 0000000..30ac6f4
--- /dev/null
+++ b/Documentation/kmsg/s390/claw
@@ -0,0 +1,731 @@
+/*?
+ * Text: "%s: Creating the /proc files for a new CLAW device failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the failed CLAW device
+ * Description:
+ * For each Common Link Access to Workstation (CLAW) device the CLAW device
+ * driver maintains files in the proc file system. The CLAW device driver
+ * failed to create a new CLAW device because it could not create these /proc
+ * files for the new device. You cannot create CLAW devices for Linux kernels
+ * that do not include a proc file system.
+ * User action:
+ * Ensure that your Linux kernel provides a proc file system. Reboot Linux.
+ * If your kernel provides a proc file system and the problem persists, contact
+ * your support organization.
+ */
+
+/*?
+ * Text: "%s: An uninitialized CLAW device received an IRQ, c-%02x d-%02x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: subchannel status
+ *   @3: device status
+ * Description:
+ * A Common Link Access to Workstation (CLAW) device was not initialized when
+ * it received a channel interrupt (IRQ). The IRQ is ignored. This might be a
+ * temporary condition while the device comes online or is taken offline.
+ * User action:
+ * If this problem occurs frequently, use the status information from the
+ * message and the channel and device traces to analyze the problem. See
+ * "Principles of Operation" for details about of the status information.
+ */
+
+/*?
+ * Text: "%s: The device is not a CLAW device\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the device
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device driver received a
+ * channel interrupt (IRQ) for a subchannel that is not a CLAW read or write
+ * subchannel. A CLAW subchannel must be configured for a 3088 device of
+ * type x'61' and have an even bus ID.
+ * User action:
+ * Assure that the subchannels have been defined correctly to the real or
+ * virtual hardware, for example, in your IOCDS or in your z/VM configuration.
+ */
+
+/*?
+ * Text: "%s: The CLAW device received an unexpected IRQ, c-%02x d-%02x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: subchannel status
+ *   @3: device status
+ * Description:
+ * A Common Link Access to Workstation (CLAW) device received a channel
+ * interrupt (IRQ) while it was in a state in which it cannot process IRQs.
+ * The IRQ is ignored. This might be a temporary condition.
+ * User action:
+ * If this problem occurs frequently, use the status information from the
+ * message and the channel and device traces to analyze the problem. See
+ * "Principles of Operation" for details about the status information.
+ */
+
+/*?
+ * Text: "%s: The CLAW device for %s received an unexpected IRQ\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * A Common Link Access to Workstation (CLAW) device received a channel
+ * interrupt (IRQ) while the CLAW device driver had assigned a status to the
+ * device in which it cannot process IRQs. The IRQ is ignored.
+ * User action:
+ * Restart the remote channel adapter. If the problem persists, use s390dbf
+ * traces and CCW traces to diagnose the problem.
+ */
+
+/*?
+ * Text: "%s: Deactivating %s completed with incorrect subchannel status (read %02x, write %02x)\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ *   @3: read subchannel status
+ *   @4: write subchannel status
+ * Description:
+ * When the Common Link Access to Workstation (CLAW) device driver closes a
+ * CLAW device, the device driver frees all storage that is used for the
+ * device. A successful closing operation results in status DEVICE END and
+ * CHANNEL END for both the read and write subchannel. At least one of these
+ * statuses is missing for a subchannel. Data might have been lost and there
+ * might be problems when the network interface is activated again.
+ * User action:
+ * If the network interface cannot be activated, vary the subchannels for the
+ * device offline and back online, for example, with chchp. If this does not
+ * resolve the problem, reset the remote channel adapter.
+ */
+
+/*?
+ * Text: "%s: The remote channel adapter is not available\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ * Description:
+ * During an operation, the Common Link Access to Workstation (CLAW) device
+ * driver received errno ENODEV from the common I/O layer. This means that
+ * the remote channel adapter was not operational or offline.
+ * User action:
+ * Check the remote channel adapter and, if necessary, restart it.
+ */
+
+/*?
+ * Text: "%s: The status of the remote channel adapter is not valid\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ * Description:
+ * During an operation, the Common Link Access to Workstation (CLAW) device
+ * driver received errno EINVAL from the common I/O layer. This indicates
+ * that the remote channel adapter was offline or not operational.
+ * User action:
+ * Check for related error messages to find the cause of the problem. If
+ * necessary, restart the remote channel adapter.
+ */
+
+/*?
+ * Text: "%s: The common device layer returned error code %d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: errno
+ * Description:
+ * During an I/O operation, the Common Link Access to Workstation (CLAW) device
+ * driver received an errno from the common I/O layer. This indicates a problem
+ * with the remote channel adapter.
+ * User action:
+ * See the errno man page to find out what the error code means. Check for
+ * related messages. Restart the remote channel adapter. If the problem
+ * persists, examine the subchannel trace for further diagnostic information.
+ */
+
+/*?
+ * Text: "%s: The communication peer of %s disconnected\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device driver received a device
+ * status word DEV_STAT_UNIT_CHECK and sense code 0x41. This indicates that the
+ * remote network interface is no longer available.
+ * User action:
+ * Ensure that the remote channel adapter is operational and activate the
+ * remote interface. For information about the sense code see
+ * /Documentation/s390/cds.txt in the Linux source tree. Search for 'SNS0' to
+ * locate the information.
+ */
+
+/*?
+ * Text: "%s: The remote channel adapter for %s has been reset\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device driver received a device
+ * status word DEV_STAT_UNIT_CHECK and sense code 0x40. This indicates that the
+ * remote channel adapter has been reset.
+ * User action:
+ * When the remote channel adapter is operational again, activate the remote
+ * interface. For information about the sense code see
+ * /Documentation/s390/cds.txt in the Linux source tree. Search for 'SNS0' to
+ * locate the information.
+ */
+
+/*?
+ * Text: "%s: A data streaming timeout occurred for %s\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device driver received a device
+ * status word DEV_STAT_UNIT_CHECK and sense code 0x24. This indicates a data
+ * streaming timeout. The remote channel adapter or the channel might be
+ * faulty.
+ * User action:
+ * Restart the remote channel adapter and activate the remote interface. If the
+ * problem persists, examine the subchannel trace for further diagnostic
+ * information. For information about the sense code see
+ * /Documentation/s390/cds.txt in the Linux source tree. Search for 'SNS0' to
+ * locate the information.
+ */
+
+/*?
+ * Text: "%s: A data transfer parity error occurred for %s\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @1: network interface name
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device driver received a device
+ * status word DEV_STAT_UNIT_CHECK and sense code 0x20. This indicates a data
+ * parity error. The remote channel adapter or the channel might be faulty.
+ * User action:
+ * Ensure that all cables are securely plugged. Restart the remote channel
+ * adapter and activate the remote interface. If the problem persists, examine
+ * the subchannel trace for further diagnostic information. For information
+ * about the sense code see /Documentation/s390/cds.txt in the Linux source
+ * tree. Search for 'SNS0' to locate the information.
+ */
+
+/*?
+ * Text: "%s: The remote channel adapter for %s is faulty\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device driver received a device
+ * status word DEV_STAT_UNIT_CHECK and sense code 0x30. This indicates that the
+ * remote channel adapter is faulty.
+ * User action:
+ * Check and restart the remote channel adapter and activate the remote
+ * interface. If the problem persists, perform device diagnosis for the remote
+ * channel adapter and examine the subchannel trace for further diagnostic
+ * information. For information about the sense code see
+ * /Documentation/s390/cds.txt in the Linux source tree. Search for 'SNS0' to
+ * locate the information.
+ */
+
+/*?
+ * Text: "%s: A read data parity error occurred for %s\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device driver received a device
+ * status word DEV_STAT_UNIT_CHECK and sense code 0x10. This indicates a read
+ * data parity error. The remote channel adapter might be faulty.
+ * User action:
+ * Ensure that all cables are securely plugged. Check and restart the remote
+ * channel adapter and activate the remote interface. If the problem persists,
+ * perform device diagnosis for the remote channel adapter and examine the
+ * subchannel trace for further diagnostic information. For information about
+ * the sense code see /Documentation/s390/cds.txt in the Linux source tree.
+ * Search for 'SNS0' to locate the information.
+ */
+
+/*?
+ * Text: "%s: The communication peer of %s uses an incorrect API version %d\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ *   @3: CLAW API version
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device driver received a
+ * SYSTEM_VALIDATE_REQUEST packet from the remote channel adapter. The packet
+ * included an unexpected version ID for the CLAW API. The version ID must
+ * be '2' for all packets.
+ * User action:
+ * Ensure that the remote channel adapter is at the latest firmware level.
+ * Restart the remote channel adapter and activate the remote interface. If the
+ * problem persists, examine the subchannel trace for further diagnostic
+ * information.
+ */
+
+/*?
+ * Text: "%s: Host name %s for %s does not match the remote adapter name %s\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: host name in the local CLAW device settings
+ *   @3: network interface name
+ *   @4: adapter name in the remote CLAW device settings
+ * Description:
+ * The host name in the local Common Link Access to Workstation (CLAW) device
+ * settings must match the adapter name in the CLAW device settings of the
+ * communication peer. The CLAW device driver discovered a mismatch between
+ * these settings. The connection cannot be established.
+ * User action:
+ * Check the configuration of the CLAW device and of its communication peer.
+ * Correct the erroneous setting and restart the CLAW device, local or remote,
+ * for which you have made corrections.
+ */
+
+/*?
+ * Text: "%s: Adapter name %s for %s does not match the remote host name %s\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: adapter name in the local CLAW device settings
+ *   @3: network interface name
+ *   @4: host name in the remote CLAW device settings
+ * Description:
+ * The adapter name in the local Common Link Access to Workstation (CLAW) device
+ * settings must match the host name in the CLAW device settings of the
+ * communication peer. The CLAW device driver discovered a mismatch between
+ * these settings. The connection cannot be established.
+ * User action:
+ * Check the configuration of the CLAW device and of its communication peer.
+ * Correct the erroneous setting and restart the CLAW device, local or remote,
+ * for which you have made corrections.
+ */
+
+/*?
+ * Text: "%s: The local write buffer is smaller than the remote read buffer\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ * Description:
+ * You set the buffer size for the local Common Link Access to Workstation
+ * (CLAW) device implicitly by setting the connection type. For connection
+ * type 'packed' the buffer size is 32 KB, for the other connection types the
+ * buffer size is 4 KB. The connection cannot be established because the
+ * write buffer size of the local CLAW device does not match the read buffer
+ * size of the communication peer.
+ * User action:
+ * Confirm that you are using the correct connection type for the local CLAW
+ * device. Ensure that the read buffer size of the remote CLAW device is set
+ * accordingly. Restart the CLAW device, local or remote, for which you have
+ * made corrections.
+ */
+
+/*?
+ * Text: "%s: The local read buffer is smaller than the remote write buffer\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ * Description:
+ * You set the buffer size for the local Common Link Access to Workstation
+ * (CLAW) device implicitly by setting the connection type. For connection
+ * type 'packed' the buffer size is 32 KB, for the other connection types the
+ * buffer size is 4 KB. The connection cannot be established because the
+ * read buffer size of the local CLAW device does not match the write buffer
+ * size of the communication peer.
+ * User action:
+ * Confirm that you are using the correct connection type for the local CLAW
+ * device. Ensure that the write buffer size of the remote CLAW device is set
+ * accordingly. Restart the CLAW device, local or remote, for which you have
+ * made corrections.
+ */
+
+/*?
+ * Text: "%s: Settings for %s validated (version=%d, remote device=%d, rc=%d, adapter name=%.8s, host name=%.8s)\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ *   @3: CLAW API version
+ *   @4: identifier for the remote CLAW device
+ *   @5: return code received from the remote CLAW device
+ *   @6: adapter name
+ *   @7: host name
+ * Description:
+ * The settings of the local Common Link Access to Workstation (CLAW) device
+ * have been validated by the communication peer. The message summarizes the
+ * content of the response. If the return code is zero, the validation was
+ * successful and the connection is activated.
+ * User action:
+ * If the return code is not equal to zero, look for related warning messages.
+ */
+
+/*?
+ * Text: "%s: Validating %s failed because of a host or adapter name mismatch\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * The Common Link Access to Workstation (CLAW) network interface cannot be
+ * activated because there is a mismatch between a host name and the
+ * corresponding adapter name. The local host name must match the remote
+ * adapter name and the local adapter name must match the remote host name.
+ * User action:
+ * Correct the erroneous setting and restart the CLAW device, local or remote,
+ * for which you have made corrections.
+ */
+
+/*?
+ * Text: "%s: Validating %s failed because of a version conflict\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * The Common Link Access to Workstation (CLAW) network interface cannot be
+ * activated because the remote CLAW device does not support CLAW version 2.
+ * The CLAW device driver requires CLAW version 2.
+ * User action:
+ * Ensure that the remote channel adapter supports CLAW version 2 and that the
+ * remote CLAW device is configured for CLAW version 2.
+ */
+
+/*?
+ * Text: "%s: Validating %s failed because of a frame size conflict\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * You set the frame size for the local Common Link Access to Workstation
+ * (CLAW) device implicitly by setting the connection type. For connection
+ * type 'packed' the frame size is 32 KB, for the other connection types the
+ * frame size is 4 KB. The connection cannot be activated because the
+ * the frame size of the local CLAW device does not match the frame size of the
+ * communication peer.
+ * User action:
+ * Confirm that you are using the correct connection type for the local CLAW
+ * device. Ensure that the frame size of the remote CLAW device is set
+ * accordingly. Restart the CLAW device, local or remote, for which you have
+ * have made corrections.
+ */
+
+/*?
+ * Text: "%s: The communication peer of %s rejected the connection\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * The remote CLAW device rejected the connection because of a mismatch between
+ * the settings of the local CLAW device and the remote CLAW device.
+ * User action:
+ * Check the settings of both the local and the remote CLAW device and ensure
+ * that the settings are consistent. Restart the CLAW device, local or remote
+ * for which you have made the correction.
+ */
+
+/*?
+ * Text: "%s: %s rejected a connection request because it is already active\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device rejected a connection
+ * request by its communication peer because the connection is already active.
+ * The CLAW device driver only supports a single connection for each CLAW
+ * device. This might be a runtime problem.
+ * User action:
+ * None if there is an active connection. If no connection can be established,
+ * restart the remote channel adapter.
+ */
+
+/*?
+ * Text: "%s: %s rejected a request to open multiple connections\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device rejected a request by
+ * its communication peer to open more than one connection. The CLAW device
+ * driver only supports a single connection for each CLAW device.
+ * User action:
+ * Reconfigure the remote CLAW device to only use one connection. Restart the
+ * remote CLAW device.
+ */
+
+/*?
+ * Text: "%s: %s rejected a connection request because of a type mismatch\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @1: network interface name
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device rejected a request by
+ * its communication peer to open a connection. A connection can only be opened
+ * if the same connection type has been set for both the local and the remote
+ * CLAW device.
+ * User action:
+ * Ensure that the connection types for the local and remote CLAW device match.
+ * Restart the CLAW device, local or remote, for which you have changed the
+ * connection type.
+ */
+
+/*?
+ * Text: "%s: The communication peer of %s rejected a connection request\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @1: network interface name
+ * Description:
+ * The remote CLAW device detected an inconsistency in the configurations of the
+ * local and the remote CLAW device and rejected a connection request.
+ * User action:
+ * Examine the settings of your local and remote CLAW device. Correct the
+ * erroneous setting and restart the CLAW device, local or remote, for which
+ * you have made corrections.
+ */
+
+/*?
+ * Text: "%s: The communication peer of %s rejected a connection request because of a type mismatch\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * The remote Common Link Access to Workstation (CLAW) device rejected a
+ * request to open a connection. A connection can only be opened if the same
+ * connection type has been set for both the local and the remote CLAW device.
+ * not be started.
+ * User action:
+ * Ensure that the connection types for the local and remote CLAW device match.
+ * Restart the CLAW device, local or remote, for which you have changed the
+ * connection type.
+ */
+
+/*?
+ * Text: "%s: Activating %s failed because of an incorrect link ID=%d\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ *   @3: link ID returned from the remote CLAW device
+ * Description:
+ * The remote Common Link Access to Workstation (CLAW) device accepted a
+ * connection request but returned an incorrect link ID. The CLAW device driver
+ * only supports a single connection at a time (link ID=1) for each network
+ * interface.
+ * User action:
+ * Restart the remote CLAW device and try again to activate the network
+ * interface.
+ */
+
+/*?
+ * Text: "%s: The communication peer of %s failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * The remote Common Link Access to Workstation (CLAW) device reported an
+ * error condition that cannot be recovered automatically.
+ * User action:
+ * Restart the remote CLAW device. If this does not resolve the error, gather
+ * logs and traces from the remote CLAW device to obtain further
+ * diagnostic data.
+ */
+
+/*?
+ * Text: "%s: The communication peer of %s sent an unknown command code\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ * Description:
+ * The remote Common Link Access to Workstation (CLAW) device sent a command
+ * code that is not defined. This might indicate that the remote CLAW device is
+ * malfunctioning. The connection remains operational.
+ * User action:
+ * If this problem occurs frequently, restart the remote CLAW device. If this
+ * does not resolve the error, gather logs and traces from the remote CLAW
+ * device to obtain further diagnostic data.
+ */
+
+/*?
+ * Text: "%s: The communication peer of %s sent a faulty frame of length %02x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: network interface name
+ *   @3: incorrect frame length value
+ * Description:
+ * The remote Common Link Access to Workstation (CLAW) device sent a frame
+ * with an incorrect value in the length field. This problem might result from
+ * data errors or incorrect packing. The connection remains operational.
+ * User action:
+ * If this problem occurs frequently, restart the remote CLAW device. If this
+ * does not resolve the error, gather logs and traces from the remote CLAW
+ * device to obtain further diagnostic data.
+ */
+
+/*?
+ * Text: "%s: Allocating a buffer for incoming data failed\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ * Description:
+ * A Common Link Access to Workstation (CLAW) data packet was received but
+ * the CLAW device driver could not allocate a receive buffer. A possible cause
+ * of this problem is memory constraints. The data packet is dropped but the
+ * connection remains operational.
+ * User action:
+ * Ensure that sufficient memory is available. If this problem occurs
+ * frequently, restart the remote CLAW device. If this does not resolve the
+ * error, gather logs and traces from the remote CLAW device to obtain further
+ * diagnostic data.
+ */
+
+/*?
+ * Text: "%s: Creating a CLAW group device failed with error code %d\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: errno
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device driver failed to create
+ * a CLAW group device. A possible cause of this problem is memory constraints.
+ * User action:
+ * Ensure that there is sufficient free memory. See the errno man page and look
+ * for related messages to find out what caused the problem. If you cannot
+ * resolve the problem, contact your support organization.
+ */
+
+/*?
+ * Text: "%s: Setting the read subchannel online failed with error code %d\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: errno
+ * Description:
+ * Setting the Common Link Access to Workstation (CLAW) device online failed
+ * with an error for the read subchannel. This problem occurs, for example, if
+ * the read subchannel used to create the CLAW group device is not defined as a
+ * CLAW read subchannel in the hardware definitions. The CLAW read subchannel
+ * must be for a 3088 device of type x'61' and have an even bus ID. The bus ID
+ * of the read subchannel matches the bus ID of the CLAW device.
+ * User action:
+ * Confirm that you are using the correct bus ID for the read subchannel. If
+ * necessary, ungroup the device and recreate it with the correct bus ID.
+ * Assure that the read subchannel has been defined correctly to the real or
+ * virtual hardware, for example, in your IOCDS or in your z/VM configuration.
+ * Assure that a valid number of read buffers has been assigned to the device.
+ * See 'Device Drivers, Features, and Commands' for details about the read
+ * buffers. See the errno man page for information about the error code.
+ */
+
+/*?
+ * Text: "%s: Setting the write subchannel online failed with error code %d\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ *   @2: errno
+ * Description:
+ * Setting the Common Link Access to Workstation (CLAW) device online failed
+ * with an error for the write subchannel. This problem occurs, for example, if
+ * the write subchannel used to create the CLAW group device is not defined as a
+ * CLAW write subchannel in the hardware definitions. The CLAW write subchannel
+ * must be for a 3088 device of type x'61' and have an uneven bus ID. The
+ * bus ID of the write subchannel can be found from the symbolic link
+ * /sys/bus/ccwgroup/drivers/claw/<device-bus-ID>/cdev1 where <device-bus-ID>
+ * is the bus ID of the CLAW device.
+ * User action:
+ * Confirm that you are using the correct bus ID for the write subchannel. If
+ * necessary, ungroup the device and recreate it with the correct bus ID.
+ * Assure that the write subchannel has been defined correctly to the real or
+ * virtual hardware, for example, in your IOCDS or in your z/VM configuration.
+ * Assure that a valid number of write buffers has been assigned to the device.
+ * See 'Device Drivers, Features, and Commands' for details about the read
+ * buffers. See the errno man page for information about the error code.
+ */
+
+/*?
+ * Text: "%s: Activating the CLAW device failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CLAW device
+ * Description:
+ * Activating the Common Link Access to Workstation (CLAW) device failed. A
+ * possible cause of this problem is memory constraints.
+ * User action:
+ * Free some memory and try again to activate the CLAW device. If the problem
+ * persists, contact your support organization.
+ */
+
+/*?
+ * Text: "Registering with the S/390 debug feature failed with error code %d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: errno
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device driver failed to register
+ * with the S/390 debug feature. No debug traces will be available for CLAW.
+ * User action:
+ * Enter 'lsmod | grep dbf' or an equivalent command to check if the S/390 debug
+ * feature loaded. If the output does not show the dbf module, the S/390 debug
+ * feature has not been loaded, unload the CLAW device driver, load the debug
+ * feature, then reload the CLAW device driver. See the errno man page for
+ * information about the error code.
+ */
+
+/*?
+ * Text: "Registering with the cu3088 device driver failed with error code %d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: errno
+ * Description:
+ * The Common Link Access to Workstation (CLAW) device driver failed to register
+ * with the cu3088 channel subsystem device driver. The CLAW device driver
+ * requires the cu3088 device driver.
+ * User action:
+ * Enter 'lsmod | grep cu3088' or an equivalent command to check if the cu3088
+ * device driver is loaded. If the output does not show the cu3088 module,
+ * unload the CLAW device driver, load the cu3088 device driver, then reload
+ * the CLAW device driver. See the errno man page for information about the
+ * error code.
+ */
+
+/*? Text: "%s: %s: CLAW device %.8s: Received Control Packet\n" */
+/*? Text: "%s: %s: CLAW device %.8s: System validate completed.\n" */
+/*? Text: "%s: %s: CLAW device %.8s: Connection completed link_id=%d.\n" */
+/*? Text: "%s: %s: remote side is not ready\n" */
+/*? Text: "%s: %s: write connection restarting\n" */
+/*? Text: "%s: %s: subchannel check for device: %04x - Sch Stat %02x  Dev Stat %02x CPA - %04x\n" */
+/*? Text: "%s: %s: Unit Exception occurred in write channel\n" */
+/*? Text: "%s: %s: Resetting Event occurred:\n" */
+/*? Text: "%s: %s: Recv Conn Confirm:Vers=%d,link_id=%d,Corr=%d,Host appl=%.8s,WS appl=%.8s\n" */
+/*? Text: "%s: %s: Recv Conn Req: Vers=%d,link_id=%d,Corr=%d,HOST appl=%.8s,WS appl=%.8s\n" */
+/*? Text: "%s: %s: Recv Sys Validate Request: Vers=%d,link_id=%d,Corr=%d,WS name=%.8s,Host name=%.8s\n" */
+/*? Text: "%s: %s: Confirmed Now packing\n" */
+/*? Text: "%s: %s: Unit Check Occured in write channel\n" */
+/*? Text: "%s: %s: Restart is required after remote side recovers \n" */
+/*? Text: "%s: %s: sys Validate Rsize:%d Wsize:%d\n" */
+/*? Text: "%s: %s:readsize=%d  writesize=%d readbuffer=%d writebuffer=%d read=0x%04x write=0x%04x\n" */
+/*? Text: "%s: %s:host_name:%.8s, adapter_name :%.8s api_type: %.8s\n" */
+/*? Text: "Driver unloaded\n" */
+/*? Text: "Loading %s\n" */
+/*? Text: "%s:  will be removed.\n" */
+/*? Text: "%s: add for %s\n" */
+/*? Text: "%s: %s: shutting down \n" */
+/*? Text: "%s: CLAW device %.8s: System validate completed.\n" */
+/*? Text: "%s: %s: Disconnect: Vers=%d,link_id=%d,Corr=%d\n" */
+/*? Text: "%s: %s: Recv Conn Resp: Vers=%d,link_id=%d,Corr=%d,RC=%d,Host appl=%.8s, WS appl=%.8s\n" */
diff --git a/Documentation/kmsg/s390/cpcmd b/Documentation/kmsg/s390/cpcmd

new file mode 100644 (file)

index 0000000..6f6b0c4
--- /dev/null
+++ b/Documentation/kmsg/s390/cpcmd
@@ -0,0 +1,17 @@
+/*?
+ * Text: "The cpcmd kernel function failed to allocate a response buffer\n"
+ * Severity: Warning
+ * Description:
+ * IPL code, console detection, and device drivers like vmcp or vmlogrdr use
+ * the cpcmd kernel function to send commands to the z/VM control program (CP).
+ * If a program that uses the cpcmd function does not allocate a contiguous
+ * response buffer below 2 GB guest real storage, cpcmd creates a bounce buffer
+ * to be used as the response buffer. Because of low memory or memory
+ * fragmentation, cpcmd could not create the bounce buffer.
+ * User action:
+ * Look for related page allocation failure messages and at the stack trace to
+ * find out which program or operation failed. Free some memory and retry the
+ * failed operation. Consider allocating more memory to your z/VM guest virtual
+ * machine.
+ */
+
diff --git a/Documentation/kmsg/s390/cpu b/Documentation/kmsg/s390/cpu

new file mode 100644 (file)

index 0000000..fba2eca
--- /dev/null
+++ b/Documentation/kmsg/s390/cpu
@@ -0,0 +1,69 @@
+/*?
+ * Text: "Processor %d started, address %d, identification %06X\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: logical CPU number
+ *   @2: CPU address
+ *   @3: CPU identification number
+ * Description:
+ * The kernel detected a CPU with the given characteristics.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Processor %d stopped\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: logical CPU number
+ * Description:
+ * A logical CPU has been set offline.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%d configured CPUs, %d standby CPUs\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: number of configured CPUs
+ *   @2: number of standby CPUs
+ * Description:
+ * The kernel detected the given number of configured and standby CPUs.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "The CPU configuration topology of the machine is:"
+ * Severity: Informational
+ * Description:
+ * The first six values of the topology information represent fields Mag6 to
+ * Mag1 of system-information block (SYSIB) 15.1.2. These fields specify the
+ * maximum numbers of topology-list entries (TLE) at successive topology nesting
+ * levels. The last value represents the MNest value of SYSIB 15.1.2 which
+ * specifies the maximum possible nesting that can be configured through
+ * dynamic changes. For details see the SYSIB 15.1.2 information in the
+ * "Principles of Operation."
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "CPU %i exceeds the maximum %i and is excluded from the dump\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: CPU number
+ *   @2: maximum CPU number
+ * Description:
+ * The Linux kernel is used as a system dumper but it runs on more CPUs than
+ * it has been compiled for with the CONFIG_NR_CPUS kernel configuration
+ * option. The system dump will be created but information on one or more
+ * CPUs will be missing.
+ * User action:
+ * Update the system dump kernel to a newer version that supports more
+ * CPUs or reduce the number of installed CPUs and reproduce the problem
+ * that should be analyzed. If you send the system dump that prompted this
+ * message to a support organization, be sure to communicate that the dump
+ * does not include all CPU information.
+ */
diff --git a/Documentation/kmsg/s390/ctcm b/Documentation/kmsg/s390/ctcm

new file mode 100644 (file)

index 0000000..1f30140
--- /dev/null
+++ b/Documentation/kmsg/s390/ctcm
@@ -0,0 +1,199 @@
+/*?
+ * Text: "%s: An I/O-error occurred on the CTCM device\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the CTCM device
+ * Description:
+ * An I/O error was detected on one of the subchannels of the CTCM device.
+ * Depending on the error, the CTCM device driver might attempt an automatic
+ * recovery.
+ * User action:
+ * Check the status of the CTCM device, for example, with ifconfig. If the
+ * device is not operational, perform a manual recovery. See "Device Drivers,
+ * Features, and Commands" for details about how to recover a CTCM device.
+ */
+
+/*?
+ * Text: "%s: An adapter hardware operation timed out\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the CTCM device
+ * Description:
+ * The CTCM device uses an adapter to physically connect to its communication
+ * peer. An operation on this adapter timed out.
+ * User action:
+ * Check the status of the CTCM device, for example, with ifconfig. If the
+ * device is not operational, perform a manual recovery. See "Device Drivers,
+ * Features, and Commands" for details about how to recover a CTCM device.
+ */
+
+/*?
+ * Text: "%s: An error occurred on the adapter hardware\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the CTCM device
+ * Description:
+ * The CTCM device uses an adapter to physically connect to its communication
+ * peer. An operation on this adapter returned an error.
+ * User action:
+ * Check the status of the CTCM device, for example, with ifconfig. If the
+ * device is not operational, perform a manual recovery. See "Device Drivers,
+ * Features, and Commands" for details about how to recover a CTCM device.
+ */
+
+/*?
+ * Text: "%s: The communication peer has disconnected\n"
+ * Severity: Notice
+ * Parameter:
+ *   @1: channel ID
+ * Description:
+ * The remote device has disconnected. Possible reasons are that the remote
+ * interface has been closed or that the operating system instance with the
+ * communication peer has been rebooted or shut down.
+ * User action:
+ * Check the status of the peer device. Ensure that the peer operating system
+ * instance is running and that the peer interface is operational.
+ */
+
+/*?
+ * Text: "%s: The remote operating system is not available\n"
+ * Severity: Notice
+ * Parameter:
+ *   @1: channel ID
+ * Description:
+ * The operating system instance with the communication peer has disconnected.
+ * Possible reasons are that the operating system instance has been rebooted
+ * or shut down.
+ * User action:
+ * Ensure that the peer operating system instance is running and that the peer
+ * interface is operational.
+ */
+
+/*?
+ * Text: "%s: The adapter received a non-specific IRQ\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CTCM device
+ * Description:
+ * The adapter hardware used by the CTCM device received an IRQ that cannot
+ * be mapped to a particular device. This is a hardware problem.
+ * User action:
+ * Check the status of the CTCM device, for example, with ifconfig. Check if
+ * the connection to the remote device still works. If the CTCM device is not
+ * operational, set it offline and back online. If this does not resolve the
+ * problem, perform a manual recovery. See "Device Drivers, Features, and
+ * Commands" for details about how to recover a CTCM device. If this problem
+ * persists, gather Linux debug data, collect the hardware logs, and report the
+ * problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: A check occurred on the subchannel\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CTCM device
+ * Description:
+ * A check condition has been detected on the subchannel.
+ * User action:
+ * Check if the connection to the remote device still works. If the CTCM device
+ * is not operational, set it offline and back online. If this does not resolve
+ * the problem, perform a manual recovery. See "Device Drivers, Features, and
+ * Commands" for details about how to recover a CTCM device. If this problem
+ * persists, gather Linux debug data and report the problem to your support
+ * organization.
+ */
+
+/*?
+ * Text: "%s: The communication peer is busy\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: channel ID
+ * Description:
+ * A busy target device was reported. This might be a temporary problem.
+ * User action:
+ * If this problem persists or is reported frequently ensure that the target
+ * device is working properly.
+ */
+
+/*?
+ * Text: "%s: The specified target device is not valid\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: channel ID
+ * Description:
+ * A target device was called with a faulty device specification. This is an
+ * adapter hardware problem.
+ * User action:
+ * Gather Linux debug data, collect the hardware logs, and contact IBM support.
+ */
+
+/*?
+ * Text: "An I/O operation resulted in error %04x\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: channel ID
+ *   @2: error information
+ * Description:
+ * A hardware operation ended with an error.
+ * User action:
+ * Check the status of the CTCM device, for example, with ifconfig. If the
+ * device is not operational, perform a manual recovery. See "Device Drivers,
+ * Features, and Commands" for details about how to recover a CTCM device.
+ * If this problem persists, gather Linux debug data, collect the hardware logs,
+ * and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Initialization failed with RX/TX init handshake error %s\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CTCM device
+ *   @2: error information
+ * Description:
+ * A problem occurred during the initialization of the connection. If the
+ * connection can be established after an automatic recovery, a success message
+ * is issued.
+ * User action:
+ * If the problem is not resolved by the automatic recovery process, check the
+ * local and remote device. If this problem persists, gather Linux debug data
+ * and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: The network backlog for %s is exceeded, package dropped\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CTCM device
+ *   @2: calling function
+ * Description:
+ * There is more network traffic than can be handled by the device. The device
+ * is closed and some data has not been transmitted. The device might be
+ * recovered automatically.
+ * User action:
+ * Investigate and resolve the congestion. If necessary, set the device
+ * online to make it operational.
+ */
+
+/*?
+ * Text: "%s: The XID used in the MPC protocol is not valid, rc = %d\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the CTCM device
+ *   @2: return code
+ * Description:
+ * The exchange identification (XID) used by the CTCM device driver when
+ * in MPC mode is not valid.
+ * User action:
+ * Note the error information provided with this message and contact your
+ * support organization.
+ */
+
+/*? Text: "CTCM driver unloaded\n" */
+/*? Text: "%s: %s Internal error: net_device is NULL, ch = 0x%p\n" */
+/*? Text: "%s / register_cu3088_discipline failed, ret = %d\n" */
+/*? Text: "%s: %s: Internal error: Can't determine channel for interrupt device %s\n" */
+/*? Text: "CTCM driver initialized\n" */
+/*? Text: "%s: setup OK : r/w = %s/%s, protocol : %d\n" */
+/*? Text: "%s: Connected with remote side\n" */
+/*? Text: "%s: Restarting device\n" */
+
diff --git a/Documentation/kmsg/s390/dasd b/Documentation/kmsg/s390/dasd

new file mode 100644 (file)

index 0000000..d3c5342
--- /dev/null
+++ b/Documentation/kmsg/s390/dasd
@@ -0,0 +1,466 @@
+/* dasd_ioctl */
+
+/*?
+ * Text: "%s: The DASD has been put in the quiesce state\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * No I/O operation is possible on this device.
+ * User action:
+ * Resume the DASD to enable I/O operations.
+ */
+
+/*?
+ * Text: "%s: I/O operations have been resumed on the DASD\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The DASD is no longer in state quiesce and I/O operations can be performed
+ * on the device.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: The DASD cannot be formatted while it is enabled\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The DASD you try to format is enabled. Enabled devices cannot be formatted.
+ * User action:
+ * Contact the owner of the formatting tool.
+ */
+
+/*?
+ * Text: "%s: The specified DASD is a partition and cannot be formatted\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The DASD you try to format is a partition. Partitions cannot be formatted
+ * separately. You can only format a complete DASD including all its partitions.
+ * User action:
+ * Format the complete DASD.
+ * ATTENTION: Formatting irreversibly destroys all data on all partitions
+ * of the DASD.
+ */
+
+/*?
+ * Text: "%s: Formatting unit %d failed with rc=%d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: start track
+ *   @3: return code
+ * Description:
+ * The formatting process might have been interrupted by a signal, for example,
+ * CTRL+C. If the process was not interrupted intentionally, an I/O error
+ * might have occurred.
+ * User action:
+ * Retry to format the device. If the error persists, check the log file for
+ * related error messages. If you cannot resolve the error, note the return
+ * code and contact your support organization.
+ */
+
+
+/* dasd */
+
+/*?
+ * Text: "%s: start_IO run out of retries and failed with request %s\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: pointer to request
+ * Description:
+ * The start IO function tried to start an IO request but the number
+ * of retries for the I/O was exceeded before the request could be started.
+ * User action:
+ * Check for related previous error messages.
+ */
+
+/*?
+ * Text: "%s: Cancelling request %p failed with rc=%d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: pointer to request
+ *   @3: return code of previous function
+ * Description:
+ * In response to a user action, the DASD device driver tried but failed to
+ * cancel a previously started I/O operation.
+ * User action:
+ * Try the action again.
+ */
+
+/*?
+ * Text: "%s: Flushing the DASD request queue failed for request %p\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: pointer to request
+ * Description:
+ * As part of the unloading process, the DASD device driver flushes the
+ * request queue. This failed because a previously started I/O operation
+ * could not be canceled.
+ * User action:
+ * Try again to unload the DASD device driver or to shut down Linux.
+ */
+
+/*?
+ * Text: "The DASD device driver could not be initialized\n"
+ * Severity: Informational
+ * Description:
+ * The initialization of the DASD device driver failed because of previous
+ * errors.
+ * User action:
+ * Check for related previous error messages.
+ */
+
+/*?
+ * Text: "%s: Accessing the DASD failed because it is in probeonly mode\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The dasd= module or kernel parameter specified the probeonly attribute for
+ * the DASD you are trying to access. The DASD device driver cannot access
+ * DASDs that are in probeonly mode.
+ * User action:
+ * Change the dasd= parameter as to omit probeonly for the DASD and reload
+ * the DASD device driver. If the DASD device driver has been compiled into
+ * the kernel, reboot Linux.
+ */
+
+/*?
+ * Text: "%s: cqr %p timed out (%is), %i retries remaining\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: pointer to request
+ *   @3: timeout value
+ *   @4: number of retries left
+ * Description:
+ * One try of the error recovery procedure (ERP) for the channel queued request
+ * (cqr) timed out and failed to recover the error. ERP continues for the DASD.
+ * User action:
+ * Ignore this message if it occurs infrequently and if the recovery succeeds
+ * during one of the retries. If this error persists, check for related
+ * previous error messages and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: cqr %p timed out (%is) but cannot be ended, retrying in 5 s\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: pointer to request
+ *   @3: timeout value
+ * Description:
+ * A try of the error recovery procedure (ERP) for the channel queued request
+ * (cqr) timed out and failed to recover the error. The I/O request submitted
+ * during the try could not be canceled. The ERP waits for 5 seconds before
+ * trying again.
+ * User action:
+ * Ignore this message if it occurs infrequently and if the recovery succeeds
+ * during one of the retries. If this error persists, check for related
+ * previous error messages and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: The DASD cannot be set offline while it is in use\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The DASD cannot be set offline because it is in use by an internal process.
+ * An action to free the DASD might not have completed yet.
+ * User action:
+ * Wait some time and set the DASD offline later.
+ */
+
+/*?
+ * Text: "%s: The DASD cannot be set offline with open count %i\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: count
+ * Description:
+ * The DASD is being used by one or more processes and cannot be set offline.
+ * User action:
+ * Ensure that the DASD is not in use anymore, for example, unmount all
+ * partitions. Then try again to set the DASD offline.
+ */
+
+/*?
+ * Text: "%s: Setting the DASD online failed with rc=%d\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: return code
+ * Description:
+ * The DASD could not be set online because of previous errors.
+ * User action:
+ * Look for previous error messages. If you cannot resolve the error, note
+ * the return code and contact your support organization.
+ */
+
+/*?
+ * Text: "%s Setting the DASD online with discipline %s failed with rc=%i\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: discipline
+ *   @3: return code
+ * Description:
+ * The DASD could not be set online because of previous errors.
+ * User action:
+ * Look for previous error messages. If you cannot resolve the error, note the
+ * return code and contact your support organization.
+ */
+
+/*?
+ * Text: "%s Setting the DASD online failed because of missing DIAG discipline\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The DASD was to be set online with discipline DIAG but this discipline of
+ * the DASD device driver is not available.
+ * User action:
+ * Ensure that the dasd_diag_mod module is loaded. If your Linux system does
+ * not include this module, you cannot set DASDs online with the DIAG
+ * discipline.
+ */
+
+/*?
+ * Text: "%s Setting the DASD online failed because of a missing discipline\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The DASD was to be set online with a DASD device driver discipline that
+ * is not available.
+ * User action:
+ * Ensure that all DASD modules are loaded correctly.
+ */
+
+---------------------------
+
+/*?
+ * Text: "The statistics feature has been switched off\n"
+ * Severity: Informational
+ * Description:
+ * The statistics feature of the DASD device driver has been switched off.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "The statistics feature has been switched on\n"
+ * Severity: Informational
+ * Description:
+ * The statistics feature of the DASD device driver has been switched on.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "The statistics have been reset\n"
+ * Severity: Informational
+ * Description:
+ * The DASD statistics data have been reset.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s is not a supported value for /proc/dasd/statistics\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: value
+ * Description:
+ * An incorrect value has been written to /proc/dasd/statistics.
+ * The supported values are: 'set on', 'set off', and 'reset'.
+ * User action:
+ * Write a supported value to /proc/dasd/statistics.
+ */
+
+/*?
+ * Text: "%s is not a valid device range\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: range
+ * Description:
+ * A device range specified with the dasd= parameter is not valid.
+ * User action:
+ * Examine the dasd= parameter and correct the device range.
+ */
+
+/*?
+ * Text: "The probeonly mode has been activated\n"
+ * Severity: Informational
+ * Description:
+ * The probeonly mode of the DASD device driver has been activated. In this
+ * mode the device driver rejects any 'open' syscalls with EPERM.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "The IPL device is not a CCW device\n"
+ * Severity: Error
+ * Description:
+ * The value for the dasd= parameter contains the 'ipldev' keyword. During
+ * the boot process this keyword is replaced with the device from which the
+ * IPL was performed. The 'ipldev' keyword is not valid if the IPL device is
+ * not a CCW device.
+ * User action:
+ * Do not specify the 'ipldev' keyword when performing an IPL from a device
+ * other than a CCW device.
+ */
+
+/*?
+ * Text: "A closing parenthesis ')' is missing in the dasd= parameter\n"
+ * Severity: Warning
+ * Description:
+ * The specification for the dasd= kernel or module parameter has an opening
+ * parenthesis '(' * without a matching closing parenthesis ')'.
+ * User action:
+ * Correct the parameter value.
+ */
+
+/*?
+ * Text: "The autodetection mode has been activated\n"
+ * Severity: Informational
+ * Description:
+ * The autodetection mode of the DASD device driver has been activated. In
+ * this mode the DASD device driver sets all detected DASDs online.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%*s is not a supported device option\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: length of option code
+ *   @2: option code
+ * Description:
+ * The dasd= parameter includes an unknown option for a DASD or a device range.
+ * Options are specified in parenthesis and immediately follow a device or
+ * device range.
+ * User action:
+ * Check the dasd= syntax and remove any unsupported options from the dasd=
+ * parameter specification.
+ */
+
+/*?
+ * Text: "PAV support has be deactivated\n"
+ * Severity: Informational
+ * Description:
+ * The 'nopav' keyword has been specified with the dasd= kernel or module
+ * parameter. The Parallel Access Volume (PAV) support of the DASD device
+ * driver has been deactivated.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "'nopav' is not supported on z/VM\n"
+ * Severity: Informational
+ * Description:
+ * For Linux instances that run as guest operating systems of the z/VM
+ * hypervisor Parallel Access Volume (PAV) support is controlled by z/VM not
+ * by Linux.
+ * User action:
+ * Remove 'nopav' from the dasd= module or kernel parameter specification.
+ */
+
+/*?
+ * Text: "High Performance FICON support has been deactivated\n"
+ * Severity: Informational
+ * Description:
+ * The 'nofcx' keyword has been specified with the dasd= kernel or module
+ * parameter. The High Performance FICON (transport mode) support of the DASD
+ * device driver has been deactivated.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "The dasd= parameter value %s has an invalid ending\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: parameter value
+ * Description:
+ * The specified value for the dasd= kernel or module parameter is not correct.
+ * User action:
+ * Check the module or the kernel parameter.
+ */
+
+/*?
+ * Text: "Registering the device driver with major number %d failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: DASD major
+ * Description:
+ * Major number 94 is reserved for the DASD device driver. The DASD device
+ * driver failed to register with this major number. Another device driver
+ * might have used major number 94.
+ * User action:
+ * Determine which device driver uses major number 94 instead of the DASD
+ * device driver and unload this device driver. Then try again to load the
+ * DASD device driver.
+ */
+
+/*?
+  * Text: "%s: default ERP has run out of retries and failed\n"
+  * Severity: Error
+  * Parameter:
+  *   @1: bus ID of the DASD
+  * Description:
+  * The error recovery procedure (ERP) tried to recover an error but the number
+  * of retries for the I/O was exceeded before the error could be resolved.
+  * User action:
+  * Check for related previous error messages.
+  */
+
+/*?
+ * Text: "%s: Unable to terminate request %p on suspend\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: pointer to request
+ * Description:
+ * As part of the suspend process, the DASD device driver terminates requests
+ * on the request queue. This failed because a previously started I/O operation
+ * could not be canceled. The suspend process will be stopped.
+ * User action:
+ * Try again to suspend the system.
+ */
+
+/*?
+ * Text: "%s: ERP failed for the DASD\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * An error recovery procedure (ERP) was performed for the DASD but failed.
+ * User action:
+ * Check the message log for previous related error messages.
+ */
+
+/*?
+ * Text: "%s: An error occurred in the DASD device driver, reason=%s\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: reason code
+ * Description:
+ * This problem indicates a program error in the DASD device driver.
+ * User action:
+ * Note the reason code and contact your support organization.
+*/
diff --git a/Documentation/kmsg/s390/dasd-diag b/Documentation/kmsg/s390/dasd-diag

new file mode 100644 (file)

index 0000000..d276860
--- /dev/null
+++ b/Documentation/kmsg/s390/dasd-diag
@@ -0,0 +1,118 @@
+/* dasd_diag */
+
+/*?
+ * Text: "%s: A 64-bit DIAG call failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * 64-bit DIAG calls require a 64-bit z/VM version.
+ * User action:
+ * Use z/VM 5.2 or later or set the sysfs 'use_diag' attribute of the DASD to 0
+ * to switch off DIAG.
+ */
+
+/*?
+ * Text: "%s: Accessing the DASD failed because of an incorrect format (rc=%d)\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: return code
+ * Description:
+ * The format of the DASD is not correct.
+ * User action:
+ * Check the device format. For details about the return code see the
+ * section about the INITIALIZE function for DIAGNOSE Code X'250'
+ * in "z/VM CP Programming Services". If you cannot resolve the error, note
+ * the return code and contact your support organization.
+ */
+
+/*?
+ * Text: "%s: New DASD with %ld byte/block, total size %ld KB%s\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: bytes per block
+ *   @3: size
+ *   @4: access mode
+ * Description:
+ * A DASD with the indicated block size and total size has been set online.
+ * If the DASD is configured as read-only to the real or virtual hardware,
+ * the message includes an indication of this hardware access mode. The
+ * hardware access mode is independent from the 'readonly' attribute of
+ * the device in sysfs.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: DIAG ERP failed with rc=%d\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: return code
+ * Description:
+ * An error in the DIAG processing could not be recovered by the error
+ * recovery procedure (ERP) of the DIAG discipline.
+ * User action:
+ * Note the return code, check for related I/O errors, and report this problem
+ * to your support organization.
+ */
+
+/*?
+ * Text: "%s: DIAG initialization failed with rc=%d\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: return code
+ * Description:
+ * Initializing the DASD with the DIAG discipline failed. Possible reasons for
+ * this problem are that the device has a device type other than FBA or ECKD,
+ * or has a block size other than one of the supported sizes:
+ * 512 byte, 1024 byte, 2048 byte, or 4096 byte.
+ * User action:
+ * Ensure that the device can be written to and has a supported device type
+ * and block size. For details about the return code see the section about
+ * the INITIALIZE function for DIAGNOSE Code X'250' in "z/VM CP Programming
+ * Services". If you cannot resolve the error, note the error code and contact
+ * your support organization.
+ */
+
+/*?
+ * Text: "%s: Device type %d is not supported in DIAG mode\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: device type
+ * Description:
+ * Only DASD of type FBA and ECKD are supported in DIAG mode.
+ * User action:
+ * Set the sysfs 'use_diag' attribute of the DASD to 0 and try again to access
+ * the DASD.
+ */
+
+/*?
+ * Text: "Discipline %s cannot be used without z/VM\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: discipline name
+ * Description:
+ * The discipline that is specified with the dasd= kernel or module parameter
+ * is only available for Linux instances that run as guest operating
+ * systems of the z/VM hypervisor.
+ * User action:
+ * Remove the unsupported discipline from the parameter string.
+ */
+
+/*?
+ * Text: "%s: The access mode of a DIAG device changed to read-only"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * A device changed its access mode from writeable to
+ * read-only while in use.
+ * User action:
+ * Set the device offline, ensure that the device is configured correctly in
+ * z/VM, then set the device online again.
+ */
diff --git a/Documentation/kmsg/s390/dasd-eckd b/Documentation/kmsg/s390/dasd-eckd

new file mode 100644 (file)

index 0000000..39149af
--- /dev/null
+++ b/Documentation/kmsg/s390/dasd-eckd
@@ -0,0 +1,1901 @@
+/* dasd_eckd */
+
+/*?
+ * Text: "%s: ERP failed for the DASD\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * An error recovery procedure (ERP) was performed for the DASD but failed.
+ * User action:
+ * Check the message log for previous related error messages.
+ */
+
+/*?
+ * Text: "%s: An error occurred in the DASD device driver, reason=%s\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: reason code
+ * Description:
+ * This problem indicates a program error in the DASD device driver.
+ * User action:
+ * Note the reason code and contact your support organization.
+*/
+
+/*?
+ * Text: "%s: Allocating memory for private DASD data failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The DASD device driver maintains data structures for each DASD it manages.
+ * There is not enough memory to allocate these data structures for one or
+ * more DASD.
+ * User action:
+ * Free some memory and try the operation again.
+ */
+
+/*?
+ * Text: "%s: DASD with %d KB/block, %d KB total size, %d KB/track, %s\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: block size
+ *   @3: DASD size
+ *   @4: track size
+ *   @5: disc layout
+ * Description:
+ * A DASD with the shown characteristics has been set online.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: Start track number %d used in formatting is too big\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: track number
+ * Description:
+ * The DASD format I/O control was used incorrectly by a formatting tool.
+ * User action:
+ * Contact the owner of the formatting tool.
+ */
+
+/*?
+ * Text: "%s: The DASD is not formatted\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * A DASD has been set online but it has not been formatted yet. You must
+ * format the DASD before you can use it.
+ * User action:
+ * Format the DASD, for example, with dasdfmt.
+ */
+
+/*?
+ * Text: "%s: 0x%x is not a known command\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: command
+ * Description:
+ * This problem is likely to be caused by a programming error.
+ * User action:
+ * Contact your support organization.
+ */
+
+/*?
+ * Text: "%s: Track 0 has no records following the VTOC\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * Linux has identified a volume table of contents (VTOC) on the DASD but
+ * cannot read any data records following the VTOC. A possible cause of this
+ * problem is that the DASD has been used with another System z operating
+ * system.
+ * User action:
+ * Format the DASD for usage with Linux, for example, with dasdfmt.
+ * ATTENTION: Formatting irreversibly destroys all data on the DASD.
+ */
+
+/*?
+ * Text: "%s: An I/O control call used incorrect flags 0x%x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: flags
+ * Description:
+ * The DASD format I/O control was used incorrectly.
+ * User action:
+ * Contact the owner of the formatting tool.
+ */
+
+/*?
+ * Text: "%s: New DASD %04X/%02X (CU %04X/%02X) with %d cylinders, %d heads, %d sectors\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: device type
+ *   @3: device model
+ *   @4: control unit type
+ *   @5: control unit model
+ *   @6: number of cylinders
+ *   @7: tracks per cylinder
+ *   @8: sectors per track
+ * Description:
+ * A DASD with the shown characteristics has been set online.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: The disk layout of the DASD is not supported\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The DASD device driver only supports the following disk layouts: CDL, LDL,
+ * FBA, CMS, and CMS RESERVED.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: Start track %d used in formatting exceeds end track\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: track number
+ * Description:
+ * The DASD format I/O control was used incorrectly by a formatting tool.
+ * User action:
+ * Contact the owner of the formatting tool.
+ */
+
+/*?
+ * Text: "%s: The DASD cache mode was set to %x (%i cylinder prestage)\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: operation mode
+ *   @3: number of cylinders
+ * Description:
+ * The DASD cache mode has been changed. See the storage system documentation
+ * for information about the different cache operation modes.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: The DASD cannot be formatted with block size %d\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: block size
+ * Description:
+ * The block size specified for a format instruction is not valid. The block
+ * size must be between 512 and 4096 byte and must be a power of 2.
+ * User action:
+ * Call the format command with a supported block size.
+ */
+
+/*?
+ * Text: "%s: The UID of the DASD has changed\n"
+ * Severity: Error
+ * Parameter:
+ * @1: bus ID of the DASD
+ * Description:
+ * The Unique Identifier (UID) of a DASD that is currently in use has changed.
+ * This indicates that the physical disk has been replaced.
+ * User action:
+ * None if the replacement was intentional.
+ * If the disk change is not expected, stop using the disk to prevent possible
+ * data loss.
+*/
+
+
+/* dasd_3990_erp */
+
+/*?
+ * Text: "%s: is offline or not installed - INTERVENTION REQUIRED!!\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The DASD to be accessed is not in an accessible state. The I/O operation
+ * will wait until the device is operational again. This is an operating system
+ * independent message that is issued by the storage system.
+ * User action:
+ * Make the DASD accessible again. For details see the storage system
+ * documentation.
+ */
+
+/*?
+ * Text: "%s: The DASD cannot be reached on any path (lpum=%x/opm=%x)\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: last path used mask
+ *   @3: online path mask
+ * Description:
+ * After a path to the DASD failed, the error recovery procedure of the DASD
+ * device driver tried but failed to reconnect the DASD through an alternative
+ * path.
+ * User action:
+ * Ensure that the cabling between the storage server and the mainframe
+ * system is securely in place. Check the file systems on the DASD when it is
+ * accessible again.
+ */
+
+/*?
+ * Text: "%s: Unable to allocate DCTL-CQR\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an internal error.
+ * User action:
+ * Contact your support organization.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Invalid Parameter\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * A data argument of a command is not valid. This is an operating system
+ * independent message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - DPS Installation Check\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This operating system independent message is issued by the storage system
+ *  for one of the following reasons:
+ * - A 3380 Model D or E DASD does not have the Dynamic Path Selection (DPS)
+ * feature in the DASD A-unit.
+ * - The device type of an attached DASD is not supported by the firmware.
+ * - A type 3390 DASD is attached to a 3 MB channel.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 2 - Reserved\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Drive motor switch is off\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - CCW Count less than required\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The CCW count of a command is less than required. This is an operating
+ * system independent message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Channel requested ... %02x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: reason code
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system. The possible reason codes indicate the following problems:
+ * 00 No Message.
+ * 01 The channel has requested unit check sense data.
+ * 02 The channel has requested retry and retry is exhausted.
+ * 03 A SA Check-2 error has occurred. This sense is presented with
+ *    Equipment Check.
+ * 04 The channel has requested retry and retry is not possible.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Status Not As Required: reason %02x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: reason code
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system. There are several potential reasons for this message;
+ * byte 8 contains the reason code.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - Reserved\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Device status 1 not valid\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Storage Path Restart\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * An operation for an active channel program was queued in a Storage Control
+ * when a warm start was received by the path. This is an operating system
+ * independent message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Reset Notification\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * A system reset or its equivalent was received on an interface. The Unit
+ * Check that generates this sense is posted to the next channel initiated
+ * selection following the resetting event. This is an operating system
+ * independent message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Invalid Command Sequence\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * An incorrect sequence of commands has occurred. This is an operating system
+ * independent message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Missing device address bit\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT F - Subsystem Processing Error\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * A firmware logic error has been detected. This is an operating system
+ * independent message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Seek incomplete\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Invalid Command\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * A command was issued that is not in the 2107/1750 command set.
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Reserved\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Command Invalid on Secondary Address\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * A command or order not allowed on a PPRC secondary device has been received
+ * by the secondary device. This is an operating system independent message
+ * that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Invalid Defective/Alternate Track Pointer\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * A defective track has been accessed. The subsystem generates an invalid
+ * Defective/Alternate Track Pointer as a part of RAID Recovery.
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Channel Returned with Incorrect retry CCW\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * A command portion of the CCW returned after a command retry sequence does
+ * not match the command for which retry was signaled. This is an operating
+ * system independent message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Diagnostic of Special Command Violates File Mask\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * A command is not allowed under the Access Authorization specified by the
+ * File Mask. This is an operating system independent message that is issued
+ * by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Head address does not compare\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Reserved\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Device did not respond to selection\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Device check-2 error or Set Sector is not complete\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Device Error Source\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The device has completed soft error logging. This is an operating system
+ * independent message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Data Pinned for Device\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * Modified data in cache or in persistent storage exists for the DASD. The
+ * data cannot be destaged to the device. This track is the first track pinned
+ * for this device. This is an operating system independent message that is
+ * issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 6 - Overrun on channel C\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Device Status 1 not as expected\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 0 - Device Fenced - device = %02x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: sense data byte 4
+ * Description:
+ * The device shown in sense byte 4 has been fenced. This is an operating
+ * system independent message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Interruption cannot be reset\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Index missing\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT F - DASD Fast Write inhibited\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * DASD Fast Write is not allowed because of a nonvolatile storage battery
+ * check condition. This is an operating system independent message that is
+ * issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - Invalid tag-in for an extended command sequence\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - Key area error; offset active\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - Count area error; offset active\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Track physical address did not compare\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 2 - 3990 check-2 error\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Offset active cannot be reset\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - RCC 1 and RCC 2 sequences not successful\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - No syn byte in count address area; offset active\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - Data area error\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 6 - Overrun on channel A\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - No sync byte in count address area\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 5 - Data Check in the key area\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT F - Caching status reset to default\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The storage director has assigned two new subsystem status devices and
+ * resets the status to its default value. This is an operating system
+ * independent message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 5 - Data Check in the data area; offset active\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 5 - Reserved\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Device not ready\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - No sync byte in key area\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 8 - DASD controller failed to set or reset the long busy latch\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 1 - Cylinder address did not compare\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 3 - Reserved\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - No syn byte in data area; offset active\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 2 - Support facility errors\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - Key area error\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 8 - End operation with transfer count not zero\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 2 - Microcode detected error %02x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: error code
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 5 - Data Check in the count area; offset active\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 3 - Allegiance terminated\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * Allegiance terminated because of a Reset Allegiance or an Unconditional
+ * Reserve command on another channel. This is an operating system independent
+ * message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - Home address area error\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - Count area error\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - Invalid tag-in during selection sequence\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - No sync byte in data area\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - No sync byte in home address area; offset active\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - Home address area error; offset active\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - Data area error; offset active\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - No sync byte in home address area\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 5 - Data Check in the home address area; offset active\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 5 - Data Check in the home address area\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 5 - Data Check in the count area\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 4 - No sync byte in key area; offset active\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - Invalid DCC selection response or timeout\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 5 - Data Check in the data area\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT F - Operation Terminated\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The storage system ends an operation related to an active channel program
+ * when termination and redrive are required and logging is not desired.
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 6 - Overrun on channel B\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 5 - Data Check in the key area; offset active\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT F - Volume is suspended duplex\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The duplex pair volume has entered the suspended duplex state because of a
+ * failure. This is an operating system independent message that is issued by
+ * the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 6 - Overrun on channel D\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - RCC 1 sequence not successful\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 6 - Overrun on channel E\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - 3990 microcode time out when stopping selection\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 6 - Overrun on channel F\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 6 - Reserved\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - RCC initiated by a connection check alert\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 6 - Overrun on channel G\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - extra RCC required\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 6 - Overrun on channel H\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 8 - Unexpected end operation response code\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - Permanent path error (DASD controller not available)\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - Missing end operation; device transfer incomplete\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT D - Reserved\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT F - Cache or nonvolatile storage equipment failure\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * An equipment failure has occurred in the cache storage or nonvolatile
+ * storage of the storage system. This is an operating system independent
+ * message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 8 - DPS cannot be filled\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 8 - Error correction code hardware fault\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - Missing end operation; device transfer complete\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - DASD controller not available on disconnected command chain\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 8 - No interruption from device during a command chain\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - No response to selection after a poll interruption\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 9 - Track physical address did not compare while oriented\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 9 - Head address did not compare\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - Invalid tag-in for an immediate command sequence\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 9 - Cylinder address did not compare\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 8 - DPS checks after a system reset or selective reset\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT F - Caching reinitiated\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * Caching has been automatically reinitiated following an error.
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 8 - End operation with transfer count zero\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 7 - Reserved\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 9 - Reserved\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 8 - Short busy time-out during device selection\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT F - Caching terminated\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The storage system was unable to initiate caching or had to suspend caching
+ * for a 3990 control unit. If this problem is caused by a failure condition,
+ * an additional message will provide more information about the failure.
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * Check for additional messages that point out possible failures. For more
+ * information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT F - Subsystem status cannot be determined\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The status of a DASD Fast Write or PPRC volume cannot be determined.
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT F - Nonvolatile storage terminated\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The storage director has stopped using nonvolatile storage or cannot
+ * initiate nonvolatile storage. If this problem is caused by a failure, an
+ * additional message will provide more information about the failure. This is
+ * an operating system independent message that is issued by the storage system.
+ * User action:
+ * Check for additional messages that point out possible failures. For more
+ * information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT 8 - Reserved\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: Write inhibited path encountered\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an informational message.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: FORMAT 9 - Device check-2 error\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This is an operating system independent message that is issued by the
+ * storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT F - Track format incorrect\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * A track format error occurred while data was being written to the DASD or
+ * while a duplex pair was being established. This is an operating system
+ * independent message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: FORMAT F - Cache fast write access not authorized\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * A request for Cache Fast Write Data access cannot be satisfied because
+ * of missing access authorization for the storage system. This is an operating
+ * system independent message that is issued by the storage system.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: Data recovered during retry with PCI fetch mode active\n"
+ * Severity: Emerg
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * A data error has been recovered on the storages system but the Linux file
+ * system cannot be informed about the data mismatch. To prevent Linux from
+ * running with incorrect data, the DASD device driver will trigger a kernel
+ * panic.
+ * User action:
+ * Reset your real or virtual hardware and reboot Linux.
+ */
+
+/*?
+ * Text: "%s: The specified record was not found\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The record to be accessed does not exist. The DASD might be unformatted
+ * or defect.
+ * User action:
+ * Try to format the DASD or replace it.
+ * ATTENTION: Formatting irreversibly destroys all data on the DASD.
+ */
+
+/*?
+ * Text: "%s: ERP %p (%02x) refers to %p\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: pointer to ERP
+ *   @3: ERP status
+ *   @4: cqr
+ * Description:
+ * This message provides debug information for the enhanced error recovery
+ * procedure (ERP).
+ * User action:
+ * If you do not need this information, you can suppress this message by
+ * switching off ERP logging, for example, by writing '1' to the 'erplog'
+ * sysfs attribute of the DASD.
+ */
+
+/*?
+ * Text: "%s: ERP chain at END of ERP-ACTION\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This message provides debug information for the enhanced error recovery
+ * procedure (ERP).
+ * User action:
+ * If you do not need this information, you can suppress this message by
+ * switching off ERP logging, for example, by writing '1' to the 'erplog'
+ * sysfs attribute of the DASD.
+ */
+
+/*?
+ * Text: "%s: The cylinder data for accessing the DASD is inconsistent\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * An error occurred in the storage system hardware.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: Accessing the DASD failed because of a hardware error\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * An error occurred in the storage system hardware.
+ * User action:
+ * For more information see the documentation of your storage system.
+ */
+
+/*?
+ * Text: "%s: ERP chain at BEGINNING of ERP-ACTION\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * This message provides debug information for the enhanced error recovery
+ * procedure (ERP).
+ * User action:
+ * If you do not need this information, you can suppress this message by
+ * switching off ERP logging, for example, by writing '1' to the 'erplog'
+ * sysfs attribute of the DASD.
+ */
+
+/*?
+ * Text: "%s: ERP %p has run out of retries and failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: ERP pointer
+ * Description:
+ * The error recovery procedure (ERP) tried to recover an error but the number
+ * of retries for the I/O was exceeded before the error could be resolved.
+ * User action:
+ * Check for related previous error messages.
+ */
+
+/*?
+ * Text: "%s: ERP failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The error recovery procedure (ERP) tried to recover an error but has
+ * failed. A retry is not recommended. The I/O will also fail.
+ * User action:
+ * Check for related previous error messages.
+ */
+
+/*?
+ * Text: "%s: SIM - SRC: %02x%02x%02x%02x\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: sense byte
+ *   @3: sense byte
+ *   @4: sense byte
+ *   @5: sense byte
+ * Description:
+ * This error message is a System Information Message (SIM) generated by the
+ * storage system. The System Reference Code (SRC) defines the error in detail.
+ * User action:
+ * Look up the SRC in the storage server documentation.
+ */
+
+/*?
+ * Text: "%s: log SIM - SRC: %02x%02x%02x%02x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: sense byte
+ *   @3: sense byte
+ *   @4: sense byte
+ *   @5: sense byte
+ * Description:
+ * This System Information Message (SIM) is generated by the storage system.
+ * The System Reference Code (SRC) defines the error in detail.
+ * User action:
+ * Look up the SRC in the storage server documentation.
+ */
+
+/*?
+ * Text: "%s: Reading device feature codes failed with rc=%d\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: return code
+ * Description:
+ * The device feature codes state which advanced features are supported by a
+ * device.
+ * Examples for advanced features are PAV or high performance FICON.
+ * Some early devices do not provide feature codes and no advanced features are
+ * available on these devices.
+ * User action:
+ * None, if the DASD does not provide feature codes. If the DASD provides
+ * feature codes, make sure that it is working correctly, then set it offline
+ * and back online.
+ */
+
+/*?
+ * Text: "%s: A channel path group could not be established\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * Initialization of a DASD did not complete because a channel path group
+ * could not be established.
+ * User action:
+ * Make sure that the DASD is working correctly, then try again to set it
+ * online. If initialization still fails, reboot.
+ */
+
+/*?
+ * Text: "%s: The DASD is not operating in multipath mode\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The DASD channel path group could not be configured to use multipath mode.
+ * This might negatively affect I/O performance on this DASD.
+ * User action:
+ * Make sure that the DASD is working correctly, then try again to set it
+ * online. If initialization still fails, reboot.
+ */
+
+/*?
+ * Text: "%s: Detecting the DASD disk layout failed because of an I/O error\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The disk layout of the DASD could not be detected because of an unexpected
+ * I/O error. The DASD device driver treats the device like an unformatted DASD,
+ * and partitions on the device are not accessible.
+ * User action:
+ * If the DASD is formatted, make sure that the DASD is working correctly,
+ * then set it offline and back online. If the DASD is unformatted, format the
+ * DASD, for example, with dasdfmt.
+ * ATTENTION: Formatting irreversibly destroys all data on the DASD.
+ */
diff --git a/Documentation/kmsg/s390/dasd-fba b/Documentation/kmsg/s390/dasd-fba

new file mode 100644 (file)

index 0000000..4739a46
--- /dev/null
+++ b/Documentation/kmsg/s390/dasd-fba
@@ -0,0 +1,30 @@
+
+/*?
+ * Text: "%s: New FBA DASD %04X/%02X (CU %04X/%02X) with %d MB and %d B/blk\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the DASD
+ *   @2: device type
+ *   @3: device model
+ *   @4: control unit type
+ *   @5: control unit model
+ *   @6: size
+ *   @7: bytes per block
+ * Description:
+ * A DASD with the shown characteristics has been set online.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: Allocating memory for private DASD data failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the DASD
+ * Description:
+ * The DASD device driver maintains data structures for each DASD it manages.
+ * There is not enough memory to allocate these data structures for one or
+ * more DASD.
+ * User action:
+ * Free some memory and try the operation again.
+ */
diff --git a/Documentation/kmsg/s390/dcssblk b/Documentation/kmsg/s390/dcssblk

new file mode 100644 (file)

index 0000000..70bfcef
--- /dev/null
+++ b/Documentation/kmsg/s390/dcssblk
@@ -0,0 +1,192 @@
+/*?
+ * Text: "Adjacent DCSSs %s and %s are not contiguous\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: name 1
+ *   @2: name 2
+ * Description:
+ * You can only map a set of two or more DCSSs to a single DCSS device if the
+ * DCSSs in the set form a contiguous memory space. The DCSS device cannot be
+ * created because there is a memory gap between two adjacent DCSSs.
+ * User action:
+ * Ensure that you have specified all DCSSs that belong to the set. Check the
+ * definitions of the DCSSs on the z/VM hypervisor to verify that they form
+ * a contiguous memory space.
+ */
+
+/*?
+ * Text: "DCSS %s and DCSS %s have incompatible types\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: name 1
+ *   @2: name 2
+ * Description:
+ * You can only map a set of two or more DCSSs to a single DCSS device if
+ * either all DCSSs in the set have the same type or if the set contains DCSSs
+ * of the two types EW and EN but no other type. The DCSS device cannot be
+ * created because at least two of the specified DCSSs are not compatible.
+ * User action:
+ * Check the definitions of the DCSSs on the z/VM hypervisor to verify that
+ * their types are compatible.
+ */
+
+/*?
+ * Text: "DCSS %s is of type SC and cannot be loaded as exclusive-writable\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: device name
+ * Description:
+ * You cannot load a DCSS device in exclusive-writable access mode if the DCSS
+ * devise maps to one or more DCSSs of type SC.
+ * User action:
+ * Load the DCSS in shared access mode.
+ */
+
+/*?
+ * Text: "DCSS device %s is removed after a failed access mode change\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: device name
+ * Description:
+ * To change the access mode of a DCSS device, all DCSSs that map to the device
+ * were unloaded. Reloading the DCSSs for the new access mode failed and the
+ * device is removed.
+ * User action:
+ * Look for related messages to find out why the DCSSs could not be reloaded.
+ * If necessary, add the device again.
+ */
+
+/*?
+ * Text: "All DCSSs that map to device %s are saved\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: device name
+ * Description:
+ * A save request has been submitted for the DCSS device. Changes to all DCSSs
+ * that map to the device are saved permanently.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Device %s is in use, its DCSSs will be saved when it becomes idle\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: device name
+ * Description:
+ * A save request for the device has been deferred until the device becomes
+ * idle. Then changes to all DCSSs that the device maps to will be saved
+ * permanently.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "A pending save request for device %s has been canceled\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: device name
+ * Description:
+ * A save request for the DCSSs that map to a DCSS device has been pending
+ * while the device was in use. This save request has been canceled. Changes to
+ * the DCSSs will not be saved permanently.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Loaded %s with total size %lu bytes and capacity %lu sectors\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: DCSS names
+ *   @2: total size in bytes
+ *   @3: total size in 512 byte sectors
+ * Description:
+ * The listed DCSSs have been verified as contiguous and successfully loaded.
+ * The displayed sizes are the sums of all DCSSs.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Device %s cannot be removed because it is not a known device\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: device name
+ * Description:
+ * The DCSS device you are trying to remove is not known to the DCSS device
+ * driver.
+ * User action:
+ * List the entries under /sys/devices/dcssblk/ to see the names of the
+ * existing DCSS devices.
+ */
+
+/*?
+ * Text: "Device %s cannot be removed while it is in use\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: device name
+ * Description:
+ * You are trying to remove a device that is in use.
+ * User action:
+ * Make sure that all users of the device close the device before you try to
+ * remove it.
+ */
+
+/*?
+ * Text: "Device %s has become idle and is being saved now\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: device name
+ * Description:
+ * A save request for the DCSSs that map to a DCSS device has been pending
+ * while the device was in use. The device has become idle and all changes
+ * to the DCSSs are now saved permanently.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Writing to %s failed because it is a read-only device\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: device name
+ * Description:
+ * The DCSS device is in shared access mode and cannot be written to. Depending
+ * on the type of the DCSSs that the device maps to, you might be able to
+ * change the access mode to exclusive-writable.
+ * User action:
+ * If the DCSSs of the device are of type SC, do not attempt to write to the
+ * device. If the DCSSs of the device are of type ER or SR, change the access
+ * mode to exclusive-writable before writing to the device.
+ */
+
+/*?
+ * Text: "The address range of DCSS %s changed while the system was suspended\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: device name
+ * Description:
+ * After resuming the system, the start address or end address of a DCSS does
+ * not match the address when the system was suspended. DCSSs must not be
+ * changed after the system was suspended.
+ * This error cannot be recovered. The system is stopped with a kernel panic.
+ * User action:
+ * Reboot Linux.
+ */
+
+/*?
+ * Text: "Suspending the system failed because DCSS device %s is writable\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: device name
+ * Description:
+ * A system cannot be suspended if one or more DCSSs are accessed in exclusive-
+ * writable mode. DCSS segment types EW, SW, and EN are always writable and
+ * must be removed before a system is suspended.
+ * User action:
+ * Remove all DCSSs of segment types EW, SW, and EN by writing the DCSS name to
+ * the sysfs 'remove' attribute. Set the access mode for all DCSSs of segment
+ * types SR and ER to read-only by writing 1 to the sysfs 'shared' attribute of
+ * the DCSS. Then try again to suspend the system.
+ */
diff --git a/Documentation/kmsg/s390/extmem b/Documentation/kmsg/s390/extmem

new file mode 100644 (file)

index 0000000..96b3654
--- /dev/null
+++ b/Documentation/kmsg/s390/extmem
@@ -0,0 +1,290 @@
+/*?
+ * Text: "Querying a DCSS type failed with rc=%ld\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: return code
+ * Description:
+ * The DCSS kernel interface used z/VM diagnose call X'64' to query the
+ * type of a DCSS. z/VM failed to determine the type and returned an error.
+ * User action:
+ * Look for related messages to find out which DCSS is affected.
+ * For details about the return codes see the section about DIAGNOSE Code
+ * X'64' in "z/VM CP Programming Services".
+ */
+
+/*?
+ * Text: "Loading DCSS %s failed with rc=%ld\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: DCSS name
+ *   @2: return code
+ * Description:
+ * The DCSS kernel interface used diagnose call X'64' to load a DCSS. z/VM
+ * failed to load the DCSS and returned an error.
+ * User action:
+ * For details about the return codes see the section about DIAGNOSE Code
+ * X'64' in "z/VM CP Programming Services".
+ */
+
+/*?
+ * Text: "DCSS %s of range %p to %p and type %s loaded as exclusive-writable\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: DCSS name
+ *   @2: starting page address
+ *   @3: ending page address
+ *   @4: DCSS type
+ * Description:
+ * The DCSS was loaded successfully in exclusive-writable access mode.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "DCSS %s of range %p to %p and type %s loaded in shared access mode\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: DCSS name
+ *   @2: starting page address
+ *   @3: ending page address
+ *   @4: DCSS type
+ * Description:
+ * The DCSS was loaded successfully in shared access mode.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "DCSS %s is already in the requested access mode\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * A request to reload a DCSS with a new access mode has been rejected
+ * because the new access mode is the same as the current access mode.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "DCSS %s is in use and cannot be reloaded\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * Reloading a DCSS in a different access mode has failed because the DCSS is
+ * being used by one or more device drivers. The DCSS remains loaded with the
+ * current access mode.
+ * User action:
+ * Ensure that the DCSS is not used by any device driver then try again to
+ * load the DCSS with the new access mode.
+ */
+
+/*?
+ * Text: "DCSS %s overlaps with used memory resources and cannot be reloaded\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * The DCSS has been unloaded and cannot be reloaded because it overlaps with
+ * another loaded DCSS or with the memory of the z/VM guest virtual machine
+ * (guest storage).
+ * User action:
+ * Ensure that no DCSS is loaded that has overlapping memory resources
+ * with the DCSS you want to reload. If the DCSS overlaps with guest storage,
+ * use the DEF STORE CONFIG z/VM CP command to create a sufficient storage gap
+ * for the DCSS. For details, see the section about the DCSS device driver in
+ * "Device Drivers, Features, and Commands".
+ */
+
+/*?
+ * Text: "Reloading DCSS %s failed with rc=%ld\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: DCSS name
+ *   @2: return code
+ * Description:
+ * The DCSS kernel interface used z/VM diagnose call X'64' to reload a DCSS
+ * in a different access mode. The DCSS was unloaded but z/VM failed to reload
+ * the DCSS.
+ * User action:
+ * For details about the return codes see the section about DIAGNOSE Code
+ * X'64' in "z/VM CP Programming Services".
+ */
+
+/*?
+ * Text: "Unloading unknown DCSS %s failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * The specified DCSS cannot be unloaded. The DCSS is known to the DCSS device
+ * driver but not to the DCSS kernel interface. This problem indicates a
+ * program error in extmem.c.
+ * User action:
+ * Report this problem to your support organization.
+ */
+
+/*?
+ * Text: "Saving unknown DCSS %s failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * The specified DCSS cannot be saved. The DCSS is known to the DCSS device
+ * driver but not to the DCSS kernel interface. This problem indicates a
+ * program error in extmem.c.
+ * User action:
+ * Report this problem to your support organization.
+ */
+
+/*?
+ * Text: "Saving a DCSS failed with DEFSEG response code %i\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: response-code
+ * Description:
+ * The DEFSEG z/VM CP command failed to permanently save changes to a DCSS.
+ * User action:
+ * Look for related messages to find the cause of this error. See also message
+ * HCP<response-code>E in the DEFSEG section of the "z/VM CP Command and
+ * Utility Reference".
+ */
+
+/*?
+ * Text: "Saving a DCSS failed with SAVESEG response code %i\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: response-code
+ * Description:
+ * The SAVESEG z/VM CP command failed to permanently save changes to a DCSS.
+ * User action:
+ * Look for related messages to find the cause of this error. See also message
+ * HCP<response-code>E in the SAVESEG section of the "z/VM CP Command and
+ * Utility Reference".
+ */
+
+/*?
+ * Text: "DCSS %s cannot be loaded or queried\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * You cannot load or query the specified DCSS because it either is not defined
+ * in the z/VM hypervisor, or it is a class S DCSS, or it is above 2047 MB
+ * and he Linux system is a 31-bit system.
+ * User action:
+ * Use the CP command "QUERY NSS" to find out if the DCSS is a valid
+ * DCSS that can be loaded.
+ */
+
+/*?
+ * Text: "DCSS %s cannot be loaded or queried without z/VM\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * A DCSS is a z/VM resource. Your Linux instance is not running as a z/VM
+ * guest operating system and, therefore, cannot load DCSSs.
+ * User action:
+ * Load DCSSs only on Linux instances that run as z/VM guest operating systems.
+ */
+
+/*?
+ * Text: "Loading or querying DCSS %s resulted in a hardware error\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * Either the z/VM DIAGNOSE X'64' query or load call issued for the DCSS
+ * returned with an error.
+ * User action:
+ * Look for previous extmem message to find the return code from the
+ * DIAGNOSE X'64' query or load call. For details about the return codes see
+ * the section about DIAGNOSE Code X'64' in "z/VM CP Programming Services".
+ */
+
+/*?
+ * Text: "DCSS %s has multiple page ranges and cannot be loaded or queried\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * You can only load or query a DCSS with multiple page ranges if:
+ * - The DCSS has 6 or fewer page ranges
+ * - The page ranges form a contiguous address space
+ * - The page ranges are of type EW or EN
+ * User action:
+ * Check the definition of the DCSS to make sure that the conditions for
+ * DCSSs with multiple page ranges are met.
+ */
+
+/*?
+ * Text: "%s needs used memory resources and cannot be loaded or queried\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * You cannot load or query the DCSS because it overlaps with an already
+ * loaded DCSS or with the memory of the z/VM guest virtual machine
+ * (guest storage).
+ * User action:
+ * Ensure that no DCSS is loaded that has overlapping memory resources
+ * with the DCSS you want to load or query. If the DCSS overlaps with guest
+ * storage, use the DEF STORE CONFIG z/VM CP command to create a sufficient
+ * storage gap for the DCSS. For details, see the section about the DCSS
+ * device driver in "Device Drivers, Features, and Commands".
+ */
+
+/*?
+ * Text: "DCSS %s is already loaded in a different access mode\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * The DCSS you are trying to load has already been loaded in a different
+ * access mode. You cannot simultaneously load the DCSS in different modes.
+ * User action:
+ * Reload the DCSS in a different mode or load it with the same mode in which
+ * it has already been loaded.
+ */
+
+/*?
+ * Text: "There is not enough memory to load or query DCSS %s\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * The available memory is not enough to load or query the DCSS.
+ * User action:
+ * Free some memory and repeat the failed operation.
+ */
+
+/*?
+ * Text: "DCSS %s overlaps with used storage and cannot be loaded\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * You cannot load the DCSS because it overlaps with an already loaded DCSS
+ * or with the memory of the z/VM guest virtual machine (guest storage).
+ * User action:
+ * Ensure that no DCSS is loaded that has overlapping memory resources
+ * with the DCSS you want to load. If the DCSS overlaps with guest storage,
+ * use the DEF STORE CONFIG z/VM CP command to create a sufficient storage gap
+ * for the DCSS. For details, see the section about the DCSS device driver in
+ * "Device Drivers, Features, and Commands".
+ */
+
+/*?
+ * Text: "DCSS %s exceeds the kernel mapping range (%lu) and cannot be loaded\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: DCSS name
+ *   @2: kernel mapping range in bytes
+ * Description:
+ * You cannot load the DCSS because it exceeds the kernel mapping range limit.
+ * User action:
+ * Ensure that the DCSS range is defined below the kernel mapping range.
+ */
+
diff --git a/Documentation/kmsg/s390/hvc_iucv b/Documentation/kmsg/s390/hvc_iucv

new file mode 100644 (file)

index 0000000..3a2972c
--- /dev/null
+++ b/Documentation/kmsg/s390/hvc_iucv
@@ -0,0 +1,122 @@
+/*?
+ * Text: "The z/VM IUCV HVC device driver cannot be used without z/VM\n"
+ * Severity: Notice
+ * Description:
+ * The z/VM IUCV hypervisor console (HVC) device driver requires the
+ * z/VM inter-user communication vehicle (IUCV).
+ * User action:
+ * Set "hvc_iucv=" to zero in the kernel parameter line and reboot Linux.
+ */
+
+/*?
+ * Text: "%lu is not a valid value for the hvc_iucv= kernel parameter\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: hvc_iucv_devices
+ * Description:
+ * The "hvc_iucv=" kernel parameter specifies the number of z/VM IUCV
+ * hypervisor console (HVC) terminal devices.
+ * The parameter value ranges from 0 to 8.
+ * If zero is specified, the z/VM IUCV HVC device driver is disabled
+ * and no IUCV-based terminal access is available.
+ * User action:
+ * Correct the "hvc_iucv=" setting in the kernel parameter line and
+ * reboot Linux.
+ */
+
+/*?
+ * Text: "Creating a new HVC terminal device failed with error code=%d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: errno
+ * Description:
+ * The device driver initialization failed to allocate a new
+ * HVC terminal device.
+ * A possible cause of this problem is memory constraints.
+ * User action:
+ * If the error code is -12 (ENOMEM), consider assigning more memory
+ * to your z/VM guest virtual machine.
+ */
+
+/*?
+ * Text: "Registering HVC terminal device as Linux console failed\n"
+ * Severity: Error
+ * Description:
+ * The device driver initialization failed to set up the first HVC terminal
+ * device for use as Linux console.
+ * User action:
+ * If the error code is -12 (ENOMEM), consider assigning more memory
+ * to your z/VM guest virtual machine.
+ */
+
+/*?
+ * Text: "Registering IUCV handlers failed with error code=%d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: errno
+ * Description:
+ * The device driver initialization failed to register with z/VM IUCV to
+ * handle IUCV connections, as well as sending and receiving of IUCV messages.
+ * User action:
+ * Check for related IUCV error messages and see the errno manual page
+ * to find out what caused the problem.
+ */
+
+/*?
+ * Text: "Allocating memory failed with reason code=%d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: reason
+ * Description:
+ * The z/VM IUCV hypervisor console (HVC) device driver initialization failed,
+ * because of a general memory allocation failure. The reason code indicates
+ * the memory operation that has failed:
+ *     kmem_cache (reason code=1),
+ *     mempool (reason code=2), or
+ *     hvc_iucv_allow= (reason code=3)
+ * User action:
+ * Consider assigning more memory to your z/VM guest virtual machine.
+ */
+
+/*?
+ * Text: "hvc_iucv_allow= does not specify a valid z/VM user ID list\n"
+ * Severity: Error
+ * Description:
+ * The "hvc_iucv_allow=" kernel parameter specifies a comma-separated list
+ * of z/VM user IDs that are permitted to connect to the z/VM IUCV hypervisor
+ * device driver.
+ * The z/VM user IDs in the list must not exceed eight characters and must
+ * not contain spaces.
+ * User action:
+ * Correct the "hvc_iucv_allow=" setting in the kernel parameter line and reboot
+ * Linux.
+ */
+
+/*?
+ * Text: "hvc_iucv_allow= specifies too many z/VM user IDs\n"
+ * Severity: Error
+ * Description:
+ * The "hvc_iucv_allow=" kernel parameter specifies a comma-separated list
+ * of z/VM user IDs that are permitted to connect to the z/VM IUCV hypervisor
+ * device driver.
+ * The number of z/VM user IDs that are specified with the "hvc_iucv_allow="
+ * kernel parameter exceeds the maximum of 500.
+ * User action:
+ * Correct the "hvc_iucv_allow=" setting by reducing the z/VM user IDs in
+ * the list and reboot Linux.
+ */
+
+/*?
+ * Text: "A connection request from z/VM user ID %s was refused\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: ID
+ * Description:
+ * An IUCV connection request from another z/VM guest virtual machine has been
+ * refused. The request was from a z/VM guest virtual machine that is not
+ * listed by the "hvc_iucv_allow=" kernel parameter.
+ * User action:
+ * Check the "hvc_iucv_allow=" kernel parameter setting.
+ * Consider adding the z/VM user ID to the "hvc_iucv_allow=" list in the kernel
+ * parameter line and reboot Linux.
+ */
diff --git a/Documentation/kmsg/s390/hypfs b/Documentation/kmsg/s390/hypfs

new file mode 100644 (file)

index 0000000..c84582b
--- /dev/null
+++ b/Documentation/kmsg/s390/hypfs
@@ -0,0 +1,56 @@
+/*?
+ * Text: "The hardware system does not support hypfs\n"
+ * Severity: Error
+ * Description:
+ * hypfs requires DIAGNOSE Code X'204' but this diagnose code is not available
+ * on your hardware. You need more recent hardware to use hypfs.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "The hardware system does not provide all functions required by hypfs\n"
+ * Severity: Error
+ * Description:
+ * hypfs requires DIAGNOSE Code X'224' but this diagnode code is not available
+ * on your hardware. You need more recent hardware to use hypfs.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Updating the hypfs tree failed\n"
+ * Severity: Error
+ * Description:
+ * There was not enough memory available to update the hypfs tree.
+ * User action:
+ * Free some memory and try again to update the hypfs tree. Consider assigning
+ * more memory to your LPAR or z/VM guest virtual machine.
+ */
+
+/*?
+ * Text: "%s is not a valid mount option\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: mount option
+ * Description:
+ * hypfs has detected mount options that are not valid.
+ * User action:
+ * See "Device Drivers Features and Commands" for information about valid
+ * mount options for hypfs.
+ */
+
+/*?
+ * Text: "Initialization of hypfs failed with rc=%i\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: error code
+ * Description:
+ * Initialization of hypfs failed because of resource or hardware constraints.
+ * Possible reasons for this problem are insufficient free memory or missing
+ * hardware interfaces.
+ * User action:
+ * See errno.h for information about the error codes.
+ */
+
+/*? Text: "Hypervisor filesystem mounted\n" */
diff --git a/Documentation/kmsg/s390/iucv b/Documentation/kmsg/s390/iucv

new file mode 100644 (file)

index 0000000..dab8e4e
--- /dev/null
+++ b/Documentation/kmsg/s390/iucv
@@ -0,0 +1,33 @@
+/*?
+ * Text: "Defining an interrupt buffer on CPU %i failed with 0x%02x (%s)\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: CPU number
+ *   @2: hexadecimal error value
+ *   @3: short error code explanation
+ * Description:
+ * Defining an interrupt buffer for external interrupts failed. Error
+ * value 0x03 indicates a problem with the z/VM directory entry of the
+ * z/VM guest virtual machine. This problem can also be caused by a
+ * program error.
+ * User action:
+ * If the error value is 0x03, examine the z/VM directory entry of your
+ * z/VM guest virtual machine. If the directory entry is correct or if the
+ * error value is not 0x03, report this problem to your support organization.
+ */
+
+/*?
+ * Text: "Suspending Linux did not completely close all IUCV connections\n"
+ * Severity: Warning
+ * Description:
+ * When resuming a suspended Linux instance, the IUCV base code found
+ * data structures from one or more IUCV connections that existed before the
+ * Linux instance was suspended. Modules that use IUCV connections must close
+ * these connections when a Linux instance is suspended. This problem
+ * indicates an error in a program that used an IUCV connection.
+ * User action:
+ * Report this problem to your support organization.
+ */
+
+/*? Text: "iucv_external_interrupt: out of memory\n" */
+
diff --git a/Documentation/kmsg/s390/lcs b/Documentation/kmsg/s390/lcs

new file mode 100644 (file)

index 0000000..c3fbbb7
--- /dev/null
+++ b/Documentation/kmsg/s390/lcs
@@ -0,0 +1,161 @@
+/*?
+ * Text: "%s:  Allocating a socket buffer to interface %s failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the LCS device
+ *   @2: network interface
+ * Description:
+ * LAN channel station (LCS) devices require a socket buffer (SKB) structure
+ * for storing incoming data. The LCS device driver failed to allocate an SKB
+ * structure to the LCS device. A likely cause of this problem is memory
+ * constraints.
+ * User action:
+ * Free some memory and repeat the failed operation.
+ */
+
+/*?
+ * Text: "%s:  Shutting down the LCS device failed\n "
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the LCS device
+ * Description:
+ * A request to shut down a LAN channel station (LCS) device resulted in an
+ * error. The error is logged in the LCS trace at trace level 4.
+ * User action:
+ * Try again to shut down the device. If the error persists, see the LCS trace
+ * to find out what causes the error.
+ */
+
+/*?
+ * Text: "%s: Detecting a network adapter for LCS devices failed with rc=%d (0x%x)\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the LCS device
+ *   @2: lcs_detect return code in decimal notation
+ *   @3: lcs_detect return code in hexadecimal notation
+ * Description:
+ * The LCS device driver could not initialize a network adapter.
+ * User action:
+ * Note the return codes from the error message and contact IBM support.
+ */
+
+/*?
+ * Text: "%s: A recovery process has been started for the LCS device\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the LCS device
+ * Description:
+ * The LAN channel station (LCS) device is shut down and restarted. The recovery
+ * process might have been initiated by a user or started automatically as a
+ * response to a device problem.
+ * User action:
+ * Wait until a message indicates the completion of the recovery process.
+ */
+
+/*?
+ * Text: "%s: An I/O-error occurred on the LCS device\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the LCS device
+ * Description:
+ * The LAN channel station (LCS) device reported a problem that can be recovered
+ * by the LCS device driver. Repeated occurrences of this problem indicate a
+ * malfunctioning device.
+ * User action:
+ * If this problem occurs frequently, initiate a recovery process for the
+ * device, for example, by writing '1' to the 'recover' sysfs attribute of the
+ * device.
+ */
+
+/*?
+ * Text: "%s: A command timed out on the LCS device\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the LCS device
+ * Description:
+ * The LAN channel station (LCS) device reported a problem that can be recovered
+ * by the LCS device driver. Repeated occurrences of this problem indicate a
+ * malfunctioning device.
+ * User action:
+ * If this problem occurs frequently, initiate a recovery process for the
+ * device, for example, by writing '1' to the 'recover' sysfs attribute of the
+ * device.
+ */
+
+/*?
+ * Text: "%s: An error occurred on the LCS device, rc=%ld\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the LCS device
+ *   @2: return code
+ * Description:
+ * The LAN channel station (LCS) device reported a problem that can be recovered
+ * by the LCS device driver. Repeated occurrences of this problem indicate a
+ * malfunctioning device.
+ * User action:
+ * If this problem occurs frequently, initiate a recovery process for the
+ * device, for example, by writing '1' to the 'recover' sysfs attribute of the
+ * device.
+ */
+
+/*?
+ * Text: "%s: The LCS device stopped because of an error, dstat=0x%X, cstat=0x%X \n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the LCS device
+ *   @2: device status
+ *   @3: subchannel status
+ * Description:
+ * The LAN channel station (LCS) device reported an error. The LCS device driver
+ * might start a device recovery process.
+ * User action:
+ * If the device driver does not start a recovery process, initiate a recovery
+ * process, for example, by writing '1' to the 'recover' sysfs attribute of the
+ * device. If the problem persists, note the status information provided with
+ * the message and contact IBM support.
+ */
+
+/*?
+ * Text: "%s: Starting an LCS device resulted in an error, rc=%d!\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the LCS device
+ *   @2: ccw_device_start return code in decimal notation
+ * Description:
+ * The LAN channel station (LCS) device driver failed to initialize an LCS
+ * device. The device is not operational.
+ * User action:
+ * Initiate a recovery process, for example, by writing '1' to the 'recover'
+ * sysfs attribute of the device. If the problem persists, contact IBM support.
+ */
+
+/*?
+ * Text: "%s: Sending data from the LCS device to the LAN failed with rc=%d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the LCS device
+ *   @2: ccw_device_resume return code in decimal notation
+ * Description:
+ * The LAN channel station (LCS) device driver could not send data to the LAN
+ * using the LCS device. This might be a temporary problem. Operations continue
+ * on the LCS device.
+ * User action:
+ * If this problem occurs frequently, initiate a recovery process, for example,
+ * by writing '1' to the 'recover' sysfs attribute of the device. If the
+ * problem persists, contact IBM support.
+ */
+
+/*? Text: "Query IPAssist failed. Assuming unsupported!\n" */
+/*? Text: "Stoplan for %s initiated by LGW.\n" */
+/*? Text: "Not enough memory to add new multicast entry!\n" */
+/*? Text: "Not enough memory for debug facility.\n" */
+/*? Text: "Adding multicast address failed. Table possibly full!\n" */
+/*? Text: "Error in opening device!\n" */
+/*? Text: "LCS device %s %s IPv6 support\n" */
+/*? Text: "Device %s successfully recovered!\n" */
+/*? Text: "LCS device %s %s Multicast support\n" */
+/*? Text: " Initialization failed\n" */
+/*? Text: "Loading %s\n" */
+/*? Text: "Initialization failed\n" */
+/*? Text: "Terminating lcs module.\n" */
+/*? Text: "Device %s could not be recovered!\n" */
diff --git a/Documentation/kmsg/s390/monreader b/Documentation/kmsg/s390/monreader

new file mode 100644 (file)

index 0000000..2f60396
--- /dev/null
+++ b/Documentation/kmsg/s390/monreader
@@ -0,0 +1,127 @@
+/*?
+ * Text: "Reading monitor data failed with rc=%i\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: return code
+ * Description:
+ * The z/VM *MONITOR record device driver failed to read monitor data
+ * because the IUCV REPLY function failed. The read function against
+ * the monitor record device returns EIO. All monitor data that has been read
+ * since the last read with 0 size is incorrect.
+ * User action:
+ * Disregard all monitor data that has been read since the last read with
+ * 0 size. If the device driver has been compiled as a separate module, unload
+ * and reload the monreader module. If the device driver has been compiled
+ * into the kernel, reboot Linux. For more information about possible causes
+ * of the error see the IUCV section in "z/VM CP Programming Services" and
+ * the *MONITOR section in "z/VM Performance".
+ */
+
+/*?
+ * Text: "z/VM *MONITOR system service disconnected with rc=%i\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: IPUSER SEVER return code
+ * Description:
+ * The z/VM *MONITOR record device driver receives monitor records through
+ * an IUCV connection to the z/VM *MONITOR system service. This connection
+ * has been severed and the read function of the z/VM *MONITOR device driver
+ * returns EIO. All data received since the last read with 0 size is incorrect.
+ * User action:
+ * Disregard all monitor data read since the last read with 0 size. Close and
+ * reopen the monitor record device. For information about the IPUSER SEVER
+ * return codes see "z/VM Performance".
+ */
+
+/*?
+ * Text: "The read queue for monitor data is full\n"
+ * Severity: Warning
+ * Description:
+ * The read function of the z/VM *MONITOR device driver returns EOVERFLOW
+ * because not enough monitor data has been read since the monitor device
+ * has been opened. Monitor data already read are valid and subsequent reads
+ * return valid data but some intermediate data might be missing.
+ * User action:
+ * Be aware that monitor data might be missing. Assure that you regularly
+ * read monitor data after opening the monitor record device.
+ */
+
+/*?
+ * Text: "Connecting to the z/VM *MONITOR system service failed with rc=%i\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: IUCV CONNECT return code
+ * Description:
+ * The z/VM *MONITOR record device driver receives monitor records through
+ * an IUCV connection to the z/VM *MONITOR system service. This connection
+ * could not be established when the monitor record device was opened. If
+ * the return code is 15, your z/VM guest virtual machine is not authorized
+ * to connect to the *MONITOR system service.
+ * User action:
+ * If the return code is 15, ensure that the IUCV *MONITOR statement is
+ * included in the z/VM directory entry for your z/VM guest virtual machine.
+ * For other IUCV CONNECT return codes see the IUCV section in "CP Programming
+ * Services" and the *MONITOR section in "z/VM  Performance".
+ */
+
+/*?
+ * Text: "Disconnecting the z/VM *MONITOR system service failed with rc=%i\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: IUCV SEVER return code
+ * Description:
+ * The z/VM *MONITOR record device driver receives monitor data through an
+ * IUCV connection to the z/VM *MONITOR system service. This connection
+ * could not be closed when the monitor record device was closed. You might
+ * not be able to resume monitoring.
+ * User action:
+ * No immediate action is necessary. If you cannot open the monitor record
+ * device in the future, reboot Linux. For information about the IUCV SEVER
+ * return codes see the IUCV section in "CP Programming Services" and the
+ * *MONITOR section in "z/VM  Performance".
+ */
+
+/*?
+ * Text: "The z/VM *MONITOR record device driver cannot be loaded without z/VM\n"
+ * Severity: Error
+ * Description:
+ * The z/VM *MONITOR record device driver uses z/VM system services to provide
+ * monitor data about z/VM guest operating systems to applications on Linux.
+ * On Linux instances that run in environments other than the z/VM hypervisor,
+ * the z/VM *MONITOR record device driver does not provide any useful
+ * function and the corresponding monreader module cannot be loaded.
+ * User action:
+ * Load the z/VM *MONITOR record device driver only on Linux instances that run
+ * as guest operating systems of the z/VM hypervisor. If the z/VM *MONITOR
+ * record device driver has been compiled into the kernel, ignore this message.
+ */
+
+/*?
+ * Text: "The z/VM *MONITOR record device driver failed to register with IUCV\n"
+ * Severity: Error
+ * Description:
+ * The z/VM *MONITOR record device driver receives monitor data through an IUCV
+ * connection and needs to register with the IUCV device driver. This
+ * registration failed and the z/VM *MONITOR record device driver was not
+ * loaded. A possible cause of this problem is insufficient memory.
+ * User action:
+ * Free some memory and try again to load the module. If the z/VM *MONITOR
+ * record device driver has been compiled into the kernel, you might have to
+ * configure more memory and reboot Linux. If you do not want to read monitor
+ * data, ignore this message.
+ */
+
+/*?
+ * Text: "The specified *MONITOR DCSS %s does not have the required type SC\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: DCSS name
+ * Description:
+ * The DCSS that was specified with the monreader.mondcss kernel parameter or
+ * with the mondcss module parameter cannot be a *MONITOR DCSS because it is
+ * not of type SC.
+ * User action:
+ * Confirm that you are using the name of the DCSS that has been configured as
+ * the *MONITOR DCSS on the z/VM hypervisor. If the default name, MONDCSS, is
+ * used, omit the monreader.mondcss or mondcss parameter.
+ */
diff --git a/Documentation/kmsg/s390/monwriter b/Documentation/kmsg/s390/monwriter

new file mode 100644 (file)

index 0000000..5947ecb
--- /dev/null
+++ b/Documentation/kmsg/s390/monwriter
@@ -0,0 +1,16 @@
+/*?
+ * Text: "Writing monitor data failed with rc=%i\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: return code
+ * Description:
+ * The monitor stream application device driver used the z/VM diagnose call
+ * DIAG X'DC' to start writing monitor data. z/VM returned an error and the
+ * monitor data cannot be written. If the return code is 5, your z/VM guest
+ * virtual machine is not authorized to write monitor data.
+ * User action:
+ * If the return code is 5, ensure that your z/VM guest virtual machine's
+ * entry in the z/VM directory includes the OPTION APPLMON statement.
+ * For other return codes see the section about DIAGNOSE Code X'DC'
+ * in "z/VM CP Programming Services".
+ */
diff --git a/Documentation/kmsg/s390/netiucv b/Documentation/kmsg/s390/netiucv

new file mode 100644 (file)

index 0000000..2368588
--- /dev/null
+++ b/Documentation/kmsg/s390/netiucv
@@ -0,0 +1,139 @@
+/*?
+ * Text: "%s: The peer interface of the IUCV device has closed the connection\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the IUCV device
+ * Description:
+ * The peer interface on the remote z/VM guest virtual machine has closed the
+ * connection. Do not expect further packets on this interface. Any packets
+ * you send to this interface will be dropped.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: The IUCV device failed to connect to z/VM guest %s\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the IUCV device
+ *   @2: z/VM user ID
+ * Description:
+ * The connection cannot be established because the z/VM guest virtual
+ * machine with the peer interface is not running.
+ * User action:
+ * Ensure that the z/VM guest virtual machine with the peer interface is
+ * running; then try again to establish the connection.
+ */
+
+/*?
+ * Text: "%s: The IUCV device failed to connect to the peer on z/VM guest %s\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the IUCV device
+ *   @2: z/VM user ID
+ * Description:
+ * The connection cannot be established because the z/VM guest virtual machine
+ * with the peer interface is not configured for IUCV connections.
+ * User action:
+ * Configure the z/VM guest virtual machine with the peer interface for IUCV
+ * connections; then try again to establish the connection.
+ */
+
+/*?
+ * Text: "%s: Connecting the IUCV device would exceed the maximum number of IUCV connections\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the IUCV device
+ * Description:
+ * The connection cannot be established because the maximum number of IUCV
+ * connections has been reached on the local z/VM guest virtual machine.
+ * User action:
+ * Close some of the established IUCV connections on the local z/VM guest
+ * virtual machine; then try again to establish the connection.
+ */
+
+/*?
+ * Text: "%s: z/VM guest %s has too many IUCV connections to connect with the IUCV device\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the IUCV device
+ *   @2: remote z/VM user ID
+ * Description:
+ * Connecting to the remote z/VM guest virtual machine failed because the
+ * maximum number of IUCV connections for the remote z/VM guest virtual
+ * machine has been reached.
+ * User action:
+ * Close some of the established IUCV connections on the remote z/VM guest
+ * virtual machine; then try again to establish the connection.
+ */
+
+/*?
+ * Text: "%s: The IUCV device cannot connect to a z/VM guest with no IUCV authorization\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the IUCV device
+ * Description:
+ * Because the remote z/VM guest virtual machine is not authorized for IUCV
+ * connections, the connection cannot be established.
+ * User action:
+ * Add the statements 'IUCV ALLOW' and 'IUCV ANY' to the z/VM directory
+ * entry of the remote z/VM guest virtual machine; then try again to
+ * establish the connection. See "z/VM CP Planning and Administration"
+ * for details about the IUCV statements.
+ */
+
+/*?
+ * Text: "%s: Connecting the IUCV device failed with error %d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the IUCV device
+ *   @2: error code
+ * Description:
+ * The connection cannot be established because of an IUCV CONNECT error.
+ * User action:
+ * Report this problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: The IUCV device has been connected successfully to %s\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the IUCV device
+ *   @2: remote z/VM user ID
+ * Description:
+ * The connection has been established and the interface is ready to
+ * transmit communication packages.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: The IUCV interface to %s has been established successfully\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the IUCV device
+ *   @2: remote z/VM user ID
+ * Description:
+ * The IUCV interface to the remote z/VM guest virtual machine has been
+ * established and can be activated with "ifconfig up" or an equivalent
+ * command.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: The IUCV device is connected to %s and cannot be removed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the IUCV device
+ *   @2: remote z/VM user ID
+ * Description:
+ * Removing a connection failed because the interface is active with a peer
+ * interface on a remote z/VM guest virtual machine.
+ * User action:
+ * Deactivate the interface with "ifconfig down" or an equivalent command;
+ * then try again to remove the interface.
+ */
+
+/*? Text: "driver unloaded\n" */
+/*? Text: "driver initialized\n" */
diff --git a/Documentation/kmsg/s390/qeth b/Documentation/kmsg/s390/qeth

new file mode 100644 (file)

index 0000000..bc8b662
--- /dev/null
+++ b/Documentation/kmsg/s390/qeth
@@ -0,0 +1,606 @@
+/*?
+ * Text: "%s: The LAN is offline\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ * Description:
+ * A start LAN command was sent by the qeth device driver but the physical or
+ * virtual adapter has not started the LAN. The LAN might take a few seconds
+ * to become available.
+ * User action:
+ * Check the status of the qeth device, for example, with the lsqeth command.
+ * If the device does not become operational within a few seconds, initiate a
+ * recovery process, for example, by writing '1' to the 'recover' sysfs
+ * attribute of the device.
+ */
+
+/*?
+ * Text: "%s: The user canceled setting the qeth device offline\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ * Description:
+ * A user initiated setting the device offline but subsequently canceled the
+ * operation, for example, with CTRL+C.
+ * User action:
+ * Check the status of the qeth device, for example, with the lsqeth command.
+ * If necessary, repeat the operation to set the device offline.
+ */
+
+/*?
+ * Text: "%s: A recovery process has been started for the device\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ * Description:
+ * A recovery process was started either by the qeth device driver or through
+ * a user command.
+ * User action:
+ * Wait until a message indicates the completion of the recovery process.
+ */
+
+/*?
+ * Text: "%s: The qeth device driver failed to recover an error on the device\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ * Description:
+ * The qeth device driver performed an automatic recovery operation to recover
+ * an error on a qeth device. The recovery operation failed.
+ * User action:
+ * Try the following actions in the given order: i) Check the status of the
+ * qeth device, for example, with the lsqeth command. ii) Initiate a recovery
+ * process by writing '1' to the 'recover' sysfs attribute of the device.
+ * iii) Ungroup and regroup the subchannel triplet of the device. vi) Reboot
+ * Linux. v) If the problem persists, gather Linux debug data and report the
+ * problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: The link for interface %s on CHPID 0x%X failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ *   @3: CHPID
+ * Description:
+ * A network link failed. A possible reason for this error is that a physical
+ * network cable has been disconnected.
+ * User action:
+ * Ensure that the network cable on the adapter hardware is connected properly.
+ * If the connection is to a guest LAN, ensure that the device is still coupled
+ * to the guest LAN.
+ */
+
+/*?
+ * Text: "%s: The link for %s on CHPID 0x%X has been restored\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ *   @3: CHPID
+ * Description:
+ * A failed network link has been re-established. A device recovery is in
+ * progress.
+ * User action:
+ * Wait until a message indicates the completion of the recovery process.
+ */
+
+/*?
+ * Text: "%s: A hardware operation timed out on the device\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ * Description:
+ * A hardware operation timed out on the qeth device.
+ * User action:
+ * Check the status of the qeth device, for example, with the lsqeth command.
+ * If the device is not operational, initiate a recovery process, for example,
+ * by writing '1' to the 'recover' sysfs attribute of the device.
+ */
+
+/*?
+ * Text: "%s: The adapter hardware is of an unknown type\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ * Description:
+ * The qeth device driver does not recognize the adapter hardware. The cause
+ * of this problem could be a hardware error or a Linux level that does not
+ * support your adapter hardware.
+ * User action:
+ * i) Investigate if your adapter hardware is supported by your Linux level.
+ * Consider using hardware that is supported by your Linux level or upgrading
+ * to a Linux level that supports your hardware. ii) Install the latest
+ * firmware on your adapter hardware. iii) If the problem persists and is not
+ * caused by a version mismatch, contact IBM support.
+ */
+
+/*?
+ * Text: "%s: The adapter is used exclusively by another host\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ * Description:
+ * The qeth adapter is exclusively used by another host.
+ * User action:
+ * Use another qeth adapter or configure this one not exclusively to a
+ * particular host.
+ */
+
+/*?
+ * Text: "%s: QDIO reported an error, rc=%i\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: return code
+ * Description:
+ * The QDIO subsystem reported an error.
+ * User action:
+ * Check for related QDIO errors. Check the status of the qeth device, for
+ * example, with the lsqeth command. If the device is not operational, initiate
+ * a recovery process, for example, by writing '1' to the 'recover' sysfs
+ * attribute of the device.
+ */
+
+/*?
+ * Text: "%s: There is no kernel module to support discipline %d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: discipline
+ * Description:
+ * The qeth device driver or a user command requested a kernel module for a
+ * particular qeth discipline. Either the discipline is not supported by the
+ * qeth device driver or the requested module is not available to your Linux
+ * system.
+ * User action:
+ * Check if the requested discipline module has been compiled into the kernel
+ * or is present in /lib/modules/<version>/kernel/drivers/s390/net.
+ */
+
+/*?
+ * Text: "Initializing the qeth device driver failed\n"
+ * Severity: Error
+ * Parameter:
+ * Description:
+ * The base module of the qeth device driver could not be initialized.
+ * User action:
+ * See errno.h to determine the reason for the error.
+ * i) Reboot Linux. ii) If the problem persists, gather Linux debug data and
+ * report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Registering IP address %s failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: IP address
+ * Description:
+ * An IP address could not be registered with the network adapter.
+ * User action:
+ * Check if another operating system instance has already registered the
+ * IP address with the same network adapter or at the same logical IP subnet.
+ */
+
+/*?
+ * Text: "%s: Reading the adapter MAC address failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ * Description:
+ * The qeth device driver could not read the MAC address from the network
+ * adapter.
+ * User action:
+ * Ungroup and regroup the subchannel triplet of the device. If this does not
+ * resolve the problem, reboot Linux. If the problem persists, gather Linux
+ * debug data and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Starting ARP processing support for %s failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The qeth device driver could not start ARP support on the network adapter.
+ * User action:
+ * Ungroup and regroup the subchannel triplet of the device. If this does not
+ * resolve the problem, reboot Linux. If the problem persists, gather Linux
+ * debug data and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Starting IP fragmentation support for %s failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The qeth device driver could not start IP fragmentation support on the
+ * network adapter.
+ * User action:
+ * Ungroup and regroup the subchannel triplet of the device. If this does not
+ * resolve the problem, reboot Linux. If the problem persists, gather Linux
+ * debug data and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Starting proxy ARP support for %s failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The qeth device driver could not start proxy ARP support on the network
+ * adapter.
+ * User action:
+ * None if you do not require proxy ARP support. If you need proxy ARP,
+ * ungroup and regroup the subchannel triplet of the device. If this does not
+ * resolve the problem, reboot Linux. If the problem persists, gather Linux
+ * debug data and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Starting VLAN support for %s failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The qeth device driver could not start VLAN support on the network adapter.
+ * User action:
+ * None if you do not require VLAN support. If you need VLAN support,
+ * ungroup and regroup the subchannel triplet of the device. If this does not
+ * resolve the problem, reboot Linux. If the problem persists, gather Linux
+ * debug data and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Starting multicast support for %s failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The qeth device driver could not start multicast support on the network
+ * adapter.
+ * User action:
+ * Ungroup and regroup the subchannel triplet of the device. If this does not
+ * resolve the problem, reboot Linux. If the problem persists, gather Linux
+ * debug data and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Activating IPv6 support for %s failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The qeth device driver could not activate IPv6 support on the network
+ * adapter.
+ * User action:
+ * None if you do not require IPv6 communication. If you need IPv6 support,
+ * ungroup and regroup the subchannel triplet of the device. If this does not
+ * resolve the problem, reboot Linux. If the problem persists, gather Linux
+ * debug data and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Enabling the passthrough mode for %s failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The qeth device driver could not enable the passthrough mode on the
+ * network adapter. The passthrough mode is required for all network traffic
+ * other than IPv4. In particular, the passthrough mode is required for IPv6
+ * traffic.
+ * User action:
+ * None if all you want to support is IPv4 communication. If you want to support
+ * IPv6 or other network traffic apart from IPv4, ungroup and regroup the
+ * subchannel triplet of the device. If this does not resolve the problem,
+ * reboot Linux. If the problem persists, gather Linux debug data and report
+ * the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Enabling broadcast filtering for %s failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The qeth device driver could not enable broadcast filtering on the network
+ * adapter.
+ * User action:
+ * Ungroup and regroup the subchannel triplet of the device. If this does not
+ * resolve the problem, reboot Linux. If the problem persists, gather Linux
+ * debug data and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Setting up broadcast filtering for %s failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The qeth device driver could not set up broadcast filtering on the network
+ * adapter.
+ * User action:
+ * Ungroup and regroup the subchannel triplet of the device. If this does not
+ * resolve the problem, reboot Linux. If the problem persists, gather Linux
+ * debug data and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Setting up broadcast echo filtering for %s failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The qeth device driver could not set up broadcast echo filtering on the
+ * network adapter.
+ * User action:
+ * Ungroup and regroup the subchannel triplet of the device. If this does not
+ * resolve the problem, reboot Linux. If the problem persists, gather Linux
+ * debug data and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Starting HW checksumming for %s failed, using SW checksumming\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The network adapter supports hardware checksumming for incoming IP packages
+ * but the qeth device driver could not start hardware checksumming on the
+ * adapter. The qeth device driver continues to use software checksumming for
+ * incoming IP packages.
+ * User action:
+ * None if you do not require hardware checksumming for incoming network
+ * traffic. If you want to enable hardware checksumming, ungroup and regroup
+ * the subchannel triplet of the device. If this does not resolve the problem,
+ * reboot Linux. If the problem persists, gather Linux debug data and report
+ * the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Enabling HW checksumming for %s failed, using SW checksumming\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The network adapter supports hardware checksumming for incoming IP packages
+ * but the qeth device driver could not enable hardware checksumming on the
+ * adapter. The qeth device driver continues to use software checksumming for
+ * incoming IP packages.
+ * User action:
+ * None if you do not require hardware checksumming for incoming network
+ * traffic. If you want to enable hardware checksumming, ungroup and regroup
+ * the subchannel triplet of the device. If this does not resolve the problem,
+ * reboot Linux. If the problem persists, gather Linux debug data and report
+ * the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Starting outbound TCP segmentation offload for %s failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The network adapter supports TCP segmentation offload, but the qeth device
+ * driver could not start this support on the adapter.
+ * User action:
+ * None if you do not require TCP segmentation offload. If you want to
+ * enable TCP segmentation offload, ungroup and regroup the subchannel triplet
+ * of the device. If this does not resolve the problem, reboot Linux. If the
+ * problem persists, gather Linux debug data and report the problem to your
+ * support organization.
+ */
+
+/*?
+ * Text: "%s: The network adapter failed to generate a unique ID\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ * Description:
+ * In IBM mainframe environments, network interfaces are not identified by
+ * a specific MAC address. Therefore, the network adapters provide the network
+ * interfaces with unique IDs to be used in their IPv6 link local addresses.
+ * Without such a unique ID, duplicate addresses might be assigned in other
+ * LPARs.
+ * User action:
+ * Install the latest firmware on the adapter hardware. Manually, configure
+ * an IPv6 link local address for this device.
+ */
+
+/*?
+ * Text: "There is no IPv6 support for the layer 3 discipline\n"
+ * Severity: Warning
+ * Description:
+ * If you want to use IPv6 with the layer 3 discipline, you need a Linux kernel
+ * with IPv6 support. Because your Linux kernel has not been compiled with
+ * IPv6 support, you cannot use IPv6 with the layer 3 discipline, even if your
+ * adapter supports IPv6.
+ * User action:
+ * Use a Linux kernel that has been complied to include IPv6 support if you
+ * want to use IPv6 with layer 3 qeth devices.
+ */
+
+/*?
+ * Text: "%s: The qeth device is not configured for the OSI layer required by z/VM\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ * Description:
+ * A qeth device that connects to a virtual network on z/VM must be configured for the
+ * same Open Systems Interconnection (OSI) layer as the virtual network. An ETHERNET
+ * guest LAN or VSWITCH uses the data link layer (layer 2) while an IP guest LAN
+ * or VSWITCH uses the network layer (layer 3).
+ * User action:
+ * If you are connecting to an ETHERNET guest LAN or VSWITCH, set the layer2 sysfs
+ * attribute of the qeth device to 1. If you are connecting to an IP guest LAN or
+ * VSWITCH, set the layer2 sysfs attribute of the qeth device to 0.
+ */
+
+/*?
+ * Text: "%s: Starting source MAC-address support for %s failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: network interface name
+ * Description:
+ * The qeth device driver could not enable source MAC-address on the network
+ * adapter.
+ * User action:
+ * Ungroup and regroup the subchannel triplet of the device. If this does not
+ * resolve the problem, reboot Linux. If the problem persists, gather Linux
+ * debug data and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: MAC address %2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x already exists\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: first token of the MAC-address
+ *   @3: second token of the MAC-address
+ *   @4: third token of the MAC-address
+ *   @5: fourth token of the MAC-address
+ *   @6: fifth token of the MAC-address
+ *   @7: sixth token of the MAC-address
+ * Description:
+ * Setting the MAC address for the qeth device fails, because this
+ * MAC address is already defined on the OSA CHPID.
+ * User action:
+ * Use a different MAC address for this qeth device.
+ */
+
+/*?
+ * Text: "%s: MAC address %2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x is not authorized\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ *   @2: first token of the MAC-address
+ *   @3: second token of the MAC-address
+ *   @4: third token of the MAC-address
+ *   @5: fourth token of the MAC-address
+ *   @6: fifth token of the MAC-address
+ *   @7: sixth token of the MAC-address
+ * Description:
+ * This qeth device is a virtual network interface card (NIC), to which z/VM
+ * has already assigned a MAC address. z/VM MAC address verification does
+ * not allow you to change this predefined address.
+ * User action:
+ * None; use the MAC address that has been assigned by z/VM.
+ */
+
+/*?
+ * Text: "%s: The HiperSockets network traffic analyzer is activated\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ * Description:
+ * The sysfs 'sniffer' attribute of the HiperSockets device has the value '1'.
+ * The corresponding HiperSockets interface has been switched into promiscuous mode.
+ * As a result, the HiperSockets network traffic analyzer is started on the device.
+ * User action:
+ * None.
+ */
+
+ /*?
+  * Text: "%s: The HiperSockets network traffic analyzer is deactivated\n"
+  * Severity: Informational
+  * Parameter:
+  *   @1: bus ID of the qeth device
+  * Description:
+  * The sysfs 'sniffer' attribute of the HiperSockets device has the value '1'.
+  * Promiscuous mode has been switched off for the corresponding HiperSockets interface
+  * As a result, the HiperSockets network traffic analyzer is stopped on the device.
+  * User action:
+  * None.
+  */
+
+/*?
+ * Text: "%s: The device is not authorized to run as a HiperSockets network traffic analyzer\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ * Description:
+ * The sysfs 'sniffer' attribute of the HiperSockets device has the value '1'.
+ * The corresponding HiperSockets interface is switched into promiscuous mode
+ * but the network traffic analyzer (NTA) rules configured at the Support Element (SE)
+ * do not allow tracing. Possible reasons are:
+ * - Tracing is not authorized for all HiperSockets channels in the mainframe system
+ * - Tracing is not authorized for this HiperSockets channel
+ * - LPAR is not authorized to enable an NTA
+ * User action:
+ * Configure appropriate HiperSockets NTA rules at the SE.
+ */
+
+/*?
+ * Text: "%s: A HiperSockets network traffic analyzer is already active in the HiperSockets LAN\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the qeth device
+ * Description:
+ * The sysfs 'sniffer' attribute of the HiperSockets device has the value '1'.
+ * The HiperSockets interface is switched into promiscuous mode but another
+ * HiperSockets device on the same HiperSockets channel is already running as
+ * a network traffic analyzer.
+ * A HiperSockets channel can only have one active network traffic analyzer.
+ * User action:
+ * Do not configure multiple HiperSockets devices in the same HiperSockets channel as
+ * tracing devices.
+ */
+
+
+/*? Text: "core functions removed\n" */
+/*? Text: "%s: Device is a%s card%s%s%s\nwith link type %s.\n" */
+/*? Text: "%s: Device is a%s card%s%s%s\nwith link type %s (no portname needed by interface).\n" */
+/*? Text: "%s: Device is a%s card%s%s%s\nwith link type %s (portname: %s)\n" */
+/*? Text: "%s: issue_next_read failed: no iob available!\n" */
+/*? Text: "%s: Priority Queueing not supported\n" */
+/*? Text: "%s: sense data available. cstat 0x%X dstat 0x%X\n" */
+/*? Text: "loading core functions\n" */
+/*? Text: "%s: MAC address %2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x successfully registered on device %s\n" */
+/*? Text: "%s: Device successfully recovered!\n" */
+/*? Text: "register layer 2 discipline\n" */
+/*? Text: "unregister layer 2 discipline\n" */
+/*? Text: "%s: Hardware IP fragmentation not supported on %s\n" */
+/*? Text: "%s: IPv6 not supported on %s\n" */
+/*? Text: "%s: VLAN not supported on %s\n" */
+/*? Text: "%s: Inbound source MAC-address not supported on %s\n" */
+/*? Text: "%s: IPV6 enabled\n" */
+/*? Text: "%s: ARP processing not supported on %s!\n" */
+/*? Text: "%s: Hardware IP fragmentation enabled \n" */
+/*? Text: "%s: set adapter parameters not supported.\n" */
+/*? Text: "%s: VLAN enabled\n" */
+/*? Text: "register layer 3 discipline\n" */
+/*? Text: "%s: Outbound TSO enabled\n" */
+/*? Text: "%s: Broadcast not supported on %s\n" */
+/*? Text: "%s: Outbound TSO not supported on %s\n" */
+/*? Text: "%s: Inbound HW Checksumming not supported on %s,\ncontinuing using Inbound SW Checksumming\n" */
+/*? Text: "%s: Using no checksumming on %s.\n" */
+/*? Text: "%s: Broadcast enabled\n" */
+/*? Text: "%s: Multicast not supported on %s\n" */
+/*? Text: "%s: Using SW checksumming on %s.\n" */
+/*? Text: "%s: HW Checksumming (inbound) enabled\n" */
+/*? Text: "unregister layer 3 discipline\n" */
+/*? Text: "%s: Multicast enabled\n" */
+/*? Text: "%s: QDIO data connection isolation is deactivated\n" */
+/*? Text: "%s: QDIO data connection isolation is activated\n" */
+/*? Text: "%s: Adapter does not support QDIO data connection isolation\n" */
+/*? Text: "%s: Adapter is dedicated. QDIO data connection isolation not supported\n" */
+/*? Text: "%s: TSO does not permit QDIO data connection isolation\n" */
+
diff --git a/Documentation/kmsg/s390/s390dbf b/Documentation/kmsg/s390/s390dbf

new file mode 100644 (file)

index 0000000..b9286cf
--- /dev/null
+++ b/Documentation/kmsg/s390/s390dbf
@@ -0,0 +1,83 @@
+/*?
+ * Text: "Root becomes the owner of all s390dbf files in sysfs\n"
+ * Severity: Warning
+ * Description:
+ * The S/390 debug feature you are using only supports uid/gid = 0.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Registering debug feature %s failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: feature name
+ * Description:
+ * The initialization of an S/390 debug feature failed. A likely cause of this
+ * problem is memory constraints. The system keeps running, but the debug
+ * data for this feature will not be available in sysfs.
+ * User action:
+ * Consider assigning more memory to your LPAR or z/VM guest virtual machine.
+ */
+
+/*?
+ * Text: "Registering view %s/%s would exceed the maximum number of views %i\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: feature name
+ *   @2: view name
+ *   @3: maximum
+ * Description:
+ * The maximum number of allowed debug feature views has been reached. The
+ * view has not been registered. The system keeps running but the new view
+ * will not be available in sysfs. This is a program error.
+ * User action:
+ * Report this problem to your support partner.
+ */
+
+/*?
+ * Text: "%s is not a valid level for a debug feature\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: level
+ * Description:
+ * Setting a new level for a debug feature by using the 'level' sysfs attribute
+ * failed. Valid levels are the minus sign (-) and the integers in the
+ * range 0 to 6. The minus sign switches off the feature. The numbers switch
+ * the feature on, where higher numbers produce more debug output.
+ * User action:
+ * Write a valid value to the 'level' sysfs attribute.
+ */
+
+/*?
+ * Text: "Flushing debug data failed because %c is not a valid area\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: debug area number
+ * Description:
+ * Flushing a debug area by using the 'flush' sysfs attribute failed. Valid
+ * values are the minus sign (-) for flushing all areas, or the number of the
+ * respective area for flushing a single area.
+ * User action:
+ * Write a valid area number or the minus sign (-) to the 'flush' sysfs
+ * attribute.
+ */
+
+/*?
+ * Text: "Allocating memory for %i pages failed\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: number of pages
+ * Description:
+ * Setting the debug feature size by using the 'page' sysfs attribute failed.
+ * Linux did not have enough memory for expanding the debug feature to the
+ * requested size.
+ * User action:
+ * Use a smaller number of pages for the debug feature or allocate more
+ * memory to your LPAR or z/VM guest virtual machine.
+ */
+
+/*? Text: "%s: set new size (%i pages)\n" */
+/*? Text: "%s: switched off\n" */
+/*? Text: "%s: level %i is out of range (%i - %i)\n" */
+/*? Text: "Registering view %s/%s failed due to out of memory\n" */
diff --git a/Documentation/kmsg/s390/sclp_cmd b/Documentation/kmsg/s390/sclp_cmd

new file mode 100644 (file)

index 0000000..5fff1fd
--- /dev/null
+++ b/Documentation/kmsg/s390/sclp_cmd
@@ -0,0 +1,16 @@
+/*? Text: "sync request failed (cmd=0x%08x, status=0x%02x)\n" */
+/*? Text: "readcpuinfo failed (response=0x%04x)\n" */
+/*? Text: "configure cpu failed (cmd=0x%08x, response=0x%04x)\n" */
+/*? Text: "configure channel-path failed (cmd=0x%08x, response=0x%04x)\n" */
+/*? Text: "read channel-path info failed (response=0x%04x)\n" */
+/*? Text: "assign storage failed (cmd=0x%08x, response=0x%04x, rn=0x%04x)\n" */
+
+/*?
+ * Text: "Memory hotplug state changed, suspend refused.\n"
+ * Severity: Error
+ * Description:
+ * Suspend is refused after a memory hotplug operation was performed.
+ * User action:
+ * The system needs to be restarted and no memory hotplug operation must be
+ * performed in order to allow suspend.
+ */
diff --git a/Documentation/kmsg/s390/sclp_config b/Documentation/kmsg/s390/sclp_config

new file mode 100644 (file)

index 0000000..483242f
--- /dev/null
+++ b/Documentation/kmsg/s390/sclp_config
@@ -0,0 +1,3 @@
+/*? Text: "cpu capability changed.\n" */
+/*? Text: "no configuration management.\n" */
+
diff --git a/Documentation/kmsg/s390/sclp_cpi b/Documentation/kmsg/s390/sclp_cpi

new file mode 100644 (file)

index 0000000..0965ad5
--- /dev/null
+++ b/Documentation/kmsg/s390/sclp_cpi
@@ -0,0 +1,2 @@
+/*? Text: "request failed (status=0x%02x)\n" */
+/*? Text: "request failed with response code 0x%x\n" */
diff --git a/Documentation/kmsg/s390/sclp_sdias b/Documentation/kmsg/s390/sclp_sdias

new file mode 100644 (file)

index 0000000..081046c
--- /dev/null
+++ b/Documentation/kmsg/s390/sclp_sdias
@@ -0,0 +1,4 @@
+/*? Text: "sclp_send failed for get_nr_blocks\n" */
+/*? Text: "SCLP error: %x\n" */
+/*? Text: "sclp_send failed: %x\n" */
+/*? Text: "Error from SCLP while copying hsa. Event status = %x\n" */
diff --git a/Documentation/kmsg/s390/setup b/Documentation/kmsg/s390/setup

new file mode 100644 (file)

index 0000000..77158b9
--- /dev/null
+++ b/Documentation/kmsg/s390/setup
@@ -0,0 +1,181 @@
+/*?
+ * Text: "Execute protection active, mvcos available\n"
+ * Severity: Informational
+ * Description:
+ * The kernel parameter 'noexec' has been specified. The kernel will
+ * honor the execute bit of mappings and will use the mvcos instruction
+ * to copy between the user and kernel address space.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Execute protection active, mvcos not available\n"
+ * Severity: Informational
+ * Description:
+ * The kernel parameter 'noexec' has been specified. The kernel will
+ * honor the execute bit of mappings. The mvcos instruction is not
+ * available and the kernel will use the slower page table walk method
+ * to copy between the user and kernel address space.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Address spaces switched, mvcos available\n"
+ * Severity: Informational
+ * Description:
+ * The kernel parameter 'switch_amode' has been specified. The kernel
+ * will use the primary address space for user space processes and the
+ * home address space for the kernel. The mvcos instruction is used to
+ * copy between the user and kernel address space.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Address spaces switched, mvcos not available\n"
+ * Severity: Informational
+ * Description:
+ * The kernel parameter 'switch_amode' has been specified. The kernel
+ * will use the primary address space for user space processes and the
+ * home address space for the kernel. The mvcos instruction is not
+ * available and the kernel will use the slower page table walk method
+ * to copy between the user and kernel address space.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "initrd extends beyond end of memory (0x%08lx > 0x%08lx) disabling initrd\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: start address of the initial RAM disk
+ *   @2: memory end address
+ * Description:
+ * The load address and the size of the initial RAM disk result in an end
+ * address of the initial RAM disk that is beyond the end of the system
+ * memory.
+ * User action:
+ * Lower the load address of the initial RAM disk, reduce the size of the
+ * initial RAM disk, or increase the size if the system memory to make the
+ * initial RAM disk fit into the memory.
+ */
+
+/*?
+ * Text: "Moving initrd (0x%08lx -> 0x%08lx, size: %ld)\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: old start address of the initial RAM disk
+ *   @2: new start address of the initial RAM disk
+ *   @3: size of the initial RAM disk
+ * Description:
+ * The location of the initial RAM disk conflicted with the boot memory bitmap.
+ * To resolve the conflict the initial RAM disk has been moved to a new
+ * location.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Linux is running as a z/VM guest operating system in 31-bit mode\n"
+ * Severity: Informational
+ * Description:
+ * The 31-bit Linux kernel detected that it is running as a guest operating
+ * system of the z/VM hypervisor.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Linux is running natively in 31-bit mode\n"
+ * Severity: Informational
+ * Description:
+ * The 31-bit Linux kernel detected that it is running on an IBM mainframe,
+ * either as the sole operating system in an LPAR or as the sole operating
+ * system on the entire mainframe. The Linux kernel is not running as a
+ * guest operating system of the z/VM hypervisor.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "The hardware system has IEEE compatible floating point units\n"
+ * Severity: Informational
+ * Description:
+ * The Linux kernel detected that it is running on a hardware system with
+ * CPUs that have IEEE compatible floating point units.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "The hardware system has no IEEE compatible floating point units\n"
+ * Severity: Informational
+ * Description:
+ * The Linux kernel detected that it is running on a hardware system with
+ * CPUs that do not have IEEE compatible floating point units.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Linux is running as a z/VM guest operating system in 64-bit mode\n"
+ * Severity: Informational
+ * Description:
+ * The 64-bit Linux kernel detected that it is running as a guest operating
+ * system of the z/VM hypervisor.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Linux is running natively in 64-bit mode\n"
+ * Severity: Informational
+ * Description:
+ * The 64-bit Linux kernel detected that it is running on an IBM mainframe,
+ * either as the sole operating system in an LPAR or as the sole operating
+ * system on the entire mainframe. The Linux kernel is not running as a
+ * guest operating system of the z/VM hypervisor.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "Defining the Linux kernel NSS failed with rc=%d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: return code
+ * Description:
+ * The Linux kernel could not define the named saved system (NSS) with
+ * the z/VM CP DEFSYS command. The return code represents the numeric
+ * portion of the CP DEFSYS error message.
+ * User action:
+ * For return code 1, the z/VM guest virtual machine is not authorized
+ * to define named saved systems.
+ * Ensure that the z/VM guest virtual machine is authorized to issue
+ * the CP DEFSYS command (typically privilege class E).
+ * For other return codes, see the help and message documentation for
+ * the CP DEFSYS command.
+ */
+
+/*?
+ * Text: "Saving the Linux kernel NSS failed with rc=%d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: return code
+ * Description:
+ * The Linux kernel could not save the named saved system (NSS) with
+ * the z/VM CP SAVESYS command. The return code represents the numeric
+ * portion of the CP SAVESYS error message.
+ * User action:
+ * For return code 1, the z/VM guest virtual machine is not authorized
+ * to save named saved systems.
+ * Ensure that the z/VM guest virtual machine is authorized to issue
+ * the CP SAVESYS command (typically privilege class E).
+ * For other return codes, see the help and message documentation for
+ * the CP SAVESYS command.
+ */
+
+/*? Text: "Linux is running under KVM in 64-bit mode\n" */
+
diff --git a/Documentation/kmsg/s390/tape b/Documentation/kmsg/s390/tape

new file mode 100644 (file)

index 0000000..42e2c01
--- /dev/null
+++ b/Documentation/kmsg/s390/tape
@@ -0,0 +1,104 @@
+/*?
+ * Text: "%s: A tape unit was detached while in use\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * A tape unit has been detached from the I/O configuration while a tape
+ * was being accessed. This typically results in I/O error messages and
+ * potentially in damaged data on the tape.
+ * User action:
+ * Check the output of the application that accesses the tape device.
+ * If this problem occurred during a write-type operation, consider repeating
+ * the operation after bringing the tape device back online.
+ */
+
+/*?
+ * Text: "%s: A tape cartridge has been mounted\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * A tape cartridge has been inserted into the tape unit. The tape in the
+ * tape unit is ready to be accessed.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: The tape cartridge has been successfully unloaded\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape cartridge has been unloaded from the tape unit. Insert a tape
+ * cartridge before accessing the tape device.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: Determining the size of the recorded area...\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape block device driver is currently determining the size of the
+ * recorded area on the tape medium. This operation typically takes a
+ * few minutes.
+ * User action:
+ * Wait until the size is shown in a completion message.
+ */
+
+/*?
+ * Text: "%s: Opening the tape failed because of missing end-of-file marks\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape block device driver requires end-of-file marks at the end of
+ * the recorded area on a tape. If the tape device was to be opened in
+ * response to a mount command, the mount command will fail.
+ * User action:
+ * Insert a tape cartridge that has been prepared for use with the tape
+ * block device driver and try the operation again.
+ */
+
+/*?
+ * Text: "%s: The size of the recorded area is %i blocks\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the tape device
+ *   @2: number of blocks
+ * Description:
+ * The tape block device driver has successfully determined the size of the
+ * recorded area on the tape medium. The tape device can now be used as
+ * a block device. See the mount(8) man page for details on how to access
+ * block devices.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "A cartridge is loaded in tape device %s, refusing to suspend\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * A request to suspend a tape device currently loaded with a cartridge is
+ * rejected.
+ * User action:
+ * Unload the tape device. Then try to suspend the system again.
+ */
+
+/*?
+ * Text: "Tape device %s is busy, refusing to suspend\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * A request to suspend a tape device being currently in use is rejected.
+ * User action:
+ * Terminate applications performing tape operations
+ * and then try to suspend the system again.
+ */
diff --git a/Documentation/kmsg/s390/tape_34xx b/Documentation/kmsg/s390/tape_34xx

new file mode 100644 (file)

index 0000000..5927b2f
--- /dev/null
+++ b/Documentation/kmsg/s390/tape_34xx
@@ -0,0 +1,418 @@
+/*?
+ * Text: "%s: An unexpected condition %d occurred in tape error recovery\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the tape device
+ *   @2: number
+ * Description:
+ * The control unit has reported an error condition that is not recognized by
+ * the error recovery process of the tape device driver.
+ * User action:
+ * Report this problem and the condition number from the message to your
+ * support organization.
+ */
+
+/*?
+ * Text: "%s: A data overrun occurred between the control unit and tape unit\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * A data overrun error has occurred on the connection between the control
+ * unit and the tape unit. If this problem occurred during a write-type
+ * operation, the integrity of the data on the tape might be compromised.
+ * User action:
+ * Use a faster connection. If this problem occurred during a write-type
+ * operation, consider repositioning the tape and repeating the operation.
+ */
+
+/*?
+ * Text: "%s: The block ID sequence on the tape is incorrect\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The control unit has detected an incorrect block ID sequence on the tape.
+ * This problem typically indicates that the data on the tape is damaged.
+ * User action:
+ * If this problem occurred during a write-type operation reposition the tape
+ * and repeat the operation.
+ */
+
+/*?
+ * Text: "%s: A read error occurred that cannot be recovered\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * A read error has occurred that cannot be recovered. The current tape might
+ * be damaged.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: A write error on the tape cannot be recovered\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * A write error has occurred that could not be recovered by the automatic
+ * error recovery process.
+ * User action:
+ * Use a different tape cartridge.
+ */
+
+/*?
+ * Text: "%s: Writing the ID-mark failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The ID-mark at the beginning of tape could not be written. The tape medium
+ * might be write-protected.
+ * User action:
+ * Try a different tape cartridge. Ensure that the write-protection on the
+ * cartridge is switched off.
+ */
+
+/*?
+ * Text: "%s: Reading the tape beyond the end of the recorded area failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * A read-type operation failed because it extended beyond the end of the
+ * recorded area on the tape medium.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: The tape contains an incorrect block ID sequence\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The control unit has detected an incorrect block ID sequence on the tape.
+ * This problem typically indicates that the data on the tape is damaged.
+ * User action:
+ * If this problem occurred during a write-type operation reposition the tape
+ * and repeat the operation.
+ */
+
+/*?
+ * Text: "%s: A path equipment check occurred for the tape device\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * A path equipment check has occurred. This check indicates problems with the
+ * connection between the mainframe system and the tape control unit.
+ * User action:
+ * Ensure that the cable connections between the mainframe system and the
+ * control unit are securely in place and not damaged.
+ */
+
+/*?
+ * Text: "%s: The tape unit cannot process the tape format\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * Either the tape unit is not able to read the format ID mark, or the
+ * specified format is not supported by the tape unit.
+ * User action:
+ * If you do not need the data recorded on the current tape, use a different
+ * tape or write a new format ID mark at the beginning of the tape. Be aware
+ * that writing a new ID mark leads to a loss of all data that has been
+ * recorded on the tape. If you need the data on the current tape, use a tape
+ * unit that supports the tape format.
+ */
+
+/*?
+ * Text: "%s: The tape medium is write-protected\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * A write-type operation failed because the tape medium is write-protected.
+ * User action:
+ * Eject the tape cartridge, switch off the write protection on the cartridge,
+ * insert the cartridge, and try the operation again.
+ */
+
+/*?
+ * Text: "%s: The tape does not have the required tape tension\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape does not have the required tape tension.
+ * User action:
+ * Rewind and reposition the tape, then repeat the operation.
+ */
+
+/*?
+ * Text: "%s: The tape unit failed to load the cartridge\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * An error has occurred while loading the tape cartridge.
+ * User action:
+ * Unload the cartridge and load it again.
+ */
+
+/*?
+ * Text: "%s: Automatic unloading of the tape cartridge failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape unit failed to unload the cartridge.
+ * User action:
+ * Unload the cartridge manually by using the eject button on the tape unit.
+ */
+
+/*?
+ * Text: "%s: An equipment check has occurred on the tape unit\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * Possible reasons for the check condition are a unit adapter error, a buffer
+ * error on the lower interface, an unusable internal path, or an error that
+ * has occurred while loading the cartridge.
+ * User action:
+ * Examine the tape unit and the cartridge loader. Consult the tape unit
+ * documentation for details.
+ */
+
+/*?
+ * Text: "%s: The tape information states an incorrect length\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape is shorter than stated at the beginning of the tape data. A
+ * possible reason for this problem is that the tape might have been physically
+ * truncated. Data written to the tape might be incomplete or damaged.
+ * User action:
+ * If this problem occurred during a write-type operation, consider repeating
+ * the operation with a different tape cartridge.
+ */
+
+/*?
+ * Text: "%s: The tape unit is not ready\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape unit is online but not ready.
+ * User action:
+ * Turn the ready switch on the tape unit to the ready position and try the
+ * operation again.
+ */
+
+/*?
+ * Text: "%s: The tape medium has been rewound or unloaded manually\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape unit rewind button, unload button, or both have been used to
+ * rewind or unload the tape cartridge. A tape cartridge other than the
+ * intended cartridge might have been inserted or the tape medium might not
+ * be at the expected position.
+ * User action:
+ * Verify that the correct tape cartridge has been inserted and that the tape
+ * medium is at the required position before continuing to work with the tape.
+ */
+
+/*?
+ * Text: "%s: The tape subsystem is running in degraded mode\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape subsystem is not operating at its maximum performance.
+ * User action:
+ * Contact your service representative for the tape unit and report this
+ * problem.
+ */
+
+/*?
+ * Text: "%s: The tape unit is already assigned\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape unit is already assigned to another channel path.
+ * User action:
+ * Free the tape unit from the operating system instance to which it is
+ * currently assigned then try again.
+ */
+
+/*?
+ * Text: "%s: The tape unit is not online\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape unit is not online to the tape device driver.
+ * User action:
+ * Ensure that the tape unit is operational and that the cable connections
+ * between the control unit and the tape unit are securely in place and not
+ * damaged.
+ */
+
+/*?
+ * Text: "%s: The control unit has fenced access to the tape volume\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The control unit fences further access to the current tape volume. The data
+ * integrity on the tape volume might have been compromised.
+ * User action:
+ * Rewind and unload the tape cartridge.
+ */
+
+/*?
+ * Text: "%s: A parity error occurred on the tape bus\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * A data parity check error occurred on the bus. Data that was read or written
+ * while the error occurred is not valid.
+ * User action:
+ * Reposition the tape and repeat the read-type or write-type operation.
+ */
+
+/*?
+ * Text: "%s: I/O error recovery failed on the tape control unit\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * An I/O error occurred that cannot be recovered by the automatic error
+ * recovery process of the tape control unit. The application that operates
+ * the tape unit will receive a return value of -EIO which indicates an
+ * I/O error. The data on the tape might be damaged.
+ * User action:
+ * If this problem occurred during a write-type operation, consider
+ * repositioning the tape and repeating the operation.
+ */
+
+/*?
+ * Text: "%s: The tape unit requires a firmware update\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape unit requires firmware patches from the tape control unit but the
+ * required patches are not available on the control unit.
+ * User action:
+ * Make the require patches available on the control unit then reposition the
+ * tape and retry the operation. For details about obtaining and installing
+ * firmware updates see the control unit documentation.
+ */
+
+/*?
+ * Text: "%s: The maximum block size for buffered mode is exceeded\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The block to be written is larger than allowed for the buffered mode.
+ * User action:
+ * Use a smaller block size.
+ */
+
+/*?
+ * Text: "%s: A channel interface error cannot be recovered\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * An error has occurred on the channel interface. This error cannot
+ * be recovered by the control unit error recovery process.
+ * User action:
+ * See the documentation of the control unit.
+ */
+
+/*?
+ * Text: "%s: A channel protocol error occurred\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * An error was detected in the channel protocol.
+ * User action:
+ * Reposition the tape and try the operation again.
+ */
+
+/*?
+ * Text: "%s: The tape unit does not support the compaction algorithm\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape unit cannot read the current tape. The data on the tape has been
+ * compressed with an algorithm that is not supported by the tape unit.
+ * User action:
+ * Use a tape unit that supports the compaction algorithm used for the
+ * current tape.
+ */
+
+/*?
+ * Text: "%s: The tape unit does not support tape format 3480-2 XF\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape unit does not support tapes recorded in the 3480-2 XF format.
+ * User action:
+ * If you do not need the data recorded on the current tape, rewind the tape
+ * and overwrite it with a supported format. If you need the data on the
+ * current tape, use a tape unit that supports the tape format.
+ */
+
+/*?
+ * Text: "%s: The tape unit does not support format 3480 XF\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape unit does not support tapes recorded in the 3480 XF format.
+ * User action:
+ * If you do not need the data recorded on the current tape, rewind the tape
+ * and overwrite it with a supported format. If you need the data on the
+ * current tape, use a tape unit that supports the tape format.
+ */
+
+/*?
+ * Text: "%s: The tape unit does not support the current tape length\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The length of the tape in the cartridge is incompatible with the tape unit.
+ * User action:
+ * Either use a different tape unit or use a tape with a supported length.
+ */
+
+/*?
+ * Text: "%s: The tape unit does not support the tape length\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The length of the tape in the cartridge is incompatible with the tape
+ * unit.
+ * User action:
+ * Either use a different tape unit or use a tape with a supported length.
+ */
+
diff --git a/Documentation/kmsg/s390/tape_3590 b/Documentation/kmsg/s390/tape_3590

new file mode 100644 (file)

index 0000000..f4ac446
--- /dev/null
+++ b/Documentation/kmsg/s390/tape_3590
@@ -0,0 +1,184 @@
+/*?
+ * Text: "%s: The tape medium must be loaded into a different tape unit\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape device has indicated an error condition that requires loading
+ * the tape cartridge into a different tape unit to recover.
+ * User action:
+ * Unload the cartridge and use a different tape unit to retry the operation.
+ */
+
+/*?
+ * Text: "%s: Tape media information: exception %s, service %s\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ *   @2: exception
+ *   @3: service
+ * Description:
+ * This is an operating system independent tape medium information message
+ * that was issued by the tape unit. The information in the message is
+ * intended for the IBM customer engineer.
+ * User action:
+ * See the documentation for the tape unit for further information.
+ */
+
+/*?
+ * Text: "%s: Device subsystem information: exception %s, service %s\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ *   @2: exception
+ *   @3: required service action
+ * Description:
+ * This is an operating system independent device subsystem information message
+ * that was issued by the tape unit. The information in the message is
+ * intended for the IBM customer engineer.
+ * User action:
+ * See the documentation for the tape unit for further information.
+ */
+
+/*?
+ * Text: "%s: I/O subsystem information: exception %s, service %s\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ *   @2: exception
+ *   @3: required service action
+ * Description:
+ * This is an operating system independent I/O subsystem information message
+ * that was issued by the tape unit. The information in the message is
+ * intended for the IBM customer engineer.
+ * User action:
+ * See the documentation for the tape unit for further information.
+ */
+
+/*?
+ * Text: "%s: The tape unit has issued sense message %s\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ *   @2: sense message code
+ * Description:
+ * The tape unit has issued an operating system independent sense message.
+ * User action:
+ * See the documentation for the tape unit for further information.
+ */
+
+/*?
+ * Text: "%s: The tape unit has issued an unknown sense message code 0x%x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ *   @2: code
+ * Description:
+ * The tape device driver has received an unknown sense message from the
+ * tape unit.
+ * driver.
+ * User action:
+ * See the documentation for the tape unit for further information.
+ */
+
+/*?
+ * Text: "%s: MIM SEV=%i, MC=%02x, ES=%x/%x, RC=%02x-%04x-%02x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ *   @2: SEV
+ *   @3: message code
+ *   @4: exception
+ *   @5: required service action
+ *   @6: refcode
+ *   @7: mid
+ *   @8: fid
+ * Description:
+ * This is an operating system independent information message that was
+ * issued by the tape unit. The information in the message is intended for
+ * the IBM customer engineer.
+ * User action:
+ * See to the documentation for the tape unit for further information.
+ */
+
+/*?
+ * Text: "%s: IOSIM SEV=%i, DEVTYPE=3590/%02x, MC=%02x, ES=%x/%x, REF=0x%04x-0x%04x-0x%04x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ *   @2: SEV
+ *   @3: model
+ *   @4: message code
+ *   @5: exception
+ *   @6: required service action
+ *   @7: refcode1
+ *   @8: refcode2
+ *   @9: refcode3
+ * Description:
+ * This is an operating system independent I/O subsystem information message
+ * that was issued by the tape unit. The information in the message is
+ * intended for the IBM customer engineer.
+ * User action:
+ * See the documentation for the tape unit for further information.
+ */
+
+/*?
+ * Text: "%s: DEVSIM SEV=%i, DEVTYPE=3590/%02x, MC=%02x, ES=%x/%x, REF=0x%04x-0x%04x-0x%04x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ *   @2: SEV
+ *   @3: model
+ *   @4: message code
+ *   @5: exception
+ *   @6: required service action
+ *   @7: refcode1
+ *   @8: refcode2
+ *   @9: refcode3
+ * Description:
+ * This is an operating system independent device subsystem information message
+ * issued by the tape unit. The information in the message is intended for
+ * the IBM customer engineer.
+ * User action:
+ * See the documentation for the tape unit for further information.
+ */
+
+/*?
+ * Text: "%s: The tape unit has issued an unknown sense message code %x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ *   @2: code
+ * Description:
+ * The tape device has issued a sense message, that is unknown to the device
+ * driver.
+ * User action:
+ * Use the message code printed as hexadecimal value and see the documentation
+ * for the tape unit for further information.
+ */
+
+/*?
+ * Text: "%s: The tape unit failed to obtain the encryption key from EKM\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * The tape unit was unable to retrieve the encryption key required to decode
+ * the data on the tape from the enterprise key manager (EKM).
+ * User action:
+ * See the EKM and tape unit documentation for information about how to enable
+ * the tape unit to retrieve the encryption key.
+ */
+
+/*?
+ * Text: "%s: A different host has privileged access to the tape unit\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the tape device
+ * Description:
+ * You cannot access the tape unit because a different operating system
+ * instance has privileged access to the unit.
+ * User action:
+ * Unload the current cartridge to solve this problem.
+ */
+
diff --git a/Documentation/kmsg/s390/time b/Documentation/kmsg/s390/time

new file mode 100644 (file)

index 0000000..6058acb
--- /dev/null
+++ b/Documentation/kmsg/s390/time
@@ -0,0 +1,36 @@
+/*?
+ * Text: "The ETR interface has adjusted the clock by %li microseconds\n"
+ * Severity: Notice
+ * Parameter:
+ *   @1: number of microseconds
+ * Description:
+ * The external time reference (ETR) interface has synchronized the system
+ * clock  with the external reference and set it to a new value. The time
+ * difference between the old and new clock value has been passed to the
+ * network time protocol (NTP) as a single shot adjustment.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "The real or virtual hardware system does not provide an ETR interface\n"
+ * Severity: Warning
+ * Description:
+ * The 'etr=' parameter has been passed on the kernel parameter line for
+ * a Linux instance that does not have access to the external time reference
+ * (ETR) facility.
+ * User action:
+ * To avoid this warning remove the 'etr=' kernel parameter.
+ */
+
+/*?
+ * Text: "The real or virtual hardware system does not provide an STP interface\n"
+ * Severity: Warning
+ * Description:
+ * The 'stp=' parameter has been passed on the kernel parameter line for
+ * a Linux instance that does not have access to the server time protocol
+ * (STP) facility.
+ * User action:
+ * To avoid this warning remove the 'stp=' kernel parameter.
+ */
+
diff --git a/Documentation/kmsg/s390/vmcp b/Documentation/kmsg/s390/vmcp

new file mode 100644 (file)

index 0000000..a938fb2
--- /dev/null
+++ b/Documentation/kmsg/s390/vmcp
@@ -0,0 +1,13 @@
+/*?
+ * Text: "The z/VM CP interface device driver cannot be loaded without z/VM\n"
+ * Severity: Warning
+ * Description:
+ * With the z/VM CP interface you can issue z/VM CP commands from a Linux
+ * terminal session. On Linux instances that run in environments other than
+ * the z/VM hypervisor, the z/VM CP interface does not provide any useful
+ * function and the corresponding vmcp device driver cannot be loaded.
+ * User action:
+ * Load the vmcp device driver only on Linux instances that run as guest
+ * operating systems of the z/VM hypervisor. If the device driver has been
+ * compiled into the kernel, ignore this message.
+ */
diff --git a/Documentation/kmsg/s390/vmlogrdr b/Documentation/kmsg/s390/vmlogrdr

new file mode 100644 (file)

index 0000000..0dc7def
--- /dev/null
+++ b/Documentation/kmsg/s390/vmlogrdr
@@ -0,0 +1,18 @@
+/*? Text: "vmlogrdr: failed to start recording automatically\n" */
+/*? Text: "vmlogrdr: connection severed with reason %i\n" */
+/*? Text: "vmlogrdr: iucv connection to %s failed with rc %i \n" */
+/*? Text: "vmlogrdr: failed to stop recording automatically\n" */
+/*? Text: "not running under VM, driver not loaded.\n" */
+
+/*?
+ * Text: "vmlogrdr: device %s is busy. Refuse to suspend.\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: device name
+ * Description:
+ * Suspending vmlogrdr devices that are in uses is not supported.
+ * A request to suspend such a device is refused.
+ * User action:
+ * Close all applications that use any of the vmlogrdr devices
+ * and then try to suspend the system again.
+ */
diff --git a/Documentation/kmsg/s390/vmur b/Documentation/kmsg/s390/vmur

new file mode 100644 (file)

index 0000000..52e7db2
--- /dev/null
+++ b/Documentation/kmsg/s390/vmur
@@ -0,0 +1,47 @@
+/*?
+ * Text: "The %s cannot be loaded without z/VM\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: z/VM virtual unit record device driver
+ * Description:
+ * The z/VM virtual unit record device driver provides Linux with access to
+ * z/VM virtual unit record devices like punch card readers, card punches, and
+ * line printers. On Linux instances that run in environments other than the
+ * z/VM hypervisor, the device driver does not provide any useful function and
+ * the corresponding vmur module cannot be loaded.
+ * User action:
+ * Load the vmur module only on Linux instances that run as guest operating
+ * systems of the z/VM hypervisor. If the z/VM virtual unit record device
+ * has been compiled into the kernel, ignore this message.
+ */
+
+/*?
+ * Text: "Kernel function alloc_chrdev_region failed with error code %d\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: error code according to errno definitions
+ * Description:
+ * The z/VM virtual unit record device driver (vmur) needs to register a range
+ * of character device minor numbers from 0x0000 to 0xffff.
+ * This registration failed, probably because of memory constraints.
+ * User action:
+ * Free some memory and reload the vmur module. If the z/VM virtual unit
+ * record device driver has been compiled into the kernel reboot Linux.
+ * Consider assigning more memory to your LPAR or z/VM guest virtual machine.
+ */
+
+/*?
+ * Text: "Unit record device %s is busy, %s refusing to suspend.\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the unit record device
+ *   @1: z/VM virtual unit record device driver
+ * Description:
+ * Linux cannot be suspended while a unit record device is in use.
+ * User action:
+ * Stop all applications that work on z/VM spool file queues, for example, the
+ * vmur tool. Then try again to suspend Linux.
+ */
+
+/*? Text: "%s loaded.\n" */
+/*? Text: "%s unloaded.\n" */
diff --git a/Documentation/kmsg/s390/vmwatchdog b/Documentation/kmsg/s390/vmwatchdog

new file mode 100644 (file)

index 0000000..99d741a
--- /dev/null
+++ b/Documentation/kmsg/s390/vmwatchdog
@@ -0,0 +1,26 @@
+/*?
+ * Text: "The system cannot be suspended while the watchdog is in use\n"
+ * Severity: Error
+ * Description:
+ * A program is currently using the vmwatchdog device node. The watchdog
+ * device driver prevents the system from being suspended while the watchdog
+ * device is in use.
+ * User action:
+ * If you want to suspend the system, find out which program uses the watchdog
+ * device. Stop the program or reconfigure it to not use the watchdog.
+ */
+
+
+/*?
+ * Text: "The system cannot be suspended while the watchdog is running\n"
+ * Severity: Error
+ * Description:
+ * The watchdog must not time out during hibernation. The watchdog
+ * device driver prevents the system from being suspended while the watchdog
+ * timer is running.
+ * User action:
+ * If you want to suspend the system, stop the watchdog, for example, by entering
+ * the command: 'echo V > /dev/vmwatchdog'. Alternatively, stop the program that
+ * uses the watchdog or reconfigure the program to not use the watchdog.
+ */
+
diff --git a/Documentation/kmsg/s390/xpram b/Documentation/kmsg/s390/xpram

new file mode 100644 (file)

index 0000000..5188731
--- /dev/null
+++ b/Documentation/kmsg/s390/xpram
@@ -0,0 +1,73 @@
+/*?
+ * Text: "%d is not a valid number of XPRAM devices\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: number of partitions
+ * Description:
+ * The number of XPRAM partitions specified for the 'devs' module parameter
+ * or with the 'xpram.parts' kernel parameter must be an integer in the
+ * range 1 to 32. The XPRAM device driver created a maximum of 32 partitions
+ * that are probably not configured as intended.
+ * User action:
+ * If the XPRAM device driver has been compiled as a separate module,
+ * unload the module and load it again with a correct value for the 'devs'
+ * module parameter. If the XPRAM device driver has been compiled
+ * into the kernel, correct the 'xpram.parts' parameter in the kernel
+ * command line and restart Linux.
+ */
+
+/*?
+ * Text: "Not enough expanded memory available\n"
+ * Severity: Error
+ * Description:
+ * The amount of expanded memory required to set up your XPRAM partitions
+ * depends on the 'sizes' parameter specified for the xpram module or on
+ * the specifications for the 'xpram.parts' parameter if the XPRAM device
+ * driver has been compiled into the kernel. Your
+ * current specification exceed the amount of available expanded memory.
+ * Your XPRAM partitions are probably not configured as intended.
+ * User action:
+ * If the XPRAM device driver has been compiled as a separate module,
+ * unload the xpram module and load it again with an appropriate value
+ * for the 'sizes' module parameter. If the XPRAM device driver has been
+ * compiled into the kernel, adjust the 'xpram.parts' parameter in the
+ * kernel command line and restart Linux. If you need more than the
+ * available expanded memory, increase the expanded memory allocation for
+ * your virtual hardware or LPAR.
+ */
+
+/*?
+ * Text: "No expanded memory available\n"
+ * Severity: Error
+ * Description:
+ * The XPRAM device driver has been loaded in a Linux instance that runs
+ * in an LPAR or virtual hardware without expanded memory.
+ * No XPRAM partitions are created.
+ * User action:
+ * Allocate expanded memory for your LPAR or virtual hardware or do not
+ * load the xpram module. You can ignore this message, if you do not want
+ * to create XPRAM partitions.
+ */
+
+/*?
+ * Text: "Resuming the system failed: %s\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: cause of the failure
+ * Description:
+ * A system cannot be resumed if the expanded memory setup changes
+ * after hibernation. Possible reasons for the failure are:
+ * - Expanded memory was removed after hibernation.
+ * - Size of the expanded memory changed after hibernation.
+ * The system is stopped with a kernel panic.
+ * User action:
+ * Reboot Linux.
+ */
+
+/*? Text: "  number of devices (partitions): %d \n" */
+/*? Text: "  size of partition %d: %u kB\n" */
+/*? Text: "  size of partition %d to be set automatically\n" */
+/*? Text: "  memory needed (for sized partitions): %lu kB\n" */
+/*? Text: "  partitions to be sized automatically: %d\n" */
+/*? Text: "  automatically determined partition size: %lu kB\n" */
+/*? Text: "  %u pages expanded memory found (%lu KB).\n" */
diff --git a/Documentation/kmsg/s390/zdump b/Documentation/kmsg/s390/zdump

new file mode 100644 (file)

index 0000000..fc61c71
--- /dev/null
+++ b/Documentation/kmsg/s390/zdump
@@ -0,0 +1,12 @@
+/*?
+ * Text: "The 32-bit dump tool cannot be used for a 64-bit system\n"
+ * Severity: Alert
+ * Description:
+ * The dump process ends without creating a system dump.
+ * User action:
+ * Use a 64-bit dump tool to obtain a system dump for 64-bit Linux instance.
+ */
+
+/*? Text: "DETECTED 'S390 (32 bit) OS'\n" */
+/*? Text: "0x%x is an unknown architecture.\n" */
+/*? Text: "DETECTED 'S390X (64 bit) OS'\n" */
diff --git a/Documentation/kmsg/s390/zfcp b/Documentation/kmsg/s390/zfcp

new file mode 100644 (file)

index 0000000..b29f504
--- /dev/null
+++ b/Documentation/kmsg/s390/zfcp
@@ -0,0 +1,865 @@
+/*?
+ * Text: "%s is not a valid SCSI device\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: device specification
+ * Description:
+ * The specification for an initial SCSI device provided with the 'zfcp.device'
+ * kernel parameter or with the 'device' module parameter is syntactically
+ * incorrect. The specified SCSI device could not be attached to the Linux
+ * system.
+ * User action:
+ * Correct the value for the 'zfcp.device' or 'device' parameter and reboot
+ * Linux. See "Device Drivers, Features, and Commands" for information about
+ * the syntax.
+ */
+
+/*?
+ * Text: "Registering the misc device zfcp_cfdc failed\n"
+ * Severity: Error
+ * Description:
+ * The zfcp device driver failed to register the device that provides access to
+ * the adapter access control file (ACL tables). The device driver
+ * initialization failed. A possible cause for this problem is memory
+ * constraints.
+ * User action:
+ * Free some memory and try again to load the zfcp device driver. If the zfcp
+ * device driver has been compiled into the kernel, reboot Linux. Consider
+ * assigning more memory to your LPAR or z/VM guest virtual machine. If the
+ * problem persists, contact your support organization.
+ */
+
+/*?
+ * Text: "The zfcp device driver could not register with the common I/O layer\n"
+ * Severity: Error
+ * Description:
+ * The device driver initialization failed. A possible cause of this problem is
+ * memory constraints.
+ * User action:
+ * Free some memory and try again to load the zfcp device driver. If the zfcp
+ * device driver has been compiled into the kernel, reboot Linux. Consider
+ * assigning more memory to your LPAR or z/VM guest virtual machine. If the
+ * problem persists, contact your support organization.
+ */
+
+/*?
+ * Text: "%s: Setting up data structures for the FCP adapter failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The zfcp device driver could not allocate data structures for an FCP adapter.
+ * A possible reason for this problem is memory constraints.
+ * User action:
+ * Set the FCP adapter offline or detach it from the Linux system, free some
+ * memory and set the FCP adapter online again or attach it again. If this
+ * problem persists, gather Linux debug data, collect the FCP adapter
+ * hardware logs, and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: The FCP device is operational again\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * An FCP device has been unavailable because it had been detached from the
+ * Linux system or because the corresponding CHPID was offline. The FCP device
+ * is now available again and the zfcp device driver resumes all operations to
+ * the FCP device.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: The CHPID for the FCP device is offline\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The CHPID for an FCP device has been set offline, either logically in Linux
+ * or on the hardware.
+ * User action:
+ * Find out which CHPID corresponds to the FCP device, for example, with the
+ * lscss command. Check if the CHPID has been set logically offline in sysfs.
+ * Write 'on' to the CHPID's status attribute to set it online. If the CHPID is
+ * online in sysfs, find out if it has been varied offline through a hardware
+ * management interface, for example the service element (SE).
+ */
+
+/*?
+ * Text: "%s: The FCP device has been detached\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * An FCP device is no longer available to Linux.
+ * User action:
+ * Ensure that the FCP adapter is operational and attached to the LPAR or z/VM
+ * virtual machine.
+ */
+
+/*?
+ * Text: "%s: The FCP device did not respond within the specified time\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The common I/O layer waited for a response from the FCP adapter but
+ * no response was received within the specified time limit. This might
+ * indicate a hardware problem.
+ * User action:
+ * Consult your hardware administrator. If this problem persists,
+ * gather Linux debug data, collect the FCP adapter hardware logs, and
+ * report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Registering the FCP device with the SCSI stack failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The FCP adapter could not be registered with the Linux SCSI
+ * stack. A possible reason for this problem is memory constraints.
+ * User action:
+ * Set the FCP adapter offline or detach it from the Linux system, free some
+ * memory and set the FCP adapter online again or attach it again. If this
+ * problem persists, gather Linux debug data, collect the FCP adapter
+ * hardware logs, and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: ERP cannot recover an error on the FCP device\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * An error occurred on an FCP device. The error recovery procedure (ERP)
+ * could not resolve the error. The FCP device driver cannot use the FCP device.
+ * User action:
+ * Check for previous error messages for the same FCP device to find the
+ * cause of the problem.
+ */
+
+/*?
+ * Text: "%s: Creating an ERP thread for the FCP device failed.\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The zfcp device driver could not set up error recovery procedure (ERP)
+ * processing for the FCP device. The FCP device is not available for use
+ * in Linux.
+ * User action:
+ * Free some memory and try again to load the zfcp device driver. If the zfcp
+ * device driver has been compiled into the kernel, reboot Linux. Consider
+ * assigning more memory to your LPAR or z/VM guest virtual machine. If the
+ * problem persists, contact your support organization.
+ */
+
+/*?
+ * Text: "%s: ERP failed for unit 0x%016Lx on port 0x%016Lx\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: LUN
+ *   @3: WWPN
+ * Description:
+ * An error occurred on the SCSI device at the specified LUN. The error recovery
+ * procedure (ERP) could not resolve the error. The SCSI device is not
+ * available.
+ * User action:
+ * Verify that the LUN is correct. Check the fibre channel fabric for errors
+ * related to the specified WWPN and LUN, the storage server, and Linux.
+ */
+
+/*?
+ * Text: "%s: ERP failed for remote port 0x%016Lx\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: WWPN
+ * Description:
+ * An error occurred on a remote port. The error recovery procedure (ERP)
+ * could not resolve the error. The port is not available.
+ * User action:
+ * Verify that the WWPN is correct and check the fibre channel fabric for
+ * errors related to the WWPN.
+ */
+
+/*?
+ * Text: "%s: Attaching the name server port to the FCP device failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The zfcp device driver could not attach the name server port of the fibre
+ * channel fabric to an FCP device. A possible cause of this problem is
+ * memory constraints.
+ * User action:
+ * Set the FCP device offline, free some memory, then set the FCP device online
+ * again. If this does not resolve the problem, reboot Linux and try again to
+ * set the FCP device online.
+ */
+
+/*?
+ * Text: "%s: Registering unit 0x%016Lx on port 0x%016Lx failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: LUN
+ *   @3: WWPN
+ * Description:
+ * The Linux kernel could not allocate enough memory to register the SCSI
+ * device at the indicated LUN with the SCSI stack. The SCSI device is not
+ * available.
+ * User action:
+ * Free some memory then detach the LUN and attach it again.
+ */
+
+/*?
+ * Text: "%s: Registering port 0x%016Lx failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: WWPN
+ * Description:
+ * The Linux kernel could not allocate enough memory to register the
+ * remote port with the indicated WWPN with the SCSI stack. The remote
+ * port is not available.
+ * User action:
+ * Free some memory and trigger the rescan for ports.
+ */
+
+/*?
+ * Text: "%s: A QDIO problem occurred\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * QDIO reported a problem to the zfcp device driver. The zfcp device driver
+ * tries to recover this problem.
+ * User action:
+ * Check for related error messages. If this problem occurs frequently, gather
+ * Linux debug data and contact your support organization.
+ */
+
+/*?
+ * Text: "%s: A QDIO protocol error occurred, operations continue\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The zfcp device driver detected a missing flag in a QDIO queue. The device
+ * driver tries to keep the FCP device operational.
+ * User action:
+ * Check for related error messages. If this problem occurs frequently, gather
+ * Linux debug data, collect the FCP adapter hardware logs, and report the
+ * problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Setting up the QDIO connection to the FCP adapter failed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The zfcp device driver failed to establish a QDIO connection with the FCP
+ * adapter.
+ * User action:
+ * Set the FCP adapter offline or detach it from the Linux system, free some
+ * memory and set the FCP adapter online again or attach it again. If this
+ * problem persists, gather Linux debug data, collect the FCP adapter
+ * hardware logs, and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: The FCP adapter reported a problem that cannot be recovered\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The FCP adapter has a problem that cannot be recovered by the zfcp device
+ * driver. The zfcp device driver stopped using the FCP device.
+ * User action:
+ * Gather Linux debug data, collect the FCP adapter hardware logs, and report
+ * this problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: There is a wrap plug instead of a fibre channel cable\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The FCP adapter is not physically connected to the fibre channel fabric.
+ * User action:
+ * Remove the wrap plug from the FCP adapter and connect the adapter with the
+ * fibre channel fabric.
+ */
+
+/*?
+ * Text: "%s: Access denied to unit 0x%016Lx on port 0x%016Lx\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: LUN
+ *   @3: WWPN
+ * Description:
+ * The Linux system is not allowed to access the SCSI device at the indicated
+ * LUN.
+ * User action:
+ * Update the access control table of the FCP device to grant the Linux
+ * system access to the LUN or remove the LUN from the Linux system.
+ */
+
+/*?
+ * Text: "%s: FCP device not operational because of an unsupported FC class\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The FCP adapter hardware does not support the fibre channel service class
+ * requested by the zfcp device driver. This problem indicates a program error
+ * in the zfcp device driver.
+ * User action:
+ * Gather Linux debug data, collect the FCP adapter hardware logs, and report
+ * this problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: 0x%Lx is an ambiguous request identifier\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: request ID
+ * Description:
+ * The FCP adapter reported that it received the same request ID twice. This is
+ * an error. The zfcp device driver stopped using the FCP device.
+ * User action:
+ * Gather Linux debug data, collect the FCP adapter hardware logs, and report
+ * this problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: QTCB version 0x%x not supported by FCP adapter (0x%x to 0x%x)\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: requested version
+ *   @3: lowest supported version
+ *   @4: highest supported version
+ * Description:
+ * See message text.
+ * The queue transfer control block (QTCB) version requested by the zfcp device
+ * driver is not supported by the FCP adapter hardware.
+ * User action:
+ * If the requested version is higher than the highest version supported by the
+ * hardware, install more recent firmware on the FCP adapter. If the requested
+ * version is lower then the lowest version supported by the hardware, upgrade
+ * to a Linux level with a more recent zfcp device driver.
+ */
+
+/*?
+ * Text: "%s: The FCP adapter could not log in to the fibre channel fabric\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The fibre channel switch rejected the login request from the FCP adapter.
+ * User action:
+ * Check the fibre channel fabric or switch logs for possible errors.
+ */
+
+/*?
+ * Text: "%s: The FCP device is suspended because of a firmware update\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The FCP device is not available while a firmware update is in progress. This
+ * problem is temporary. The FCP device will resume operations when the
+ * firmware update is completed.
+ * User action:
+ * Wait 10 seconds and try the operation again.
+ */
+
+/*?
+ * Text: "%s: All NPIV ports on the FCP adapter have been assigned\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The number of N_Port ID Virtualization (NPIV) ports that can be assigned
+ * on an FCP adapter is limited. Once assigned, NPIV ports are not released
+ * automatically but have to be released explicitly through the support
+ * element (SE).
+ * User action:
+ * Identify NPIV ports that have been assigned but are no longer in use and
+ * release them from the SE.
+ */
+
+/*?
+ * Text: "%s: The link between the FCP adapter and the FC fabric is down\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The FCP adapter is not usable. Specific error information is not available.
+ * User action:
+ * Check the cabling and the fibre channel fabric configuration. If this
+ * problem persists, gather Linux debug data, collect the FCP adapter
+ * hardware logs, and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Access denied to port 0x%016Lx\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: WWPN
+ * Description:
+ * The Linux system is not allowed to access the remote port with the specified
+ * WWPN.
+ * User action:
+ * Update the access control table of the FCP device to grant the Linux
+ * system access to the WWPN or remove the WWPN from the Linux system.
+ */
+
+/*?
+ * Text: "%s: The QTCB type is not supported by the FCP adapter\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The queue transfer control block (QTCB) type requested by the zfcp device
+ * driver is not supported by the FCP adapter hardware.
+ * User action:
+ * Install the latest firmware on your FCP adapter hardware. If this does not
+ * resolve the problem, upgrade to a Linux level with a more recent zfcp device
+ * driver. If the problem persists, contact your support organization.
+ */
+
+/*?
+ * Text: "%s: The error threshold for checksum statistics has been exceeded\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The FCP adapter has reported a large number of bit errors. This might
+ * indicate a problem with the physical components of the fibre channel fabric.
+ * Details about the errors have been written to the HBA trace for the FCP
+ * adapter.
+ * User action:
+ * Check for problems in the fibre channel fabric and ensure that all cables
+ * are properly plugged.
+ */
+
+/*?
+ * Text: "%s: The local link has been restored\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * A problem with the connection between the FCP adapter and the adjacent node
+ * on the fibre channel fabric has been resolved. The FCP adapter is now
+ * available again.
+ * User action:
+ * None.
+ */
+
+/*?
+ * Text: "%s: Access denied according to ACT rule type %s, rule %d\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: access rule type
+ *   @3: access rule
+ * Description:
+ * A rule in the access control table (ACT) for the FCP device denies access
+ * to a remote port or a LUN.
+ * User action:
+ * Examine the access control tables for the FCP device to see if the
+ * specified rule is correct.
+ */
+
+/*?
+ * Text: "%s: The mode table on the FCP adapter has been damaged\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * This is an FCP adapter hardware problem.
+ * User action:
+ * Report this problem with FCP hardware logs to IBM support.
+ */
+
+/*?
+ * Text: "%s: The adjacent fibre channel node does not support FCP\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The fibre channel switch or storage system that is connected to the FCP
+ * channel does not support the fibre channel protocol (FCP). The zfcp
+ * device driver stopped using the FCP device.
+ * User action:
+ * Check the adjacent fibre channel node.
+ */
+
+/*?
+ * Text: "%s: The FCP adapter does not recognize the command 0x%x\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: command
+ * Description:
+ * A command code that was sent from the zfcp device driver to the FCP adapter
+ * is not valid. The zfcp device driver stopped using the FCP device.
+ * User action:
+ * Gather Linux debug data, collect the FCP adapter hardware logs, and report
+ * this problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: There is no light signal from the local fibre channel cable\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * There is no signal on the fibre channel cable that connects the FCP adapter
+ * to the fibre channel fabric.
+ * User action:
+ * Ensure that the cable is in place and connected properly to the FCP adapter
+ * and to the adjacent fibre channel switch or storage system.
+ */
+
+/*?
+ * Text: "%s: The WWPN assignment file on the FCP adapter has been damaged\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * This is an FCP adapter hardware problem.
+ * User action:
+ * Report this problem with FCP hardware logs to IBM support.
+ */
+
+/*?
+ * Text: "%s: The FCP device detected a WWPN that is duplicate or not valid\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * This condition indicates an error in the FCP adapter hardware or in the z/VM
+ * hypervisor.
+ * User action:
+ * Gather Linux debug data, collect the FCP adapter hardware logs, and report
+ * this problem to IBM support.
+ */
+
+/*?
+ * Text: "%s: The fibre channel fabric does not support NPIV\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The FCP adapter requires N_Port ID Virtualization (NPIV) from the adjacent
+ * fibre channel node. Either the FCP adapter is connected to a fibre channel
+ * switch that does not support NPIV or the FCP adapter tries to use NPIV in a
+ * point-to-point setup. The connection is not operational.
+ * User action:
+ * Verify that NPIV is correctly used for this connection. Check the FCP adapter
+ * configuration and the fibre channel switch configuration. If necessary,
+ * update the fibre channel switch firmware.
+ */
+
+/*?
+ * Text: "%s: The FCP adapter cannot support more NPIV ports\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * N_Port ID Virtualization (NPIV) ports consume physical resources on the FCP
+ * adapter. The FCP adapter resources are exhausted. The connection is not
+ * operational.
+ * User action:
+ * Analyze the number of available NPIV ports and which operating system
+ * instances use them. If necessary, reconfigure your setup to move some
+ * NPIV ports to an FCP adapter with free resources.
+ */
+
+/*?
+ * Text: "%s: The adjacent switch cannot support more NPIV ports\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * N_Port ID Virtualization (NPIV) ports consume physical resources. The
+ * resources of the fibre channel switch that is connected to the FCP adapter
+ * are exhausted. The connection is not operational.
+ * User action:
+ * Analyze the number of available NPIV ports on the adjacent fibre channel
+ * switch and how they are used. If necessary, reconfigure your fibre channel
+ * fabric to accommodate the required NPIV ports.
+ */
+
+/*?
+ * Text: "%s: 0x%x is not a valid transfer protocol status\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: status information
+ * Description:
+ * The transfer protocol status information reported by the FCP adapter is not
+ * a valid status for the zfcp device driver. The zfcp device driver stopped
+ * using the FCP device.
+ * User action:
+ * Gather Linux debug data, collect the FCP adapter hardware logs, and report
+ * this problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Unknown or unsupported arbitrated loop fibre channel topology detected\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The FCP device is connected to a fibre channel arbitrated loop or the FCP adapter
+ * reported an unknown fibre channel topology. The zfcp device driver supports
+ * point-to-point connections and switched fibre channel fabrics but not arbitrated
+ * loop topologies. The FCP device cannot be used.
+ * User action:
+ * Check the fibre channel setup and ensure that only supported topologies are
+ * connected to the FCP adapter.
+ */
+
+/*?
+ * Text: "%s: FCP adapter maximum QTCB size (%d bytes) is too small\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: maximum supported size
+ *   @3: requested QTCB size
+ * Description:
+ * The queue transfer control block (QTCB) size requested by the zfcp
+ * device driver is not supported by the FCP adapter hardware.
+ * User action:
+ * Update the firmware on your FCP adapter hardware to the latest
+ * available level and update the Linux kernel to the latest supported
+ * level. If the problem persists, contact your support organization.
+ */
+
+/*?
+ * Text: "%s: The FCP adapter only supports newer control block versions\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The protocol supported by the FCP adapter is not compatible with the zfcp
+ * device driver.
+ * User action:
+ * Upgrade your Linux kernel to a level that includes a zfcp device driver
+ * with support for the control block version required by your FCP adapter.
+ */
+
+/*?
+ * Text: "%s: The FCP adapter only supports older control block versions\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ * Description:
+ * The protocol supported by the FCP adapter is not compatible with the zfcp
+ * device driver.
+ * User action:
+ * Install the latest firmware on your FCP adapter.
+ */
+
+/*?
+ * Text: "%s: Not enough FCP adapter resources to open remote port 0x%016Lx\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: WWPN
+ * Description:
+ * Each port that is opened consumes physical resources of the FCP adapter to
+ * which it is attached. These resources are exhausted and the specified port
+ * cannot be opened.
+ * User action:
+ * Reduce the total number of remote ports that are attached to the
+ * FCP adapter.
+ */
+
+/*?
+ * Text: "%s: LUN 0x%Lx on port 0x%Lx is already in use by CSS%d, MIF Image ID %x\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: LUN
+ *   @3: remote port WWPN
+ *   @4: channel subsystem ID
+ *   @5: MIF Image ID of the LPAR
+ * Description:
+ * The SCSI device at the indicated LUN is already in use by another system.
+ * Only one system at a time can use the SCSI device.
+ * User action:
+ * Ensure that the other system stops using the device before trying to use it.
+ */
+
+/*?
+ * Text: "%s: No handle is available for LUN 0x%016Lx on port 0x%016Lx\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: LUN
+ *   @3: WWPN
+ * Description:
+ * The FCP adapter can only open a limited number of SCSI devices. This limit
+ * has been reached and the SCSI device at the indicated LUN cannot be opened.
+ * User action:
+ * Check all SCSI devices opened through the FCP adapter and close some of them.
+ */
+
+/*?
+ * Text: "%s: SCSI device at LUN 0x%016Lx on port 0x%016Lx opened read-only\n"
+ * Severity: Informational
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: LUN
+ *   @3: WWPN
+ * Description:
+ * The access control tables in the FCP adapter allow read-only access for the
+ * LUN. Write access is not permitted for your Linux instance. The SCSI
+ * device has been opened successfully in read-only access mode.
+ * User action:
+ * None if read-only access is sufficient. If you require write access, change
+ * the access control tables in the FCP adapter.
+ */
+
+/*?
+ * Text: "%s: Exclusive read-only access not supported (unit 0x%016Lx, port 0x%016Lx)\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: LUN
+ *   @3: WWPN
+ * Description:
+ * The access configuration specified in the access control tables of the FCP
+ * adapter is not valid. The SCSI device at the indicated LUN cannot be
+ * accessed.
+ * User action:
+ * Change the access control tables in the FCP adapter.
+ */
+
+/*?
+ * Text: "%s: Shared read-write access not supported (unit 0x%016Lx, port 0x%016Lx)\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: LUN
+ *   @3: WWPN
+ * Description:
+ * The access configuration specified in the access control tables of the FCP
+ * adapter is not valid. The SCSI device at the indicated LUN cannot be
+ * accessed.
+ * User action:
+ * Change the access control tables in the FCP adapter.
+ */
+
+/*?
+ * Text: "%s: Incorrect direction %d, unit 0x%016Lx on port 0x%016Lx closed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: value in direction field
+ *   @3: LUN
+ *   @4: WWPN
+ * Description:
+ * The direction field in a SCSI request contains an incorrect value. The zfcp
+ * device driver closed down the SCSI device at the indicated LUN.
+ * User action:
+ * Gather Linux debug data and report this problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Incorrect CDB length %d, unit 0x%016Lx on port 0x%016Lx closed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: value in length field
+ *   @3: LUN
+ *   @4: WWPN
+ * Description:
+ * The control-data-block (CDB) length field in a SCSI request is not valid or
+ * too large for the FCP adapter. The zfcp device driver closed down the SCSI
+ * device at the indicated LUN.
+ * User action:
+ * Gather Linux debug data and report this problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Oversize data package, unit 0x%016Lx on port 0x%016Lx closed\n"
+ * Severity: Error
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: LUN
+ *   @3: WWPN
+ * Description:
+ * A SCSI request with too much data has been sent to the SCSI device at the
+ * indicated LUN. The FCP adapter cannot handle data packets of this size and
+ * the SCSI device driver closed down the SCSI device.
+ * User action:
+ * Gather Linux debug data and report this problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: Opening WKA port 0x%x failed\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: destination ID of the WKA port
+ * Description:
+ * The FCP adapter rejected a request to open the specified
+ * well-known address (WKA) port. No retry is possible.
+ * User action:
+ * Verify the setup and check if the maximum number of remote ports
+ * used through this adapter is below the maximum allowed. If the
+ * problem persists, gather Linux debug data, collect the FCP adapter
+ * hardware logs, and report the problem to your support organization.
+ */
+
+/*?
+ * Text: "%s: The name server reported %d words residual data\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: number of words in residual data
+ * Description:
+ * The fibre channel name server sent too much information about remote ports.
+ * The zfcp device driver did not receive sufficient information to attach all
+ * available remote ports in the SAN.
+ * User action:
+ * Verify that you are running the latest firmware level on the FCP
+ * adapter. Check your SAN setup and consider reducing the number of ports
+ * visible to the FCP adapter by using more restrictive zoning in the SAN.
+ */
+
+/*?
+ * Text: "%s: A port opened with WWPN 0x%016Lx returned data that identifies it as WWPN 0x%016Lx\n"
+ * Severity: Warning
+ * Parameter:
+ *   @1: bus ID of the zfcp device
+ *   @2: expected WWPN
+ *   @3: reported WWPN
+ * Description:
+ * A remote port was opened successfully, but it reported an
+ * unexpected WWPN in the returned port login (PLOGI) data. This
+ * condition might have been caused by a change applied to the SAN
+ * configuration while the port was being opened.
+ * User action:
+ * If this condition is only temporary and access to the remote port
+ * is possible, no action is required. If the condition persists,
+ * identify the storage system with the specified WWPN and contact the
+ * support organization of the storage system.
+ */
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt

index 6d78841..c85756e 100644 (file)
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -603,6 +603,18 @@ can be ORed together:
  1024 - A module from drivers/staging was loaded.
  2048 - The system is working around a severe firmware bug.
  4096 - An out-of-tree module has been loaded.
+ 0x40000000 - An unsupported kernel module was loaded.
+ 0x80000000 - An kernel module with external support was loaded.
+
+==============================================================
+
+unsupported:
+
+Allow to load unsupported kernel modules:
+
+  0 - refuse to load unsupported modules,
+  1 - warn when loading unsupported modules,
+  2 - don't warn.
  
  ==============================================================
  
diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.txt

new file mode 100644 (file)

index 0000000..5a1a00c
--- /dev/null
+++ b/Documentation/vm/frontswap.txt
@@ -0,0 +1,210 @@
+Frontswap provides a "transcendent memory" interface for swap pages.
+In some environments, dramatic performance savings may be obtained because
+swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk.
+
+Frontswap is so named because it can be thought of as the opposite of
+a "backing" store for a swap device.  The storage is assumed to be
+a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming
+to the requirements of transcendent memory (such as Xen's "tmem", or
+in-kernel compressed memory, aka "zcache", or future RAM-like devices);
+this pseudo-RAM device is not directly accessible or addressable by the
+kernel and is of unknown and possibly time-varying size.  The driver
+links itself to frontswap by calling frontswap_register_ops to set the
+frontswap_ops funcs appropriately and the functions it provides must
+conform to certain policies as follows:
+
+An "init" prepares the device to receive frontswap pages associated
+with the specified swap device number (aka "type").  A "put_page" will
+copy the page to transcendent memory and associate it with the type and
+offset associated with the page. A "get_page" will copy the page, if found,
+from transcendent memory into kernel memory, but will NOT remove the page
+from from transcendent memory.  An "invalidate_page" will remove the page
+from transcendent memory and an "invalidate_area" will remove ALL pages
+associated with the swap type (e.g., like swapoff) and notify the "device"
+to refuse further puts with that swap type.
+
+Once a page is successfully put, a matching get on the page will normally
+succeed.  So when the kernel finds itself in a situation where it needs
+to swap out a page, it first attempts to use frontswap.  If the put returns
+success, the data has been successfully saved to transcendent memory and
+a disk write and, if the data is later read back, a disk read are avoided.
+If a put returns failure, transcendent memory has rejected the data, and the
+page can be written to swap as usual.
+
+Note that if a page is put and the page already exists in transcendent memory
+(a "duplicate" put), either the put succeeds and the data is overwritten,
+or the put fails AND the page is invalidated.  This ensures stale data may
+never be obtained from frontswap.
+
+If properly configured, monitoring of frontswap is done via debugfs in
+the /sys/kernel/debug/frontswap directory.  The effectiveness of
+frontswap can be measured (across all swap devices) with:
+
+failed_puts    - how many put attempts have failed
+gets           - how many gets were attempted (all should succeed)
+succ_puts      - how many put attempts have succeeded
+invalidates    - how many invalidates were attempted
+
+A backend implementation may provide additional metrics.
+
+FAQ
+
+1) Where's the value?
+
+When a workload starts swapping, performance falls through the floor.
+Frontswap significantly increases performance in many such workloads by
+providing a clean, dynamic interface to read and write swap pages to
+"transcendent memory" that is otherwise not directly addressable to the kernel.
+This interface is ideal when data is transformed to a different form
+and size (such as with compression) or secretly moved (as might be
+useful for write-balancing for some RAM-like devices).  Swap pages (and
+evicted page-cache pages) are a great use for this kind of slower-than-RAM-
+but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and
+cleancache) interface to transcendent memory provides a nice way to read
+and write -- and indirectly "name" -- the pages.
+
+In the virtual case, the whole point of virtualization is to statistically
+multiplex physical resources acrosst the varying demands of multiple
+virtual machines.  This is really hard to do with RAM and efforts to do
+it well with no kernel changes have essentially failed (except in some
+well-publicized special-case workloads).  Frontswap -- and cleancache --
+with a fairly small impact on the kernel, provides a huge amount
+of flexibility for more dynamic, flexible RAM multiplexing.
+Specifically, the Xen Transcendent Memory backend allows otherwise
+"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple
+virtual machines, but the pages can be compressed and deduplicated to
+optimize RAM utilization.  And when guest OS's are induced to surrender
+underutilized RAM (e.g. with "self-ballooning"), sudden unexpected
+memory pressure may result in swapping; frontswap allows those pages
+to be swapped to and from hypervisor RAM if overall host system memory
+conditions allow.
+
+2) Sure there may be performance advantages in some situations, but
+   what's the space/time overhead of frontswap?
+
+If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into
+nothingness and the only overhead is a few extra bytes per swapon'ed
+swap device.  If CONFIG_FRONTSWAP is enabled but no frontswap "backend"
+registers, there is one extra global variable compared to zero for
+every swap page read or written.  If CONFIG_FRONTSWAP is enabled
+AND a frontswap backend registers AND the backend fails every "put"
+request (i.e. provides no memory despite claiming it might),
+CPU overhead is still negligible -- and since every frontswap fail
+precedes a swap page write-to-disk, the system is highly likely
+to be I/O bound and using a small fraction of a percent of a CPU
+will be irrelevant anyway.
+
+As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend
+registers, one bit is allocated for every swap page for every swap
+device that is swapon'd.  This is added to the EIGHT bits (which
+was sixteen until about 2.6.34) that the kernel already allocates
+for every swap page for every swap device that is swapon'd.  (Hugh
+Dickins has observed that frontswap could probably steal one of
+the existing eight bits, but let's worry about that minor optimization
+later.)  For very large swap disks (which are rare) on a standard
+4K pagesize, this is 1MB per 32GB swap.
+
+3) OK, how about a quick overview of what this frontswap patch does
+   in terms that a kernel hacker can grok?
+
+Let's assume that a frontswap "backend" has registered during
+kernel initialization; this registration indicates that this
+frontswap backend has access to some "memory" that is not directly
+accessible by the kernel.  Exactly how much memory it provides is
+entirely dynamic and random.
+
+Whenever a swap-device is swapon'd frontswap_init() is called,
+passing the swap device number (aka "type") as a parameter.
+This notifies frontswap to expect attempts to "put" swap pages
+associated with that number.
+
+Whenever the swap subsystem is readying a page to write to a swap
+device (c.f swap_writepage()), frontswap_put_page is called.  Frontswap
+consults with the frontswap backend and if the backend says it does NOT
+have room, frontswap_put_page returns -1 and the kernel swaps the page
+to the swap device as normal.  Note that the response from the frontswap
+backend is unpredictable to the kernel; it may choose to never accept a
+page, it could accept every ninth page, or it might accept every
+page.  But if the backend does accept a page, the data from the page
+has already been copied and associated with the type and offset,
+and the backend guarantees the persistence of the data.  In this case,
+frontswap sets a bit in the "frontswap_map" for the swap device
+corresponding to the page offset on the swap device to which it would
+otherwise have written the data.
+
+When the swap subsystem needs to swap-in a page (swap_readpage()),
+it first calls frontswap_get_page() which checks the frontswap_map to
+see if the page was earlier accepted by the frontswap backend.  If
+it was, the page of data is filled from the frontswap backend and
+the swap-in is complete.  If not, the normal swap-in code is
+executed to obtain the page of data from the real swap device.
+
+So every time the frontswap backend accepts a page, a swap device read
+and (potentially) a swap device write are replaced by a "frontswap backend
+put" and (possibly) a "frontswap backend get", which are presumably much
+faster.
+
+4) Can't frontswap be configured as a "special" swap device that is
+   just higher priority than any real swap device (e.g. like zswap)?
+
+No.  Recall that acceptance of any swap page by the frontswap
+backend is entirely unpredictable. This is critical to the definition
+of frontswap because it grants completely dynamic discretion to the
+backend.  But since any "put" might fail, there must always be a real
+slot on a real swap device to swap the page.  Thus frontswap must be
+implemented as a "shadow" to every swapon'd device with the potential
+capability of holding every page that the swap device might have held
+and the possibility that it might hold no pages at all.
+On the downside, this also means that frontswap cannot contain more
+pages than the total of swapon'd swap devices.  For example, if NO
+swap device is configured on some installation, frontswap is useless.
+
+Further, frontswap is entirely synchronous whereas a real swap
+device is, by definition, asynchronous and uses block I/O.  The
+block I/O layer is not only unnecessary, but may perform "optimizations"
+that are inappropriate for a RAM-oriented device including delaying
+the write of some pages for a significant amount of time.  Synchrony is
+required to ensure the dynamicity of the backend and to avoid thorny race
+conditions that would unnecessarily and greatly complicate frontswap
+and/or the block I/O subsystem.
+
+In a virtualized environment, the dynamicity allows the hypervisor
+(or host OS) to do "intelligent overcommit".  For example, it can
+choose to accept pages only until host-swapping might be imminent,
+then force guests to do their own swapping.  In zcache, "poorly"
+compressible pages can be rejected, where "poorly" can itself be defined
+dynamically depending on current memory constraints.
+
+5) Why this weird definition about "duplicate puts"?  If a page
+   has been previously successfully put, can't it always be
+   successfully overwritten?
+
+Nearly always it can, but no, sometimes it cannot.  Consider an example
+where data is compressed and the original 4K page has been compressed
+to 1K.  Now an attempt is made to overwrite the page with data that
+is non-compressible and so would take the entire 4K.  But the backend
+has no more space.  In this case, the put must be rejected.  Whenever
+frontswap rejects a put that would overwrite, it also must invalidate
+the old data and ensure that it is no longer accessible.  Since the
+swap subsystem then writes the new data to the read swap device,
+this is the correct course of action to ensure coherency.
+
+6) What is frontswap_shrink for?
+
+When the (non-frontswap) swap subsystem swaps out a page to a real
+swap device, that page is only taking up low-value pre-allocated disk
+space.  But if frontswap has placed a page in transcendent memory, that
+page may be taking up valuable real estate.  The frontswap_shrink
+routine allows code outside of the swap subsystem (such as Xen tmem
+or zcache or some future tmem backend) to force pages out of the memory
+managed by frontswap and back into kernel-addressable memory.
+
+7) Why does the frontswap patch create the new include file swapfile.h?
+
+The frontswap code depends on some swap-subsystem-internal data
+structures that have, over the years, moved back and forth between
+static and global.  This seemed a reasonable compromise:  Define
+them as global but declare them in a new include file that isn't
+included by the large number of source files that include swap.h.
+
+Dan Magenheimer, last updated September 12, 2011
diff --git a/Makefile b/Makefile

index a687963..9bd666f 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -65,6 +65,20 @@ ifndef KBUILD_CHECKSRC
    KBUILD_CHECKSRC = 0
  endif
  
+# Call message checker as part of the C compilation
+#
+# Use 'make D=1' to enable checking
+# Use 'make D=2' to create the message catalog
+
+ifdef D
+  ifeq ("$(origin D)", "command line")
+    KBUILD_KMSG_CHECK = $(D)
+  endif
+endif
+ifndef KBUILD_KMSG_CHECK
+  KBUILD_KMSG_CHECK = 0
+endif
+
  # Use make M=dir to specify directory of external module to build
  # Old syntax make ... SUBDIRS=$PWD is still supported
  # Setting the environment variable KBUILD_EXTMOD take precedence
@@ -347,6 +361,7 @@ CHECK               = sparse
  
  CHECKFLAGS     := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
                   -Wbitwise -Wno-return-void $(CF)
+KMSG_CHECK     = $(srctree)/scripts/kmsg-doc
  CFLAGS_MODULE   =
  AFLAGS_MODULE   =
  LDFLAGS_MODULE  =
@@ -376,6 +391,11 @@ KBUILD_AFLAGS_MODULE  := -DMODULE
  KBUILD_CFLAGS_MODULE  := -DMODULE
  KBUILD_LDFLAGS_MODULE := -T $(srctree)/scripts/module-common.lds
  
+# Warn about unsupported modules in kernels built inside Autobuild
+ifneq ($(wildcard /.buildenv),)
+CFLAGS         += -DUNSUPPORTED_MODULES=2
+endif
+
  # Read KERNELRELEASE from include/config/kernel.release (if it exists)
  KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null)
  KERNELVERSION = $(VERSION)$(if $(PATCHLEVEL),.$(PATCHLEVEL)$(if $(SUBLEVEL),.$(SUBLEVEL)))$(EXTRAVERSION)
@@ -392,6 +412,7 @@ export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
  export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
  export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
  export KBUILD_ARFLAGS
+export KBUILD_KMSG_CHECK KMSG_CHECK
  
  # When compiling out-of-tree modules, put MODVERDIR in the module
  # tree rather than in the kernel tree. The kernel tree might
@@ -592,6 +613,11 @@ KBUILD_CFLAGS      += -fomit-frame-pointer
  endif
  endif
  
+ifdef CONFIG_UNWIND_INFO
+KBUILD_CFLAGS  += -fasynchronous-unwind-tables
+LDFLAGS_vmlinux        += --eh-frame-hdr
+endif
+
  ifdef CONFIG_DEBUG_INFO
  KBUILD_CFLAGS  += -g
  KBUILD_AFLAGS  += -gdwarf-2
@@ -1027,7 +1053,7 @@ depend dep:
  
  # ---------------------------------------------------------------------------
  # Firmware install
-INSTALL_FW_PATH=$(INSTALL_MOD_PATH)/lib/firmware
+INSTALL_FW_PATH=$(INSTALL_MOD_PATH)/lib/firmware/$(KERNELRELEASE)
  export INSTALL_FW_PATH
  
  PHONY += firmware_install
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h

index fcb5757..03cdb78 100644 (file)
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -141,6 +141,16 @@
  #define page_to_phys(page)     (__pfn_to_phys(page_to_pfn(page)))
  #define phys_to_page(phys)     (pfn_to_page(__phys_to_pfn(phys)))
  
+#ifndef CONFIG_ARM_PATCH_PHYS_VIRT
+#ifndef PHYS_OFFSET
+#ifdef PLAT_PHYS_OFFSET
+#define PHYS_OFFSET    PLAT_PHYS_OFFSET
+#else
+#define PHYS_OFFSET    UL(CONFIG_PHYS_OFFSET)
+#endif
+#endif
+#endif
+
  #ifndef __ASSEMBLY__
  
  /*
@@ -188,14 +198,6 @@ static inline unsigned long __phys_to_virt(unsigned long x)
  #endif
  #endif
  
-#ifndef PHYS_OFFSET
-#ifdef PLAT_PHYS_OFFSET
-#define PHYS_OFFSET    PLAT_PHYS_OFFSET
-#else
-#define PHYS_OFFSET    UL(CONFIG_PHYS_OFFSET)
-#endif
-#endif
-
  /*
   * PFNs are used to describe any physical page; this means
   * PFN 0 == physical address 0.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig

index bd72669..12a0e6f 100644 (file)
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -234,7 +234,7 @@ config IA64_HP_SIM
  config IA64_XEN_GUEST
         bool "Xen guest"
         select SWIOTLB
-       depends on XEN
+       depends on PARAVIRT_XEN
         help
           Build a kernel that runs on Xen guest domain. At this moment only
           16KB page size in supported.
@@ -543,6 +543,7 @@ config IA64_MC_ERR_INJECT
  
  config SGI_SN
         def_bool y if (IA64_SGI_SN2 || IA64_GENERIC)
+       select HAVE_UNSTABLE_SCHED_CLOCK
  
  config IA64_ESI
         bool "ESI (Extensible SAL Interface) support"
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile

index be7bfa1..342907d 100644 (file)
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -55,7 +55,7 @@ core-$(CONFIG_IA64_XEN_GUEST) += arch/ia64/dig/
  core-$(CONFIG_IA64_SGI_SN2)    += arch/ia64/sn/
  core-$(CONFIG_IA64_SGI_UV)     += arch/ia64/uv/
  core-$(CONFIG_KVM)             += arch/ia64/kvm/
-core-$(CONFIG_XEN)             += arch/ia64/xen/
+core-$(CONFIG_PARAVIRT_XEN)    += arch/ia64/xen/
  
  drivers-$(CONFIG_PCI)          += arch/ia64/pci/
  drivers-$(CONFIG_IA64_HP_SIM)  += arch/ia64/hp/sim/
diff --git a/arch/ia64/include/asm/xen/hypervisor.h b/arch/ia64/include/asm/xen/hypervisor.h

index 67455c2..aacad12 100644 (file)
--- a/arch/ia64/include/asm/xen/hypervisor.h
+++ b/arch/ia64/include/asm/xen/hypervisor.h
@@ -34,13 +34,13 @@
  #define _ASM_IA64_XEN_HYPERVISOR_H
  
  #include <linux/err.h>
+#include <xen/xen.h>
+#ifdef CONFIG_PARAVIRT_XEN
  #include <xen/interface/xen.h>
  #include <xen/interface/version.h>     /* to compile feature.c */
  #include <xen/features.h>              /* to comiple xen-netfront.c */
-#include <xen/xen.h>
  #include <asm/xen/hypercall.h>
  
-#ifdef CONFIG_XEN
  extern struct shared_info *HYPERVISOR_shared_info;
  extern struct start_info *xen_start_info;
  
diff --git a/arch/ia64/include/asm/xen/interface.h b/arch/ia64/include/asm/xen/interface.h

index 09d5f7f..d950667 100644 (file)
--- a/arch/ia64/include/asm/xen/interface.h
+++ b/arch/ia64/include/asm/xen/interface.h
@@ -56,31 +56,19 @@
  #ifndef _ASM_IA64_XEN_INTERFACE_H
  #define _ASM_IA64_XEN_INTERFACE_H
  
-#define __DEFINE_GUEST_HANDLE(name, type)      \
+#define __DEFINE_XEN_GUEST_HANDLE(name, type)  \
         typedef struct { type *p; } __guest_handle_ ## name
  
  #define DEFINE_GUEST_HANDLE_STRUCT(name)       \
-       __DEFINE_GUEST_HANDLE(name, struct name)
-#define DEFINE_GUEST_HANDLE(name)      __DEFINE_GUEST_HANDLE(name, name)
-#define GUEST_HANDLE(name)             __guest_handle_ ## name
-#define GUEST_HANDLE_64(name)          GUEST_HANDLE(name)
+       __DEFINE_XEN_GUEST_HANDLE(name, struct name)
+#define DEFINE_XEN_GUEST_HANDLE(name)  __DEFINE_XEN_GUEST_HANDLE(name, name)
+#define XEN_GUEST_HANDLE(name)         __guest_handle_ ## name
+#define XEN_GUEST_HANDLE_64(name)      XEN_GUEST_HANDLE(name)
  #define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
  
  #ifndef __ASSEMBLY__
-/* Guest handles for primitive C types. */
-__DEFINE_GUEST_HANDLE(uchar, unsigned char);
-__DEFINE_GUEST_HANDLE(uint, unsigned int);
-__DEFINE_GUEST_HANDLE(ulong, unsigned long);
-
-DEFINE_GUEST_HANDLE(char);
-DEFINE_GUEST_HANDLE(int);
-DEFINE_GUEST_HANDLE(long);
-DEFINE_GUEST_HANDLE(void);
-DEFINE_GUEST_HANDLE(uint64_t);
-DEFINE_GUEST_HANDLE(uint32_t);
-
+typedef unsigned long xen_ulong_t;
  typedef unsigned long xen_pfn_t;
-DEFINE_GUEST_HANDLE(xen_pfn_t);
  #define PRI_xen_pfn    "lx"
  #endif
  
@@ -92,7 +80,7 @@ DEFINE_GUEST_HANDLE(xen_pfn_t);
  /* Maximum number of virtual CPUs in multi-processor guests. */
  /* keep sizeof(struct shared_page) <= PAGE_SIZE.
   * this is checked in arch/ia64/xen/hypervisor.c. */
-#define MAX_VIRT_CPUS  64
+#define XEN_LEGACY_MAX_VCPUS 64
  
  #ifndef __ASSEMBLY__
  
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c

index a48bd9a..d6d14ad 100644 (file)
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -290,7 +290,7 @@ void foo(void)
         DEFINE(IA64_ITC_LASTCYCLE_OFFSET,
                 offsetof (struct itc_jitter_data_t, itc_lastcycle));
  
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
         BLANK();
  
         DEFINE(XEN_NATIVE_ASM, XEN_NATIVE);
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S

index 0ccb28f..3d3f305 100644 (file)
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -182,7 +182,7 @@ SECTIONS {
                 __start_gate_section = .;
                 *(.data..gate)
                 __stop_gate_section = .;
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
                 . = ALIGN(PAGE_SIZE);
                 __xen_start_gate_section = .;
                 *(.data..gate.xen)
diff --git a/arch/ia64/xen/Kconfig b/arch/ia64/xen/Kconfig

index 515e082..14d8ac6 100644 (file)
--- a/arch/ia64/xen/Kconfig
+++ b/arch/ia64/xen/Kconfig
@@ -2,7 +2,7 @@
  # This Kconfig describes xen/ia64 options
  #
  
-config XEN
+config PARAVIRT_XEN
         bool "Xen hypervisor support"
         default y
         depends on PARAVIRT && MCKINLEY && IA64_PAGE_SIZE_16KB && EXPERIMENTAL
@@ -16,10 +16,6 @@ config XEN
           Enable Xen hypervisor support.  Resulting kernel runs
           both as a guest OS on Xen and natively on hardware.
  
-config XEN_XENCOMM
-       depends on XEN
-       bool
-
  config NO_IDLE_HZ
-       depends on XEN
+       depends on PARAVIRT_XEN
         bool
diff --git a/arch/ia64/xen/xcom_hcall.c b/arch/ia64/xen/xcom_hcall.c

index ccaf743..7690fc3 100644 (file)
--- a/arch/ia64/xen/xcom_hcall.c
+++ b/arch/ia64/xen/xcom_hcall.c
@@ -343,7 +343,7 @@ xencommize_memory_reservation(struct xencomm_mini *xc_area,
  int
  xencomm_hypercall_memory_op(unsigned int cmd, void *arg)
  {
-       GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} };
+       XEN_GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} };
         struct xen_memory_reservation *xmr = NULL;
         int rc;
         struct xencomm_handle *desc;
diff --git a/arch/powerpc/include/asm/serial.h b/arch/powerpc/include/asm/serial.h

index 3e8589b..d71cbd6 100644 (file)
--- a/arch/powerpc/include/asm/serial.h
+++ b/arch/powerpc/include/asm/serial.h
@@ -15,6 +15,12 @@
  /* Default baud base if not found in device-tree */
  #define BASE_BAUD ( 1843200 / 16 )
  
+#if defined(SUPPORT_SYSRQ) && defined(CONFIG_PPC_PSERIES)
+#undef arch_8250_sysrq_via_ctrl_o
+extern int do_sysrq_via_ctrl_o;
+#define arch_8250_sysrq_via_ctrl_o(ch, port) ((ch) == '\x0f' && do_sysrq_via_ctrl_o && uart_handle_break((port)))
+#endif
+
  #ifdef CONFIG_PPC_UDBG_16550
  extern void find_legacy_serial_ports(void);
  #else
diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c

index bedd12e..508c093 100644 (file)
--- a/arch/powerpc/kernel/legacy_serial.c
+++ b/arch/powerpc/kernel/legacy_serial.c
@@ -527,6 +527,55 @@ device_initcall(serial_dev_init);
  
  
  #ifdef CONFIG_SERIAL_8250_CONSOLE
+#if defined(CONFIG_PPC_PSERIES) && defined(CONFIG_SERIAL_8250_CONSOLE)
+/*
+ * Handle the SysRq ^O Hack also via ttyS0 on POWER4 systems
+ * but only on the system console, see asm/serial.h
+ * If they run in FullSystemPartition mode, the firmware console comes in via ttyS0
+ * But BREAK does not work via the HMC, to trigger sysrq.
+ * The same is required for Cell blades
+ */
+int do_sysrq_via_ctrl_o;
+static const char __initdata *need_ctrl_o[] = {
+       "IBM,079", /* QS2x */
+       "IBM,0792-32G", /* QS21 */
+       "IBM,0793-2RZ", /* QS22 */
+       "IBM,7040-681", /* p690 */
+       "IBM,7040-671", /* p670 */
+       "IBM,7039-651", /* p655 */
+       "IBM,7038-6M2", /* p650 */
+       "IBM,7028-6E4", /* p630 tower */
+       "IBM,7028-6C4", /* p630 rack */
+       "IBM,7029-6E3", /* p615 tower */
+       "IBM,7029-6C3", /* p615 rack */
+       NULL
+};
+static void __init detect_need_for_ctrl_o(void)
+{
+       struct device_node *root;
+       const char *model, *p;
+       int i;
+
+       root = of_find_node_by_path("/");
+       if (!root)
+               return;
+       model = of_get_property(root, "model", NULL);
+       if (model) {
+               i = 0;
+               while (need_ctrl_o[i]) {
+                       p = need_ctrl_o[i];
+                       if (strncmp(p, model, strlen(p)) == 0) {
+                               do_sysrq_via_ctrl_o = 1;
+                               DBG("Enable sysrq via CTRL o on model %s\n", model);
+                               break;
+                       }
+                       i++;
+               }
+       }
+       of_node_put(root);
+}
+#endif
+
  /*
   * This is called very early, as part of console_init() (typically just after
   * time_init()). This function is respondible for trying to find a good
@@ -595,6 +644,9 @@ static int __init check_legacy_serial_console(void)
         if (i >= legacy_serial_count)
                 goto not_found;
  
+#if defined(CONFIG_PPC_PSERIES) && defined(CONFIG_SERIAL_8250_CONSOLE)
+       detect_need_for_ctrl_o();
+#endif
         of_node_put(prom_stdout);
  
         DBG("Found serial console at ttyS%d\n", offset);
diff --git a/arch/powerpc/kernel/ppc32.h b/arch/powerpc/kernel/ppc32.h

index dc16aef..cc8940c 100644 (file)
--- a/arch/powerpc/kernel/ppc32.h
+++ b/arch/powerpc/kernel/ppc32.h
@@ -136,4 +136,6 @@ struct ucontext32 {
         struct mcontext32       uc_mcontext;
  };
  
+extern int copy_siginfo_to_user32(struct compat_siginfo __user *d, siginfo_t *s);
+
  #endif  /* _PPC64_PPC32_H */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c

index 9986027..f6b156e 100644 (file)
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -163,6 +163,7 @@ static unsigned long __initdata dt_string_start, dt_string_end;
  
  static unsigned long __initdata prom_initrd_start, prom_initrd_end;
  
+static int __initdata prom_no_display;
  #ifdef CONFIG_PPC64
  static int __initdata prom_iommu_force_on;
  static int __initdata prom_iommu_off;
@@ -615,6 +616,14 @@ static void __init early_cmdline_parse(void)
  #endif /* CONFIG_CMDLINE */
         prom_printf("command line: %s\n", RELOC(prom_cmd_line));
  
+       opt = strstr(RELOC(prom_cmd_line), RELOC("prom="));
+       if (opt) {
+               opt += 5;
+               while (*opt && *opt == ' ')
+                       opt++;
+               if (!strncmp(opt, RELOC("nodisplay"), 9))
+                       RELOC(prom_no_display) = 1;
+       }
  #ifdef CONFIG_PPC64
         opt = strstr(RELOC(prom_cmd_line), RELOC("iommu="));
         if (opt) {
@@ -2873,6 +2882,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
         /* 
          * Initialize display devices
          */
+       if (RELOC(prom_no_display) == 0)
         prom_check_displays();
  
  #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c

index 469349d..87fca3a 100644 (file)
--- a/arch/powerpc/kernel/ptrace32.c
+++ b/arch/powerpc/kernel/ptrace32.c
@@ -28,12 +28,15 @@
  #include <linux/security.h>
  #include <linux/signal.h>
  #include <linux/compat.h>
+#include <linux/elf.h>
  
  #include <asm/uaccess.h>
  #include <asm/page.h>
  #include <asm/pgtable.h>
  #include <asm/switch_to.h>
  
+#include "ppc32.h"
+
  /*
   * does not yet catch signals sent when the child dies.
   * in exit.c or in signal.c.
@@ -69,6 +72,27 @@ static long compat_ptrace_old(struct task_struct *child, long request,
  #define FPRINDEX(i) TS_FPRWIDTH * FPRNUMBER(i) * 2 + FPRHALF(i)
  #define FPRINDEX_3264(i) (TS_FPRWIDTH * ((i) - PT_FPR0))
  
+static int compat_ptrace_getsiginfo(struct task_struct *child, compat_siginfo_t __user *data)
+{
+       siginfo_t lastinfo;
+       int error = -ESRCH;
+
+       read_lock(&tasklist_lock);
+       if (likely(child->sighand != NULL)) {
+               error = -EINVAL;
+               spin_lock_irq(&child->sighand->siglock);
+               if (likely(child->last_siginfo != NULL)) {
+                       lastinfo = *child->last_siginfo;
+                       error = 0;
+               }
+               spin_unlock_irq(&child->sighand->siglock);
+       }
+       read_unlock(&tasklist_lock);
+       if (!error)
+               return copy_siginfo_to_user32(data, &lastinfo);
+       return error;
+}
+
  long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
                         compat_ulong_t caddr, compat_ulong_t cdata)
  {
@@ -300,6 +324,9 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
                         0, PT_REGS_COUNT * sizeof(compat_long_t),
                         compat_ptr(data));
  
+       case PTRACE_GETSIGINFO:
+               return compat_ptrace_getsiginfo(child, compat_ptr(data));
+
         case PTRACE_GETFPREGS:
         case PTRACE_SETFPREGS:
         case PTRACE_GETVRREGS:
diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c

index c665d7d..ac451a9 100644 (file)
--- a/arch/powerpc/platforms/chrp/setup.c
+++ b/arch/powerpc/platforms/chrp/setup.c
@@ -293,7 +293,7 @@ static void chrp_init_early(void)
         if (!property)
                 goto out_put;
         if (!strcmp(property, "failsafe") || !strcmp(property, "serial"))
-               add_preferred_console("ttyS", 0, NULL);
+               add_preferred_console("ttyS", 0, "115200");
  out_put:
         of_node_put(node);
  }
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c

index 51ecac9..daec547 100644 (file)
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -409,7 +409,7 @@ static void __init pSeries_setup_arch(void)
  static int __init pSeries_init_panel(void)
  {
         /* Manually leave the kernel version on the panel. */
-       ppc_md.progress("Linux ppc64\n", 0);
+       ppc_md.progress("SUSE Linux\n", 0);
         ppc_md.progress(init_utsname()->version, 0);
  
         return 0;
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c

index 0f3ab06..402425e 100644 (file)
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -138,6 +138,7 @@ static struct bpt *in_breakpoint_table(unsigned long pc, unsigned long *offp);
  static int  do_step(struct pt_regs *);
  static void bpt_cmds(void);
  static void cacheflush(void);
+static void xmon_show_dmesg(void);
  static int  cpu_cmd(void);
  static void csum(void);
  static void bootcmds(void);
@@ -197,6 +198,7 @@ Commands:\n\
  #endif
    "\
    C    checksum\n\
+  D    show dmesg (printk) buffer\n\
    d    dump bytes\n\
    di   dump instructions\n\
    df   dump float values\n\
@@ -829,6 +831,9 @@ cmds(struct pt_regs *excp)
                 case 'd':
                         dump();
                         break;
+               case 'D':
+                       xmon_show_dmesg();
+                       break;
                 case 'l':
                         symbol_lookup();
                         break;
@@ -2589,6 +2594,58 @@ static void xmon_print_symbol(unsigned long address, const char *mid,
         printf("%s", after);
  }
  
+extern void kdb_syslog_data(char *syslog_data[]);
+#define SYSLOG_WRAP(p) if (p < syslog_data[0]) p = syslog_data[1]-1; \
+       else if (p >= syslog_data[1]) p = syslog_data[0];
+
+static void xmon_show_dmesg(void)
+{
+       char *syslog_data[4], *start, *end, c;
+       int logsize;
+
+       /* syslog_data[0,1] physical start, end+1.
+        * syslog_data[2,3] logical start, end+1.
+        */
+       kdb_syslog_data(syslog_data);
+       if (syslog_data[2] == syslog_data[3])
+               return;
+       logsize = syslog_data[1] - syslog_data[0];
+       start = syslog_data[0] + (syslog_data[2] - syslog_data[0]) % logsize;
+       end = syslog_data[0] + (syslog_data[3] - syslog_data[0]) % logsize;
+
+       /* Do a line at a time (max 200 chars) to reduce overhead */
+       c = '\0';
+       while(1) {
+               char *p;
+               int chars = 0;
+               if (!*start) {
+                       while (!*start) {
+                               ++start;
+                               SYSLOG_WRAP(start);
+                               if (start == end)
+                                       break;
+                       }
+                       if (start == end)
+                               break;
+               }
+               p = start;
+               while (*start && chars < 200) {
+                       c = *start;
+                       ++chars;
+                       ++start;
+                       SYSLOG_WRAP(start);
+                       if (start == end || c == '\n')
+                               break;
+               }
+               if (chars)
+                       printf("%.*s", chars, p);
+               if (start == end)
+                       break;
+       }
+       if (c != '\n')
+               printf("\n");
+}
+
  #ifdef CONFIG_PPC_BOOK3S_64
  static void dump_slb(void)
  {
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig

index 9015060..02d71c0 100644 (file)
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -600,6 +600,14 @@ config S390_GUEST
           virtio transport. If KVM is detected, the virtio console will be
           the default console.
  
+config KMSG_IDS
+       bool "Kernel message numbers"
+       default y
+       help
+         Select this option if you want to include a message number to the
+         prefix for kernel messages issued by the s390 architecture and
+         driver code. See "Documentation/s390/kmsg.txt" for more details.
+
  config SECCOMP
         def_bool y
         prompt "Enable seccomp to safely compute untrusted bytecode"
diff --git a/arch/s390/Makefile b/arch/s390/Makefile

index 0ad2f1e..1eea138 100644 (file)
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -104,12 +104,12 @@ drivers-$(CONFIG_OPROFILE)        += arch/s390/oprofile/
  
  boot           := arch/s390/boot
  
-all: image bzImage
+all: image bzImage kerntypes.o
  
  install: vmlinux
         $(Q)$(MAKE) $(build)=$(boot) $@
  
-image bzImage: vmlinux
+image bzImage kerntypes.o: vmlinux
         $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
  
  zfcpdump:
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile

index f2737a0..8d4bce4 100644 (file)
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -2,15 +2,26 @@
  # Makefile for the linux s390-specific parts of the memory manager.
  #
  
-COMPILE_VERSION := __linux_compile_version_id__`hostname |  \
-                       tr -c '[0-9A-Za-z]' '_'`__`date | \
-                       tr -c '[0-9A-Za-z]' '_'`_t
+COMPILE_VERSION := __linux_compile_version_id__$(shell hostname |  \
+                       tr -c '[0-9A-Za-z]' '_')__$(shell date | \
+                       tr -c '[0-9A-Za-z]' '_')_t
  
+
+chk-option = $(shell if $(CC) $(CFLAGS) $(1) -S -o /dev/null -xc /dev/null \
+            > /dev/null 2>&1; then echo "$(1)"; fi ;)
+
+# Remove possible '-g' from CFLAGS_KERNEL, since we want to use stabs
+# debug format.
+override CFLAGS_KERNEL := $(shell echo $(CFLAGS_KERNEL) | sed 's/-g//')
  ccflags-y  := -DCOMPILE_VERSION=$(COMPILE_VERSION) -gstabs -I.
+# Assume we don't need the flag if the compiler doesn't know about it
+ccflags-y  += $(call chk-option,-fno-eliminate-unused-debug-types)
+
  
  targets := image
  targets += bzImage
  subdir- := compressed
+targets += kerntypes.o
  
  $(obj)/image: vmlinux FORCE
         $(call if_changed,objcopy)
diff --git a/arch/s390/boot/kerntypes.c b/arch/s390/boot/kerntypes.c

new file mode 100644 (file)

index 0000000..8adaaac
--- /dev/null
+++ b/arch/s390/boot/kerntypes.c
@@ -0,0 +1,310 @@
+/*
+ * kerntypes.c
+ *
+ * Dummy module that includes headers for all kernel types of interest.
+ * The kernel type information is used by the lcrash utility when
+ * analyzing system crash dumps or the live system. Using the type
+ * information for the running system, rather than kernel header files,
+ * makes for a more flexible and robust analysis tool.
+ *
+ * This source code is released under the GNU GPL.
+ */
+
+/* generate version for this file */
+typedef char *COMPILE_VERSION;
+
+/* General linux types */
+
+#include <generated/compile.h>
+#include <linux/utsname.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#ifdef CONFIG_SLUB
+ #include <linux/slub_def.h>
+#endif
+#ifdef CONFIG_SLAB
+ #include <linux/slab_def.h>
+#endif
+#ifdef CONFIG_SLQB
+ #include <linux/slqb_def.h>
+#endif
+#include <linux/bio.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/bitrev.h>
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/bootmem.h>
+#include <linux/buffer_head.h>
+#include <linux/cache.h>
+#include <linux/cdev.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/dcache.h>
+#include <linux/debugfs.h>
+#include <linux/elevator.h>
+#include <linux/fd.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/futex.h>
+#include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/if.h>
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/if_bonding.h>
+#include <linux/if_ether.h>
+#include <linux/if_tr.h>
+#include <linux/if_tun.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/in_route.h>
+#include <linux/inet.h>
+#include <linux/inet_diag.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/inotify.h>
+#include <linux/interrupt.h>
+#include <linux/ioctl.h>
+#include <linux/ip.h>
+#include <linux/ipsec.h>
+#include <linux/ipv6.h>
+#include <linux/ipv6_route.h>
+#include <linux/interrupt.h>
+#include <linux/irqflags.h>
+#include <linux/irqreturn.h>
+#include <linux/jbd2.h>
+#include <linux/jffs2.h>
+#include <linux/jhash.h>
+#include <linux/jiffies.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/kexec.h>
+#include <linux/kobject.h>
+#include <linux/kthread.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/memory.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/mm_types.h>
+#include <linux/mman.h>
+#include <linux/mmtimer.h>
+#include <linux/mmzone.h>
+#include <linux/mnt_namespace.h>
+#include <linux/module.h>
+#include <linux/moduleloader.h>
+#include <linux/moduleparam.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/mqueue.h>
+#include <linux/mtio.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/neighbour.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_arp.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_decnet.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netlink.h>
+#include <linux/netpoll.h>
+#include <linux/pagemap.h>
+#include <linux/param.h>
+#include <linux/percpu.h>
+#include <linux/percpu_counter.h>
+#include <linux/pfn.h>
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <linux/poll.h>
+#include <linux/posix-timers.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_types.h>
+#include <linux/preempt.h>
+#include <linux/prio_tree.h>
+#include <linux/proc_fs.h>
+#include <linux/profile.h>
+#include <linux/ptrace.h>
+#include <linux/radix-tree.h>
+#include <linux/ramfs.h>
+#include <linux/raw.h>
+#include <linux/rbtree.h>
+#include <linux/rcupdate.h>
+#include <linux/reboot.h>
+#include <linux/relay.h>
+#include <linux/resource.h>
+#include <linux/romfs_fs.h>
+#include <linux/root_dev.h>
+#include <linux/route.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/sem.h>
+#include <linux/seq_file.h>
+#include <linux/seqlock.h>
+#include <linux/shm.h>
+#include <linux/shmem_fs.h>
+#include <linux/signal.h>
+#include <linux/signalfd.h>
+#include <linux/skbuff.h>
+#include <linux/smp.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include <linux/statfs.h>
+#include <linux/stddef.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/sys.h>
+#include <linux/syscalls.h>
+#include <linux/sysctl.h>
+#include <linux/sysdev.h>
+#include <linux/sysfs.h>
+#include <linux/sysrq.h>
+#include <linux/tc.h>
+#include <linux/tcp.h>
+#include <linux/thread_info.h>
+#include <linux/threads.h>
+#include <linux/tick.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+#include <linux/timerfd.h>
+#include <linux/times.h>
+#include <linux/timex.h>
+#include <linux/topology.h>
+#include <linux/transport_class.h>
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/tty_flip.h>
+#include <linux/tty_ldisc.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/unistd.h>
+#include <linux/utime.h>
+#include <linux/uts.h>
+#include <linux/utsname.h>
+#include <generated/utsrelease.h>
+#include <linux/version.h>
+#include <linux/vfs.h>
+#include <linux/vmalloc.h>
+#include <linux/vmstat.h>
+#include <linux/wait.h>
+#include <linux/watchdog.h>
+#include <linux/workqueue.h>
+#include <linux/zconf.h>
+#include <linux/zlib.h>
+
+/*
+ * s390 specific includes
+ */
+
+#include <asm/lowcore.h>
+#include <asm/debug.h>
+#include <asm/ccwdev.h>
+#include <asm/ccwgroup.h>
+#include <asm/qdio.h>
+#include <asm/zcrypt.h>
+#include <asm/etr.h>
+#include <asm/ipl.h>
+#include <asm/setup.h>
+#include <asm/schid.h>
+#include <asm/chsc.h>
+
+/* channel subsystem driver */
+#include "drivers/s390/cio/cio.h"
+#include "drivers/s390/cio/chsc.h"
+#include "drivers/s390/cio/css.h"
+#include "drivers/s390/cio/device.h"
+#include "drivers/s390/cio/chsc_sch.h"
+
+/* dasd device driver */
+#include "drivers/s390/block/dasd_int.h"
+#include "drivers/s390/block/dasd_diag.h"
+#include "drivers/s390/block/dasd_eckd.h"
+#include "drivers/s390/block/dasd_fba.h"
+
+/* networking drivers */
+#include "include/net/iucv/iucv.h"
+#include "drivers/s390/net/fsm.h"
+#include "drivers/s390/net/ctcm_main.h"
+#include "drivers/s390/net/ctcm_fsms.h"
+#include "drivers/s390/net/lcs.h"
+#include "drivers/s390/net/qeth_core.h"
+#include "drivers/s390/net/qeth_core_mpc.h"
+#include "drivers/s390/net/qeth_l3.h"
+
+/* zfcp device driver */
+#include "drivers/s390/scsi/zfcp_def.h"
+#include "drivers/s390/scsi/zfcp_fsf.h"
+
+/* crypto device driver */
+#include "drivers/s390/crypto/ap_bus.h"
+#include "drivers/s390/crypto/zcrypt_api.h"
+#include "drivers/s390/crypto/zcrypt_cca_key.h"
+#include "drivers/s390/crypto/zcrypt_pcica.h"
+#include "drivers/s390/crypto/zcrypt_pcicc.h"
+#include "drivers/s390/crypto/zcrypt_pcixcc.h"
+#include "drivers/s390/crypto/zcrypt_cex2a.h"
+
+/* sclp device driver */
+#include "drivers/s390/char/sclp.h"
+#include "drivers/s390/char/sclp_rw.h"
+#include "drivers/s390/char/sclp_tty.h"
+
+/* vmur device driver */
+#include "drivers/s390/char/vmur.h"
+
+/* qdio device driver */
+#include "drivers/s390/cio/qdio.h"
+#include "drivers/s390/cio/qdio_thinint.c"
+
+
+/* KVM */
+#include "include/linux/kvm.h"
+#include "include/linux/kvm_host.h"
+#include "include/linux/kvm_para.h"
+
+/* Virtio */
+#include "include/linux/virtio.h"
+#include "include/linux/virtio_config.h"
+#include "include/linux/virtio_ring.h"
+#include "include/linux/virtio_9p.h"
+#include "include/linux/virtio_console.h"
+#include "include/linux/virtio_rng.h"
+#include "include/linux/virtio_balloon.h"
+#include "include/linux/virtio_net.h"
+#include "include/linux/virtio_blk.h"
+
+/*
+ * include sched.c for types:
+ *    - struct prio_array
+ *    - struct runqueue
+ */
+#include "kernel/sched.c"
+/*
+ * include slab.c for struct kmem_cache
+ */
+#ifdef CONFIG_SLUB
+ #include "mm/slub.c"
+#endif
+#ifdef CONFIG_SLAB
+ #include "mm/slab.c"
+#endif
+#ifdef CONFIG_SLQB
+ #include "mm/slqb.c"
+#endif
+
+/* include driver core private structures */
+#include "drivers/base/base.h"
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild

index 0e9dec6..006ed9e 100644 (file)
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -2,7 +2,7 @@
  obj-$(CONFIG_KVM) += kvm/
  
  # Xen paravirtualization support
-obj-$(CONFIG_XEN) += xen/
+obj-$(CONFIG_PARAVIRT_XEN) += xen/
  
  # lguest paravirtualization support
  obj-$(CONFIG_LGUEST_GUEST) += lguest/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index c9866b0..e2732a5 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -8,7 +8,7 @@ config 64BIT
  
  config X86_32
         def_bool !64BIT
-       select CLKSRC_I8253
+       select CLKSRC_I8253 if !XEN
  
  config X86_64
         def_bool 64BIT
@@ -20,7 +20,7 @@ config X86
         select HAVE_UNSTABLE_SCHED_CLOCK
         select HAVE_IDE
         select HAVE_OPROFILE
-       select HAVE_PCSPKR_PLATFORM
+       select HAVE_PCSPKR_PLATFORM if !XEN_UNPRIVILEGED_GUEST
         select HAVE_PERF_EVENTS
         select HAVE_IRQ_WORK
         select HAVE_IOREMAP_PROT
@@ -42,8 +42,8 @@ config X86
         select HAVE_FUNCTION_TRACE_MCOUNT_TEST
         select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
         select HAVE_SYSCALL_TRACEPOINTS
-       select HAVE_KVM
-       select HAVE_ARCH_KGDB
+       select HAVE_KVM if !XEN
+       select HAVE_ARCH_KGDB if !XEN
         select HAVE_ARCH_TRACEHOOK
         select HAVE_GENERIC_DMA_COHERENT if X86_32
         select HAVE_EFFICIENT_UNALIGNED_ACCESS
@@ -51,14 +51,14 @@ config X86
         select HAVE_REGS_AND_STACK_ACCESS_API
         select HAVE_DMA_API_DEBUG
         select HAVE_KERNEL_GZIP
-       select HAVE_KERNEL_BZIP2
-       select HAVE_KERNEL_LZMA
-       select HAVE_KERNEL_XZ
-       select HAVE_KERNEL_LZO
+       select HAVE_KERNEL_BZIP2 if !XEN
+       select HAVE_KERNEL_LZMA if !XEN
+       select HAVE_KERNEL_XZ if !XEN
+       select HAVE_KERNEL_LZO if !XEN
         select HAVE_HW_BREAKPOINT
         select HAVE_MIXED_BREAKPOINTS_REGS
         select PERF_EVENTS
-       select HAVE_PERF_EVENTS_NMI
+       select HAVE_PERF_EVENTS_NMI if !XEN
         select ANON_INODES
         select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
         select HAVE_CMPXCHG_LOCAL if !M386
@@ -78,7 +78,7 @@ config X86
         select IRQ_FORCED_THREADING
         select USE_GENERIC_SMP_HELPERS if SMP
         select HAVE_BPF_JIT if (X86_64 && NET)
-       select CLKEVT_I8253
+       select CLKEVT_I8253 if !XEN
         select ARCH_HAVE_NMI_SAFE_CMPXCHG
         select GENERIC_IOMAP
         select DCACHE_WORD_ACCESS
@@ -101,17 +101,19 @@ config GENERIC_CMOS_UPDATE
  
  config CLOCKSOURCE_WATCHDOG
         def_bool y
+       depends on !XEN
  
  config GENERIC_CLOCKEVENTS
         def_bool y
  
  config ARCH_CLOCKSOURCE_DATA
         def_bool y
-       depends on X86_64
+       depends on X86_64 && !XEN
  
  config GENERIC_CLOCKEVENTS_BROADCAST
         def_bool y
         depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
+       depends on !XEN
  
  config LOCKDEP_SUPPORT
         def_bool y
@@ -129,7 +131,7 @@ config SBUS
         bool
  
  config NEED_DMA_MAP_STATE
-       def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG)
+       def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB)
  
  config NEED_SG_DMA_LENGTH
         def_bool y
@@ -193,6 +195,7 @@ config NEED_PER_CPU_PAGE_FIRST_CHUNK
  
  config ARCH_HIBERNATION_POSSIBLE
         def_bool y
+       depends on !XEN
  
  config ARCH_SUSPEND_POSSIBLE
         def_bool y
@@ -225,7 +228,15 @@ config X86_64_SMP
  
  config X86_HT
         def_bool y
-       depends on SMP
+       depends on SMP && !XEN
+
+config X86_NO_TSS
+       def_bool y
+       depends on XEN
+
+config X86_NO_IDT
+       def_bool y
+       depends on XEN
  
  config X86_32_LAZY_GS
         def_bool y
@@ -241,7 +252,7 @@ config KTIME_SCALAR
  
  config ARCH_CPU_PROBE_RELEASE
         def_bool y
-       depends on HOTPLUG_CPU
+       depends on HOTPLUG_CPU && !XEN
  
  source "init/Kconfig"
  source "kernel/Kconfig.freezer"
@@ -307,13 +318,22 @@ config X86_MPPARSE
           For old smp systems that do not have proper acpi support. Newer systems
           (esp with 64bit cpus) with acpi support, MADT and DSDT will override it
  
+config X86_XEN
+       bool "Xen-compatible"
+       depends on X86_32
+       select XEN
+       select X86_PAE
+       help
+         Choose this option if you plan to run this kernel on top of the
+         Xen Hypervisor.
+
  config X86_BIGSMP
         bool "Support for big SMP systems with more than 8 CPUs"
-       depends on X86_32 && SMP
+       depends on X86_32 && SMP && !XEN
         ---help---
           This option is needed for the systems that have more than 8 CPUs
  
-if X86_32
+if X86_32 && !XEN
  config X86_EXTENDED_PLATFORM
         bool "Support for extended (non-PC) x86 platforms"
         default y
@@ -336,7 +356,14 @@ config X86_EXTENDED_PLATFORM
           generic distribution kernel, say Y here - otherwise say N.
  endif
  
-if X86_64
+config X86_64_XEN
+       bool "Enable Xen compatible kernel"
+       depends on X86_64
+       select XEN
+       help
+         This option will compile a kernel compatible with Xen hypervisor
+
+if X86_64 && !XEN
  config X86_EXTENDED_PLATFORM
         bool "Support for extended (non-PC) x86 platforms"
         default y
@@ -519,7 +546,7 @@ config X86_ES7000
  
  config X86_32_IRIS
         tristate "Eurobraille/Iris poweroff module"
-       depends on X86_32
+       depends on X86_32 && !XEN
         ---help---
           The Iris machines from EuroBraille do not have APM or ACPI support
           to shut themselves down properly.  A special I/O sequence is
@@ -533,7 +560,7 @@ config X86_32_IRIS
  config SCHED_OMIT_FRAME_POINTER
         def_bool y
         prompt "Single-depth WCHAN output"
-       depends on X86
+       depends on X86 && !STACK_UNWIND
         ---help---
           Calculate simpler /proc/<PID>/wchan values. If this option
           is disabled then wchan values will recurse back to the
@@ -544,6 +571,7 @@ config SCHED_OMIT_FRAME_POINTER
  
  menuconfig PARAVIRT_GUEST
         bool "Paravirtualized guest support"
+       depends on !XEN
         ---help---
           Say Y here to get to see options related to running Linux under
           various hypervisors.  This option alone does not add any kernel code.
@@ -624,6 +652,7 @@ config NO_BOOTMEM
  
  config MEMTEST
         bool "Memtest"
+       depends on !XEN
         ---help---
           This option adds a kernel parameter 'memtest', which allows memtest
           to be set.
@@ -646,6 +675,7 @@ source "arch/x86/Kconfig.cpu"
  config HPET_TIMER
         def_bool X86_64
         prompt "HPET Timer Support" if X86_32
+       depends on !XEN
         ---help---
           Use the IA-PC HPET (High Precision Event Timer) to manage
           time in preference to the PIT and RTC, if a HPET is
@@ -683,6 +713,7 @@ config APB_TIMER
  config DMI
         default y
         bool "Enable DMI scanning" if EXPERT
+       depends on !XEN_UNPRIVILEGED_GUEST
         ---help---
           Enabled scanning of DMI to identify machine quirks. Say Y
           here unless you have verified that your setup is not
@@ -693,7 +724,7 @@ config GART_IOMMU
         bool "GART IOMMU support" if EXPERT
         default y
         select SWIOTLB
-       depends on X86_64 && PCI && AMD_NB
+       depends on X86_64 && PCI && AMD_NB && !X86_64_XEN
         ---help---
           Support for full DMA access of devices with 32bit memory access only
           on systems with more than 3GB. This is usually needed for USB,
@@ -708,7 +739,7 @@ config GART_IOMMU
  config CALGARY_IOMMU
         bool "IBM Calgary IOMMU support"
         select SWIOTLB
-       depends on X86_64 && PCI && EXPERIMENTAL
+       depends on X86_64 && PCI && !X86_64_XEN && EXPERIMENTAL
         ---help---
           Support for hardware IOMMUs in IBM's xSeries x366 and x460
           systems. Needed to run systems with more than 3GB of memory
@@ -736,7 +767,8 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
  
  # need this always selected by IOMMU for the VIA workaround
  config SWIOTLB
-       def_bool y if X86_64
+       def_bool y if X86_64 || XEN
+       prompt "Software I/O TLB" if XEN_UNPRIVILEGED_GUEST && !XEN_PCIDEV_FRONTEND
         ---help---
           Support for software bounce buffers used on x86-64 systems
           which don't have a hardware IOMMU (e.g. the current generation
@@ -757,11 +789,12 @@ config MAXSMP
  
  config NR_CPUS
         int "Maximum number of CPUs" if SMP && !MAXSMP
-       range 2 8 if SMP && X86_32 && !X86_BIGSMP
+       range 2 8 if SMP && X86_32 && !X86_BIGSMP && !X86_XEN
         range 2 512 if SMP && !MAXSMP
         default "1" if !SMP
         default "4096" if MAXSMP
         default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
+       default "16" if X86_64_XEN
         default "8" if SMP
         ---help---
           This allows you to specify the maximum number of CPUs which this
@@ -804,7 +837,7 @@ source "kernel/Kconfig.preempt"
  
  config X86_UP_APIC
         bool "Local APIC support on uniprocessors"
-       depends on X86_32 && !SMP && !X86_32_NON_STANDARD
+       depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !XEN_UNPRIVILEGED_GUEST
         ---help---
           A local APIC (Advanced Programmable Interrupt Controller) is an
           integrated interrupt controller in the CPU. If you have a single-CPU
@@ -830,10 +863,12 @@ config X86_UP_IOAPIC
  config X86_LOCAL_APIC
         def_bool y
         depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
+       depends on !XEN_UNPRIVILEGED_GUEST
  
  config X86_IO_APIC
         def_bool y
         depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
+       depends on !XEN_UNPRIVILEGED_GUEST
  
  config X86_VISWS_APIC
         def_bool y
@@ -841,7 +876,7 @@ config X86_VISWS_APIC
  
  config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
         bool "Reroute for broken boot IRQs"
-       depends on X86_IO_APIC
+       depends on X86_IO_APIC && !XEN
         ---help---
           This option enables a workaround that fixes a source of
           spurious interrupts. This is recommended when threaded
@@ -864,6 +899,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
  
  config X86_MCE
         bool "Machine Check / overheating reporting"
+       depends on !XEN_UNPRIVILEGED_GUEST
         ---help---
           Machine Check support allows the processor to notify the
           kernel if it detects a problem (e.g. overheating, data corruption).
@@ -873,7 +909,7 @@ config X86_MCE
  config X86_MCE_INTEL
         def_bool y
         prompt "Intel MCE features"
-       depends on X86_MCE && X86_LOCAL_APIC
+       depends on X86_MCE && X86_LOCAL_APIC && !XEN
         ---help---
            Additional support for intel specific MCE features such as
            the thermal monitor.
@@ -881,14 +917,14 @@ config X86_MCE_INTEL
  config X86_MCE_AMD
         def_bool y
         prompt "AMD MCE features"
-       depends on X86_MCE && X86_LOCAL_APIC
+       depends on X86_MCE && X86_LOCAL_APIC && !XEN
         ---help---
            Additional support for AMD specific MCE features such as
            the DRAM Error Threshold.
  
  config X86_ANCIENT_MCE
         bool "Support for old Pentium 5 / WinChip machine checks"
-       depends on X86_32 && X86_MCE
+       depends on X86_32 && X86_MCE && !XEN
         ---help---
           Include support for machine check handling on old Pentium 5 or WinChip
           systems. These typically need to be enabled explicitely on the command
@@ -906,6 +942,10 @@ config X86_MCE_INJECT
           If you don't know what a machine check is and you don't do kernel
           QA it is safe to say n.
  
+config X86_XEN_MCE
+       def_bool y
+       depends on XEN && X86_MCE
+
  config X86_THERMAL_VECTOR
         def_bool y
         depends on X86_MCE_INTEL
@@ -959,7 +999,7 @@ config I8K
  
  config X86_REBOOTFIXUPS
         bool "Enable X86 board specific fixups for reboot"
-       depends on X86_32
+       depends on X86_32 && !XEN
         ---help---
           This enables chipset and/or board specific fixups to be done
           in order to get reboot to work correctly. This is only needed on
@@ -976,6 +1016,7 @@ config X86_REBOOTFIXUPS
  
  config MICROCODE
         tristate "/dev/cpu/microcode - microcode support"
+       depends on !XEN_UNPRIVILEGED_GUEST
         select FW_LOADER
         ---help---
           If you say Y here, you will be able to update the microcode on
@@ -994,7 +1035,7 @@ config MICROCODE
  
  config MICROCODE_INTEL
         bool "Intel microcode patch loading support"
-       depends on MICROCODE
+       depends on MICROCODE && !XEN
         default MICROCODE
         select FW_LOADER
         ---help---
@@ -1007,7 +1048,7 @@ config MICROCODE_INTEL
  
  config MICROCODE_AMD
         bool "AMD microcode patch loading support"
-       depends on MICROCODE
+       depends on MICROCODE && !XEN
         select FW_LOADER
         ---help---
           If you select this option, microcode patch loading support for AMD
@@ -1019,6 +1060,7 @@ config MICROCODE_OLD_INTERFACE
  
  config X86_MSR
         tristate "/dev/cpu/*/msr - Model-specific register support"
+       select XEN_DOMCTL if XEN_PRIVILEGED_GUEST
         ---help---
           This device gives privileged processes access to the x86
           Model-Specific Registers (MSRs).  It is a character device with
@@ -1036,7 +1078,7 @@ config X86_CPUID
  
  choice
         prompt "High Memory Support"
-       default HIGHMEM64G if X86_NUMAQ
+       default HIGHMEM64G if X86_NUMAQ || XEN
         default HIGHMEM4G
         depends on X86_32
  
@@ -1079,7 +1121,7 @@ config NOHIGHMEM
  
  config HIGHMEM4G
         bool "4GB"
-       depends on !X86_NUMAQ
+       depends on !X86_NUMAQ && !XEN
         ---help---
           Select this if you have a 32-bit processor and between 1 and 4
           gigabytes of physical RAM.
@@ -1155,12 +1197,12 @@ config ARCH_PHYS_ADDR_T_64BIT
         def_bool X86_64 || X86_PAE
  
  config ARCH_DMA_ADDR_T_64BIT
-       def_bool X86_64 || HIGHMEM64G
+       def_bool X86_64 || XEN || HIGHMEM64G
  
  config DIRECT_GBPAGES
         bool "Enable 1GB pages for kernel pagetables" if EXPERT
         default y
-       depends on X86_64
+       depends on X86_64 && !XEN
         ---help---
           Allow the kernel linear mapping to use 1GB pages on CPUs that
           support it. This can improve the kernel's performance a tiny bit by
@@ -1169,7 +1211,7 @@ config DIRECT_GBPAGES
  # Common NUMA Features
  config NUMA
         bool "Numa Memory Allocation and Scheduler Support"
-       depends on SMP
+       depends on SMP && !XEN
         depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
         default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
         ---help---
@@ -1270,12 +1312,13 @@ config ARCH_DISCONTIGMEM_DEFAULT
  config ARCH_SPARSEMEM_ENABLE
         def_bool y
         depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
+       depends on !XEN
         select SPARSEMEM_STATIC if X86_32
         select SPARSEMEM_VMEMMAP_ENABLE if X86_64
  
  config ARCH_SPARSEMEM_DEFAULT
         def_bool y
-       depends on X86_64
+       depends on X86_64 && !X86_64_XEN
  
  config ARCH_SELECT_MEMORY_MODEL
         def_bool y
@@ -1307,6 +1350,7 @@ config HIGHPTE
  
  config X86_CHECK_BIOS_CORRUPTION
         bool "Check for low memory corruption"
+       depends on !XEN
         ---help---
           Periodically check for memory corruption in low memory, which
           is suspected to be caused by BIOS.  Even when enabled in the
@@ -1337,6 +1381,7 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
  
  config X86_RESERVE_LOW
         int "Amount of low memory, in kilobytes, to reserve for the BIOS"
+       depends on !XEN
         default 64
         range 4 640
         ---help---
@@ -1367,6 +1412,7 @@ config X86_RESERVE_LOW
  config MATH_EMULATION
         bool
         prompt "Math emulation" if X86_32
+       depends on !XEN
         ---help---
           Linux can emulate a math coprocessor (used for floating point
           operations) if you don't have one. 486DX and Pentium processors have
@@ -1393,6 +1439,7 @@ config MATH_EMULATION
  config MTRR
         def_bool y
         prompt "MTRR (Memory Type Range Register) support" if EXPERT
+       depends on !XEN_UNPRIVILEGED_GUEST
         ---help---
           On Intel P6 family processors (Pentium Pro, Pentium II and later)
           the Memory Type Range Registers (MTRRs) may be used to control
@@ -1428,7 +1475,7 @@ config MTRR
  config MTRR_SANITIZER
         def_bool y
         prompt "MTRR cleanup support"
-       depends on MTRR
+       depends on MTRR && !XEN
         ---help---
           Convert MTRR layout from continuous to discrete, so X drivers can
           add writeback entries.
@@ -1458,8 +1505,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
  
  config X86_PAT
         def_bool y
-       prompt "x86 PAT support" if EXPERT
-       depends on MTRR
+       prompt "x86 PAT support" if EXPERT || XEN_UNPRIVILEGED_GUEST
+       depends on MTRR || (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND)
         ---help---
           Use PAT attributes to setup page level cache control.
  
@@ -1486,7 +1533,7 @@ config ARCH_RANDOM
  
  config EFI
         bool "EFI runtime service support"
-       depends on ACPI
+       depends on ACPI && !XEN_UNPRIVILEGED_GUEST
         ---help---
           This enables the kernel to use EFI runtime services that are
           available (such as the EFI variable services).
@@ -1500,7 +1547,7 @@ config EFI
  
  config EFI_STUB
         bool "EFI stub support"
-       depends on EFI
+       depends on EFI && !XEN
         ---help---
            This kernel feature allows a bzImage to be loaded directly
           by EFI firmware without the use of a bootloader.
@@ -1541,6 +1588,7 @@ source kernel/Kconfig.hz
  
  config KEXEC
         bool "kexec system call"
+       depends on !XEN_UNPRIVILEGED_GUEST
         ---help---
           kexec is a system call that implements the ability to shutdown your
           current kernel, and to start another kernel.  It is like a reboot
@@ -1558,6 +1606,7 @@ config KEXEC
  config CRASH_DUMP
         bool "kernel crash dumps"
         depends on X86_64 || (X86_32 && HIGHMEM)
+       depends on !XEN
         ---help---
           Generate crash dump after being started by kexec.
           This should be normally only set in special crash dump kernels
@@ -1578,7 +1627,8 @@ config KEXEC_JUMP
           code in physical address mode via KEXEC
  
  config PHYSICAL_START
-       hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
+       hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP || XEN)
+       default 0x100000 if XEN
         default "0x1000000"
         ---help---
           This gives the physical address where the kernel is loaded.
@@ -1620,6 +1670,7 @@ config PHYSICAL_START
  
  config RELOCATABLE
         bool "Build a relocatable kernel"
+       depends on !XEN
         default y
         ---help---
           This builds a kernel image that retains relocation information
@@ -1641,7 +1692,8 @@ config X86_NEED_RELOCS
         depends on X86_32 && RELOCATABLE
  
  config PHYSICAL_ALIGN
-       hex "Alignment value to which kernel should be aligned" if X86_32
+       hex "Alignment value to which kernel should be aligned" if X86_32 && !XEN
+       default 0x2000 if XEN
         default "0x1000000"
         range 0x2000 0x1000000
         ---help---
@@ -1734,6 +1786,7 @@ endmenu
  config ARCH_ENABLE_MEMORY_HOTPLUG
         def_bool y
         depends on X86_64 || (X86_32 && HIGHMEM)
+       depends on !XEN
  
  config ARCH_ENABLE_MEMORY_HOTREMOVE
         def_bool y
@@ -1751,6 +1804,8 @@ config ARCH_HIBERNATION_HEADER
  
  source "kernel/power/Kconfig"
  
+if !XEN_UNPRIVILEGED_GUEST
+
  source "drivers/acpi/Kconfig"
  
  source "drivers/sfi/Kconfig"
@@ -1761,7 +1816,7 @@ config X86_APM_BOOT
  
  menuconfig APM
         tristate "APM (Advanced Power Management) BIOS support"
-       depends on X86_32 && PM_SLEEP
+       depends on X86_32 && PM_SLEEP && !XEN
         ---help---
           APM is a BIOS specification for saving power using several different
           techniques. This is mostly useful for battery powered laptops with
@@ -1886,6 +1941,8 @@ source "drivers/cpuidle/Kconfig"
  
  source "drivers/idle/Kconfig"
  
+endif # !XEN_UNPRIVILEGED_GUEST
+
  endmenu
  
  
@@ -1895,6 +1952,7 @@ config PCI
         bool "PCI support"
         default y
         select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
+       select ARCH_SUPPORTS_MSI if (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND)
         ---help---
           Find out whether you have a PCI motherboard. PCI is the name of a
           bus system, i.e. the way the CPU talks to the other stuff inside
@@ -1922,25 +1980,36 @@ choice
  
  config PCI_GOBIOS
         bool "BIOS"
+       depends on !XEN
  
  config PCI_GOMMCONFIG
         bool "MMConfig"
+       depends on !XEN_UNPRIVILEGED_GUEST
  
  config PCI_GODIRECT
         bool "Direct"
+       depends on !XEN_UNPRIVILEGED_GUEST
  
  config PCI_GOOLPC
         bool "OLPC XO-1"
-       depends on OLPC
+       depends on OLPC && !XEN_UNPRIVILEGED_GUEST
+
+config PCI_GOXEN_FE
+       bool "Xen PCI Frontend"
+       depends on X86_XEN
+       help
+         The PCI device frontend driver allows the kernel to import arbitrary
+         PCI devices from a PCI backend to support PCI driver domains.
  
  config PCI_GOANY
         bool "Any"
+       depends on !XEN_UNPRIVILEGED_GUEST
  
  endchoice
  
  config PCI_BIOS
         def_bool y
-       depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY)
+       depends on X86_32 && PCI && !XEN && (PCI_GOBIOS || PCI_GOANY)
  
  # x86-64 doesn't support PCI BIOS access from long mode so always go direct.
  config PCI_DIRECT
@@ -1957,7 +2026,7 @@ config PCI_OLPC
  
  config PCI_XEN
         def_bool y
-       depends on PCI && XEN
+       depends on PCI && PARAVIRT_XEN
         select SWIOTLB_XEN
  
  config PCI_DOMAINS
@@ -1988,7 +2057,7 @@ source "drivers/pci/Kconfig"
  
  # x86_64 have no ISA slots, but can have ISA-style DMA.
  config ISA_DMA_API
-       bool "ISA-style DMA support" if (X86_64 && EXPERT)
+       bool "ISA-style DMA support" if ((X86_64 || XEN) && EXPERT) || XEN_UNPRIVILEGED_GUEST
         default y
         help
           Enables ISA-style DMA support for devices requiring such controllers.
@@ -1998,6 +2067,7 @@ if X86_32
  
  config ISA
         bool "ISA support"
+       depends on !XEN
         ---help---
           Find out whether you have ISA slots on your motherboard.  ISA is the
           name of a bus system, i.e. the way the CPU talks to the other stuff
@@ -2025,6 +2095,7 @@ source "drivers/eisa/Kconfig"
  
  config MCA
         bool "MCA support"
+       depends on !XEN
         ---help---
           MicroChannel Architecture is found in some IBM PS/2 machines and
           laptops.  It is a bus system similar to PCI or ISA. See
@@ -2056,7 +2127,7 @@ config SCx200HR_TIMER
  
  config OLPC
         bool "One Laptop Per Child support"
-       depends on !X86_PAE
+       depends on !X86_PAE && !XEN
         select GPIOLIB
         select OF
         select OF_PROMTREE
@@ -2135,7 +2206,7 @@ endif # X86_32
  
  config AMD_NB
         def_bool y
-       depends on CPU_SUP_AMD && PCI
+       depends on CPU_SUP_AMD && PCI && !XEN_UNPRIVILEGED_GUEST
  
  source "drivers/pcmcia/Kconfig"
  
@@ -2219,7 +2290,9 @@ source "net/Kconfig"
  
  source "drivers/Kconfig"
  
+if !XEN_UNPRIVILEGED_GUEST
  source "drivers/firmware/Kconfig"
+endif
  
  source "fs/Kconfig"
  
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu

index 706e12e..d731b0d 100644 (file)
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -6,7 +6,7 @@ choice
  
  config M386
         bool "386"
-       depends on X86_32 && !UML
+       depends on X86_32 && !UML && !XEN
         ---help---
           This is the processor type of your CPU. This information is used for
           optimizing purposes. In order to compile a kernel that can run on
@@ -47,7 +47,7 @@ config M386
  
  config M486
         bool "486"
-       depends on X86_32
+       depends on X86_32 && !XEN
         ---help---
           Select this for a 486 series processor, either Intel or one of the
           compatible processors from AMD, Cyrix, IBM, or Intel.  Includes DX,
@@ -56,7 +56,7 @@ config M486
  
  config M586
         bool "586/K5/5x86/6x86/6x86MX"
-       depends on X86_32
+       depends on X86_32 && !XEN
         ---help---
           Select this for an 586 or 686 series processor such as the AMD K5,
           the Cyrix 5x86, 6x86 and 6x86MX.  This choice does not
@@ -64,14 +64,14 @@ config M586
  
  config M586TSC
         bool "Pentium-Classic"
-       depends on X86_32
+       depends on X86_32 && !XEN
         ---help---
           Select this for a Pentium Classic processor with the RDTSC (Read
           Time Stamp Counter) instruction for benchmarking.
  
  config M586MMX
         bool "Pentium-MMX"
-       depends on X86_32
+       depends on X86_32 && !XEN
         ---help---
           Select this for a Pentium with the MMX graphics/multimedia
           extended instructions.
@@ -395,6 +395,7 @@ config X86_P6_NOP
  config X86_TSC
         def_bool y
         depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64
+       depends on !XEN
  
  config X86_CMPXCHG64
         def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug

index e46c214..2b27700 100644 (file)
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -25,6 +25,7 @@ config STRICT_DEVMEM
  config X86_VERBOSE_BOOTUP
         bool "Enable verbose x86 bootup info messages"
         default y
+       depends on !XEN
         ---help---
           Enables the informational output from the decompression stage
           (e.g. bzImage) of the boot. If you disable this you will still
@@ -32,6 +33,7 @@ config X86_VERBOSE_BOOTUP
  
  config EARLY_PRINTK
         bool "Early printk" if EXPERT
+       depends on !XEN_UNPRIVILEGED_GUEST
         default y
         ---help---
           Write kernel log output directly into the VGA buffer or to a serial
@@ -122,7 +124,7 @@ config DEBUG_NX_TEST
  config DOUBLEFAULT
         default y
         bool "Enable doublefault exception handler" if EXPERT
-       depends on X86_32
+       depends on X86_32 && !X86_NO_TSS
         ---help---
           This option allows trapping of rare doublefault exceptions that
           would otherwise cause a system to silently reboot. Disabling this
@@ -162,6 +164,7 @@ config IOMMU_LEAK
  
  config HAVE_MMIOTRACE_SUPPORT
         def_bool y
+       depends on !XEN
  
  config X86_DECODER_SELFTEST
         bool "x86 instruction decoder selftest"
@@ -250,6 +253,7 @@ config DEBUG_BOOT_PARAMS
         bool "Debug boot parameters"
         depends on DEBUG_KERNEL
         depends on DEBUG_FS
+       depends on !XEN
         ---help---
           This option will cause struct boot_params to be exported via debugfs.
  
diff --git a/arch/x86/Makefile b/arch/x86/Makefile

index 94e91e4..b30eaa3 100644 (file)
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -126,7 +126,9 @@ KBUILD_CFLAGS += -pipe
  # Workaround for a gcc prelease that unfortunately was shipped in a suse release
  KBUILD_CFLAGS += -Wno-sign-compare
  #
+ifneq ($(CONFIG_UNWIND_INFO),y)
  KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
+endif
  # prevent gcc from generating any FP code by mistake
  KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
  KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
@@ -175,9 +177,28 @@ boot := arch/x86/boot
  
  BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage
  
-PHONY += bzImage $(BOOT_TARGETS)
+PHONY += bzImage vmlinuz $(BOOT_TARGETS)
+
+ifdef CONFIG_XEN
+LINUXINCLUDE := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \
+       -I$(srctree)/arch/x86/include/mach-xen $(LINUXINCLUDE)
+
+ifdef CONFIG_X86_64
+LDFLAGS_vmlinux := -e startup_64
+endif
  
  # Default kernel to build
+all: vmlinuz
+
+# KBUILD_IMAGE specifies the target image being built
+KBUILD_IMAGE := $(boot)/vmlinuz
+
+vmlinuz: vmlinux
+       $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
+       $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
+       $(Q)ln -fsn ../../x86/boot/$@ $(objtree)/arch/$(UTS_MACHINE)/boot/$@
+else
+# Default kernel to build
  all: bzImage
  
  # KBUILD_IMAGE specify target image being built
@@ -190,6 +211,7 @@ endif
         $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
         $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
         $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
+endif
  
  $(BOOT_TARGETS): vmlinux
         $(Q)$(MAKE) $(build)=$(boot) $@
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile

index 5a747dd..8ad5384 100644 (file)
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -17,6 +17,7 @@
  SVGA_MODE      := -DSVGA_MODE=NORMAL_VGA
  
  targets                := vmlinux.bin setup.bin setup.elf bzImage
+targets                += vmlinuz vmlinux-stripped
  targets                += fdimage fdimage144 fdimage288 image.iso mtools.conf
  subdir-                := compressed
  
@@ -189,6 +190,20 @@ bzlilo: $(obj)/bzImage
         cp System.map $(INSTALL_PATH)/
         if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
  
+$(obj)/vmlinuz: $(obj)/vmlinux-stripped FORCE
+       $(call if_changed,gzip)
+       @echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
+
+$(obj)/vmlinux-stripped: OBJCOPYFLAGS := -g --strip-unneeded
+$(obj)/vmlinux-stripped: vmlinux FORCE
+       $(call if_changed,objcopy)
+
+ifndef CONFIG_XEN
+bzImage := bzImage
+else
+bzImage := vmlinuz
+endif
+
  install:
-       sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
+       sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/$(bzImage) \
                 System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/ia32/ia32entry-xen.S b/arch/x86/ia32/ia32entry-xen.S

new file mode 100644 (file)

index 0000000..dce835b
--- /dev/null
+++ b/arch/x86/ia32/ia32entry-xen.S
@@ -0,0 +1,383 @@
+/*
+ * Compatibility mode system call entry point for x86-64. 
+ *             
+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+ */             
+
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/asm-offsets.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/ia32_unistd.h>   
+#include <asm/thread_info.h>   
+#include <asm/segment.h>
+#include <asm/irqflags.h>
+#include <linux/linkage.h>
+#include <linux/err.h>
+
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386                (EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE           0x40000000
+
+       .section .entry.text, "ax"
+
+       .macro IA32_ARG_FIXUP noebp=0
+       movl    %edi,%r8d
+       .if \noebp
+       jmp     .Lia32_common
+       .else
+       movl    %ebp,%r9d
+.Lia32_common:
+       .endif
+       xchg    %ecx,%esi
+       movl    %ebx,%edi
+       movl    %edx,%edx       /* zero extension */
+       .endm 
+
+       /* clobbers %eax */     
+       .macro  CLEAR_RREGS offset=0, _r9=rax
+       xorl    %eax,%eax
+       movq    %rax,\offset+R11(%rsp)
+       movq    %rax,\offset+R10(%rsp)
+       movq    %\_r9,\offset+R9(%rsp)
+       movq    %rax,\offset+R8(%rsp)
+       .endm
+
+       /*
+        * Reload arg registers from stack in case ptrace changed them.
+        * We don't reload %eax because syscall_trace_enter() returned
+        * the %rax value we should see.  Instead, we just truncate that
+        * value to 32 bits again as we did on entry from user mode.
+        * If it's a new value set by user_regset during entry tracing,
+        * this matches the normal truncation of the user-mode value.
+        * If it's -1 to make us punt the syscall, then (u32)-1 is still
+        * an appropriately invalid value.
+        */
+       .macro LOAD_ARGS32 offset, _r9=0
+       .if \_r9
+       movl \offset+16(%rsp),%r9d
+       .endif
+       movl \offset+40(%rsp),%ecx
+       movl \offset+48(%rsp),%edx
+       movl \offset+56(%rsp),%esi
+       movl \offset+64(%rsp),%edi
+       movl %eax,%eax                  /* zero extension */
+       .endm
+
+       .macro CFI_STARTPROC32 simple
+       CFI_STARTPROC   \simple
+       CFI_UNDEFINED   r8
+       CFI_UNDEFINED   r9
+       CFI_UNDEFINED   r10
+       CFI_UNDEFINED   r11
+       CFI_UNDEFINED   r12
+       CFI_UNDEFINED   r13
+       CFI_UNDEFINED   r14
+       CFI_UNDEFINED   r15
+       .endm
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_usergs_sysret32)
+       swapgs
+       sysretl
+ENDPROC(native_usergs_sysret32)
+
+ENTRY(native_irq_enable_sysexit)
+       swapgs
+       sti
+       sysexit
+ENDPROC(native_irq_enable_sysexit)
+#endif
+
+/*
+ * 32bit SYSENTER instruction entry.
+ *
+ * Arguments:
+ * %eax        System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp user stack
+ * 0(%ebp) Arg6        
+ *     
+ * Interrupts on.
+ *     
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below. Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */    
+ENTRY(ia32_sysenter_target)
+       CFI_STARTPROC32 simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA     rsp,SS+8-RIP+16
+       /*CFI_REL_OFFSET        ss,SS-RIP+16*/
+       CFI_REL_OFFSET  rsp,RSP-RIP+16
+       /*CFI_REL_OFFSET        rflags,EFLAGS-RIP+16*/
+       /*CFI_REL_OFFSET        cs,CS-RIP+16*/
+       CFI_REL_OFFSET  rip,RIP-RIP+16
+       CFI_REL_OFFSET  r11,8
+       CFI_REL_OFFSET  rcx,0
+       movq    8(%rsp),%r11
+       CFI_RESTORE     r11
+       popq_cfi %rcx
+       CFI_RESTORE     rcx
+       movl    %ebp,%ebp               /* zero extension */
+       movl    %eax,%eax
+       movl    TI_sysenter_return+THREAD_INFO(%rsp,8*6-KERNEL_STACK_OFFSET),%r10d
+       movl    $__USER32_DS,40(%rsp)
+       movq    %rbp,32(%rsp)
+       movl    $__USER32_CS,16(%rsp)
+       movq    %r10,8(%rsp)
+       movq    %rax,(%rsp)
+       cld
+       SAVE_ARGS 0,1,0
+       /* no need to do an access_ok check here because rbp has been
+          32bit zero extended */ 
+1:     movl    (%rbp),%ebp
+       .section __ex_table,"a"
+       .quad 1b,ia32_badarg
+       .previous       
+       orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       jnz  sysenter_tracesys
+       jmp .Lia32_check_call
+
+#ifdef CONFIG_AUDITSYSCALL
+       .macro auditsys_entry_common
+       movl %esi,%r9d                  /* 6th arg: 4th syscall arg */
+       movl %edx,%r8d                  /* 5th arg: 3rd syscall arg */
+       /* (already in %ecx)               4th arg: 2nd syscall arg */
+       movl %ebx,%edx                  /* 3rd arg: 1st syscall arg */
+       movl %eax,%esi                  /* 2nd arg: syscall number */
+       movl $AUDIT_ARCH_I386,%edi      /* 1st arg: audit arch */
+       call __audit_syscall_entry
+       movl RAX-ARGOFFSET(%rsp),%eax   /* reload syscall number */
+       cmpq $(IA32_NR_syscalls-1),%rax
+       ja ia32_badsys
+       movl %ebx,%edi                  /* reload 1st syscall arg */
+       movl RCX-ARGOFFSET(%rsp),%esi   /* reload 2nd syscall arg */
+       movl RDX-ARGOFFSET(%rsp),%edx   /* reload 3rd syscall arg */
+       movl RSI-ARGOFFSET(%rsp),%ecx   /* reload 4th syscall arg */
+       movl RDI-ARGOFFSET(%rsp),%r8d   /* reload 5th syscall arg */
+       .endm
+
+sysenter_auditsys:
+       auditsys_entry_common
+       movl %ebp,%r9d                  /* reload 6th syscall arg */
+       jmp .Lia32_dispatch
+#endif
+       CFI_ENDPROC
+ENDPROC(ia32_sysenter_target)
+
+/*
+ * 32bit SYSCALL instruction entry.
+ *
+ * Arguments:
+ * %eax        System call number.
+ * %ebx Arg1
+ * %ecx return EIP 
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
+ * %esp user stack 
+ * 0(%esp) Arg6
+ *     
+ * Interrupts on.
+ *     
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below. Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.     
+ */    
+ENTRY(ia32_cstar_target)
+       CFI_STARTPROC32 simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA     rsp,SS+8-RIP+16
+       /*CFI_REL_OFFSET        ss,SS-RIP+16*/
+       CFI_REL_OFFSET  rsp,RSP-RIP+16
+       /*CFI_REL_OFFSET        rflags,EFLAGS-RIP+16*/
+       /*CFI_REL_OFFSET        cs,CS-RIP+16*/
+       CFI_REL_OFFSET  rip,RIP-RIP+16
+       movl    %eax,%eax       /* zero extension */
+       movl    RSP-RIP+16(%rsp),%r8d
+       SAVE_ARGS -8,0,0
+       movq    %rax,ORIG_RAX-ARGOFFSET(%rsp)
+       movq    %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+       movl    %ebp,%ecx
+       movl    $__USER32_CS,CS-ARGOFFSET(%rsp)
+       movl    $__USER32_DS,SS-ARGOFFSET(%rsp)
+       /* no need to do an access_ok check here because r8 has been
+          32bit zero extended */ 
+       /* hardware stack frame is complete now */      
+1:     movl    (%r8),%r9d
+       .section __ex_table,"a"
+       .quad 1b,ia32_badarg
+       .previous       
+       orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       jnz   cstar_tracesys
+       cmpq $IA32_NR_syscalls-1,%rax
+       ja  ia32_badsys
+cstar_do_call:
+       IA32_ARG_FIXUP 1
+       
+#ifdef CONFIG_AUDITSYSCALL
+cstar_auditsys:
+       movl %r9d,R9-ARGOFFSET(%rsp)    /* register to be clobbered by call */
+       auditsys_entry_common
+       movl R9-ARGOFFSET(%rsp),%r9d    /* reload 6th syscall arg */
+       jmp .Lia32_dispatch
+#endif
+
+cstar_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+       testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       jz cstar_auditsys
+#endif
+       xchgl %r9d,%ebp
+       SAVE_REST
+       CLEAR_RREGS 0, r9
+       movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
+       movq %rsp,%rdi        /* &pt_regs -> arg1 */
+       call syscall_trace_enter
+       LOAD_ARGS32 ARGOFFSET, 1  /* reload args from stack in case ptrace changed it */
+       RESTORE_REST
+       xchgl %ebp,%r9d
+       cmpq $(IA32_NR_syscalls-1),%rax
+       ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
+       jmp cstar_do_call
+END(ia32_cstar_target)
+                               
+ia32_badarg:
+       movq $-EFAULT,%rax
+       jmp ia32_sysret
+       CFI_ENDPROC
+
+/* 
+ * Emulated IA32 system calls via int 0x80. 
+ *
+ * Arguments:   
+ * %eax        System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
+ *
+ * Notes:
+ * Uses the same stack frame as the x86-64 version.    
+ * All registers except %eax must be saved (but ptrace may violate that)
+ * Arguments are zero extended. For system calls that want sign extension and
+ * take long arguments a wrapper is needed. Most calls can just be called
+ * directly.
+ * Assumes it is only called from user space and entered with interrupts on.
+ */                            
+
+ENTRY(ia32_syscall)
+       CFI_STARTPROC32 simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA     rsp,SS+8-RIP+16
+       /*CFI_REL_OFFSET        ss,SS-RIP+16*/
+       CFI_REL_OFFSET  rsp,RSP-RIP+16
+       /*CFI_REL_OFFSET        rflags,EFLAGS-RIP+16*/
+       /*CFI_REL_OFFSET        cs,CS-RIP+16*/
+       CFI_REL_OFFSET  rip,RIP-RIP+16
+       CFI_REL_OFFSET  r11,8
+       CFI_REL_OFFSET  rcx,0
+       movq 8(%rsp),%r11
+       CFI_RESTORE     r11
+       popq_cfi %rcx
+       CFI_RESTORE     rcx
+       movl %eax,%eax
+       movq %rax,(%rsp)
+       cld
+       /* note the registers are not zero extended to the sf.
+          this could be a problem. */
+       SAVE_ARGS 0,1,0
+       orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       jnz ia32_tracesys
+.Lia32_check_call:
+       cmpq $(IA32_NR_syscalls-1),%rax
+       ja ia32_badsys
+ia32_do_call:
+       IA32_ARG_FIXUP
+.Lia32_dispatch:
+       call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
+ia32_sysret:
+       movq %rax,RAX-ARGOFFSET(%rsp)
+       CLEAR_RREGS -ARGOFFSET
+       jmp int_ret_from_sys_call 
+
+sysenter_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+       testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       jz sysenter_auditsys
+#endif
+ia32_tracesys:                  
+       SAVE_REST
+       CLEAR_RREGS
+       movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
+       movq %rsp,%rdi        /* &pt_regs -> arg1 */
+       call syscall_trace_enter
+       LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+       RESTORE_REST
+       cmpq $(IA32_NR_syscalls-1),%rax
+       ja  int_ret_from_sys_call       /* ia32_tracesys has set RAX(%rsp) */
+       jmp ia32_do_call
+END(ia32_syscall)
+
+ia32_badsys:
+       movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+       movq $-ENOSYS,%rax
+       jmp ia32_sysret
+
+       CFI_ENDPROC
+       
+       .macro PTREGSCALL label, func, arg
+       ALIGN
+GLOBAL(\label)
+       leaq \func(%rip),%rax
+       leaq -ARGOFFSET+8(%rsp),\arg    /* 8 for return address */
+       jmp  ia32_ptregs_common 
+       .endm
+
+       CFI_STARTPROC32
+
+       PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
+       PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
+       PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
+       PTREGSCALL stub32_execve, sys32_execve, %rcx
+       PTREGSCALL stub32_fork, sys_fork, %rdi
+       PTREGSCALL stub32_clone, sys32_clone, %rdx
+       PTREGSCALL stub32_vfork, sys_vfork, %rdi
+       PTREGSCALL stub32_iopl, sys_iopl, %rsi
+
+       ALIGN
+ia32_ptregs_common:
+       popq %r11
+       CFI_ENDPROC
+       CFI_STARTPROC32 simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA     rsp,SS+8-ARGOFFSET
+       CFI_REL_OFFSET  rax,RAX-ARGOFFSET
+       CFI_REL_OFFSET  rcx,RCX-ARGOFFSET
+       CFI_REL_OFFSET  rdx,RDX-ARGOFFSET
+       CFI_REL_OFFSET  rsi,RSI-ARGOFFSET
+       CFI_REL_OFFSET  rdi,RDI-ARGOFFSET
+       CFI_REL_OFFSET  rip,RIP-ARGOFFSET
+/*     CFI_REL_OFFSET  cs,CS-ARGOFFSET*/
+/*     CFI_REL_OFFSET  rflags,EFLAGS-ARGOFFSET*/
+       CFI_REL_OFFSET  rsp,RSP-ARGOFFSET
+/*     CFI_REL_OFFSET  ss,SS-ARGOFFSET*/
+       SAVE_REST
+       call *%rax
+       RESTORE_REST
+       jmp  ia32_sysret        /* misbalances the return cache */
+       CFI_ENDPROC
+END(ia32_ptregs_common)
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h

index 610001d..8e3270f 100644 (file)
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -31,6 +31,10 @@
  #include <asm/mpspec.h>
  #include <asm/trampoline.h>
  
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#include <xen/interface/platform.h>
+#endif
+
  #define COMPILER_DEPENDENT_INT64   long long
  #define COMPILER_DEPENDENT_UINT64  unsigned long long
  
@@ -115,7 +119,11 @@ static inline void acpi_disable_pci(void)
  }
  
  /* Low-level suspend routine. */
+#ifdef CONFIG_ACPI_PV_SLEEP
+#define acpi_suspend_lowlevel() acpi_enter_sleep_state(ACPI_STATE_S3, 0)
+#else
  extern int acpi_suspend_lowlevel(void);
+#endif
  
  extern const unsigned char acpi_wakeup_code[];
  #define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code)))
@@ -123,11 +131,33 @@ extern const unsigned char acpi_wakeup_code[];
  /* early initialization routine */
  extern void acpi_reserve_wakeup_memory(void);
  
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+static inline int acpi_notify_hypervisor_state(u8 sleep_state,
+                                              u32 pm1a_cnt_val,
+                                              u32 pm1b_cnt_val)
+{
+       struct xen_platform_op op = {
+               .cmd = XENPF_enter_acpi_sleep,
+               .interface_version = XENPF_INTERFACE_VERSION,
+               .u = {
+                       .enter_acpi_sleep = {
+                               .pm1a_cnt_val = pm1a_cnt_val,
+                               .pm1b_cnt_val = pm1b_cnt_val,
+                               .sleep_state = sleep_state,
+                       },
+               },
+       };
+
+       return HYPERVISOR_platform_op(&op);
+}
+#endif
+
  /*
   * Check if the CPU can handle C2 and deeper
   */
  static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
  {
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         /*
          * Early models (<=5) of AMD Opterons are not supposed to go into
          * C2 state.
@@ -142,6 +172,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
         else if (amd_e400_c1e_detected)
                 return 1;
         else
+#endif
                 return max_cstate;
  }
  
@@ -181,7 +212,9 @@ static inline void disable_acpi(void) { }
  
  #endif /* !CONFIG_ACPI */
  
+#ifndef CONFIG_XEN
  #define ARCH_HAS_POWER_INIT    1
+#endif
  
  #ifdef CONFIG_ACPI_NUMA
  extern int acpi_numa;
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h

index eec2a70..91e72c0 100644 (file)
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -15,6 +15,9 @@
  #define map_page_into_agp(page) set_pages_uc(page, 1)
  #define unmap_page_from_agp(page) set_pages_wb(page, 1)
  
+#define map_pages_into_agp set_pages_array_uc
+#define unmap_pages_from_agp set_pages_array_wb
+
  /*
   * Could use CLFLUSH here if the cpu supports it. But then it would
   * need to be called for each cacheline of the whole page so it may
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h

index d854101..3f77acd 100644 (file)
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -9,11 +9,15 @@
  #include <asm/processor.h>
  #include <asm/apicdef.h>
  #include <linux/atomic.h>
+#ifndef CONFIG_XEN
  #include <asm/fixmap.h>
+#endif
  #include <asm/mpspec.h>
  #include <asm/msr.h>
  
+#ifndef CONFIG_XEN
  #define ARCH_APICTIMER_STOPS_ON_C3     1
+#endif
  
  /*
   * Debugging macros
@@ -45,6 +49,7 @@ static inline void generic_apic_probe(void)
  #ifdef CONFIG_X86_LOCAL_APIC
  
  extern unsigned int apic_verbosity;
+#ifndef CONFIG_XEN
  extern int local_apic_timer_c2_ok;
  
  extern int disable_apic;
@@ -118,6 +123,8 @@ extern u64 native_apic_icr_read(void);
  
  extern int x2apic_mode;
  
+#endif /* CONFIG_XEN */
+
  #ifdef CONFIG_X86_X2APIC
  /*
   * Make previous memory operations globally visible before
@@ -237,7 +244,11 @@ extern void setup_local_APIC(void);
  extern void end_local_APIC_setup(void);
  extern void bsp_end_local_APIC_setup(void);
  extern void init_apic_mappings(void);
+#ifndef CONFIG_XEN
  void register_lapic_address(unsigned long address);
+#else
+#define register_lapic_address(address)
+#endif
  extern void setup_boot_APIC_clock(void);
  extern void setup_secondary_APIC_clock(void);
  extern int APIC_init_uniprocessor(void);
@@ -285,16 +296,19 @@ static inline void disable_local_APIC(void) { }
  struct apic {
         char *name;
  
+#ifndef CONFIG_XEN
         int (*probe)(void);
         int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
         int (*apic_id_valid)(int apicid);
         int (*apic_id_registered)(void);
+#endif
  
         u32 irq_delivery_mode;
         u32 irq_dest_mode;
  
         const struct cpumask *(*target_cpus)(void);
  
+#ifndef CONFIG_XEN
         int disable_esr;
  
         int dest_logical;
@@ -313,8 +327,10 @@ struct apic {
         void (*setup_portio_remap)(void);
         int (*check_phys_apicid_present)(int phys_apicid);
         void (*enable_apic_mode)(void);
+#endif
         int (*phys_pkg_id)(int cpuid_apic, int index_msb);
  
+#ifndef CONFIG_XEN
         /*
          * When one of the next two hooks returns 1 the apic
          * is switched to this. Essentially they are additional
@@ -329,6 +345,7 @@ struct apic {
         unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
         unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
                                                const struct cpumask *andmask);
+#endif
  
         /* ipi */
         void (*send_IPI_mask)(const struct cpumask *mask, int vector);
@@ -338,6 +355,7 @@ struct apic {
         void (*send_IPI_all)(int vector);
         void (*send_IPI_self)(int vector);
  
+#ifndef CONFIG_XEN
         /* wakeup_secondary_cpu */
         int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
  
@@ -377,6 +395,7 @@ struct apic {
          */
         int (*x86_32_numa_cpu_node)(int cpu);
  #endif
+#endif /* CONFIG_XEN */
  };
  
  /*
@@ -386,6 +405,8 @@ struct apic {
   */
  extern struct apic *apic;
  
+#ifndef CONFIG_XEN
+
  /*
   * APIC drivers are probed based on how they are listed in the .apicdrivers
   * section. So the order is important and enforced by the ordering
@@ -504,6 +525,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert)
  
  extern void generic_bigsmp_probe(void);
  
+#endif /* CONFIG_XEN */
  
  #ifdef CONFIG_X86_LOCAL_APIC
  
@@ -520,6 +542,8 @@ static inline const struct cpumask *default_target_cpus(void)
  #endif
  }
  
+#ifndef CONFIG_XEN
+
  DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
  
  
@@ -631,6 +655,8 @@ extern int default_cpu_present_to_apicid(int mps_cpu);
  extern int default_check_phys_apicid_present(int phys_apicid);
  #endif
  
+#endif /* CONFIG_XEN */
+
  #endif /* CONFIG_X86_LOCAL_APIC */
  
  #endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h

index 134bba0..96fd18b 100644 (file)
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -17,6 +17,8 @@
   */
  #define IO_APIC_SLOT_SIZE              1024
  
+#ifndef CONFIG_XEN
+
  #define        APIC_ID         0x20
  
  #define        APIC_LVR        0x30
@@ -147,6 +149,16 @@
  #define XAPIC_ENABLE   (1UL << 11)
  #define X2APIC_ENABLE  (1UL << 10)
  
+#else /* CONFIG_XEN */
+
+enum {
+       APIC_DEST_ALLBUT = 0x1,
+       APIC_DEST_SELF,
+       APIC_DEST_ALLINC
+};
+
+#endif /* CONFIG_XEN */
+
  #ifdef CONFIG_X86_32
  # define MAX_IO_APICS 64
  # define MAX_LOCAL_APIC 256
@@ -155,6 +167,8 @@
  # define MAX_LOCAL_APIC 32768
  #endif
  
+#ifndef CONFIG_XEN
+
  /*
   * All x86-64 systems are xAPIC compatible.
   * In the following, "apicid" is a physical APIC ID.
@@ -425,6 +439,8 @@ struct local_apic {
  
  #undef u32
  
+#endif /* CONFIG_XEN */
+
  #ifdef CONFIG_X86_32
   #define BAD_APICID 0xFFu
  #else
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h

index 5e1a2ee..2d2275a 100644 (file)
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -16,7 +16,7 @@
                                 & ~(CONFIG_PHYSICAL_ALIGN - 1))
  
  /* Minimum kernel alignment, as a power of two */
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
  #define MIN_KERNEL_ALIGN_LG2   PMD_SHIFT
  #else
  #define MIN_KERNEL_ALIGN_LG2   (PAGE_SHIFT + THREAD_ORDER)
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h

index d680579..70209da 100644 (file)
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -229,7 +229,11 @@ static inline void __user *arch_compat_alloc_user_space(long len)
                 sp = task_pt_regs(current)->sp;
         } else {
                 /* -128 for the x32 ABI redzone */
+#ifndef CONFIG_XEN
                 sp = percpu_read(old_rsp) - 128;
+#else
+               sp = task_pt_regs(current)->sp - 128;
+#endif
         }
  
         return (void __user *)round_down(sp - len, 16);
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h

index 340ee49..e7c76f9 100644 (file)
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -296,7 +296,11 @@ extern const char * const x86_power_flags[32];
  #define cpu_has_xmm4_1         boot_cpu_has(X86_FEATURE_XMM4_1)
  #define cpu_has_xmm4_2         boot_cpu_has(X86_FEATURE_XMM4_2)
  #define cpu_has_x2apic         boot_cpu_has(X86_FEATURE_X2APIC)
+#ifndef CONFIG_XEN
  #define cpu_has_xsave          boot_cpu_has(X86_FEATURE_XSAVE)
+#else
+#define cpu_has_xsave          boot_cpu_has(X86_FEATURE_OSXSAVE)
+#endif
  #define cpu_has_osxsave                boot_cpu_has(X86_FEATURE_OSXSAVE)
  #define cpu_has_hypervisor     boot_cpu_has(X86_FEATURE_HYPERVISOR)
  #define cpu_has_pclmulqdq      boot_cpu_has(X86_FEATURE_PCLMULQDQ)
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h

index 2d91580..dddccdd 100644 (file)
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -94,6 +94,7 @@ DECLARE_PER_CPU(unsigned long, cpu_dr7);
  
  static inline unsigned long native_get_debugreg(int regno)
  {
+#ifndef CONFIG_XEN
         unsigned long val = 0;  /* Damn you, gcc! */
  
         switch (regno) {
@@ -119,10 +120,14 @@ static inline unsigned long native_get_debugreg(int regno)
                 BUG();
         }
         return val;
+#else
+       return HYPERVISOR_get_debugreg(regno);
+#endif
  }
  
  static inline void native_set_debugreg(int regno, unsigned long value)
  {
+#ifndef CONFIG_XEN
         switch (regno) {
         case 0:
                 asm("mov %0, %%db0"     ::"r" (value));
@@ -145,6 +150,9 @@ static inline void native_set_debugreg(int regno, unsigned long value)
         default:
                 BUG();
         }
+#else
+       WARN_ON(HYPERVISOR_set_debugreg(regno, value));
+#endif
  }
  
  static inline void hw_breakpoint_disable(void)
@@ -168,7 +176,7 @@ extern void aout_dump_debugregs(struct user *dump);
  
  extern void hw_breakpoint_restore(void);
  
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_IDT)
  DECLARE_PER_CPU(int, debug_stack_usage);
  static inline void debug_stack_usage_inc(void)
  {
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h

index f6f1598..699a973 100644 (file)
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -35,7 +35,8 @@
  #define CFI_SIGNAL_FRAME
  #endif
  
-#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
+#if !defined(CONFIG_UNWIND_INFO) && defined(CONFIG_AS_CFI_SECTIONS) \
+    && defined(__ASSEMBLY__)
         /*
          * Emit CFI data in .debug_frame sections, not .eh_frame sections.
          * The latter we currently just discard since we don't do DWARF
@@ -53,7 +54,7 @@
   * Due to the structure of pre-exisiting code, don't use assembler line
   * comment character # to ignore the arguments. Instead, use a dummy macro.
   */
-.macro cfi_ignore a=0, b=0, c=0, d=0
+.macro cfi_ignore a=0, b=0, c=0, d=0, e=0, f=0, g=0, h=0
  .endm
  
  #define CFI_STARTPROC          cfi_ignore
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h

index 3778256..ba52483 100644 (file)
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -66,7 +66,11 @@ struct e820map {
         struct e820entry map[E820_X_MAX];
  };
  
+#ifndef CONFIG_XEN
  #define ISA_START_ADDRESS      0xa0000
+#else
+#define ISA_START_ADDRESS      0
+#endif
  #define ISA_END_ADDRESS                0x100000
  
  #define BIOS_BEGIN             0x000a0000
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h

index 382f75d..031153e 100644 (file)
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -18,7 +18,11 @@ typedef struct {
  #ifdef CONFIG_SMP
         unsigned int irq_resched_count;
         unsigned int irq_call_count;
+#ifndef CONFIG_XEN
         unsigned int irq_tlb_count;
+#else
+       unsigned int irq_lock_count;
+#endif
  #endif
  #ifdef CONFIG_X86_THERMAL_VECTOR
         unsigned int irq_thermal_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h

index eb92a6e..b54fa50 100644 (file)
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -101,6 +101,7 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
         irq_attr->polarity      = polarity;
  }
  
+#ifndef CONFIG_XEN
  struct irq_2_iommu {
         struct intel_iommu *iommu;
         u16 irte_index;
@@ -123,6 +124,9 @@ struct irq_cfg {
         struct irq_2_iommu      irq_2_iommu;
  #endif
  };
+#else
+struct irq_cfg;
+#endif
  
  extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
  extern void send_cleanup_vector(struct irq_cfg *);
@@ -159,9 +163,15 @@ extern void smp_invalidate_interrupt(struct pt_regs *);
  #else
  extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
  #endif
+extern void smp_irq_work_interrupt(struct pt_regs *);
+#ifdef CONFIG_XEN
+extern void smp_reboot_interrupt(struct pt_regs *);
+#endif
  #endif
  
+#ifndef CONFIG_XEN
  extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
+#endif
  
  typedef int vector_irq_t[NR_VECTORS];
  DECLARE_PER_CPU(vector_irq_t, vector_irq);
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h

index 7a15153..2bb1d90 100644 (file)
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -60,3 +60,7 @@ static inline bool hypervisor_x2apic_available(void)
  }
  
  #endif
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include_next <asm/hypervisor.h>
+#endif
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h

index a203659..b9daf61 100644 (file)
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -54,6 +54,7 @@ extern struct irq_chip i8259A_chip;
  
  struct legacy_pic {
         int nr_legacy_irqs;
+#ifndef CONFIG_XEN
         struct irq_chip *chip;
         void (*mask)(unsigned int irq);
         void (*unmask)(unsigned int irq);
@@ -61,6 +62,7 @@ struct legacy_pic {
         void (*restore_mask)(void);
         void (*init)(int auto_eoi);
         int (*irq_pending)(unsigned int irq);
+#endif
         void (*make_irq)(unsigned int irq);
  };
  
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h

index d8e8eef..51cd71d 100644 (file)
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -331,7 +331,7 @@ extern void early_iounmap(void __iomem *addr, unsigned long size);
  extern void fixup_early_ioremap(void);
  extern bool is_early_ioremap_ptep(pte_t *ptep);
  
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
  #include <xen/xen.h>
  struct bio_vec;
  
@@ -341,7 +341,7 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
  #define BIOVEC_PHYS_MERGEABLE(vec1, vec2)                              \
         (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&                         \
          (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
-#endif /* CONFIG_XEN */
+#endif /* CONFIG_PARAVIRT_XEN */
  
  #define IO_SPACE_LIMIT 0xffff
  
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h

index 317ff17..2e587bc 100644 (file)
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -5,14 +5,30 @@
  # define PA_CONTROL_PAGE       0
  # define VA_CONTROL_PAGE       1
  # define PA_PGD                        2
+# ifndef CONFIG_XEN
  # define PA_SWAP_PAGE          3
  # define PAGES_NR              4
+# else /* CONFIG_XEN */
+/*
+ * The hypervisor interface implicitly requires that all entries (except
+ * for possibly the final one) are arranged in matching PA_/VA_ pairs.
+#  define VA_PGD               3
+ */
+#  define PA_SWAP_PAGE         4
+#  define PAGES_NR             5
+# endif /* CONFIG_XEN */
  #else
  # define PA_CONTROL_PAGE       0
  # define VA_CONTROL_PAGE       1
  # define PA_TABLE_PAGE         2
+# ifndef CONFIG_XEN
  # define PA_SWAP_PAGE          3
  # define PAGES_NR              4
+# else /* CONFIG_XEN, see comment above
+#  define VA_TABLE_PAGE                3 */
+#  define PA_SWAP_PAGE         4
+#  define PAGES_NR             5
+# endif /* CONFIG_XEN */
  #endif
  
  # define KEXEC_CONTROL_CODE_MAX_SIZE   2048
@@ -163,6 +179,19 @@ struct kimage_arch {
  };
  #endif
  
+/* Under Xen we need to work with machine addresses. These macros give the
+ * machine address of a certain page to the generic kexec code instead of
+ * the pseudo physical address which would be given by the default macros.
+ */
+
+#ifdef CONFIG_XEN
+#define KEXEC_ARCH_HAS_PAGE_MACROS
+#define kexec_page_to_pfn(page)  pfn_to_mfn(page_to_pfn(page))
+#define kexec_pfn_to_page(pfn)   pfn_to_page(mfn_to_pfn(pfn))
+#define kexec_virt_to_phys(addr) virt_to_machine(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr))
+#endif
+
  #endif /* __ASSEMBLY__ */
  
  #endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h

index a01e7ec..a39d3e1 100644 (file)
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -5,6 +5,8 @@
  #ifndef _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
  #define _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
  
+#include <linux/nmi.h>
+#include <asm/delay.h>
  #include <asm/mc146818rtc.h>
  
  #define NMI_REASON_PORT                0x61
@@ -22,6 +24,29 @@ static inline unsigned char default_get_nmi_reason(void)
         return inb(NMI_REASON_PORT);
  }
  
+static inline void clear_serr_error(unsigned char reason)
+{
+       reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
+       outb(reason, NMI_REASON_PORT);
+}
+
+static inline void clear_io_check_error(unsigned char reason)
+{
+       unsigned long i;
+
+       reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
+       outb(reason, NMI_REASON_PORT);
+
+       i = 20000;
+       while (--i) {
+               touch_nmi_watchdog();
+               udelay(100);
+       }
+
+       reason &= ~NMI_REASON_CLEAR_IOCHK;
+       outb(reason, NMI_REASON_PORT);
+}
+
  static inline void reassert_nmi(void)
  {
         int old_reg = -1;
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h

index d354fb7..9198ee4 100644 (file)
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -13,7 +13,7 @@
  #define RTC_ALWAYS_BCD 1       /* RTC operates in binary mode */
  #endif
  
-#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) && defined(__HAVE_ARCH_CMPXCHG)
  /*
   * This lock provides nmi access to the CMOS/RTC registers.  It has some
   * special properties.  It is owned by a CPU and stores the index register
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h

index 5f55e69..e9b162a 100644 (file)
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -16,12 +16,15 @@ typedef struct {
         /* True if mm supports a task running in 32 bit compatibility mode. */
         unsigned short ia32_compat;
  #endif
+#ifdef CONFIG_XEN
+       bool has_foreign_mappings:1;
+#endif
  
         struct mutex lock;
         void *vdso;
  } mm_context_t;
  
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
  void leave_mm(int cpu);
  #else
  static inline void leave_mm(int cpu)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h

index fd3f9f1..d3d8968 100644 (file)
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -17,7 +17,10 @@ struct ctl_table;
  extern int proc_nmi_enabled(struct ctl_table *, int ,
                         void __user *, size_t *, loff_t *);
  extern int unknown_nmi_panic;
+#endif
  
+#if (defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)) || \
+    (defined(CONFIG_XEN_SMPBOOT) && CONFIG_XEN_COMPAT >= 0x030200)
  void arch_trigger_all_cpu_backtrace(void);
  #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
  #endif
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h

index 7639dbf..63775df 100644 (file)
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -69,7 +69,15 @@ extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
  #endif /* !__ASSEMBLY__ */
  
  #ifdef CONFIG_FLATMEM
+/*
+ * While max_pfn is not exported, max_mapnr never gets initialized for non-Xen
+ * other than for hotplugged memory.
+ */
+#ifndef CONFIG_XEN
  #define pfn_valid(pfn)          ((pfn) < max_pfn)
+#else
+#define pfn_valid(pfn)          ((pfn) < max_mapnr)
+#endif
  #endif
  
  #endif /* _ASM_X86_PAGE_64_DEFS_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h

index dcfde52..9ca586f 100644 (file)
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -133,6 +133,8 @@ struct pt_regs {
  #include <linux/init.h>
  #ifdef CONFIG_PARAVIRT
  #include <asm/paravirt_types.h>
+#elif defined(CONFIG_X86_64_XEN)
+#include <xen/interface/xen.h>
  #endif
  
  struct cpuinfo_x86;
@@ -192,7 +194,13 @@ static inline int v8086_mode(struct pt_regs *regs)
  #ifdef CONFIG_X86_64
  static inline bool user_64bit_mode(struct pt_regs *regs)
  {
-#ifndef CONFIG_PARAVIRT
+#if defined(CONFIG_XEN)
+       /*
+        * On Xen, these are the only long mode CPL 3 selectors.
+        * We do not allow long mode selectors in the LDT.
+        */
+       return regs->cs == __USER_CS || regs->cs == FLAT_USER_CS64;
+#elif !defined(CONFIG_PARAVIRT)
         /*
          * On non-paravirt systems, this is the only long mode CPL 3
          * selector.  We do not allow long mode selectors in the LDT.
@@ -285,7 +293,9 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
  }
  
  #define arch_has_single_step() (1)
-#ifdef CONFIG_X86_DEBUGCTLMSR
+#if defined(CONFIG_XEN)
+#define arch_has_block_step()  (0)
+#elif defined(CONFIG_X86_DEBUGCTLMSR)
  #define arch_has_block_step()  (1)
  #else
  #define arch_has_block_step()  (boot_cpu_data.x86 >= 6)
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h

index 6c7fc25..b0549bf 100644 (file)
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -48,7 +48,7 @@
  #endif
  
  #ifdef CONFIG_X86_64
-#ifdef CONFIG_PARAVIRT
+#if defined(CONFIG_PARAVIRT) || defined(CONFIG_XEN)
  /* Paravirtualized systems may not have PSE or PGE available */
  #define NEED_PSE       0
  #define NEED_PGE       0
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h

index 1654662..9be96d7 100644 (file)
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -188,7 +188,9 @@
  #define __KERNEL_DS    (GDT_ENTRY_KERNEL_DS*8)
  #define __USER_DS      (GDT_ENTRY_DEFAULT_USER_DS*8+3)
  #define __USER_CS      (GDT_ENTRY_DEFAULT_USER_CS*8+3)
-#ifndef CONFIG_PARAVIRT
+#if defined(CONFIG_X86_XEN)
+#define get_kernel_rpl()  (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
+#elif !defined(CONFIG_PARAVIRT)
  #define get_kernel_rpl()  0
  #endif
  
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h

index 70bbe39..50525cb 100644 (file)
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -37,6 +37,9 @@ print_context_stack_bp(struct thread_info *tinfo,
  /* Generic stack tracer with callbacks */
  
  struct stacktrace_ops {
+       void (*warning)(void *data, char *msg);
+       /* msg must contain %s for the symbol */
+       void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
         void (*address)(void *data, unsigned long address, int reliable);
         /* On negative return stop dumping */
         int (*stack)(void *data, char *name);
@@ -89,6 +92,10 @@ extern void
  show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                    unsigned long *sp, unsigned long bp, char *log_lvl);
  
+int try_stack_unwind(struct task_struct *task, struct pt_regs *regs,
+                     unsigned long **stack, unsigned long *bp,
+                     const struct stacktrace_ops *ops, void *data);
+
  extern unsigned int code_bytes;
  
  /* The form of the top of the frame on the stack */
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h

index 4ec45b3..5746248 100644 (file)
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -100,12 +100,22 @@ do {                                                                      \
  #define __switch_canary_iparam
  #endif /* CC_STACKPROTECTOR */
  
+/* The stack unwind code needs this but it pollutes traces otherwise */
+#ifdef CONFIG_UNWIND_INFO
+#define THREAD_RETURN_SYM \
+       ".globl thread_return\n" \
+       "thread_return:\n\t"
+#else
+#define THREAD_RETURN_SYM
+#endif
+
  /* Save restore flags to clear handle leaking NT */
  #define switch_to(prev, next, last) \
         asm volatile(SAVE_CONTEXT                                         \
              "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
              "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
              "call __switch_to\n\t"                                       \
+            THREAD_RETURN_SYM                                            \
              "movq "__percpu_arg([current_task])",%%rsi\n\t"              \
              __switch_canary                                              \
              "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h

index ad6df8c..d6ac42b 100644 (file)
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -97,6 +97,9 @@ struct thread_info {
  #define TIF_SYSCALL_TRACEPOINT 28      /* syscall tracepoint instrumentation */
  #define TIF_ADDR32             29      /* 32-bit address space on 64 bits */
  #define TIF_X32                        30      /* 32-bit native x86-64 binary */
+#if defined(CONFIG_X86_XEN) && defined(CONFIG_CPU_SUP_AMD)
+#define TIF_CSTAR              31      /* cstar-based syscall (special handling) */
+#endif
  
  #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
  #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
@@ -120,6 +123,7 @@ struct thread_info {
  #define _TIF_SYSCALL_TRACEPOINT        (1 << TIF_SYSCALL_TRACEPOINT)
  #define _TIF_ADDR32            (1 << TIF_ADDR32)
  #define _TIF_X32               (1 << TIF_X32)
+#define _TIF_CSTAR             (1 << TIF_CSTAR)
  
  /* work to do in syscall_trace_enter() */
  #define _TIF_WORK_SYSCALL_ENTRY        \
@@ -147,9 +151,13 @@ struct thread_info {
          _TIF_USER_RETURN_NOTIFY)
  
  /* flags to check in __switch_to() */
+#ifndef CONFIG_XEN
  #define _TIF_WORK_CTXSW                                                        \
         (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
  
+#else
+#define _TIF_WORK_CTXSW (_TIF_NOTSC /*todo | _TIF_BLOCKSTEP */)
+#endif
  #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
  #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
  
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h

index b9676ae..7fb31d4 100644 (file)
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -30,7 +30,7 @@
  #  define ENABLE_TOPO_DEFINES
  # endif
  #else
-# ifdef CONFIG_SMP
+# if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
  #  define ENABLE_TOPO_DEFINES
  # endif
  #endif
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h

index feca311..0a55878 100644 (file)
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -1,4 +1,4 @@
-#ifndef _ASM_X86_TRAMPOLINE_H
+#if !defined(_ASM_X86_TRAMPOLINE_H) && !defined(CONFIG_XEN)
  #define _ASM_X86_TRAMPOLINE_H
  
  #ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h

index 88eae2a..6d16bc7 100644 (file)
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -40,6 +40,9 @@ asmlinkage void alignment_check(void);
  asmlinkage void machine_check(void);
  #endif /* CONFIG_X86_MCE */
  asmlinkage void simd_coprocessor_error(void);
+#ifdef CONFIG_X86_XEN
+asmlinkage void fixup_4gb_segment(void);
+#endif
  
  dotraplinkage void do_divide_error(struct pt_regs *, long);
  dotraplinkage void do_debug(struct pt_regs *, long);
@@ -68,6 +71,9 @@ dotraplinkage void do_machine_check(struct pt_regs *, long);
  dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
  #ifdef CONFIG_X86_32
  dotraplinkage void do_iret_error(struct pt_regs *, long);
+#ifdef CONFIG_XEN
+void do_fixup_4gb_segment(struct pt_regs *, long);
+#endif
  #endif
  
  static inline int get_si_code(unsigned long condition)
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h

new file mode 100644 (file)

index 0000000..d5a2411
--- /dev/null
+++ b/arch/x86/include/asm/unwind.h
@@ -0,0 +1,163 @@
+#ifndef _ASM_X86_UNWIND_H
+#define _ASM_X86_UNWIND_H
+
+/*
+ * Copyright (C) 2002-2009 Novell, Inc.
+ *     Jan Beulich <jbeulich@novell.com>
+ * This code is released under version 2 of the GNU GPL.
+ */
+
+#ifdef CONFIG_STACK_UNWIND
+
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <asm/ptrace.h>
+
+struct unwind_frame_info
+{
+       struct pt_regs regs;
+       struct task_struct *task;
+       unsigned call_frame:1;
+};
+
+#define UNW_PC(frame)      (frame)->regs.ip
+#define UNW_SP(frame)      (frame)->regs.sp
+#ifdef CONFIG_FRAME_POINTER
+#define UNW_FP(frame)      (frame)->regs.bp
+#define FRAME_LINK_OFFSET  0
+#define STACK_BOTTOM(tsk)  STACK_LIMIT((tsk)->thread.sp0)
+#define TSK_STACK_TOP(tsk) ((tsk)->thread.sp0)
+#else
+#define UNW_FP(frame)      ((void)(frame), 0UL)
+#endif
+/* On x86-64, might need to account for the special exception and interrupt
+   handling stacks here, since normally
+       EXCEPTION_STACK_ORDER < THREAD_ORDER < IRQSTACK_ORDER,
+   but the construct is needed only for getting across the stack switch to
+   the interrupt stack - thus considering the IRQ stack itself is unnecessary,
+   and the overhead of comparing against all exception handling stacks seems
+   not desirable. */
+#define STACK_LIMIT(ptr)   (((ptr) - 1) & ~(THREAD_SIZE - 1))
+
+#ifdef CONFIG_X86_64
+
+#include <asm/vsyscall.h>
+
+#define FRAME_RETADDR_OFFSET 8
+
+#define UNW_REGISTER_INFO \
+       PTREGS_INFO(ax), \
+       PTREGS_INFO(dx), \
+       PTREGS_INFO(cx), \
+       PTREGS_INFO(bx), \
+       PTREGS_INFO(si), \
+       PTREGS_INFO(di), \
+       PTREGS_INFO(bp), \
+       PTREGS_INFO(sp), \
+       PTREGS_INFO(r8), \
+       PTREGS_INFO(r9), \
+       PTREGS_INFO(r10), \
+       PTREGS_INFO(r11), \
+       PTREGS_INFO(r12), \
+       PTREGS_INFO(r13), \
+       PTREGS_INFO(r14), \
+       PTREGS_INFO(r15), \
+       PTREGS_INFO(ip)
+
+#else /* X86_32 */
+
+#include <asm/fixmap.h>
+
+#define FRAME_RETADDR_OFFSET 4
+
+#define UNW_REGISTER_INFO \
+       PTREGS_INFO(ax), \
+       PTREGS_INFO(cx), \
+       PTREGS_INFO(dx), \
+       PTREGS_INFO(bx), \
+       PTREGS_INFO(sp), \
+       PTREGS_INFO(bp), \
+       PTREGS_INFO(si), \
+       PTREGS_INFO(di), \
+       PTREGS_INFO(ip)
+
+#endif
+
+#define UNW_DEFAULT_RA(raItem, dataAlign) \
+       ((raItem).where == Memory && \
+        !((raItem).value * (dataAlign) + sizeof(void *)))
+
+static inline void arch_unw_init_frame_info(struct unwind_frame_info *info,
+                                            /*const*/ struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_64
+       info->regs = *regs;
+#else
+       if (user_mode_vm(regs))
+               info->regs = *regs;
+       else {
+               memcpy(&info->regs, regs, offsetof(struct pt_regs, sp));
+               info->regs.sp = (unsigned long)&regs->sp;
+               info->regs.ss = __KERNEL_DS;
+       }
+#endif
+}
+
+static inline void arch_unw_init_blocked(struct unwind_frame_info *info)
+{
+#ifdef CONFIG_X86_64
+       extern const char thread_return[];
+
+       memset(&info->regs, 0, sizeof(info->regs));
+       info->regs.ip = (unsigned long)thread_return;
+       info->regs.cs = __KERNEL_CS;
+       probe_kernel_address(info->task->thread.sp, info->regs.bp);
+       info->regs.sp = info->task->thread.sp;
+       info->regs.ss = __KERNEL_DS;
+#else
+       memset(&info->regs, 0, sizeof(info->regs));
+       info->regs.ip = info->task->thread.ip;
+       info->regs.cs = __KERNEL_CS;
+       probe_kernel_address(info->task->thread.sp, info->regs.bp);
+       info->regs.sp = info->task->thread.sp;
+       info->regs.ss = __KERNEL_DS;
+       info->regs.ds = __USER_DS;
+       info->regs.es = __USER_DS;
+#endif
+}
+
+extern asmlinkage int
+arch_unwind_init_running(struct unwind_frame_info *,
+                        unwind_callback_fn,
+                        const struct stacktrace_ops *, void *data);
+
+static inline int arch_unw_user_mode(/*const*/ struct unwind_frame_info *info)
+{
+#ifdef CONFIG_X86_64
+       return user_mode(&info->regs)
+              || (long)info->regs.ip >= 0
+              || (info->regs.ip >= VSYSCALL_START && info->regs.ip < VSYSCALL_END)
+              || (long)info->regs.sp >= 0;
+#else
+       return user_mode_vm(&info->regs)
+              || info->regs.ip < PAGE_OFFSET
+              || (info->regs.ip >= __fix_to_virt(FIX_VDSO)
+                  && info->regs.ip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE)
+              || info->regs.sp < PAGE_OFFSET;
+#endif
+}
+
+#else
+
+#define UNW_PC(frame) ((void)(frame), 0UL)
+#define UNW_SP(frame) ((void)(frame), 0UL)
+#define UNW_FP(frame) ((void)(frame), 0UL)
+
+static inline int arch_unw_user_mode(const void *info)
+{
+       return 0;
+}
+
+#endif
+
+#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h

index 21f7385..1c44df1 100644 (file)
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -11,7 +11,7 @@
  #ifndef _ASM_X86_UV_UV_HUB_H
  #define _ASM_X86_UV_UV_HUB_H
  
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_UV
  #include <linux/numa.h>
  #include <linux/percpu.h>
  #include <linux/timer.h>
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h

index 5728852..9219196 100644 (file)
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -48,6 +48,7 @@
  #include <xen/interface/sched.h>
  #include <xen/interface/physdev.h>
  #include <xen/interface/platform.h>
+#include <xen/interface/tmem.h>
  
  /*
   * The hypercall asms have to meet several constraints:
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h

index 66d0fff..41ff2bd 100644 (file)
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -58,7 +58,7 @@ static inline uint32_t xen_cpuid_base(void)
         return 0;
  }
  
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
  extern bool xen_hvm_need_lapic(void);
  
  static inline bool xen_x2apic_para_available(void)
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h

index cbf0c9d..40c95d2 100644 (file)
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -10,17 +10,20 @@
  #define _ASM_X86_XEN_INTERFACE_H
  
  #ifdef __XEN__
-#define __DEFINE_GUEST_HANDLE(name, type) \
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
      typedef struct { type *p; } __guest_handle_ ## name
  #else
-#define __DEFINE_GUEST_HANDLE(name, type) \
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
      typedef type * __guest_handle_ ## name
  #endif
  
+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
+    ___DEFINE_XEN_GUEST_HANDLE(name, type);   \
+    ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type)
  #define DEFINE_GUEST_HANDLE_STRUCT(name) \
-       __DEFINE_GUEST_HANDLE(name, struct name)
-#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
-#define GUEST_HANDLE(name)        __guest_handle_ ## name
+       __DEFINE_XEN_GUEST_HANDLE(name, struct name)
+#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name)
+#define XEN_GUEST_HANDLE(name)        __guest_handle_ ## name
  
  #ifdef __XEN__
  #if defined(__i386__)
@@ -47,16 +50,8 @@
  #endif
  
  #ifndef __ASSEMBLY__
-/* Guest handles for primitive C types. */
-__DEFINE_GUEST_HANDLE(uchar, unsigned char);
-__DEFINE_GUEST_HANDLE(uint,  unsigned int);
-__DEFINE_GUEST_HANDLE(ulong, unsigned long);
-DEFINE_GUEST_HANDLE(char);
-DEFINE_GUEST_HANDLE(int);
-DEFINE_GUEST_HANDLE(long);
-DEFINE_GUEST_HANDLE(void);
-DEFINE_GUEST_HANDLE(uint64_t);
-DEFINE_GUEST_HANDLE(uint32_t);
+typedef unsigned long xen_pfn_t;
+typedef unsigned long xen_ulong_t;
  #endif
  
  #ifndef HYPERVISOR_VIRT_START
@@ -68,7 +63,7 @@ DEFINE_GUEST_HANDLE(uint32_t);
  #define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
  
  /* Maximum number of virtual CPUs in multi-processor guests. */
-#define MAX_VIRT_CPUS 32
+#define XEN_LEGACY_MAX_VCPUS 32
  
  /*
   * SEGMENT DESCRIPTOR TABLES
diff --git a/arch/x86/include/mach-xen/asm/agp.h b/arch/x86/include/mach-xen/asm/agp.h

new file mode 100644 (file)

index 0000000..45ba49d
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/agp.h
@@ -0,0 +1,58 @@
+#ifndef _ASM_X86_AGP_H
+#define _ASM_X86_AGP_H
+
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
+#include <asm/special_insns.h>
+
+/*
+ * Functions to keep the agpgart mappings coherent with the MMU. The
+ * GART gives the CPU a physical alias of pages in memory. The alias
+ * region is mapped uncacheable. Make sure there are no conflicting
+ * mappings with different cachability attributes for the same
+ * page. This avoids data corruption on some CPUs.
+ */
+
+#define map_page_into_agp(page) ( \
+       xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \
+       ?: set_pages_uc(page, 1))
+#define unmap_page_from_agp(page) ( \
+       xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \
+       /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
+       set_pages_wb(page, 1))
+
+#define map_pages_into_agp(pages, nr) ({ \
+       __typeof__(nr) n__; \
+       int rc__ = 0; \
+       for (n__ = 0; n__ < (nr) && !rc__; ++n__) \
+               rc__ = xen_create_contiguous_region( \
+                       (unsigned long)page_address((pages)[n__]), 0, 32); \
+       rc__ ?: set_pages_array_uc(pages, nr); \
+})
+#define unmap_pages_from_agp(pages, nr) ({ \
+       __typeof__(nr) n__; \
+       for (n__ = 0; n__ < nr; ++n__) \
+               xen_destroy_contiguous_region( \
+                       (unsigned long)page_address((pages)[n__]), 0); \
+       /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \
+       set_pages_array_wb(pages, nr); \
+})
+
+/*
+ * Could use CLFLUSH here if the cpu supports it. But then it would
+ * need to be called for each cacheline of the whole page so it may
+ * not be worth it. Would need a page for it.
+ */
+#define flush_agp_cache() wbinvd()
+
+#define virt_to_gart virt_to_machine
+
+/* GATT allocation. Returns/accepts GATT kernel virtual address. */
+#define alloc_gatt_pages(order)        ({                                          \
+       char *_t; dma_addr_t _d;                                            \
+       _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL);    \
+       _t; })
+#define free_gatt_pages(table, order)  \
+       dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table))
+
+#endif /* _ASM_X86_AGP_H */
diff --git a/arch/x86/include/mach-xen/asm/cmpxchg.h b/arch/x86/include/mach-xen/asm/cmpxchg.h

new file mode 100644 (file)

index 0000000..17fde1d
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/cmpxchg.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_X86_XEN_CMPXCHG_H
+#define _ASM_X86_XEN_CMPXCHG_H
+
+#include_next <asm/cmpxchg.h>
+#ifdef CONFIG_X86_32
+# include "cmpxchg_32.h"
+#else
+# include "cmpxchg_64.h"
+#endif
+
+#endif /* _ASM_X86_XEN_CMPXCHG_H */
diff --git a/arch/x86/include/mach-xen/asm/cmpxchg_32.h b/arch/x86/include/mach-xen/asm/cmpxchg_32.h

new file mode 100644 (file)

index 0000000..9effb00
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/cmpxchg_32.h
@@ -0,0 +1,24 @@
+#ifndef _ASM_X86_XEN_CMPXCHG_32_H
+#define _ASM_X86_XEN_CMPXCHG_32_H
+
+static inline u64 get_64bit(const volatile u64 *ptr)
+{
+       u64 res;
+       __asm__("movl %%ebx,%%eax\n"
+               "movl %%ecx,%%edx\n"
+               LOCK_PREFIX "cmpxchg8b %1"
+               : "=&A" (res) : "m" (*ptr));
+       return res;
+}
+
+static inline u64 get_64bit_local(const volatile u64 *ptr)
+{
+       u64 res;
+       __asm__("movl %%ebx,%%eax\n"
+               "movl %%ecx,%%edx\n"
+               "cmpxchg8b %1"
+               : "=&A" (res) : "m" (*ptr));
+       return res;
+}
+
+#endif /* _ASM_X86_XEN_CMPXCHG_32_H */
diff --git a/arch/x86/include/mach-xen/asm/cmpxchg_64.h b/arch/x86/include/mach-xen/asm/cmpxchg_64.h

new file mode 100644 (file)

index 0000000..092b27b
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/cmpxchg_64.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_X86_XEN_CMPXCHG_64_H
+#define _ASM_X86_XEN_CMPXCHG_64_H
+
+static inline u64 get_64bit(const volatile u64 *ptr)
+{
+       return *ptr;
+}
+
+#define get_64bit_local get_64bit
+
+#endif /* _ASM_X86_XEN_CMPXCHG_64_H */
diff --git a/arch/x86/include/mach-xen/asm/desc.h b/arch/x86/include/mach-xen/asm/desc.h

new file mode 100644 (file)

index 0000000..14862a0
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/desc.h
@@ -0,0 +1,433 @@
+#ifndef _ASM_X86_DESC_H
+#define _ASM_X86_DESC_H
+
+#include <asm/desc_defs.h>
+#include <asm/ldt.h>
+#include <asm/mmu.h>
+
+#include <linux/smp.h>
+
+static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info)
+{
+       desc->limit0            = info->limit & 0x0ffff;
+
+       desc->base0             = (info->base_addr & 0x0000ffff);
+       desc->base1             = (info->base_addr & 0x00ff0000) >> 16;
+
+       desc->type              = (info->read_exec_only ^ 1) << 1;
+       desc->type             |= info->contents << 2;
+
+       desc->s                 = 1;
+       desc->dpl               = 0x3;
+       desc->p                 = info->seg_not_present ^ 1;
+       desc->limit             = (info->limit & 0xf0000) >> 16;
+       desc->avl               = info->useable;
+       desc->d                 = info->seg_32bit;
+       desc->g                 = info->limit_in_pages;
+
+       desc->base2             = (info->base_addr & 0xff000000) >> 24;
+       /*
+        * Don't allow setting of the lm bit. It would confuse
+        * user_64bit_mode and would get overridden by sysret anyway.
+        */
+       desc->l                 = 0;
+}
+
+#ifndef CONFIG_X86_NO_IDT
+extern struct desc_ptr idt_descr;
+extern gate_desc idt_table[];
+extern struct desc_ptr nmi_idt_descr;
+extern gate_desc nmi_idt_table[];
+#endif
+
+struct gdt_page {
+       struct desc_struct gdt[GDT_ENTRIES];
+} __attribute__((aligned(PAGE_SIZE)));
+
+DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+
+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+{
+       return per_cpu(gdt_page, cpu).gdt;
+}
+
+#ifdef CONFIG_X86_64
+
+static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
+                            unsigned dpl, unsigned ist, unsigned seg)
+{
+       gate->offset_low        = PTR_LOW(func);
+       gate->segment           = __KERNEL_CS;
+       gate->ist               = ist;
+       gate->p                 = 1;
+       gate->dpl               = dpl;
+       gate->zero0             = 0;
+       gate->zero1             = 0;
+       gate->type              = type;
+       gate->offset_middle     = PTR_MIDDLE(func);
+       gate->offset_high       = PTR_HIGH(func);
+}
+
+#else
+static inline void pack_gate(gate_desc *gate, unsigned char type,
+                            unsigned long base, unsigned dpl, unsigned flags,
+                            unsigned short seg)
+{
+       gate->a = (seg << 16) | (base & 0xffff);
+       gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8);
+}
+
+#endif
+
+static inline int desc_empty(const void *ptr)
+{
+       const u32 *desc = ptr;
+
+       return !(desc[0] | desc[1]);
+}
+
+#ifndef CONFIG_XEN
+#define load_TR_desc()                         native_load_tr_desc()
+#define load_gdt(dtr)                          native_load_gdt(dtr)
+#define load_idt(dtr)                          native_load_idt(dtr)
+#define load_tr(tr)                            asm volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt)                          asm volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr)                         native_store_gdt(dtr)
+#define store_idt(dtr)                         native_store_idt(dtr)
+#define store_tr(tr)                           (tr = native_store_tr())
+
+#define load_TLS(t, cpu)                       native_load_tls(t, cpu)
+#define set_ldt                                        native_set_ldt
+
+#define write_ldt_entry(dt, entry, desc)       native_write_ldt_entry(dt, entry, desc)
+#define write_gdt_entry(dt, entry, desc, type) native_write_gdt_entry(dt, entry, desc, type)
+#define write_idt_entry(dt, entry, g)          native_write_idt_entry(dt, entry, g)
+
+static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
+{
+}
+
+static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
+{
+}
+
+#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
+
+static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
+{
+       memcpy(&idt[entry], gate, sizeof(*gate));
+}
+
+static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
+{
+       memcpy(&ldt[entry], desc, 8);
+}
+
+static inline void
+native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type)
+{
+       unsigned int size;
+
+       switch (type) {
+       case DESC_TSS:  size = sizeof(tss_desc);        break;
+       case DESC_LDT:  size = sizeof(ldt_desc);        break;
+       default:        size = sizeof(*gdt);            break;
+       }
+
+       memcpy(&gdt[entry], desc, size);
+}
+#endif
+
+static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
+                                  unsigned long limit, unsigned char type,
+                                  unsigned char flags)
+{
+       desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
+       desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+               (limit & 0x000f0000) | ((type & 0xff) << 8) |
+               ((flags & 0xf) << 20);
+       desc->p = 1;
+}
+
+
+#ifndef CONFIG_XEN
+static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size)
+{
+#ifdef CONFIG_X86_64
+       struct ldttss_desc64 *desc = d;
+
+       memset(desc, 0, sizeof(*desc));
+
+       desc->limit0            = size & 0xFFFF;
+       desc->base0             = PTR_LOW(addr);
+       desc->base1             = PTR_MIDDLE(addr) & 0xFF;
+       desc->type              = type;
+       desc->p                 = 1;
+       desc->limit1            = (size >> 16) & 0xF;
+       desc->base2             = (PTR_MIDDLE(addr) >> 8) & 0xFF;
+       desc->base3             = PTR_HIGH(addr);
+#else
+       pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
+#endif
+}
+
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+{
+       struct desc_struct *d = get_cpu_gdt_table(cpu);
+       tss_desc tss;
+
+       /*
+        * sizeof(unsigned long) coming from an extra "long" at the end
+        * of the iobitmap. See tss_struct definition in processor.h
+        *
+        * -1? seg base+limit should be pointing to the address of the
+        * last valid byte
+        */
+       set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
+                             IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
+                             sizeof(unsigned long) - 1);
+       write_gdt_entry(d, entry, &tss, DESC_TSS);
+}
+
+#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+
+static inline void native_set_ldt(const void *addr, unsigned int entries)
+{
+       if (likely(entries == 0))
+               asm volatile("lldt %w0"::"q" (0));
+       else {
+               unsigned cpu = smp_processor_id();
+               ldt_desc ldt;
+
+               set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
+                                     entries * LDT_ENTRY_SIZE - 1);
+               write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+                               &ldt, DESC_LDT);
+               asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+       }
+}
+
+static inline void native_load_tr_desc(void)
+{
+       asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static inline void native_load_gdt(const struct desc_ptr *dtr)
+{
+       asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static inline void native_load_idt(const struct desc_ptr *dtr)
+{
+       asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static inline void native_store_gdt(struct desc_ptr *dtr)
+{
+       asm volatile("sgdt %0":"=m" (*dtr));
+}
+
+static inline void native_store_idt(struct desc_ptr *dtr)
+{
+       asm volatile("sidt %0":"=m" (*dtr));
+}
+
+static inline unsigned long native_store_tr(void)
+{
+       unsigned long tr;
+
+       asm volatile("str %0":"=r" (tr));
+
+       return tr;
+}
+
+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+       unsigned int i;
+
+       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+               gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
+}
+#else
+#include <asm/pgtable.h>
+
+#define load_TLS(t, cpu) xen_load_tls(t, cpu)
+#define set_ldt xen_set_ldt
+
+extern int write_ldt_entry(struct desc_struct *ldt, int entry,
+                          const void *desc);
+extern int write_gdt_entry(struct desc_struct *gdt, int entry,
+                          const void *desc, int type);
+
+static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+       unsigned int i;
+       struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
+
+       for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+               if (HYPERVISOR_update_descriptor(
+                               arbitrary_virt_to_machine(&gdt[i]),
+                               *(u64 *)&t->tls_array[i]))
+                       BUG();
+}
+#endif
+
+#define _LDT_empty(info)                               \
+       ((info)->base_addr              == 0    &&      \
+        (info)->limit                  == 0    &&      \
+        (info)->contents               == 0    &&      \
+        (info)->read_exec_only         == 1    &&      \
+        (info)->seg_32bit              == 0    &&      \
+        (info)->limit_in_pages         == 0    &&      \
+        (info)->seg_not_present        == 1    &&      \
+        (info)->useable                == 0)
+
+#ifdef CONFIG_X86_64
+#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
+#else
+#define LDT_empty(info) (_LDT_empty(info))
+#endif
+
+static inline void clear_LDT(void)
+{
+       set_ldt(NULL, 0);
+}
+
+/*
+ * load one particular LDT into the current CPU
+ */
+static inline void load_LDT_nolock(mm_context_t *pc)
+{
+       set_ldt(pc->ldt, pc->size);
+}
+
+static inline void load_LDT(mm_context_t *pc)
+{
+       preempt_disable();
+       load_LDT_nolock(pc);
+       preempt_enable();
+}
+
+static inline unsigned long get_desc_base(const struct desc_struct *desc)
+{
+       return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
+}
+
+static inline void set_desc_base(struct desc_struct *desc, unsigned long base)
+{
+       desc->base0 = base & 0xffff;
+       desc->base1 = (base >> 16) & 0xff;
+       desc->base2 = (base >> 24) & 0xff;
+}
+
+static inline unsigned long get_desc_limit(const struct desc_struct *desc)
+{
+       return desc->limit0 | (desc->limit << 16);
+}
+
+static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
+{
+       desc->limit0 = limit & 0xffff;
+       desc->limit = (limit >> 16) & 0xf;
+}
+
+#ifndef CONFIG_X86_NO_IDT
+#ifdef CONFIG_X86_64
+static inline void set_nmi_gate(int gate, void *addr)
+{
+       gate_desc s;
+
+       pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
+       write_idt_entry(nmi_idt_table, gate, &s);
+}
+#endif
+
+static inline void _set_gate(int gate, unsigned type, void *addr,
+                            unsigned dpl, unsigned ist, unsigned seg)
+{
+       gate_desc s;
+
+       pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
+       /*
+        * does not need to be atomic because it is only done once at
+        * setup time
+        */
+       write_idt_entry(idt_table, gate, &s);
+}
+
+/*
+ * This needs to use 'idt_table' rather than 'idt', and
+ * thus use the _nonmapped_ version of the IDT, as the
+ * Pentium F0 0F bugfix can have resulted in the mapped
+ * IDT being write-protected.
+ */
+static inline void set_intr_gate(unsigned int n, void *addr)
+{
+       BUG_ON((unsigned)n > 0xFF);
+       _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
+}
+
+extern int first_system_vector;
+/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
+extern unsigned long used_vectors[];
+
+static inline void alloc_system_vector(int vector)
+{
+       if (!test_bit(vector, used_vectors)) {
+               set_bit(vector, used_vectors);
+               if (first_system_vector > vector)
+                       first_system_vector = vector;
+       } else {
+               BUG();
+       }
+}
+
+static inline void alloc_intr_gate(unsigned int n, void *addr)
+{
+       alloc_system_vector(n);
+       set_intr_gate(n, addr);
+}
+
+/*
+ * This routine sets up an interrupt gate at directory privilege level 3.
+ */
+static inline void set_system_intr_gate(unsigned int n, void *addr)
+{
+       BUG_ON((unsigned)n > 0xFF);
+       _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_system_trap_gate(unsigned int n, void *addr)
+{
+       BUG_ON((unsigned)n > 0xFF);
+       _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
+}
+
+static inline void set_trap_gate(unsigned int n, void *addr)
+{
+       BUG_ON((unsigned)n > 0xFF);
+       _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
+}
+
+static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
+{
+       BUG_ON((unsigned)n > 0xFF);
+       _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
+}
+
+static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+       BUG_ON((unsigned)n > 0xFF);
+       _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
+}
+
+static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
+{
+       BUG_ON((unsigned)n > 0xFF);
+       _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
+}
+#endif
+
+#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/mach-xen/asm/dma-mapping.h b/arch/x86/include/mach-xen/asm/dma-mapping.h

new file mode 100644 (file)

index 0000000..17ca77c
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/dma-mapping.h
@@ -0,0 +1,26 @@
+#ifndef _ASM_X86_DMA_MAPPING_H_
+
+#define phys_to_dma _phys_to_dma_
+#define dma_to_phys _dma_to_phys_
+
+#include_next <asm/dma-mapping.h>
+
+#undef phys_to_dma
+#undef dma_to_phys
+
+static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
+{
+       return phys_to_machine(paddr);
+}
+
+static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
+{
+       return machine_to_phys(daddr);
+}
+
+void dma_generic_free_coherent(struct device *, size_t, void *, dma_addr_t,
+                              struct dma_attrs *);
+
+extern int range_straddles_page_boundary(paddr_t p, size_t size);
+
+#endif /* _ASM_X86_DMA_MAPPING_H_ */
diff --git a/arch/x86/include/mach-xen/asm/fixmap.h b/arch/x86/include/mach-xen/asm/fixmap.h

new file mode 100644 (file)

index 0000000..dccdd97
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/fixmap.h
@@ -0,0 +1,240 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009
+ */
+
+#ifndef _ASM_X86_FIXMAP_H
+#define _ASM_X86_FIXMAP_H
+
+#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/page.h>
+#ifdef CONFIG_X86_32
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#else
+#include <asm/vsyscall.h>
+#endif
+
+/*
+ * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall
+ * uses fixmaps that relies on FIXADDR_TOP for proper address calculation.
+ * Because of this, FIXADDR_TOP x86 integration was left as later work.
+ */
+#ifdef CONFIG_X86_32
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+extern unsigned long __FIXADDR_TOP;
+#define FIXADDR_TOP    ((unsigned long)__FIXADDR_TOP)
+
+#define FIXADDR_USER_START     __fix_to_virt(FIX_VDSO)
+#define FIXADDR_USER_END       __fix_to_virt(FIX_VDSO - 1)
+#else
+#define FIXADDR_TOP    (VSYSCALL_END-PAGE_SIZE)
+
+/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
+#define FIXADDR_USER_START     ((unsigned long)VSYSCALL32_VSYSCALL)
+#define FIXADDR_USER_END       (FIXADDR_USER_START + PAGE_SIZE)
+#endif
+
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process.
+ * for x86_32: We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * These 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages (or larger if used with an increment
+ * higher than 1). Use set_fixmap(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+enum fixed_addresses {
+#ifdef CONFIG_X86_32
+       FIX_HOLE,
+       FIX_VDSO,
+#else
+       VSYSCALL_LAST_PAGE,
+       VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
+                           + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+       VVAR_PAGE,
+       VSYSCALL_HPET,
+#endif
+       FIX_DBGP_BASE,
+       FIX_EARLYCON_MEM_BASE,
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+       FIX_OHCI1394_BASE,
+#endif
+#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_LOCAL_APIC
+       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+       FIX_IO_APIC_BASE_0,
+       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
+#endif
+#else
+       FIX_SHARED_INFO,
+#define NR_FIX_ISAMAPS 256
+       FIX_ISAMAP_END,
+       FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1,
+#endif
+#ifdef CONFIG_X86_VISWS_APIC
+       FIX_CO_CPU,     /* Cobalt timer */
+       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */
+       FIX_LI_PCIA,    /* Lithium PCI Bridge A */
+       FIX_LI_PCIB,    /* Lithium PCI Bridge B */
+#endif
+#ifdef CONFIG_X86_F00F_BUG
+       FIX_F00F_IDT,   /* Virtual mapping for IDT */
+#endif
+#ifdef CONFIG_X86_CYCLONE_TIMER
+       FIX_CYCLONE_TIMER, /*cyclone timer register*/
+#endif
+#ifdef CONFIG_X86_32
+       FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+       FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#ifdef CONFIG_PCI_MMCONFIG
+       FIX_PCIE_MCFG,
+#endif
+#endif
+#ifdef CONFIG_PARAVIRT
+       FIX_PARAVIRT_BOOTMAP,
+#endif
+       FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
+       FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
+#ifdef CONFIG_X86_INTEL_MID
+       FIX_LNW_VRTC,
+#endif
+       __end_of_permanent_fixed_addresses,
+
+       /*
+        * 256 temporary boot-time mappings, used by early_ioremap(),
+        * before ioremap() is functional.
+        *
+        * If necessary we round it up to the next 256 pages boundary so
+        * that we can have a single pgd entry and a single pte table:
+        */
+#define NR_FIX_BTMAPS          64
+#define FIX_BTMAPS_SLOTS       4
+#define TOTAL_FIX_BTMAPS       (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
+       FIX_BTMAP_END =
+        (__end_of_permanent_fixed_addresses ^
+         (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) &
+        -PTRS_PER_PTE
+        ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS -
+          (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
+        : __end_of_permanent_fixed_addresses,
+       FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
+#ifdef CONFIG_X86_32
+       FIX_WP_TEST,
+#endif
+#ifdef CONFIG_INTEL_TXT
+       FIX_TBOOT_BASE,
+#endif
+       __end_of_fixed_addresses
+};
+
+
+extern void reserve_top_address(unsigned long reserve);
+
+#define FIXADDR_SIZE   (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_BOOT_SIZE      (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START          (FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_BOOT_START     (FIXADDR_TOP - FIXADDR_BOOT_SIZE)
+
+extern int fixmaps_set;
+
+extern pte_t *kmap_pte;
+extern pgprot_t kmap_prot;
+extern pte_t *pkmap_page_table;
+
+void xen_set_fixmap(enum fixed_addresses, phys_addr_t, pgprot_t);
+
+static inline void __set_fixmap(enum fixed_addresses idx,
+                               phys_addr_t phys, pgprot_t flags)
+{
+       xen_set_fixmap(idx, phys, flags);
+}
+
+#define set_fixmap(idx, phys)                          \
+       __set_fixmap(idx, phys, PAGE_KERNEL)
+
+/*
+ * Some hardware wants to get fixmapped without caching.
+ */
+#define set_fixmap_nocache(idx, phys)                  \
+       __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+
+#define clear_fixmap(idx)                      \
+       __set_fixmap(idx, 0, __pgprot(0))
+
+#define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x)       ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+extern void __this_fixmap_does_not_exist(void);
+
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without translation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+{
+       /*
+        * this branch gets completely eliminated after inlining,
+        * except when someone tries to use fixaddr indices in an
+        * illegal way. (such as mixing up address types or using
+        * out-of-range indices).
+        *
+        * If it doesn't get removed, the linker will complain
+        * loudly with a reasonably clear error message..
+        */
+       if (idx >= __end_of_fixed_addresses)
+               __this_fixmap_does_not_exist();
+
+       return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+       BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+       return __virt_to_fix(vaddr);
+}
+
+/* Return an pointer with offset calculated */
+static __always_inline unsigned long
+__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
+{
+       __set_fixmap(idx, phys, flags);
+       return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
+}
+
+#define set_fixmap_offset(idx, phys)                   \
+       __set_fixmap_offset(idx, phys, PAGE_KERNEL)
+
+#define set_fixmap_offset_nocache(idx, phys)                   \
+       __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
+
+#endif /* !__ASSEMBLY__ */
+#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/mach-xen/asm/fpu-internal.h b/arch/x86/include/mach-xen/asm/fpu-internal.h

new file mode 100644 (file)

index 0000000..becacb6
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/fpu-internal.h
@@ -0,0 +1,54 @@
+#ifndef _FPU_INTERNAL_H
+#define switch_fpu_prepare native_switch_fpu_prepare
+#include <asm/i387.h>
+#include_next <asm/fpu-internal.h>
+
+static inline void xen_thread_fpu_begin(struct task_struct *tsk,
+                                       multicall_entry_t *mcl)
+{
+       if (mcl) {
+               mcl->op = __HYPERVISOR_fpu_taskswitch;
+               mcl->args[0] = 0;
+       }
+       __thread_set_has_fpu(tsk);
+}
+
+static inline fpu_switch_t xen_switch_fpu_prepare(struct task_struct *old,
+                                                 struct task_struct *new,
+                                                 int cpu,
+                                                 multicall_entry_t **mcl)
+{
+       fpu_switch_t fpu;
+
+       fpu.preload = tsk_used_math(new) && new->fpu_counter > 5;
+       if (__thread_has_fpu(old)) {
+               if (!__save_init_fpu(old))
+                       cpu = ~0;
+               old->thread.fpu.last_cpu = cpu;
+               old->thread.fpu.has_fpu = 0;    /* But leave fpu_owner_task! */
+
+               /* Don't change CR0.TS if we just switch! */
+               if (fpu.preload) {
+                       new->fpu_counter++;
+                       __thread_set_has_fpu(new);
+                       prefetch(new->thread.fpu.state);
+               } else {
+                       (*mcl)->op = __HYPERVISOR_fpu_taskswitch;
+                       (*mcl)++->args[0] = 1;
+               }
+       } else {
+               old->fpu_counter = 0;
+               old->thread.fpu.last_cpu = ~0;
+               if (fpu.preload) {
+                       new->fpu_counter++;
+                       if (fpu_lazy_restore(new, cpu))
+                               fpu.preload = 0;
+                       else
+                               prefetch(new->thread.fpu.state);
+                       xen_thread_fpu_begin(new, (*mcl)++);
+               }
+       }
+       return fpu;
+}
+
+#endif
diff --git a/arch/x86/include/mach-xen/asm/gnttab_dma.h b/arch/x86/include/mach-xen/asm/gnttab_dma.h

new file mode 100644 (file)

index 0000000..fd7197c
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/gnttab_dma.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
+ * Copyright (c) 2007 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _ASM_I386_GNTTAB_DMA_H
+#define _ASM_I386_GNTTAB_DMA_H
+
+static inline int gnttab_dma_local_pfn(struct page *page)
+{
+       /* Has it become a local MFN? */
+       return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page))));
+}
+
+static inline maddr_t gnttab_dma_map_page(struct page *page)
+{
+       __gnttab_dma_map_page(page);
+       return ((maddr_t)pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT);
+}
+
+static inline void gnttab_dma_unmap_page(maddr_t maddr)
+{
+       __gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr)));
+}
+
+#endif /* _ASM_I386_GNTTAB_DMA_H */
diff --git a/arch/x86/include/mach-xen/asm/highmem.h b/arch/x86/include/mach-xen/asm/highmem.h

new file mode 100644 (file)

index 0000000..0b43fd4
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/highmem.h
@@ -0,0 +1,98 @@
+/*
+ * highmem.h: virtual kernel memory mappings for high memory
+ *
+ * Used in CONFIG_HIGHMEM systems for memory pages which
+ * are not addressable by direct kernel virtual addresses.
+ *
+ * Copyright (C) 1999 Gerhard Wichert, Siemens AG
+ *                   Gerhard.Wichert@pdb.siemens.de
+ *
+ *
+ * Redesigned the x86 32-bit VM architecture to deal with
+ * up to 16 Terabyte physical memory. With current x86 CPUs
+ * we now support up to 64 Gigabytes physical RAM.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#ifndef _ASM_X86_HIGHMEM_H
+#define _ASM_X86_HIGHMEM_H
+
+#ifdef __KERNEL__
+
+#include <linux/interrupt.h>
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+
+/* declarations for highmem.c */
+extern unsigned long highstart_pfn, highend_pfn;
+
+/*
+ * Right now we initialize only a single pte table. It can be extended
+ * easily, subsequent pte tables have to be allocated in one physical
+ * chunk of RAM.
+ */
+/*
+ * Ordering is:
+ *
+ * FIXADDR_TOP
+ *                     fixed_addresses
+ * FIXADDR_START
+ *                     temp fixed addresses
+ * FIXADDR_BOOT_START
+ *                     Persistent kmap area
+ * PKMAP_BASE
+ * VMALLOC_END
+ *                     Vmalloc area
+ * VMALLOC_START
+ * high_memory
+ */
+#define LAST_PKMAP_MASK (LAST_PKMAP-1)
+#define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
+#define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
+
+extern void *kmap_high(struct page *page);
+extern void kunmap_high(struct page *page);
+
+void *kmap(struct page *page);
+void kunmap(struct page *page);
+
+void *kmap_atomic_prot(struct page *page, pgprot_t prot);
+void *kmap_atomic(struct page *page);
+void __kunmap_atomic(void *kvaddr);
+void *kmap_atomic_pfn(unsigned long pfn);
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
+struct page *kmap_atomic_to_page(void *ptr);
+
+#define kmap_atomic_pte(page) \
+       kmap_atomic_prot(page, \
+                        PagePinned(page) ? PAGE_KERNEL_RO : kmap_prot)
+
+#define flush_cache_kmaps()    do { } while (0)
+
+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+                                       unsigned long end_pfn);
+
+void clear_highpage(struct page *);
+static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
+{
+       clear_highpage(page);
+}
+#define __HAVE_ARCH_CLEAR_HIGHPAGE
+#define clear_user_highpage clear_user_highpage
+#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE
+
+void copy_highpage(struct page *to, struct page *from);
+static inline void copy_user_highpage(struct page *to, struct page *from,
+       unsigned long vaddr, struct vm_area_struct *vma)
+{
+       copy_highpage(to, from);
+}
+#define __HAVE_ARCH_COPY_HIGHPAGE
+#define __HAVE_ARCH_COPY_USER_HIGHPAGE
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_HIGHMEM_H */
diff --git a/arch/x86/include/mach-xen/asm/hypercall.h b/arch/x86/include/mach-xen/asm/hypercall.h

new file mode 100644 (file)

index 0000000..573ce8d
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypercall.h
@@ -0,0 +1,439 @@
+/******************************************************************************
+ * hypercall.h
+ *
+ * Linux-specific hypervisor handling.
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * 64-bit updates:
+ *   Benjamin Liu <benjamin.liu@intel.com>
+ *   Jun Nakajima <jun.nakajima@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __HYPERCALL_H__
+#define __HYPERCALL_H__
+
+#ifndef __HYPERVISOR_H__
+# error "please don't include this file directly"
+#endif
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+# include <xen/interface/platform.h>
+# include <xen/interface/arch-x86/xen-mca.h>
+#endif
+#if CONFIG_XEN_COMPAT <= 0x030002
+# include <linux/string.h> /* memcpy() */
+# include <xen/interface/event_channel.h>
+# include <xen/interface/physdev.h>
+#endif
+
+#ifdef CONFIG_XEN
+#define HYPERCALL_ASM_OPERAND "%c"
+#define HYPERCALL_LOCATION(op) (hypercall_page + (op) * 32)
+#define HYPERCALL_C_OPERAND(name) "i" (HYPERCALL_LOCATION(__HYPERVISOR_##name))
+#else
+#define HYPERCALL_ASM_OPERAND "*%"
+#define HYPERCALL_LOCATION(op) (hypercall_stubs + (op) * 32)
+#define HYPERCALL_C_OPERAND(name) "g" (HYPERCALL_LOCATION(__HYPERVISOR_##name))
+#endif
+
+#define HYPERCALL_ARG(arg, n) \
+       register typeof((arg)+0) __arg##n asm(HYPERCALL_arg##n) = (arg)
+
+#define _hypercall0(type, name)                                        \
+({                                                             \
+       type __res;                                             \
+       asm volatile (                                          \
+               "call " HYPERCALL_ASM_OPERAND "1"               \
+               : "=a" (__res)                                  \
+               : HYPERCALL_C_OPERAND(name)                     \
+               : "memory" );                                   \
+       __res;                                                  \
+})
+
+#define _hypercall1(type, name, arg)                           \
+({                                                             \
+       type __res;                                             \
+       HYPERCALL_ARG(arg, 1);                                  \
+       asm volatile (                                          \
+               "call " HYPERCALL_ASM_OPERAND "2"               \
+               : "=a" (__res), "+r" (__arg1)                   \
+               : HYPERCALL_C_OPERAND(name)                     \
+               : "memory" );                                   \
+       __res;                                                  \
+})
+
+#define _hypercall2(type, name, a1, a2)                                \
+({                                                             \
+       type __res;                                             \
+       HYPERCALL_ARG(a1, 1);                                   \
+       HYPERCALL_ARG(a2, 2);                                   \
+       asm volatile (                                          \
+               "call " HYPERCALL_ASM_OPERAND "3"               \
+               : "=a" (__res), "+r" (__arg1), "+r" (__arg2)    \
+               : HYPERCALL_C_OPERAND(name)                     \
+               : "memory" );                                   \
+       __res;                                                  \
+})
+
+#define _hypercall3(type, name, a1, a2, a3)                    \
+({                                                             \
+       type __res;                                             \
+       HYPERCALL_ARG(a1, 1);                                   \
+       HYPERCALL_ARG(a2, 2);                                   \
+       HYPERCALL_ARG(a3, 3);                                   \
+       asm volatile (                                          \
+               "call " HYPERCALL_ASM_OPERAND "4"               \
+               : "=a" (__res), "+r" (__arg1),                  \
+                 "+r" (__arg2), "+r" (__arg3)                  \
+               : HYPERCALL_C_OPERAND(name)                     \
+               : "memory" );                                   \
+       __res;                                                  \
+})
+
+#define _hypercall4(type, name, a1, a2, a3, a4)                        \
+({                                                             \
+       type __res;                                             \
+       HYPERCALL_ARG(a1, 1);                                   \
+       HYPERCALL_ARG(a2, 2);                                   \
+       HYPERCALL_ARG(a3, 3);                                   \
+       HYPERCALL_ARG(a4, 4);                                   \
+       asm volatile (                                          \
+               "call " HYPERCALL_ASM_OPERAND "5"               \
+               : "=a" (__res), "+r" (__arg1), "+r" (__arg2),   \
+                 "+r" (__arg3), "+r" (__arg4)                  \
+               : HYPERCALL_C_OPERAND(name)                     \
+               : "memory" );                                   \
+       __res;                                                  \
+})
+
+#define _hypercall5(type, name, a1, a2, a3, a4, a5)            \
+({                                                             \
+       type __res;                                             \
+       HYPERCALL_ARG(a1, 1);                                   \
+       HYPERCALL_ARG(a2, 2);                                   \
+       HYPERCALL_ARG(a3, 3);                                   \
+       HYPERCALL_ARG(a4, 4);                                   \
+       HYPERCALL_ARG(a5, 5);                                   \
+       asm volatile (                                          \
+               "call " HYPERCALL_ASM_OPERAND "6"               \
+               : "=a" (__res), "+r" (__arg1), "+r" (__arg2),   \
+                 "+r" (__arg3), "+r" (__arg4), "+r" (__arg5)   \
+               : HYPERCALL_C_OPERAND(name)                     \
+               : "memory" );                                   \
+       __res;                                                  \
+})
+
+#define _hypercall(type, op, a1, a2, a3, a4, a5)               \
+({                                                             \
+       type __res;                                             \
+       HYPERCALL_ARG(a1, 1);                                   \
+       HYPERCALL_ARG(a2, 2);                                   \
+       HYPERCALL_ARG(a3, 3);                                   \
+       HYPERCALL_ARG(a4, 4);                                   \
+       HYPERCALL_ARG(a5, 5);                                   \
+       asm volatile (                                          \
+               "call *%6"                                      \
+               : "=a" (__res), "+r" (__arg1), "+r" (__arg2),   \
+                 "+r" (__arg3), "+r" (__arg4), "+r" (__arg5)   \
+               : "g" (HYPERCALL_LOCATION(op))                  \
+               : "memory" );                                   \
+       __res;                                                  \
+})
+
+#ifdef CONFIG_X86_32
+# include "hypercall_32.h"
+#else
+# include "hypercall_64.h"
+#endif
+
+static inline int __must_check
+HYPERVISOR_set_trap_table(
+       const trap_info_t *table)
+{
+       return _hypercall1(int, set_trap_table, table);
+}
+
+static inline int __must_check
+HYPERVISOR_mmu_update(
+       mmu_update_t *req, unsigned int count, unsigned int *success_count,
+       domid_t domid)
+{
+       if (arch_use_lazy_mmu_mode())
+               return xen_multi_mmu_update(req, count, success_count, domid);
+       return _hypercall4(int, mmu_update, req, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_mmuext_op(
+       struct mmuext_op *op, unsigned int count, unsigned int *success_count,
+       domid_t domid)
+{
+       if (arch_use_lazy_mmu_mode())
+               return xen_multi_mmuext_op(op, count, success_count, domid);
+       return _hypercall4(int, mmuext_op, op, count, success_count, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_set_gdt(
+       unsigned long *frame_list, unsigned int entries)
+{
+       return _hypercall2(int, set_gdt, frame_list, entries);
+}
+
+static inline int __must_check
+HYPERVISOR_stack_switch(
+       unsigned long ss, unsigned long esp)
+{
+       return _hypercall2(int, stack_switch, ss, esp);
+}
+
+static inline int
+HYPERVISOR_fpu_taskswitch(
+       int set)
+{
+       return _hypercall1(int, fpu_taskswitch, set);
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int __must_check
+HYPERVISOR_sched_op_compat(
+       int cmd, unsigned long arg)
+{
+       return _hypercall2(int, sched_op_compat, cmd, arg);
+}
+#endif
+
+static inline int __must_check
+HYPERVISOR_sched_op(
+       int cmd, void *arg)
+{
+       return _hypercall2(int, sched_op, cmd, arg);
+}
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+static inline int __must_check
+HYPERVISOR_platform_op(
+       struct xen_platform_op *platform_op)
+{
+       platform_op->interface_version = XENPF_INTERFACE_VERSION;
+       return _hypercall1(int, platform_op, platform_op);
+}
+
+static inline int __must_check
+HYPERVISOR_mca(
+       struct xen_mc *mc_op)
+{
+       mc_op->interface_version = XEN_MCA_INTERFACE_VERSION;
+       return _hypercall1(int, mca, mc_op);
+}
+#endif
+
+static inline int __must_check
+HYPERVISOR_set_debugreg(
+       unsigned int reg, unsigned long value)
+{
+       return _hypercall2(int, set_debugreg, reg, value);
+}
+
+static inline unsigned long __must_check
+HYPERVISOR_get_debugreg(
+       unsigned int reg)
+{
+       return _hypercall1(unsigned long, get_debugreg, reg);
+}
+
+static inline int __must_check
+HYPERVISOR_memory_op(
+       unsigned int cmd, void *arg)
+{
+       if (arch_use_lazy_mmu_mode())
+               xen_multicall_flush();
+       return _hypercall2(int, memory_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_multicall(
+       multicall_entry_t *call_list, unsigned int nr_calls)
+{
+       return _hypercall2(int, multicall, call_list, nr_calls);
+}
+
+static inline int __must_check
+HYPERVISOR_event_channel_op(
+       int cmd, void *arg)
+{
+       int rc = _hypercall2(int, event_channel_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (unlikely(rc == -ENOSYS)) {
+               struct evtchn_op op;
+               op.cmd = cmd;
+               memcpy(&op.u, arg, sizeof(op.u));
+               rc = _hypercall1(int, event_channel_op_compat, &op);
+               memcpy(arg, &op.u, sizeof(op.u));
+       }
+#endif
+
+       return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_xen_version(
+       int cmd, void *arg)
+{
+       return _hypercall2(int, xen_version, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_console_io(
+       int cmd, unsigned int count, char *str)
+{
+       return _hypercall3(int, console_io, cmd, count, str);
+}
+
+static inline int __must_check
+HYPERVISOR_physdev_op(
+       int cmd, void *arg)
+{
+       int rc = _hypercall2(int, physdev_op, cmd, arg);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (unlikely(rc == -ENOSYS)) {
+               struct physdev_op op;
+               op.cmd = cmd;
+               memcpy(&op.u, arg, sizeof(op.u));
+               rc = _hypercall1(int, physdev_op_compat, &op);
+               memcpy(arg, &op.u, sizeof(op.u));
+       }
+#endif
+
+       return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_grant_table_op(
+       unsigned int cmd, void *uop, unsigned int count)
+{
+       bool fixup = false;
+       int rc;
+
+       if (arch_use_lazy_mmu_mode())
+               xen_multicall_flush();
+#ifdef GNTTABOP_map_grant_ref
+       if (cmd == GNTTABOP_map_grant_ref)
+#endif
+               fixup = gnttab_pre_map_adjust(cmd, uop, count);
+       rc = _hypercall3(int, grant_table_op, cmd, uop, count);
+       if (rc == 0 && fixup)
+               rc = gnttab_post_map_adjust(uop, count);
+       return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_vm_assist(
+       unsigned int cmd, unsigned int type)
+{
+       return _hypercall2(int, vm_assist, cmd, type);
+}
+
+static inline int __must_check
+HYPERVISOR_vcpu_op(
+       int cmd, unsigned int vcpuid, void *extra_args)
+{
+       return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
+}
+
+static inline int __must_check
+HYPERVISOR_suspend(
+       unsigned long srec)
+{
+       struct sched_shutdown sched_shutdown = {
+               .reason = SHUTDOWN_suspend
+       };
+
+       int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown,
+                            &sched_shutdown, srec);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (rc == -ENOSYS)
+               rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown,
+                                SHUTDOWN_suspend, srec);
+#endif
+
+       return rc;
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int
+HYPERVISOR_nmi_op(
+       unsigned long op, void *arg)
+{
+       return _hypercall2(int, nmi_op, op, arg);
+}
+#endif
+
+#ifndef CONFIG_XEN
+static inline unsigned long __must_check
+HYPERVISOR_hvm_op(
+    int op, void *arg)
+{
+    return _hypercall2(unsigned long, hvm_op, op, arg);
+}
+#endif
+
+static inline int __must_check
+HYPERVISOR_callback_op(
+       int cmd, const void *arg)
+{
+       return _hypercall2(int, callback_op, cmd, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_xenoprof_op(
+       int op, void *arg)
+{
+       return _hypercall2(int, xenoprof_op, op, arg);
+}
+
+static inline int __must_check
+HYPERVISOR_kexec_op(
+       unsigned long op, void *args)
+{
+       return _hypercall2(int, kexec_op, op, args);
+}
+
+struct tmem_op;
+
+static inline int __must_check
+HYPERVISOR_tmem_op(
+       struct tmem_op *op)
+{
+       return _hypercall1(int, tmem_op, (void *)op);
+}
+
+#endif /* __HYPERCALL_H__ */
diff --git a/arch/x86/include/mach-xen/asm/hypercall_32.h b/arch/x86/include/mach-xen/asm/hypercall_32.h

new file mode 100644 (file)

index 0000000..3987b2e
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypercall_32.h
@@ -0,0 +1,62 @@
+#define HYPERCALL_arg1 "ebx"
+#define HYPERCALL_arg2 "ecx"
+#define HYPERCALL_arg3 "edx"
+#define HYPERCALL_arg4 "esi"
+#define HYPERCALL_arg5 "edi"
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int __must_check
+HYPERVISOR_set_callbacks(
+       unsigned long event_selector, unsigned long event_address,
+       unsigned long failsafe_selector, unsigned long failsafe_address)
+{
+       return _hypercall4(int, set_callbacks,
+                          event_selector, event_address,
+                          failsafe_selector, failsafe_address);
+}
+#endif
+
+static inline long __must_check
+HYPERVISOR_set_timer_op(
+       u64 timeout)
+{
+       return _hypercall2(long, set_timer_op,
+                          (unsigned long)timeout,
+                          (unsigned long)(timeout>>32));
+}
+
+static inline int __must_check
+HYPERVISOR_update_descriptor(
+       u64 ma, u64 desc)
+{
+       return _hypercall4(int, update_descriptor,
+                          (unsigned long)ma, (unsigned long)(ma>>32),
+                          (unsigned long)desc, (unsigned long)(desc>>32));
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping(
+       unsigned long va, pte_t new_val, unsigned long flags)
+{
+       unsigned long pte_hi = 0;
+
+       if (arch_use_lazy_mmu_mode())
+               return xen_multi_update_va_mapping(va, new_val, flags);
+#ifdef CONFIG_X86_PAE
+       pte_hi = new_val.pte_high;
+#endif
+       return _hypercall4(int, update_va_mapping, va,
+                          new_val.pte_low, pte_hi, flags);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping_otherdomain(
+       unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
+{
+       unsigned long pte_hi = 0;
+#ifdef CONFIG_X86_PAE
+       pte_hi = new_val.pte_high;
+#endif
+       return _hypercall5(int, update_va_mapping_otherdomain, va,
+                          new_val.pte_low, pte_hi, flags, domid);
+}
diff --git a/arch/x86/include/mach-xen/asm/hypercall_64.h b/arch/x86/include/mach-xen/asm/hypercall_64.h

new file mode 100644 (file)

index 0000000..97d9445
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypercall_64.h
@@ -0,0 +1,54 @@
+#define HYPERCALL_arg1 "rdi"
+#define HYPERCALL_arg2 "rsi"
+#define HYPERCALL_arg3 "rdx"
+#define HYPERCALL_arg4 "r10"
+#define HYPERCALL_arg5 "r8"
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+static inline int __must_check
+HYPERVISOR_set_callbacks(
+       unsigned long event_address, unsigned long failsafe_address, 
+       unsigned long syscall_address)
+{
+       return _hypercall3(int, set_callbacks,
+                          event_address, failsafe_address, syscall_address);
+}
+#endif
+
+static inline long __must_check
+HYPERVISOR_set_timer_op(
+       u64 timeout)
+{
+       return _hypercall1(long, set_timer_op, timeout);
+}
+
+static inline int __must_check
+HYPERVISOR_update_descriptor(
+       unsigned long ma, unsigned long word)
+{
+       return _hypercall2(int, update_descriptor, ma, word);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping(
+       unsigned long va, pte_t new_val, unsigned long flags)
+{
+       if (arch_use_lazy_mmu_mode())
+               return xen_multi_update_va_mapping(va, new_val, flags);
+       return _hypercall3(int, update_va_mapping, va, new_val.pte, flags);
+}
+
+static inline int __must_check
+HYPERVISOR_update_va_mapping_otherdomain(
+       unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
+{
+       return _hypercall4(int, update_va_mapping_otherdomain, va,
+                          new_val.pte, flags, domid);
+}
+
+static inline int __must_check
+HYPERVISOR_set_segment_base(
+       int reg, unsigned long value)
+{
+       return _hypercall2(int, set_segment_base, reg, value);
+}
diff --git a/arch/x86/include/mach-xen/asm/hypervisor.h b/arch/x86/include/mach-xen/asm/hypervisor.h

new file mode 100644 (file)

index 0000000..f668981
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/hypervisor.h
@@ -0,0 +1,392 @@
+/******************************************************************************
+ * hypervisor.h
+ * 
+ * Linux-specific hypervisor handling.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __HYPERVISOR_H__
+#define __HYPERVISOR_H__
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/sched.h>
+#include <xen/interface/vcpu.h>
+#include <asm/ptrace.h>
+#include <asm/pgtable_types.h>
+#include <asm/smp-processor-id.h>
+
+extern shared_info_t *HYPERVISOR_shared_info;
+
+#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
+DECLARE_PER_CPU(struct vcpu_info, vcpu_info);
+#define vcpu_info(cpu) (&per_cpu(vcpu_info, cpu))
+#define current_vcpu_info() (&__get_cpu_var(vcpu_info))
+#define vcpu_info_read(fld) percpu_read(vcpu_info.fld)
+#define vcpu_info_write(fld, val) percpu_write(vcpu_info.fld, val)
+#define vcpu_info_xchg(fld, val) percpu_xchg(vcpu_info.fld, val)
+void setup_vcpu_info(unsigned int cpu);
+void adjust_boot_vcpu_info(void);
+#else
+#define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu))
+#ifdef CONFIG_SMP
+#define current_vcpu_info() vcpu_info(smp_processor_id())
+#else
+#define current_vcpu_info() vcpu_info(0)
+#endif
+#define vcpu_info_read(fld) (current_vcpu_info()->fld)
+#define vcpu_info_write(fld, val) (current_vcpu_info()->fld = (val))
+static inline void setup_vcpu_info(unsigned int cpu) {}
+#endif
+
+#ifdef CONFIG_X86_32
+extern unsigned long hypervisor_virt_start;
+#endif
+
+/* arch/xen/i386/kernel/setup.c */
+extern start_info_t *xen_start_info;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN)
+#else
+#define is_initial_xendomain() 0
+#endif
+
+#define init_hypervisor(c) ((void)(c))
+#define init_hypervisor_platform() init_hypervisor(&boot_cpu_data)
+
+DECLARE_PER_CPU(struct vcpu_runstate_info, runstate);
+#define vcpu_running(cpu) (per_cpu(runstate.state, cpu) == RUNSTATE_running)
+
+/* arch/xen/kernel/evtchn.c */
+/* Force a proper event-channel callback from Xen. */
+void force_evtchn_callback(void);
+
+/* arch/xen/kernel/process.c */
+void xen_cpu_idle (void);
+
+/* arch/xen/i386/kernel/hypervisor.c */
+void do_hypervisor_callback(struct pt_regs *regs);
+
+/* arch/xen/i386/mm/hypervisor.c */
+/*
+ * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
+ * be MACHINE addresses.
+ */
+
+void xen_pt_switch(pgd_t *);
+void xen_new_user_pt(pgd_t *); /* x86_64 only */
+void xen_load_gs(unsigned int selector); /* x86_64 only */
+void xen_tlb_flush(void);
+void xen_invlpg(unsigned long ptr);
+
+void xen_l1_entry_update(pte_t *ptr, pte_t val);
+void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
+void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
+void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
+void xen_pgd_pin(pgd_t *);
+void xen_pgd_unpin(pgd_t *);
+
+void xen_init_pgd_pin(void);
+#ifdef CONFIG_PM_SLEEP
+void setup_pfn_to_mfn_frame_list(void *(*)(unsigned long, unsigned long,
+                                          unsigned long));
+#endif
+
+void xen_set_ldt(const void *ptr, unsigned int ents);
+
+#ifdef CONFIG_SMP
+#include <linux/cpumask.h>
+void xen_tlb_flush_all(void);
+void xen_invlpg_all(unsigned long ptr);
+void xen_tlb_flush_mask(const cpumask_t *mask);
+void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr);
+#else
+#define xen_tlb_flush_all xen_tlb_flush
+#define xen_invlpg_all xen_invlpg
+#endif
+
+/* Returns zero on success else negative errno. */
+int xen_create_contiguous_region(
+    unsigned long vstart, unsigned int order, unsigned int address_bits);
+void xen_destroy_contiguous_region(
+    unsigned long vstart, unsigned int order);
+int early_create_contiguous_region(unsigned long pfn, unsigned int order,
+                                  unsigned int address_bits);
+
+struct page;
+
+int xen_limit_pages_to_max_mfn(
+       struct page *pages, unsigned int order, unsigned int address_bits);
+
+bool __cold hypervisor_oom(void);
+
+/* Turn jiffies into Xen system time. */
+u64 jiffies_to_st(unsigned long jiffies);
+
+#ifdef CONFIG_XEN_SCRUB_PAGES
+void xen_scrub_pages(void *, unsigned int);
+#else
+#define xen_scrub_pages(_p,_n) ((void)0)
+#endif
+
+#if defined(CONFIG_XEN) && !defined(MODULE)
+
+DECLARE_PER_CPU(bool, xen_lazy_mmu);
+
+void xen_multicall_flush(void);
+
+int __must_check xen_multi_update_va_mapping(unsigned long va, pte_t,
+                                            unsigned long flags);
+int __must_check xen_multi_mmu_update(mmu_update_t *, unsigned int count,
+                                     unsigned int *success_count, domid_t);
+int __must_check xen_multi_mmuext_op(struct mmuext_op *, unsigned int count,
+                                    unsigned int *success_count, domid_t);
+
+#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+static inline void arch_enter_lazy_mmu_mode(void)
+{
+       percpu_write(xen_lazy_mmu, true);
+}
+
+static inline void arch_leave_lazy_mmu_mode(void)
+{
+       percpu_write(xen_lazy_mmu, false);
+       xen_multicall_flush();
+}
+
+#define arch_use_lazy_mmu_mode() unlikely(percpu_read(xen_lazy_mmu))
+
+#if 0 /* All uses are in places potentially called asynchronously, but
+       * asynchronous code should rather not make use of lazy mode at all.
+       * Therefore, all uses of this function get commented out, proper
+       * detection of asynchronous invocations is added whereever needed,
+       * and this function is disabled to catch any new (improper) uses.
+       */
+static inline void arch_flush_lazy_mmu_mode(void)
+{
+       if (arch_use_lazy_mmu_mode())
+               xen_multicall_flush();
+}
+#endif
+
+#else /* !CONFIG_XEN || MODULE */
+
+static inline void xen_multicall_flush(void) {}
+#define arch_use_lazy_mmu_mode() false
+#define xen_multi_update_va_mapping(...) ({ BUG(); -ENOSYS; })
+#define xen_multi_mmu_update(...) ({ BUG(); -ENOSYS; })
+#define xen_multi_mmuext_op(...) ({ BUG(); -ENOSYS; })
+
+#endif /* CONFIG_XEN && !MODULE */
+
+#ifdef CONFIG_XEN
+
+struct gnttab_map_grant_ref;
+bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *,
+                          unsigned int count);
+#if CONFIG_XEN_COMPAT < 0x030400
+int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int);
+#else
+static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m,
+                                        unsigned int count)
+{
+       BUG();
+       return -ENOSYS;
+}
+#endif
+
+#else /* !CONFIG_XEN */
+
+#define gnttab_pre_map_adjust(...) false
+#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; })
+
+#endif /* CONFIG_XEN */
+
+#if defined(CONFIG_X86_64)
+#define MULTI_UVMFLAGS_INDEX 2
+#define MULTI_UVMDOMID_INDEX 3
+#else
+#define MULTI_UVMFLAGS_INDEX 3
+#define MULTI_UVMDOMID_INDEX 4
+#endif
+
+#ifdef CONFIG_XEN
+#define is_running_on_xen() 1
+extern char hypercall_page[PAGE_SIZE];
+#else
+extern char *hypercall_stubs;
+#define is_running_on_xen() (!!hypercall_stubs)
+#endif
+
+#include <xen/hypercall.h>
+
+static inline int
+HYPERVISOR_yield(
+       void)
+{
+       int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (rc == -ENOSYS)
+               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
+#endif
+
+       return rc;
+}
+
+static inline int
+HYPERVISOR_block(
+       void)
+{
+       int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (rc == -ENOSYS)
+               rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0);
+#endif
+
+       return rc;
+}
+
+static inline void __noreturn
+HYPERVISOR_shutdown(
+       unsigned int reason)
+{
+       struct sched_shutdown sched_shutdown = {
+               .reason = reason
+       };
+
+       VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown));
+#if CONFIG_XEN_COMPAT <= 0x030002
+       VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason));
+#endif
+       /* Don't recurse needlessly. */
+       BUG_ON(reason != SHUTDOWN_crash);
+       for(;;);
+}
+
+static inline int __must_check
+HYPERVISOR_poll(
+       evtchn_port_t *ports, unsigned int nr_ports, u64 timeout)
+{
+       int rc;
+       struct sched_poll sched_poll = {
+               .nr_ports = nr_ports,
+               .timeout = jiffies_to_st(timeout)
+       };
+       set_xen_guest_handle(sched_poll.ports, ports);
+
+       rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (rc == -ENOSYS)
+               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
+#endif
+
+       return rc;
+}
+
+static inline int __must_check
+HYPERVISOR_poll_no_timeout(
+       evtchn_port_t *ports, unsigned int nr_ports)
+{
+       int rc;
+       struct sched_poll sched_poll = {
+               .nr_ports = nr_ports
+       };
+       set_xen_guest_handle(sched_poll.ports, ports);
+
+       rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (rc == -ENOSYS)
+               rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0);
+#endif
+
+       return rc;
+}
+
+#ifdef CONFIG_XEN
+
+static inline void
+MULTI_update_va_mapping(
+    multicall_entry_t *mcl, unsigned long va,
+    pte_t new_val, unsigned long flags)
+{
+    mcl->op = __HYPERVISOR_update_va_mapping;
+    mcl->args[0] = va;
+#if defined(CONFIG_X86_64)
+    mcl->args[1] = new_val.pte;
+#elif defined(CONFIG_X86_PAE)
+    mcl->args[1] = new_val.pte_low;
+    mcl->args[2] = new_val.pte_high;
+#else
+    mcl->args[1] = new_val.pte_low;
+    mcl->args[2] = 0;
+#endif
+    mcl->args[MULTI_UVMFLAGS_INDEX] = flags;
+}
+
+static inline void
+MULTI_mmu_update(multicall_entry_t *mcl, mmu_update_t *req,
+                unsigned int count, unsigned int *success_count,
+                domid_t domid)
+{
+    mcl->op = __HYPERVISOR_mmu_update;
+    mcl->args[0] = (unsigned long)req;
+    mcl->args[1] = count;
+    mcl->args[2] = (unsigned long)success_count;
+    mcl->args[3] = domid;
+}
+
+static inline void
+MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd,
+                    void *uop, unsigned int count)
+{
+    mcl->op = __HYPERVISOR_grant_table_op;
+    mcl->args[0] = cmd;
+    mcl->args[1] = (unsigned long)uop;
+    mcl->args[2] = count;
+}
+
+#else /* !defined(CONFIG_XEN) */
+
+/* Multicalls not supported for HVM guests. */
+#define MULTI_update_va_mapping(a,b,c,d) ((void)0)
+#define MULTI_grant_table_op(a,b,c,d) ((void)0)
+
+#endif
+
+#define uvm_multi(cpumask) ((unsigned long)cpumask_bits(cpumask) | UVMF_MULTI)
+
+#ifdef LINUX
+/* drivers/staging/ use Windows-style types, including VOID */
+#undef VOID
+#endif
+
+#endif /* __HYPERVISOR_H__ */
diff --git a/arch/x86/include/mach-xen/asm/io.h b/arch/x86/include/mach-xen/asm/io.h

new file mode 100644 (file)

index 0000000..2d07f8a
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/io.h
@@ -0,0 +1,343 @@
+#ifndef _ASM_X86_IO_H
+#define _ASM_X86_IO_H
+
+/*
+ * This file contains the definitions for the x86 IO instructions
+ * inb/inw/inl/outb/outw/outl and the "string versions" of the same
+ * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
+ * versions of the single-IO instructions (inb_p/inw_p/..).
+ *
+ * This file is not meant to be obfuscating: it's just complicated
+ * to (a) handle it all in a way that makes gcc able to optimize it
+ * as well as possible and (b) trying to avoid writing the same thing
+ * over and over again with slight variations and possibly making a
+ * mistake somewhere.
+ */
+
+/*
+ * Thanks to James van Artsdalen for a better timing-fix than
+ * the two short jumps: using outb's to a nonexistent port seems
+ * to guarantee better timings even on fast machines.
+ *
+ * On the other hand, I'd like to be sure of a non-existent port:
+ * I feel a bit unsafe about using 0x80 (should be safe, though)
+ *
+ *             Linus
+ */
+
+ /*
+  *  Bit simplified and optimized by Jan Hubicka
+  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
+  *
+  *  isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
+  *  isa_read[wl] and isa_write[wl] fixed
+  *  - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+  */
+
+#define ARCH_HAS_IOREMAP_WC
+
+#include <linux/string.h>
+#include <linux/compiler.h>
+#include <asm/page.h>
+#ifdef __KERNEL__
+#include <asm/fixmap.h>
+#endif
+
+#define build_mmio_read(name, size, type, reg, barrier) \
+static inline type name(const volatile void __iomem *addr) \
+{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
+:"m" (*(volatile type __force *)addr) barrier); return ret; }
+
+#define build_mmio_write(name, size, type, reg, barrier) \
+static inline void name(type val, volatile void __iomem *addr) \
+{ asm volatile("mov" size " %0,%1": :reg (val), \
+"m" (*(volatile type __force *)addr) barrier); }
+
+build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
+build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
+build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
+
+build_mmio_read(__readb, "b", unsigned char, "=q", )
+build_mmio_read(__readw, "w", unsigned short, "=r", )
+build_mmio_read(__readl, "l", unsigned int, "=r", )
+
+build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
+build_mmio_write(writew, "w", unsigned short, "r", :"memory")
+build_mmio_write(writel, "l", unsigned int, "r", :"memory")
+
+build_mmio_write(__writeb, "b", unsigned char, "q", )
+build_mmio_write(__writew, "w", unsigned short, "r", )
+build_mmio_write(__writel, "l", unsigned int, "r", )
+
+#define readb_relaxed(a) __readb(a)
+#define readw_relaxed(a) __readw(a)
+#define readl_relaxed(a) __readl(a)
+#define __raw_readb __readb
+#define __raw_readw __readw
+#define __raw_readl __readl
+
+#define __raw_writeb __writeb
+#define __raw_writew __writew
+#define __raw_writel __writel
+
+#define mmiowb() barrier()
+
+#ifdef CONFIG_X86_64
+
+build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
+build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
+
+#define readq_relaxed(a)       readq(a)
+
+#define __raw_readq(a)         readq(a)
+#define __raw_writeq(val, addr)        writeq(val, addr)
+
+/* Let people know that we have them */
+#define readq                  readq
+#define writeq                 writeq
+
+#endif
+
+/**
+ *     virt_to_phys    -       map virtual addresses to physical
+ *     @address: address to remap
+ *
+ *     The returned physical address is the physical (CPU) mapping for
+ *     the memory address given. It is only valid to use this function on
+ *     addresses directly mapped or allocated via kmalloc.
+ *
+ *     This function does not give bus mappings for DMA transfers. In
+ *     almost all conceivable cases a device driver should not be using
+ *     this function
+ */
+
+static inline phys_addr_t virt_to_phys(volatile void *address)
+{
+       return __pa(address);
+}
+
+/**
+ *     phys_to_virt    -       map physical address to virtual
+ *     @address: address to remap
+ *
+ *     The returned virtual address is a current CPU mapping for
+ *     the memory address given. It is only valid to use this function on
+ *     addresses that have a kernel mapping
+ *
+ *     This function does not handle bus mappings for DMA transfers. In
+ *     almost all conceivable cases a device driver should not be using
+ *     this function
+ */
+
+static inline void *phys_to_virt(phys_addr_t address)
+{
+       return __va(address);
+}
+
+/*
+ * Change "struct page" to physical address.
+ */
+#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
+#undef page_to_phys
+#define page_to_phys(page)      (phys_to_machine(page_to_pseudophys(page)))
+#define page_to_bus(page)       (phys_to_machine(page_to_pseudophys(page)))
+
+/*
+ * ISA I/O bus memory addresses are 1:1 with the physical address.
+ * However, we truncate the address to unsigned int to avoid undesirable
+ * promitions in legacy drivers.
+ */
+#define isa_virt_to_bus(_x) ({ \
+       unsigned long _va_ = (unsigned long)(_x); \
+       _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \
+       ? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \
+       : ({ BUG(); (unsigned long)virt_to_bus(_va_); }); })
+#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x))
+
+/*
+ * However PCI ones are not necessarily 1:1 and therefore these interfaces
+ * are forbidden in portable PCI drivers.
+ *
+ * Allow them on x86 for legacy drivers, though.
+ */
+#define virt_to_bus(_x) phys_to_machine(__pa(_x))
+#define bus_to_virt(_x) __va(machine_to_phys(_x))
+
+/**
+ * ioremap     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * If the area you are trying to map is a PCI BAR you should have a
+ * look at pci_iomap().
+ */
+extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
+                               unsigned long prot_val);
+
+/*
+ * The default ioremap() behavior is non-cached:
+ */
+static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
+{
+       return ioremap_nocache(offset, size);
+}
+
+extern void iounmap(volatile void __iomem *addr);
+
+extern void set_iounmap_nonlazy(void);
+
+#ifdef __KERNEL__
+
+#include <asm-generic/iomap.h>
+
+#include <linux/vmalloc.h>
+
+/*
+ * Convert a virtual cached pointer to an uncached pointer
+ */
+#define xlate_dev_kmem_ptr(p)  p
+
+static inline void
+memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
+{
+       memset((void __force *)addr, val, count);
+}
+
+static inline void
+memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
+{
+       memcpy(dst, (const void __force *)src, count);
+}
+
+static inline void
+memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
+{
+       memcpy((void __force *)dst, src, count);
+}
+
+/*
+ *     Cache management
+ *
+ *     This needed for two cases
+ *     1. Out of order aware processors
+ *     2. Accidentally out of order processors (PPro errata #51)
+ */
+
+static inline void flush_write_buffers(void)
+{
+#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
+       asm volatile("lock; addl $0,0(%%esp)": : :"memory");
+#endif
+}
+
+#endif /* __KERNEL__ */
+
+extern void native_io_delay(void);
+
+extern int io_delay_type;
+extern void io_delay_init(void);
+
+static inline void slow_down_io(void)
+{
+       native_io_delay();
+#ifdef REALLY_SLOW_IO
+       native_io_delay();
+       native_io_delay();
+       native_io_delay();
+#endif
+}
+
+#define BUILDIO(bwl, bw, type)                                         \
+static inline void out##bwl(unsigned type value, int port)             \
+{                                                                      \
+       asm volatile("out" #bwl " %" #bw "0, %w1"                       \
+                    : : "a"(value), "Nd"(port));                       \
+}                                                                      \
+                                                                       \
+static inline unsigned type in##bwl(int port)                          \
+{                                                                      \
+       unsigned type value;                                            \
+       asm volatile("in" #bwl " %w1, %" #bw "0"                        \
+                    : "=a"(value) : "Nd"(port));                       \
+       return value;                                                   \
+}                                                                      \
+                                                                       \
+static inline void out##bwl##_p(unsigned type value, int port)         \
+{                                                                      \
+       out##bwl(value, port);                                          \
+       slow_down_io();                                                 \
+}                                                                      \
+                                                                       \
+static inline unsigned type in##bwl##_p(int port)                      \
+{                                                                      \
+       unsigned type value = in##bwl(port);                            \
+       slow_down_io();                                                 \
+       return value;                                                   \
+}                                                                      \
+                                                                       \
+static inline void outs##bwl(int port, const void *addr, unsigned long count) \
+{                                                                      \
+       asm volatile("rep; outs" #bwl                                   \
+                    : "+S"(addr), "+c"(count) : "d"(port));            \
+}                                                                      \
+                                                                       \
+static inline void ins##bwl(int port, void *addr, unsigned long count) \
+{                                                                      \
+       asm volatile("rep; ins" #bwl                                    \
+                    : "+D"(addr), "+c"(count) : "d"(port));            \
+}
+
+BUILDIO(b, b, char)
+BUILDIO(w, w, short)
+BUILDIO(l, , int)
+
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+
+/* We will be supplying our own /dev/mem implementation */
+#define ARCH_HAS_DEV_MEM
+
+#define bvec_to_pseudophys(bv)  (page_to_pseudophys((bv)->bv_page) + \
+                                 (unsigned long)(bv)->bv_offset)
+
+#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+       (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
+        && bvec_to_pseudophys(vec1) + (vec1)->bv_len \
+           == bvec_to_pseudophys(vec2))
+
+#endif
+
+extern void *xlate_dev_mem_ptr(unsigned long phys);
+extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
+
+extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
+                                    unsigned long prot_val);
+extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+
+/*
+ * early_ioremap() and early_iounmap() are for temporary early boot-time
+ * mappings, before the real ioremap() is functional.
+ * A boot-time mapping is currently limited to at most 16 pages.
+ */
+extern void early_ioremap_init(void);
+extern void early_ioremap_reset(void);
+extern void __iomem *early_ioremap(resource_size_t phys_addr,
+                                  unsigned long size);
+extern void __iomem *early_memremap(resource_size_t phys_addr,
+                                   unsigned long size);
+extern void __iomem *early_memremap_ro(resource_size_t phys_addr,
+                                      unsigned long size);
+extern void early_iounmap(void __iomem *addr, unsigned long size);
+extern void fixup_early_ioremap(void);
+extern bool is_early_ioremap_ptep(pte_t *ptep);
+
+#define IO_SPACE_LIMIT 0xffff
+
+#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/mach-xen/asm/ipi.h b/arch/x86/include/mach-xen/asm/ipi.h

new file mode 100644 (file)

index 0000000..4bdda1d
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/ipi.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_IPI_H
+#define _ASM_X86_IPI_H
+
+#include <asm/hw_irq.h>
+#include <asm/smp.h>
+
+void xen_send_IPI_mask(const struct cpumask *, int vector);
+void xen_send_IPI_mask_allbutself(const struct cpumask *, int vector);
+void xen_send_IPI_allbutself(int vector);
+void xen_send_IPI_all(int vector);
+void xen_send_IPI_self(int vector);
+
+#endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/mach-xen/asm/irq_vectors.h b/arch/x86/include/mach-xen/asm/irq_vectors.h

new file mode 100644 (file)

index 0000000..7798731
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/irq_vectors.h
@@ -0,0 +1,98 @@
+#ifndef _ASM_X86_IRQ_VECTORS_H
+#define _ASM_X86_IRQ_VECTORS_H
+
+#define MCE_VECTOR                     0x12
+
+#define IA32_SYSCALL_VECTOR            0x80
+#ifdef CONFIG_X86_32
+# define SYSCALL_VECTOR                        0x80
+#endif
+
+#define RESCHEDULE_VECTOR              0
+#define CALL_FUNCTION_VECTOR           1
+#define NMI_VECTOR                     0x02
+#define CALL_FUNC_SINGLE_VECTOR                3
+#define REBOOT_VECTOR                  4
+#ifdef CONFIG_IRQ_WORK
+#define IRQ_WORK_VECTOR                        5
+#define NR_IPIS                                6
+#else
+#define NR_IPIS                                5
+#endif
+
+/*
+ * The maximum number of vectors supported by i386 processors
+ * is limited to 256. For processors other than i386, NR_VECTORS
+ * should be changed accordingly.
+ */
+#define NR_VECTORS                      256
+
+#define        FIRST_VM86_IRQ                     3
+#define LAST_VM86_IRQ                    15
+
+#ifndef __ASSEMBLY__
+static inline int invalid_vm86_irq(int irq)
+{
+       return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
+}
+#endif
+
+/*
+ * Size the maximum number of interrupts.
+ *
+ * If the irq_desc[] array has a sparse layout, we can size things
+ * generously - it scales up linearly with the maximum number of CPUs,
+ * and the maximum number of IO-APICs, whichever is higher.
+ *
+ * In other cases we size more conservatively, to not create too large
+ * static arrays.
+ */
+
+#define NR_IRQS_LEGACY                   16
+
+/*
+ * The flat IRQ space is divided into two regions:
+ *  1. A one-to-one mapping of real physical IRQs. This space is only used
+ *     if we have physical device-access privilege. This region is at the
+ *     start of the IRQ space so that existing device drivers do not need
+ *     to be modified to translate physical IRQ numbers into our IRQ space.
+ *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
+ *     are bound using the provided bind/unbind functions.
+ */
+#define PIRQ_BASE                      0
+/* PHYSDEVOP_pirq_eoi_gmfn restriction: */
+#define PIRQ_MAX(n) ((n) < (1 << (PAGE_SHIFT + 3)) - NR_VECTORS \
+                  ? (n) : (1 << (PAGE_SHIFT + 3)) - NR_VECTORS)
+
+#define IO_APIC_VECTOR_LIMIT           PIRQ_MAX(32 * MAX_IO_APICS)
+#define CPU_VECTOR_LIMIT               PIRQ_MAX(64 * NR_CPUS)
+
+#if defined(CONFIG_X86_IO_APIC)
+# define NR_PIRQS                                      \
+       (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ?      \
+               (NR_VECTORS + CPU_VECTOR_LIMIT)  :      \
+               (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
+#elif defined(CONFIG_XEN_PCIDEV_FRONTEND)
+# define NR_PIRQS                      (NR_VECTORS + CPU_VECTOR_LIMIT)
+#else /* !CONFIG_X86_IO_APIC: */
+# define NR_PIRQS                      NR_IRQS_LEGACY
+#endif
+
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_SPARSE_IRQ
+extern int nr_pirqs;
+#else
+# define nr_pirqs                      NR_PIRQS
+#endif
+#endif
+
+#define DYNIRQ_BASE                    (PIRQ_BASE + nr_pirqs)
+#ifdef CONFIG_SPARSE_IRQ
+#define NR_DYNIRQS                     (CPU_VECTOR_LIMIT + CONFIG_XEN_NR_GUEST_DEVICES)
+#else
+#define NR_DYNIRQS                     (64 + CONFIG_XEN_NR_GUEST_DEVICES)
+#endif
+
+#define NR_IRQS                                (NR_PIRQS + NR_DYNIRQS)
+
+#endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/mach-xen/asm/irqflags.h b/arch/x86/include/mach-xen/asm/irqflags.h

new file mode 100644 (file)

index 0000000..95d336f
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/irqflags.h
@@ -0,0 +1,212 @@
+#ifndef _X86_IRQFLAGS_H_
+#define _X86_IRQFLAGS_H_
+
+#include <asm/smp-processor-id.h>
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <xen/interface/vcpu.h>
+/*
+ * The use of 'barrier' in the following reflects their use as local-lock
+ * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following
+ * critical operations are executed. All critical operations must complete
+ * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also
+ * includes these barriers, for example.
+ */
+
+#define xen_save_fl(void) vcpu_info_read(evtchn_upcall_mask)
+
+#define xen_restore_fl(f)                                      \
+do {                                                           \
+       vcpu_info_t *_vcpu;                                     \
+       barrier();                                              \
+       _vcpu = current_vcpu_info();                            \
+       if ((_vcpu->evtchn_upcall_mask = (f)) == 0) {           \
+               barrier(); /* unmask then check (avoid races) */\
+               if (unlikely(_vcpu->evtchn_upcall_pending))     \
+                       force_evtchn_callback();                \
+       }                                                       \
+} while (0)
+
+#define xen_irq_disable()                                      \
+do {                                                           \
+       vcpu_info_write(evtchn_upcall_mask, 1);                 \
+       barrier();                                              \
+} while (0)
+
+#define xen_irq_enable()                                       \
+do {                                                           \
+       vcpu_info_t *_vcpu;                                     \
+       barrier();                                              \
+       _vcpu = current_vcpu_info();                            \
+       _vcpu->evtchn_upcall_mask = 0;                          \
+       barrier(); /* unmask then check (avoid races) */        \
+       if (unlikely(_vcpu->evtchn_upcall_pending))             \
+               force_evtchn_callback();                        \
+} while (0)
+
+#define arch_local_save_flags() xen_save_fl()
+
+#define arch_local_irq_restore(flags) xen_restore_fl(flags)
+
+#define arch_local_irq_disable()       xen_irq_disable()
+
+#define arch_local_irq_enable() xen_irq_enable()
+
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+#define arch_safe_halt HYPERVISOR_block
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+#define halt() VOID(irqs_disabled()                                    \
+                   ? HYPERVISOR_vcpu_op(VCPUOP_down,                   \
+                                        smp_processor_id(), NULL)      \
+                   : 0)
+
+/*
+ * For spinlocks, etc:
+ */
+#define arch_local_irq_save()                                          \
+({                                                                     \
+       unsigned long flags = arch_local_save_flags();                  \
+                                                                       \
+       arch_local_irq_disable();                                       \
+                                                                       \
+       flags;                                                          \
+})
+#else
+
+/* Offsets into shared_info_t. */
+#define evtchn_upcall_pending          /* 0 */
+#define evtchn_upcall_mask             1
+
+#ifdef CONFIG_X86_64
+# define __REG_si %rsi
+# define __CPU_num PER_CPU_VAR(cpu_number)
+#else
+# define __REG_si %esi
+# define __CPU_num TI_cpu(%ebp)
+#endif
+
+#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
+
+#define GET_VCPU_INFO          PER_CPU(vcpu_info, __REG_si)
+#define __DISABLE_INTERRUPTS   movb $1,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask)
+#define __ENABLE_INTERRUPTS    movb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask)
+#define __TEST_PENDING         cmpb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_pending+0)
+#define DISABLE_INTERRUPTS(clb)        __DISABLE_INTERRUPTS
+#define ENABLE_INTERRUPTS(clb) __ENABLE_INTERRUPTS
+
+#define __SIZEOF_DISABLE_INTERRUPTS 8
+#define __SIZEOF_TEST_PENDING  8
+
+#else /* CONFIG_XEN_VCPU_INFO_PLACEMENT */
+
+#define sizeof_vcpu_shift      6
+
+#ifdef CONFIG_SMP
+#define GET_VCPU_INFO          movl __CPU_num,%esi                     ; \
+                               shl $sizeof_vcpu_shift,%esi             ; \
+                               add HYPERVISOR_shared_info,__REG_si
+#else
+#define GET_VCPU_INFO          mov HYPERVISOR_shared_info,__REG_si
+#endif
+
+#define __DISABLE_INTERRUPTS   movb $1,evtchn_upcall_mask(__REG_si)
+#define __ENABLE_INTERRUPTS    movb $0,evtchn_upcall_mask(__REG_si)
+#define __TEST_PENDING         testb $0xFF,evtchn_upcall_pending(__REG_si)
+#define DISABLE_INTERRUPTS(clb)        GET_VCPU_INFO                           ; \
+                               __DISABLE_INTERRUPTS
+#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO                           ; \
+                               __ENABLE_INTERRUPTS
+
+#define __SIZEOF_DISABLE_INTERRUPTS 4
+#define __SIZEOF_TEST_PENDING  3
+
+#endif /* CONFIG_XEN_VCPU_INFO_PLACEMENT */
+
+#ifndef CONFIG_X86_64
+#define INTERRUPT_RETURN               iret
+#define ENABLE_INTERRUPTS_SYSEXIT                                        \
+       movb $0,evtchn_upcall_mask(%esi) /* __ENABLE_INTERRUPTS */      ; \
+sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/            ; \
+       cmpb $0,evtchn_upcall_pending(%esi) /* __TEST_PENDING */        ; \
+       jnz  14f        /* process more events if necessary... */       ; \
+       movl PT_ESI(%esp), %esi                                         ; \
+       sysexit                                                         ; \
+14:    movb $1,evtchn_upcall_mask(%esi) /* __DISABLE_INTERRUPTS */     ; \
+       TRACE_IRQS_OFF                                                  ; \
+sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/              ; \
+       mov  $__KERNEL_PERCPU, %ecx                                     ; \
+       push %esp                                                       ; \
+       mov  %ecx, %fs                                                  ; \
+       SET_KERNEL_GS %ecx                                              ; \
+       call evtchn_do_upcall                                           ; \
+       add  $4,%esp                                                    ; \
+       jmp  ret_from_intr
+#endif
+
+
+#endif /* __ASSEMBLY__ */
+
+#ifndef __ASSEMBLY__
+static inline int arch_irqs_disabled_flags(unsigned long flags)
+{
+       return (flags != 0);
+}
+
+#define arch_irqs_disabled()                                           \
+({                                                                     \
+       unsigned long flags = arch_local_save_flags();                  \
+                                                                       \
+       arch_irqs_disabled_flags(flags);                                \
+})
+
+#else
+
+#ifdef CONFIG_X86_64
+#define ARCH_LOCKDEP_SYS_EXIT          call lockdep_sys_exit_thunk
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ      \
+       TRACE_IRQS_ON; \
+       ENABLE_INTERRUPTS(CLBR_NONE); \
+       SAVE_REST; \
+       LOCKDEP_SYS_EXIT; \
+       RESTORE_REST; \
+       __DISABLE_INTERRUPTS; \
+       TRACE_IRQS_OFF;
+
+#else
+#define ARCH_LOCKDEP_SYS_EXIT                  \
+       pushl %eax;                             \
+       pushl %ecx;                             \
+       pushl %edx;                             \
+       call lockdep_sys_exit;                  \
+       popl %edx;                              \
+       popl %ecx;                              \
+       popl %eax;
+
+#define ARCH_LOCKDEP_SYS_EXIT_IRQ
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+#  define TRACE_IRQS_ON                call trace_hardirqs_on_thunk;
+#  define TRACE_IRQS_OFF       call trace_hardirqs_off_thunk;
+#else
+#  define TRACE_IRQS_ON
+#  define TRACE_IRQS_OFF
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#  define LOCKDEP_SYS_EXIT     ARCH_LOCKDEP_SYS_EXIT
+#  define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
+# else
+#  define LOCKDEP_SYS_EXIT
+#  define LOCKDEP_SYS_EXIT_IRQ
+# endif
+
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/arch/x86/include/mach-xen/asm/mach_traps.h b/arch/x86/include/mach-xen/asm/mach_traps.h

new file mode 100644 (file)

index 0000000..99314d3
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/mach_traps.h
@@ -0,0 +1,37 @@
+/*
+ *  include/asm-xen/asm-i386/mach-xen/mach_traps.h
+ *
+ *  Machine specific NMI handling for Xen
+ */
+#ifndef _MACH_TRAPS_H
+#define _MACH_TRAPS_H
+
+#include <linux/bitops.h>
+#include <xen/interface/nmi.h>
+
+#define NMI_REASON_SERR                0x80
+#define NMI_REASON_IOCHK       0x40
+#define NMI_REASON_MASK                (NMI_REASON_SERR | NMI_REASON_IOCHK)
+
+static inline void clear_serr_error(unsigned char reason) {}
+static inline void clear_io_check_error(unsigned char reason) {}
+
+static inline unsigned char xen_get_nmi_reason(void)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+       unsigned char reason = 0;
+
+       /* construct a value which looks like it came from
+        * port 0x61.
+        */
+       if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason))
+               reason |= NMI_REASON_IOCHK;
+       if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason))
+               reason |= NMI_REASON_SERR;
+
+        return reason;
+}
+
+static inline void reassert_nmi(void) {}
+
+#endif /* !_MACH_TRAPS_H */
diff --git a/arch/x86/include/mach-xen/asm/maddr.h b/arch/x86/include/mach-xen/asm/maddr.h

new file mode 100644 (file)

index 0000000..455e848
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/maddr.h
@@ -0,0 +1,155 @@
+#ifndef _X86_MADDR_H
+#define _X86_MADDR_H
+
+#include <asm/asm.h>
+#include <asm/bug.h>
+#include <xen/features.h>
+#include <xen/interface/xen.h>
+
+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
+#define INVALID_P2M_ENTRY      (~0UL)
+#define FOREIGN_FRAME_BIT      (1UL << (BITS_PER_LONG - 1))
+#define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
+
+/* Definitions for machine and pseudophysical addresses. */
+#ifdef CONFIG_X86_PAE
+typedef unsigned long long paddr_t;
+typedef unsigned long long maddr_t;
+#else
+typedef unsigned long paddr_t;
+typedef unsigned long maddr_t;
+#endif
+
+#ifdef CONFIG_XEN
+
+extern unsigned long *phys_to_machine_mapping;
+extern unsigned long  max_mapnr;
+
+#undef machine_to_phys_mapping
+extern unsigned long *machine_to_phys_mapping;
+extern unsigned long  machine_to_phys_nr;
+
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+       if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+               return pfn;
+       if (likely(max_mapnr))
+               BUG_ON(pfn >= max_mapnr);
+       return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT;
+}
+
+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
+{
+       if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+               return 1;
+       if (likely(max_mapnr))
+               BUG_ON(pfn >= max_mapnr);
+       return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+       unsigned long pfn;
+
+       if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
+               return mfn;
+
+       if (unlikely(mfn >= machine_to_phys_nr))
+               return max_mapnr;
+
+       /* The array access can fail (e.g., device space beyond end of RAM). */
+       asm (
+               "1:     "_ASM_MOV" %1,%0\n"
+               "2:\n"
+               ".section .fixup,\"ax\"\n"
+               "3:     "_ASM_MOV" %2,%0\n"
+               "       jmp  2b\n"
+               ".previous\n"
+               _ASM_EXTABLE(1b,3b)
+               : "=r" (pfn)
+               : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) );
+
+       return pfn;
+}
+
+/*
+ * We detect special mappings in one of two ways:
+ *  1. If the MFN is an I/O page then Xen will set the m2p entry
+ *     to be outside our maximum possible pseudophys range.
+ *  2. If the MFN belongs to a different domain then we will certainly
+ *     not have MFN in our p2m table. Conversely, if the page is ours,
+ *     then we'll have p2m(m2p(MFN))==MFN.
+ * If we detect a special mapping then it doesn't have a 'struct page'.
+ * We force !pfn_valid() by returning an out-of-range pointer.
+ *
+ * NB. These checks require that, for any MFN that is not in our reservation,
+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
+ *
+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
+ *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
+ *      require. In all the cases we care about, the FOREIGN_FRAME bit is
+ *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
+ */
+static inline unsigned long mfn_to_local_pfn(phys_addr_t mfn)
+{
+       unsigned long pfn = mfn_to_pfn(mfn);
+       if (likely(pfn < max_mapnr)
+           && likely(!xen_feature(XENFEAT_auto_translated_physmap))
+           && unlikely(phys_to_machine_mapping[pfn] != mfn))
+               return max_mapnr; /* force !pfn_valid() */
+       return pfn;
+}
+
+static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+       if (likely(max_mapnr))
+               BUG_ON(pfn >= max_mapnr);
+       if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+               return;
+       }
+       phys_to_machine_mapping[pfn] = mfn;
+}
+
+static inline maddr_t phys_to_machine(paddr_t phys)
+{
+       maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
+       machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK);
+       return machine;
+}
+
+static inline paddr_t machine_to_phys(maddr_t machine)
+{
+       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
+       phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK);
+       return phys;
+}
+
+#ifdef CONFIG_X86_32
+# include "maddr_32.h"
+#else
+# include "maddr_64.h"
+#endif
+
+#else /* !CONFIG_XEN */
+
+#define pfn_to_mfn(pfn) (pfn)
+#define mfn_to_pfn(mfn) (mfn)
+#define mfn_to_local_pfn(mfn) (mfn)
+#define set_phys_to_machine(pfn, mfn) ((void)0)
+#define phys_to_machine_mapping_valid(pfn) 1
+#define phys_to_machine(phys) ((maddr_t)(phys))
+#define machine_to_phys(mach) ((paddr_t)(mach))
+#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot)
+#define __pte_ma(x) __pte(x)
+
+#endif /* !CONFIG_XEN */
+
+/* VIRT <-> MACHINE conversion */
+#define virt_to_machine(v)     phys_to_machine(__pa(v))
+#define virt_to_mfn(v)         pfn_to_mfn(__pa(v) >> PAGE_SHIFT)
+#define mfn_to_virt(m)         __va(mfn_to_pfn(m) << PAGE_SHIFT)
+
+#endif /* _X86_MADDR_H */
diff --git a/arch/x86/include/mach-xen/asm/maddr_32.h b/arch/x86/include/mach-xen/asm/maddr_32.h

new file mode 100644 (file)

index 0000000..de34d87
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/maddr_32.h
@@ -0,0 +1,35 @@
+#ifndef _I386_MADDR_H
+#define _I386_MADDR_H
+
+#ifdef CONFIG_X86_PAE
+static inline paddr_t pte_phys_to_machine(paddr_t phys)
+{
+       /*
+        * In PAE mode, the NX bit needs to be dealt with in the value
+        * passed to pfn_to_mfn(). On x86_64, we need to mask it off,
+        * but for i386 the conversion to ulong for the argument will
+        * clip it off.
+        */
+       maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT);
+       machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
+       return machine;
+}
+
+static inline paddr_t pte_machine_to_phys(maddr_t machine)
+{
+       /*
+        * In PAE mode, the NX bit needs to be dealt with in the value
+        * passed to mfn_to_pfn(). On x86_64, we need to mask it off,
+        * but for i386 the conversion to ulong for the argument will
+        * clip it off.
+        */
+       paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT);
+       phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
+       return phys;
+}
+#else
+#define pte_phys_to_machine phys_to_machine
+#define pte_machine_to_phys machine_to_phys
+#endif
+
+#endif /* _I386_MADDR_H */
diff --git a/arch/x86/include/mach-xen/asm/maddr_64.h b/arch/x86/include/mach-xen/asm/maddr_64.h

new file mode 100644 (file)

index 0000000..e2c271e
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/maddr_64.h
@@ -0,0 +1,21 @@
+#ifndef _X86_64_MADDR_H
+#define _X86_64_MADDR_H
+
+static inline paddr_t pte_phys_to_machine(paddr_t phys)
+{
+       maddr_t machine;
+       machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
+       machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK);
+       return machine;
+}
+
+static inline paddr_t pte_machine_to_phys(maddr_t machine)
+{
+       paddr_t phys;
+       phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT);
+       phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK);
+       return phys;
+}
+
+#endif /* _X86_64_MADDR_H */
+
diff --git a/arch/x86/include/mach-xen/asm/mmu_context.h b/arch/x86/include/mach-xen/asm/mmu_context.h

new file mode 100644 (file)

index 0000000..1fbe9dd
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/mmu_context.h
@@ -0,0 +1,165 @@
+#ifndef _ASM_X86_MMU_CONTEXT_H
+#define _ASM_X86_MMU_CONTEXT_H
+
+#include <asm/desc.h>
+#include <linux/atomic.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+void arch_exit_mmap(struct mm_struct *mm);
+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+
+void mm_pin(struct mm_struct *mm);
+void mm_unpin(struct mm_struct *mm);
+void mm_pin_all(void);
+
+static inline void xen_activate_mm(struct mm_struct *prev,
+                                  struct mm_struct *next)
+{
+       if (!PagePinned(virt_to_page(next->pgd)))
+               mm_pin(next);
+}
+
+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+       if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+               percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
+#endif
+}
+
+#define prepare_arch_switch(next)      __prepare_arch_switch()
+
+static inline void __prepare_arch_switch(void)
+{
+#ifdef CONFIG_X86_32
+       /*
+        * Save away %gs. No need to save %fs, as it was saved on the
+        * stack on entry.  No need to save %es and %ds, as those are
+        * always kernel segments while inside the kernel.
+        */
+       lazy_save_gs(current->thread.gs);
+       lazy_load_gs(__KERNEL_STACK_CANARY);
+#else
+       /*
+        * Save away %es, %ds, %fs and %gs. Must happen before reload
+        * of cr3/ldt (i.e., not in __switch_to).
+        */
+       __asm__ __volatile__ (
+               "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3"
+               : "=m" (current->thread.es),
+                 "=m" (current->thread.ds),
+                 "=m" (current->thread.fsindex),
+                 "=m" (current->thread.gsindex) );
+
+       if (current->thread.ds)
+               __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) );
+
+       if (current->thread.es)
+               __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) );
+
+       if (current->thread.fsindex) {
+               __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) );
+               current->thread.fs = 0;
+       }
+
+       if (current->thread.gsindex) {
+               load_gs_index(0);
+               current->thread.gs = 0;
+       }
+#endif
+}
+
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+                            struct task_struct *tsk)
+{
+       unsigned cpu = smp_processor_id();
+       struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op;
+#ifdef CONFIG_X86_64
+       pgd_t *upgd;
+#endif
+
+       if (likely(prev != next)) {
+               BUG_ON(!xen_feature(XENFEAT_writable_page_tables) &&
+                      !PagePinned(virt_to_page(next->pgd)));
+
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+               percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+               percpu_write(cpu_tlbstate.active_mm, next);
+#endif
+               cpumask_set_cpu(cpu, mm_cpumask(next));
+
+               /* Re-load page tables: load_cr3(next->pgd) */
+               op->cmd = MMUEXT_NEW_BASEPTR;
+               op->arg1.mfn = virt_to_mfn(next->pgd);
+               op++;
+
+               /* xen_new_user_pt(next->pgd) */
+#ifdef CONFIG_X86_64
+               op->cmd = MMUEXT_NEW_USER_BASEPTR;
+               upgd = __user_pgd(next->pgd);
+               op->arg1.mfn = likely(upgd) ? virt_to_mfn(upgd) : 0;
+               op++;
+#endif
+
+               /*
+                * load the LDT, if the LDT is different:
+                */
+               if (unlikely(prev->context.ldt != next->context.ldt)) {
+                       /* load_LDT_nolock(&next->context) */
+                       op->cmd = MMUEXT_SET_LDT;
+                       op->arg1.linear_addr = (unsigned long)next->context.ldt;
+                       op->arg2.nr_ents     = next->context.size;
+                       op++;
+               }
+
+               BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF));
+
+               /* stop TLB flushes for the previous mm */
+               cpumask_clear_cpu(cpu, mm_cpumask(prev));
+       }
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */
+       else {
+               percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+               BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
+
+               if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
+                       /* We were in lazy tlb mode and leave_mm disabled
+                        * tlb flush IPI delivery. We must reload CR3
+                        * to make sure to use no freed page tables.
+                        */
+                       load_cr3(next->pgd);
+                       xen_new_user_pt(next->pgd);
+                       load_LDT_nolock(&next->context);
+               }
+       }
+#endif
+}
+
+#define activate_mm(prev, next)                        \
+do {                                           \
+       xen_activate_mm(prev, next);            \
+       switch_mm((prev), (next), NULL);        \
+} while (0);
+
+#ifdef CONFIG_X86_32
+#define deactivate_mm(tsk, mm)                 \
+do {                                           \
+       lazy_load_gs(0);                        \
+} while (0)
+#else
+#define deactivate_mm(tsk, mm)                 \
+do {                                           \
+       load_gs_index(0);                       \
+       loadsegment(fs, 0);                     \
+} while (0)
+#endif
+
+#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/mach-xen/asm/mutex.h b/arch/x86/include/mach-xen/asm/mutex.h

new file mode 100644 (file)

index 0000000..ee9126e
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/mutex.h
@@ -0,0 +1,3 @@
+#define arch_cpu_is_running(cpu) vcpu_running(cpu)
+
+#include_next <asm/mutex.h>
diff --git a/arch/x86/include/mach-xen/asm/pci.h b/arch/x86/include/mach-xen/asm/pci.h

new file mode 100644 (file)

index 0000000..54289aa
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pci.h
@@ -0,0 +1,180 @@
+#ifndef _ASM_X86_PCI_H
+#define _ASM_X86_PCI_H
+
+#include <linux/mm.h> /* for struct page */
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <asm/scatterlist.h>
+#include <asm/io.h>
+#include <asm/x86_init.h>
+
+#ifdef __KERNEL__
+
+struct pci_sysdata {
+       int             domain;         /* PCI domain */
+       int             node;           /* NUMA node */
+#ifdef CONFIG_X86_64
+       void            *iommu;         /* IOMMU private data */
+#endif
+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
+       struct pcifront_device *pdev;
+#endif
+};
+
+extern int pci_routeirq;
+extern int noioapicquirk;
+extern int noioapicreroute;
+
+/* scan a bus after allocating a pci_sysdata for it */
+extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
+                                           int node);
+extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
+
+#ifdef CONFIG_PCI
+
+#ifdef CONFIG_PCI_DOMAINS
+static inline int pci_domain_nr(struct pci_bus *bus)
+{
+       struct pci_sysdata *sd = bus->sysdata;
+       return sd->domain;
+}
+
+static inline int pci_proc_domain(struct pci_bus *bus)
+{
+       return pci_domain_nr(bus);
+}
+#endif
+
+/* Can be used to override the logic in pci_scan_bus for skipping
+   already-configured bus numbers - to be used for buggy BIOSes
+   or architectures with incomplete PCI setup by the loader */
+
+extern unsigned int pcibios_assign_all_busses(void);
+extern int pci_legacy_init(void);
+# ifdef CONFIG_ACPI
+#  define x86_default_pci_init pci_acpi_init
+# else
+#  define x86_default_pci_init pci_legacy_init
+# endif
+#else
+# define pcibios_assign_all_busses()   0
+# define x86_default_pci_init          NULL
+#endif
+
+#include <asm/hypervisor.h>
+#define pcibios_scan_all_fns(a, b)     (!is_initial_xendomain())
+
+extern unsigned long pci_mem_start;
+#define PCIBIOS_MIN_IO         0x1000
+#define PCIBIOS_MIN_MEM                (pci_mem_start)
+
+#define PCIBIOS_MIN_CARDBUS_IO 0x4000
+
+extern int pcibios_enabled;
+void pcibios_config_init(void);
+struct pci_bus *pcibios_scan_root(int bus);
+
+void pcibios_set_master(struct pci_dev *dev);
+void pcibios_penalize_isa_irq(int irq, int active);
+struct irq_routing_table *pcibios_get_irq_routing_table(void);
+int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
+
+
+#define HAVE_PCI_MMAP
+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+                              enum pci_mmap_state mmap_state,
+                              int write_combine);
+
+
+#ifdef CONFIG_PCI
+extern void early_quirks(void);
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+                                       enum pci_dma_burst_strategy *strat,
+                                       unsigned long *strategy_parameter)
+{
+       *strat = PCI_DMA_BURST_INFINITY;
+       *strategy_parameter = ~0UL;
+}
+#else
+static inline void early_quirks(void) { }
+#endif
+
+extern void pci_iommu_alloc(void);
+
+#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN)
+/* MSI arch specific hooks */
+static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+       return x86_msi.setup_msi_irqs(dev, nvec, type);
+}
+
+static inline void x86_teardown_msi_irqs(struct pci_dev *dev)
+{
+       x86_msi.teardown_msi_irqs(dev);
+}
+
+static inline void x86_teardown_msi_irq(unsigned int irq)
+{
+       x86_msi.teardown_msi_irq(irq);
+}
+static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+       x86_msi.restore_msi_irqs(dev, irq);
+}
+#define arch_setup_msi_irqs x86_setup_msi_irqs
+#define arch_teardown_msi_irqs x86_teardown_msi_irqs
+#define arch_teardown_msi_irq x86_teardown_msi_irq
+#define arch_restore_msi_irqs x86_restore_msi_irqs
+/* implemented in arch/x86/kernel/apic/io_apic. */
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
+void native_teardown_msi_irq(unsigned int irq);
+void native_restore_msi_irqs(struct pci_dev *dev, int irq);
+/* default to the implementation in drivers/lib/msi.c */
+#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
+#define HAVE_DEFAULT_MSI_RESTORE_IRQS
+void default_teardown_msi_irqs(struct pci_dev *dev);
+void default_restore_msi_irqs(struct pci_dev *dev, int irq);
+#else
+#define native_setup_msi_irqs          NULL
+#define native_teardown_msi_irq                NULL
+#define default_teardown_msi_irqs      NULL
+#define default_restore_msi_irqs       NULL
+#endif
+
+#define PCI_DMA_BUS_IS_PHYS 0
+
+#endif  /* __KERNEL__ */
+
+#ifdef CONFIG_X86_64
+#include "../../asm/pci_64.h"
+#endif
+
+/* implement the pci_ DMA API in terms of the generic device dma_ one */
+#include <asm-generic/pci-dma-compat.h>
+
+/* generic pci stuff */
+#include <asm-generic/pci.h>
+#define PCIBIOS_MAX_MEM_32 0xffffffff
+
+#ifdef CONFIG_NUMA
+/* Returns the node based on pci bus */
+static inline int __pcibus_to_node(const struct pci_bus *bus)
+{
+       const struct pci_sysdata *sd = bus->sysdata;
+
+       return sd->node;
+}
+
+static inline const struct cpumask *
+cpumask_of_pcibus(const struct pci_bus *bus)
+{
+       int node;
+
+       node = __pcibus_to_node(bus);
+       return (node == -1) ? cpu_online_mask :
+                             cpumask_of_node(node);
+}
+#endif
+
+#endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/include/mach-xen/asm/percpu.h b/arch/x86/include/mach-xen/asm/percpu.h

new file mode 100644 (file)

index 0000000..336a525
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/percpu.h
@@ -0,0 +1,61 @@
+#ifndef _ASM_X86_XEN_PERCPU_H
+#define _ASM_X86_XEN_PERCPU_H
+
+#include_next <asm/percpu.h>
+
+#define this_vcpu_read_1 this_cpu_read_1
+#define this_vcpu_read_2 this_cpu_read_2
+#define this_vcpu_read_4 this_cpu_read_4
+
+#ifdef CONFIG_64BIT
+# define this_vcpu_read_8 this_cpu_read_8
+#else
+# define this_vcpu_read_8(pcp) ({ \
+       typeof(pcp) res__; \
+       __asm__ ("movl %%ebx,%%eax\n" \
+                "movl %%ecx,%%edx\n" \
+                "cmpxchg8b " __percpu_arg(1) \
+                : "=&A" (res__) : "m" (pcp)); \
+       res__; })
+#endif
+
+#define this_vcpu_read(pcp) __pcpu_size_call_return(this_vcpu_read_, pcp)
+
+#define percpu_exchange_op(op, var, val)               \
+({                                                     \
+       typedef typeof(var) pxo_T__;                    \
+       pxo_T__ pxo_ret__;                              \
+       if (0) {                                        \
+               pxo_ret__ = (val);                      \
+               (void)pxo_ret__;                        \
+       }                                               \
+       switch (sizeof(var)) {                          \
+       case 1:                                         \
+               asm(op "b %0,"__percpu_arg(1)           \
+                   : "=q" (pxo_ret__), "+m" (var)      \
+                   : "0" ((pxo_T__)(val)));            \
+               break;                                  \
+       case 2:                                         \
+               asm(op "w %0,"__percpu_arg(1)           \
+                   : "=r" (pxo_ret__), "+m" (var)      \
+                   : "0" ((pxo_T__)(val)));            \
+               break;                                  \
+       case 4:                                         \
+               asm(op "l %0,"__percpu_arg(1)           \
+                   : "=r" (pxo_ret__), "+m" (var)      \
+                   : "0" ((pxo_T__)(val)));            \
+               break;                                  \
+       case 8:                                         \
+               asm(op "q %0,"__percpu_arg(1)           \
+                   : "=r" (pxo_ret__), "+m" (var)      \
+                   : "0" ((pxo_T__)(val)));            \
+               break;                                  \
+       default: __bad_percpu_size();                   \
+       }                                               \
+       pxo_ret__;                                      \
+})
+
+#define percpu_xchg(var, val)          percpu_exchange_op("xchg", var, val)
+#define percpu_xadd(var, val)          percpu_exchange_op("xadd", var, val)
+
+#endif /* _ASM_X86_XEN_PERCPU_H */
diff --git a/arch/x86/include/mach-xen/asm/perf_event.h b/arch/x86/include/mach-xen/asm/perf_event.h

new file mode 100644 (file)

index 0000000..6c784d1
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/perf_event.h
@@ -0,0 +1,42 @@
+#ifndef _ASM_X86_PERF_EVENT_H
+#define _ASM_X86_PERF_EVENT_H
+
+#ifdef CONFIG_PERF_EVENTS
+
+/*
+ * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
+ * This flag is otherwise unused and ABI specified to be 0, so nobody should
+ * care what we do with it.
+ */
+#define PERF_EFLAGS_EXACT      (1UL << 3)
+
+#define perf_instruction_pointer(regs) instruction_pointer(regs)
+
+#define perf_misc_flags(regs) ({ \
+       struct pt_regs *_r_ = (regs); \
+       unsigned long _f_ = user_mode(_r_) ? PERF_RECORD_MISC_USER \
+                                          : PERF_RECORD_MISC_KERNEL; \
+       _r_->flags & PERF_EFLAGS_EXACT ? _f_ | PERF_RECORD_MISC_EXACT_IP : _f_; \
+})
+
+#include <asm/stacktrace.h>
+
+/*
+ * We abuse bit 3 from flags to pass exact information, see perf_misc_flags
+ * and the comment with PERF_EFLAGS_EXACT.
+ */
+#define perf_arch_fetch_caller_regs(regs, __ip)                {       \
+       (regs)->ip = (__ip);                                    \
+       (regs)->bp = caller_frame_pointer();                    \
+       (regs)->cs = __KERNEL_CS;                               \
+       regs->flags = 0;                                        \
+       asm volatile(                                           \
+               _ASM_MOV "%%"_ASM_SP ", %0\n"                   \
+               : "=m" ((regs)->sp)                             \
+               :: "memory"                                     \
+       );                                                      \
+}
+
+#endif
+
+#endif /* _ASM_X86_PERF_EVENT_H */
diff --git a/arch/x86/include/mach-xen/asm/pgalloc.h b/arch/x86/include/mach-xen/asm/pgalloc.h

new file mode 100644 (file)

index 0000000..3879075
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgalloc.h
@@ -0,0 +1,159 @@
+#ifndef _ASM_X86_PGALLOC_H
+#define _ASM_X86_PGALLOC_H
+
+#include <linux/threads.h>
+#include <linux/mm.h>          /* for struct page */
+#include <linux/pagemap.h>
+
+#include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
+
+static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
+static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
+
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
+                                           unsigned long start, unsigned long count) {}
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_release_pte(unsigned long pfn) {}
+static inline void paravirt_release_pmd(unsigned long pfn) {}
+static inline void paravirt_release_pud(unsigned long pfn) {}
+
+#ifdef CONFIG_X86_64
+void early_make_page_readonly(void *va, unsigned int feature);
+pmd_t *early_get_pmd(unsigned long va);
+#define make_lowmem_page_readonly make_page_readonly
+#define make_lowmem_page_writable make_page_writable
+#endif
+
+/*
+ * Flags to use when allocating a user page table page.
+ */
+extern gfp_t __userpte_alloc_gfp;
+
+/*
+ * Allocate and free page tables.
+ */
+extern pgd_t *pgd_alloc(struct mm_struct *);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+
+/* Should really implement gc for free page table pages. This could be
+   done with a reference count in struct page. */
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+       BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
+       make_lowmem_page_writable(pte, XENFEAT_writable_page_tables);
+       free_page((unsigned long)pte);
+}
+
+extern void __pte_free(pgtable_t);
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+       __pte_free(pte);
+}
+
+extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+
+static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
+                                 unsigned long address)
+{
+       ___pte_free_tlb(tlb, pte);
+}
+
+static inline void pmd_populate_kernel(struct mm_struct *mm,
+                                      pmd_t *pmd, pte_t *pte)
+{
+       paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
+       set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+                               struct page *pte)
+{
+       unsigned long pfn = page_to_pfn(pte);
+       pmd_t ent = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE);
+
+       paravirt_alloc_pte(mm, pfn);
+       if (PagePinned(virt_to_page(pmd))) {
+#ifndef CONFIG_HIGHPTE
+               BUG_ON(PageHighMem(pte));
+#endif
+               set_pmd(pmd, ent);
+       } else
+               *pmd = ent;
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+
+#if PAGETABLE_LEVELS > 2
+extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr);
+extern void __pmd_free(pgtable_t);
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+       BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+       __pmd_free(virt_to_page(pmd));
+}
+
+extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
+                                 unsigned long address)
+{
+       ___pmd_free_tlb(tlb, pmd);
+}
+
+#ifdef CONFIG_X86_PAE
+extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
+#else  /* !CONFIG_X86_PAE */
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+       pud_t ent = __pud(_PAGE_TABLE | __pa(pmd));
+
+       paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+       if (PagePinned(virt_to_page(pud)))
+               set_pud(pud, ent);
+       else
+               *pud = ent;
+}
+#endif /* CONFIG_X86_PAE */
+
+#if PAGETABLE_LEVELS > 3
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+       pgd_t ent = __pgd(_PAGE_TABLE | __pa(pud));
+
+       paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
+       if (unlikely(PagePinned(virt_to_page(pgd))))
+               xen_l4_entry_update(pgd, ent);
+       else
+               *__user_pgd(pgd) = *pgd = ent;
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+       return (pud_t *)pmd_alloc_one(mm, addr);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+       BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+       __pmd_free(virt_to_page(pud));
+}
+
+extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+
+static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
+                                 unsigned long address)
+{
+       ___pud_free_tlb(tlb, pud);
+}
+
+#endif /* PAGETABLE_LEVELS > 3 */
+#endif /* PAGETABLE_LEVELS > 2 */
+
+#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable-3level.h b/arch/x86/include/mach-xen/asm/pgtable-3level.h

new file mode 100644 (file)

index 0000000..71e906c
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable-3level.h
@@ -0,0 +1,152 @@
+#ifndef _ASM_X86_PGTABLE_3LEVEL_H
+#define _ASM_X86_PGTABLE_3LEVEL_H
+
+/*
+ * Intel Physical Address Extension (PAE) Mode - three-level page
+ * tables on PPro+ CPUs.
+ *
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ */
+
+#define pte_ERROR(e)                                                   \
+       printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n",                \
+               __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
+#define pmd_ERROR(e)                                                   \
+       printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n",                \
+              __FILE__, __LINE__, &(e), __pmd_val(e),                  \
+              (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
+#define pgd_ERROR(e)                                                   \
+       printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n",                \
+              __FILE__, __LINE__, &(e), __pgd_val(e),                  \
+              (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT)
+
+/* Rules for using set_pte: the pte being assigned *must* be
+ * either not present or in a state where the hardware will
+ * not attempt to update the pte.  In places where this is
+ * not possible, use pte_get_and_clear to obtain the old pte
+ * value and then use set_pte to update it.  -ben
+ */
+
+static inline void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+       ptep->pte_high = pte.pte_high;
+       smp_wmb();
+       ptep->pte_low = pte.pte_low;
+}
+
+static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+       xen_l2_entry_update(pmdp, pmd);
+}
+
+static inline void xen_set_pud(pud_t *pudp, pud_t pud)
+{
+       xen_l3_entry_update(pudp, pud);
+}
+
+/*
+ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
+ * entry, so clear the bottom half first and enforce ordering with a compiler
+ * barrier.
+ */
+static inline void __xen_pte_clear(pte_t *ptep)
+{
+       ptep->pte_low = 0;
+       smp_wmb();
+       ptep->pte_high = 0;
+}
+
+#define xen_pmd_clear(pmd)                     \
+({                                             \
+       pmd_t *__pmdp = (pmd);                  \
+       PagePinned(virt_to_page(__pmdp))        \
+       ? set_pmd(__pmdp, __pmd(0))             \
+       : (void)(*__pmdp = __pmd(0));           \
+})
+
+static inline void __xen_pud_clear(pud_t *pudp)
+{
+       set_pud(pudp, __pud(0));
+
+       /*
+        * According to Intel App note "TLBs, Paging-Structure Caches,
+        * and Their Invalidation", April 2007, document 317080-001,
+        * section 8.1: in PAE mode we explicitly have to flush the
+        * TLB via cr3 if the top-level pgd is changed...
+        *
+        * Currently all places where pud_clear() is called either have
+        * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or
+        * pud_clear_bad()), so we don't need TLB flush here.
+        */
+}
+
+#define xen_pud_clear(pudp)                    \
+({                                             \
+       pud_t *__pudp = (pudp);                 \
+       PagePinned(virt_to_page(__pudp))        \
+       ? __xen_pud_clear(__pudp)               \
+       : (void)(*__pudp = __pud(0));           \
+})
+
+#ifdef CONFIG_SMP
+static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
+{
+       uint64_t val = __pte_val(res);
+       if (__cmpxchg64(&ptep->pte, val, 0) != val) {
+               /* xchg acts as a barrier before the setting of the high bits */
+               res.pte_low = xchg(&ptep->pte_low, 0);
+               res.pte_high = ptep->pte_high;
+               ptep->pte_high = 0;
+       }
+       return res;
+}
+#else
+#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
+#endif
+
+#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
+                        ((_pte).pte_high << (32-PAGE_SHIFT)))
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_SMP
+union split_pmd {
+       struct {
+               u32 pmd_low;
+               u32 pmd_high;
+       };
+       pmd_t pmd;
+};
+static inline pmd_t xen_pmdp_get_and_clear(pmd_t *pmdp)
+{
+       union split_pmd res, *orig = (union split_pmd *)pmdp;
+
+       /* xchg acts as a barrier before setting of the high bits */
+       res.pmd_low = xchg(&orig->pmd_low, 0);
+       res.pmd_high = orig->pmd_high;
+       orig->pmd_high = 0;
+
+       return res.pmd;
+}
+#else
+#define xen_pmdp_get_and_clear(xp) xen_local_pmdp_get_and_clear(xp)
+#endif
+#endif
+
+/*
+ * Bits 0, 6 and 7 are taken in the low part of the pte,
+ * put the 32 bits of offset into the high part.
+ */
+#define pte_to_pgoff(pte) ((pte).pte_high)
+#define pgoff_to_pte(off)                                              \
+       ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
+#define PTE_FILE_MAX_BITS       32
+
+/* Encode and de-code a swap entry */
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
+#define __swp_type(x)                  (((x).val) & 0x1f)
+#define __swp_offset(x)                        ((x).val >> 5)
+#define __swp_entry(type, offset)      ((swp_entry_t){(type) | (offset) << 5})
+#define __pte_to_swp_entry(pte)                ((swp_entry_t){ (pte).pte_high })
+#define __swp_entry_to_pte(x)          ((pte_t){ { .pte_high = (x).val } })
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable-3level_types.h b/arch/x86/include/mach-xen/asm/pgtable-3level_types.h

new file mode 100644 (file)

index 0000000..36d6f2b
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable-3level_types.h
@@ -0,0 +1,44 @@
+#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+typedef u64    pteval_t;
+typedef u64    pmdval_t;
+typedef u64    pudval_t;
+typedef u64    pgdval_t;
+typedef u64    pgprotval_t;
+
+typedef union {
+       struct {
+               unsigned long pte_low, pte_high;
+       };
+       pteval_t pte;
+} pte_t;
+#endif /* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD      0
+
+#define PAGETABLE_LEVELS       3
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT    30
+#define PTRS_PER_PGD   4
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT      21
+#define PTRS_PER_PMD   512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE   512
+
+
+#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable.h b/arch/x86/include/mach-xen/asm/pgtable.h

new file mode 100644 (file)

index 0000000..cd43083
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable.h
@@ -0,0 +1,885 @@
+#ifndef _ASM_X86_PGTABLE_H
+#define _ASM_X86_PGTABLE_H
+
+#include <asm/page.h>
+#include <asm/e820.h>
+
+#include <asm/pgtable_types.h>
+
+/*
+ * Macro to mark a page protection value as UC-
+ */
+#define pgprot_noncached(prot)                                 \
+       ((boot_cpu_data.x86 > 3)                                \
+        ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS))  \
+        : (prot))
+
+#ifndef __ASSEMBLY__
+
+#include <asm/x86_init.h>
+
+/*
+ * ZERO_PAGE is a global shared page that is always zero: used
+ * for zero-mapped memory areas etc..
+ */
+extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
+
+extern spinlock_t pgd_lock;
+extern struct list_head pgd_list;
+
+extern struct mm_struct *pgd_page_get_mm(struct page *page);
+
+#define set_pte(ptep, pte)             xen_set_pte(ptep, pte)
+#define set_pte_at(mm, addr, ptep, pte)        xen_set_pte_at(mm, addr, ptep, pte)
+#define set_pmd_at(mm, addr, pmdp, pmd)        xen_set_pmd_at(mm, addr, pmdp, pmd)
+
+#define set_pmd(pmdp, pmd)             xen_set_pmd(pmdp, pmd)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define set_pgd(pgdp, pgd)             xen_set_pgd(pgdp, pgd)
+#define pgd_clear(pgd)                 xen_pgd_clear(pgd)
+#endif
+
+#ifndef set_pud
+# define set_pud(pudp, pud)            xen_set_pud(pudp, pud)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pud_clear(pud)                 xen_pud_clear(pud)
+#endif
+
+#define pte_clear(mm, addr, ptep)      xen_pte_clear(mm, addr, ptep)
+#define pmd_clear(pmd)                 xen_pmd_clear(pmd)
+
+#define pte_update(mm, addr, ptep)              do { } while (0)
+#define pte_update_defer(mm, addr, ptep)        do { } while (0)
+#define pmd_update(mm, addr, ptep)              do { } while (0)
+#define pmd_update_defer(mm, addr, ptep)        do { } while (0)
+
+#define pgd_val(x)     xen_pgd_val(x)
+#define __pgd(x)       xen_make_pgd(x)
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define pud_val(x)     xen_pud_val(x)
+#define __pud(x)       xen_make_pud(x)
+#endif
+
+#ifndef __PAGETABLE_PMD_FOLDED
+#define pmd_val(x)     xen_pmd_val(x)
+#define __pmd(x)       xen_make_pmd(x)
+#endif
+
+#define pte_val(x)     xen_pte_val(x)
+#define __pte(x)       xen_make_pte(x)
+
+#define arch_end_context_switch(prev)  do {} while(0)
+
+/*
+ * The following only work if pte_present() is true.
+ * Undefined behaviour if not..
+ */
+static inline int pte_dirty(pte_t pte)
+{
+       return pte_flags(pte) & _PAGE_DIRTY;
+}
+
+static inline int pte_young(pte_t pte)
+{
+       return pte_flags(pte) & _PAGE_ACCESSED;
+}
+
+static inline int pmd_young(pmd_t pmd)
+{
+       return pmd_flags(pmd) & _PAGE_ACCESSED;
+}
+
+static inline int pte_write(pte_t pte)
+{
+       return pte_flags(pte) & _PAGE_RW;
+}
+
+static inline int pte_file(pte_t pte)
+{
+       return pte_flags(pte) & _PAGE_FILE;
+}
+
+static inline int pte_huge(pte_t pte)
+{
+       return pte_flags(pte) & _PAGE_PSE;
+}
+
+static inline int pte_global(pte_t pte)
+{
+       return 0;
+}
+
+static inline int pte_exec(pte_t pte)
+{
+       return !(pte_flags(pte) & _PAGE_NX);
+}
+
+static inline int pte_special(pte_t pte)
+{
+       return pte_flags(pte) & _PAGE_SPECIAL;
+}
+
+#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
+       __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
+#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IOMAP ? max_mapnr : \
+                      (_pte).pte_low & _PAGE_PRESENT ?           \
+                      mfn_to_local_pfn(__pte_mfn(_pte)) :        \
+                      __pte_mfn(_pte))
+
+#define pte_page(pte)  pfn_to_page(pte_pfn(pte))
+
+static inline unsigned long pmd_pfn(pmd_t pmd)
+{
+       return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
+static inline int pmd_large(pmd_t pte)
+{
+       return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+               (_PAGE_PSE | _PAGE_PRESENT);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+       return pmd_val(pmd) & _PAGE_SPLITTING;
+}
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+       return pmd_val(pmd) & _PAGE_PSE;
+}
+
+static inline int has_transparent_hugepage(void)
+{
+       return cpu_has_pse;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
+{
+       pteval_t v = __pte_val(pte);
+
+       return __pte_ma(v | set);
+}
+
+static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
+{
+       pteval_t v = __pte_val(pte);
+
+       return __pte_ma(v & ~clear);
+}
+
+static inline pte_t pte_mkclean(pte_t pte)
+{
+       return pte_clear_flags(pte, _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkold(pte_t pte)
+{
+       return pte_clear_flags(pte, _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+       return pte_clear_flags(pte, _PAGE_RW);
+}
+
+static inline pte_t pte_mkexec(pte_t pte)
+{
+       return pte_clear_flags(pte, _PAGE_NX);
+}
+
+static inline pte_t pte_mkdirty(pte_t pte)
+{
+       return pte_set_flags(pte, _PAGE_DIRTY);
+}
+
+static inline pte_t pte_mkyoung(pte_t pte)
+{
+       return pte_set_flags(pte, _PAGE_ACCESSED);
+}
+
+static inline pte_t pte_mkwrite(pte_t pte)
+{
+       return pte_set_flags(pte, _PAGE_RW);
+}
+
+static inline pte_t pte_mkhuge(pte_t pte)
+{
+       return pte_set_flags(pte, _PAGE_PSE);
+}
+
+static inline pte_t pte_clrhuge(pte_t pte)
+{
+       return pte_clear_flags(pte, _PAGE_PSE);
+}
+
+static inline pte_t pte_mkglobal(pte_t pte)
+{
+       return pte;
+}
+
+static inline pte_t pte_clrglobal(pte_t pte)
+{
+       return pte;
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+       return pte_set_flags(pte, _PAGE_SPECIAL);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
+{
+       pmdval_t v = native_pmd_val(pmd);
+
+       return __pmd(v | set);
+}
+
+static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
+{
+       pmdval_t v = native_pmd_val(pmd);
+
+       return __pmd(v & ~clear);
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+       return pmd_clear_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+       return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+       return pmd_set_flags(pmd, _PAGE_DIRTY);
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+       return pmd_set_flags(pmd, _PAGE_PSE);
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+       return pmd_set_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+       return pmd_set_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+       return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
+#endif
+
+/*
+ * Mask out unsupported bits in a present pgprot.  Non-present pgprots
+ * can use those bits for other purposes, so leave them be.
+ */
+static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
+{
+       pgprotval_t protval = pgprot_val(pgprot);
+
+       if (protval & _PAGE_PRESENT)
+               protval &= __supported_pte_mask;
+
+       return protval;
+}
+
+static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+       return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
+                    massage_pgprot(pgprot));
+}
+
+static inline pte_t pfn_pte_ma(phys_addr_t page_nr, pgprot_t pgprot)
+{
+       return __pte_ma((page_nr << PAGE_SHIFT) | massage_pgprot(pgprot));
+}
+
+static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
+{
+       return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
+                    massage_pgprot(pgprot));
+}
+
+static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+{
+       pteval_t val = pte_val(pte) & _PAGE_CHG_MASK;
+
+       val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK;
+
+       return __pte(val);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+       pmdval_t val = pmd_val(pmd);
+
+       val &= _HPAGE_CHG_MASK;
+       val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+
+       return __pmd(val);
+}
+#endif
+
+/* mprotect needs to preserve PAT bits when updating vm_page_prot */
+#define pgprot_modify pgprot_modify
+static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
+{
+       pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
+       pgprotval_t addbits = pgprot_val(newprot);
+       return __pgprot(preservebits | addbits);
+}
+
+#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
+
+#define canon_pgprot(p) __pgprot(massage_pgprot(p))
+
+static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
+                                        unsigned long flags,
+                                        unsigned long new_flags)
+{
+       /*
+        * PAT type is always WB for untracked ranges, so no need to check.
+        */
+       if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
+               return 1;
+
+       /*
+        * Certain new memtypes are not allowed with certain
+        * requested memtype:
+        * - request is uncached, return cannot be write-back
+        * - request is write-combine, return cannot be write-back
+        */
+       if ((flags == _PAGE_CACHE_UC_MINUS &&
+            new_flags == _PAGE_CACHE_WB) ||
+           (flags == _PAGE_CACHE_WC &&
+            new_flags == _PAGE_CACHE_WB)) {
+               return 0;
+       }
+
+       return 1;
+}
+
+pmd_t *populate_extra_pmd(unsigned long vaddr);
+pte_t *populate_extra_pte(unsigned long vaddr);
+#endif /* __ASSEMBLY__ */
+
+#ifdef CONFIG_X86_32
+# include "pgtable_32.h"
+#else
+# include "pgtable_64.h"
+#endif
+
+#ifndef __ASSEMBLY__
+#include <linux/mm_types.h>
+
+static inline int pte_none(pte_t pte)
+{
+       return !pte.pte;
+}
+
+#define __HAVE_ARCH_PTE_SAME
+static inline int pte_same(pte_t a, pte_t b)
+{
+       return a.pte == b.pte;
+}
+
+static inline int pte_present(pte_t a)
+{
+       return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
+}
+
+static inline int pte_hidden(pte_t pte)
+{
+       return pte_flags(pte) & _PAGE_HIDDEN;
+}
+
+static inline int pmd_present(pmd_t pmd)
+{
+#if CONFIG_XEN_COMPAT <= 0x030002
+/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
+   can temporarily clear it. */
+       return __pmd_val(pmd) != 0;
+#else
+       return pmd_flags(pmd) & _PAGE_PRESENT;
+#endif
+}
+
+static inline int pmd_none(pmd_t pmd)
+{
+       /* Only check low word on 32-bit platforms, since it might be
+          out of sync with upper half. */
+       return (unsigned long)__pmd_val(pmd) == 0;
+}
+
+static inline unsigned long pmd_page_vaddr(pmd_t pmd)
+{
+       return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pmd_page(pmd)  pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
+
+/*
+ * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
+ *
+ * this macro returns the index of the entry in the pmd page which would
+ * control the given virtual address
+ */
+static inline unsigned long pmd_index(unsigned long address)
+{
+       return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
+}
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ *
+ * (Currently stuck as a macro because of indirect forward reference
+ * to linux/mm.h:page_to_nid())
+ */
+#define mk_pte(page, pgprot)   pfn_pte(page_to_pfn(page), (pgprot))
+
+/*
+ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
+ *
+ * this function returns the index of the entry in the pte page which would
+ * control the given virtual address
+ */
+static inline unsigned long pte_index(unsigned long address)
+{
+       return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+}
+
+static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
+{
+       return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
+}
+
+static inline int pmd_bad(pmd_t pmd)
+{
+#if CONFIG_XEN_COMPAT <= 0x030002
+       return (pmd_flags(pmd) & ~_PAGE_USER & ~_PAGE_PRESENT)
+              != (_KERNPG_TABLE & ~_PAGE_PRESENT);
+#else
+       return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
+#endif
+}
+
+static inline unsigned long pages_to_mb(unsigned long npg)
+{
+       return npg >> (20 - PAGE_SHIFT);
+}
+
+#define io_remap_pfn_range(vma, vaddr, pfn, size, prot)        \
+       direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO)
+
+#if PAGETABLE_LEVELS > 2
+static inline int pud_none(pud_t pud)
+{
+       return __pud_val(pud) == 0;
+}
+
+static inline int pud_present(pud_t pud)
+{
+       return pud_flags(pud) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pud_page_vaddr(pud_t pud)
+{
+       return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pud_page(pud)          pfn_to_page(pud_val(pud) >> PAGE_SHIFT)
+
+/* Find an entry in the second-level page table.. */
+static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
+{
+       return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
+}
+
+static inline int pud_large(pud_t pud)
+{
+       return (__pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
+               (_PAGE_PSE | _PAGE_PRESENT);
+}
+
+static inline int pud_bad(pud_t pud)
+{
+       return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+}
+#else
+static inline int pud_large(pud_t pud)
+{
+       return 0;
+}
+#endif /* PAGETABLE_LEVELS > 2 */
+
+#if PAGETABLE_LEVELS > 3
+static inline int pgd_present(pgd_t pgd)
+{
+       return pgd_flags(pgd) & _PAGE_PRESENT;
+}
+
+static inline unsigned long pgd_page_vaddr(pgd_t pgd)
+{
+       return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define pgd_page(pgd)          pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
+
+/* to find an entry in a page-table-directory. */
+static inline unsigned long pud_index(unsigned long address)
+{
+       return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+}
+
+static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
+{
+       return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address);
+}
+
+static inline int pgd_bad(pgd_t pgd)
+{
+       return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+}
+
+static inline int pgd_none(pgd_t pgd)
+{
+       return !__pgd_val(pgd);
+}
+#endif /* PAGETABLE_LEVELS > 3 */
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
+ *
+ * this macro returns the index of the entry in the pgd page which would
+ * control the given virtual address
+ */
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
+
+/*
+ * pgd_offset() returns a (pgd_t *)
+ * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
+ */
+#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+/*
+ * a shortcut which implies the use of the kernel's pgd, instead
+ * of a process's
+ */
+#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
+
+
+#define KERNEL_PGD_BOUNDARY    pgd_index(PAGE_OFFSET)
+#define KERNEL_PGD_PTRS                (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
+
+#ifndef __ASSEMBLY__
+
+#define direct_gbpages 0
+
+/* local pte updates need not use xchg for locking */
+static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
+{
+       xen_set_pte(ptep, __pte(0));
+       return res;
+}
+
+static inline pmd_t xen_local_pmdp_get_and_clear(pmd_t *pmdp)
+{
+       pmd_t res = *pmdp;
+
+       xen_set_pmd(pmdp, __pmd(0));
+       return res;
+}
+
+static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
+                                 pte_t *ptep , pte_t pte)
+{
+       if ((mm != current->mm && mm != &init_mm) ||
+           HYPERVISOR_update_va_mapping(addr, pte, 0))
+               xen_set_pte(ptep, pte);
+}
+
+static inline void xen_set_pmd_at(struct mm_struct *mm, unsigned long addr,
+                                 pmd_t *pmdp , pmd_t pmd)
+{
+       xen_set_pmd(pmdp, pmd);
+}
+
+static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr,
+                                pte_t *ptep)
+{
+       if ((mm != current->mm && mm != &init_mm)
+           || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
+               __xen_pte_clear(ptep);
+}
+
+#ifndef CONFIG_PARAVIRT
+/*
+ * Rules for using pte_update - it must be called after any PTE update which
+ * has not been done using the set_pte / clear_pte interfaces.  It is used by
+ * shadow mode hypervisors to resynchronize the shadow page tables.  Kernel PTE
+ * updates should either be sets, clears, or set_pte_atomic for P->P
+ * transitions, which means this hook should only be called for user PTEs.
+ * This hook implies a P->P protection or access change has taken place, which
+ * requires a subsequent TLB flush.  The notification can optionally be delayed
+ * until the TLB flush event by using the pte_update_defer form of the
+ * interface, but care must be taken to assure that the flush happens while
+ * still holding the same page table lock so that the shadow and primary pages
+ * do not become out of sync on SMP.
+ */
+#define pte_update(mm, addr, ptep)             do { } while (0)
+#define pte_update_defer(mm, addr, ptep)       do { } while (0)
+#endif
+
+/*
+ * We only update the dirty/accessed state if we set
+ * the dirty bit by hand in the kernel, since the hardware
+ * will do the accessed bit for us, and we don't want to
+ * race with other CPU's that might be updating the dirty
+ * bit at the same time.
+ */
+struct vm_area_struct;
+
+#define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+                                unsigned long address, pte_t *ptep,
+                                pte_t entry, int dirty);
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
+                                    unsigned long addr, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
+                                 unsigned long address, pte_t *ptep);
+
+#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
+#define ptep_clear_flush(vma, addr, ptep)                      \
+({                                                             \
+       pte_t *__ptep = (ptep);                                 \
+       pte_t __res = *__ptep;                                  \
+       if (!pte_none(__res) &&                                 \
+           ((vma)->vm_mm != current->mm ||                     \
+            HYPERVISOR_update_va_mapping(addr, __pte(0),       \
+                       uvm_multi(mm_cpumask((vma)->vm_mm)) |   \
+                               UVMF_INVLPG))) {                \
+               __xen_pte_clear(__ptep);                        \
+               flush_tlb_page(vma, addr);                      \
+       }                                                       \
+       __res;                                                  \
+})
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+                                      pte_t *ptep)
+{
+       pte_t pte = *ptep;
+       if (!pte_none(pte)
+           && (mm != &init_mm
+               || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
+               pte = xen_ptep_get_and_clear(ptep, pte);
+               pte_update(mm, addr, ptep);
+       }
+       return pte;
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
+#define ptep_get_and_clear_full(mm, addr, ptep, full)          \
+       ((full) ? ({                                            \
+               pte_t *__ptep = (ptep);                         \
+               pte_t __res = *__ptep;                          \
+               if (!PagePinned(virt_to_page((mm)->pgd)))       \
+                       __xen_pte_clear(__ptep);                \
+               else if (!pte_none(__res))                      \
+                       xen_l1_entry_update(__ptep, __pte(0));  \
+               __res;                                          \
+        }) :                                                   \
+        ptep_get_and_clear(mm, addr, ptep))
+
+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int);
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+                                     unsigned long addr, pte_t *ptep)
+{
+       pte_t pte = *ptep;
+       if (pte_write(pte))
+               set_pte_at(mm, addr, ptep, pte_wrprotect(pte));
+}
+
+#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
+
+#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
+
+#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+                                unsigned long address, pmd_t *pmdp,
+                                pmd_t entry, int dirty);
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                                    unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                                 unsigned long address, pmd_t *pmdp);
+
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+                                unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+       return pmd_flags(pmd) & _PAGE_RW;
+}
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+                                      pmd_t *pmdp)
+{
+       pmd_t pmd = xen_pmdp_get_and_clear(pmdp);
+       pmd_update(mm, addr, pmdp);
+       return pmd;
+}
+#endif
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+                                     unsigned long addr, pmd_t *pmdp)
+{
+       clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+       pmd_update(mm, addr, pmdp);
+}
+#endif
+
+/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ *  dst - pointer to pgd range anwhere on a pgd page
+ *  src - ""
+ *  count - the number of pgds to copy.
+ *
+ * dst and src can be on the same page, but the range must not overlap,
+ * and must not cross a page boundary.
+ */
+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+{
+       memcpy(dst, src, count * sizeof(pgd_t));
+}
+
+#define arbitrary_virt_to_mfn(va)                                      \
+({                                                                     \
+       unsigned int __lvl;                                             \
+       pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl);    \
+       BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\
+       pte_mfn(*__ptep);                                               \
+})
+
+#define arbitrary_virt_to_machine(va)                                  \
+       (((maddr_t)arbitrary_virt_to_mfn(va) << PAGE_SHIFT)             \
+        | ((unsigned long)(va) & (PAGE_SIZE - 1)))
+
+#ifdef CONFIG_HIGHPTE
+#include <asm/io.h>
+struct page *kmap_atomic_to_page(void *);
+#define ptep_to_machine(ptep)                                          \
+({                                                                     \
+       pte_t *__ptep = (ptep);                                         \
+       page_to_phys(kmap_atomic_to_page(__ptep))                       \
+               | ((unsigned long)__ptep & (PAGE_SIZE - 1));            \
+})
+#else
+#define ptep_to_machine(ptep)  virt_to_machine(ptep)
+#endif
+
+#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
+static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+                                          pte_t *ptep)
+{
+#if CONFIG_XEN_COMPAT < 0x030300
+       if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad)))
+               return ptep_get_and_clear(mm, addr, ptep);
+#endif
+       return *ptep;
+}
+
+static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+                                          pte_t *ptep, pte_t pte)
+{
+       mmu_update_t u;
+
+#if CONFIG_XEN_COMPAT < 0x030300
+       if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) {
+               set_pte_at(mm, addr, ptep, pte);
+               return;
+       }
+#endif
+       u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD;
+       u.val = __pte_val(pte);
+       if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF))
+               BUG();
+}
+
+#include <asm-generic/pgtable.h>
+
+#include <xen/features.h>
+void make_page_readonly(void *va, unsigned int feature);
+void make_page_writable(void *va, unsigned int feature);
+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature);
+void make_pages_writable(void *va, unsigned int nr, unsigned int feature);
+
+struct vm_area_struct;
+
+int direct_remap_pfn_range(struct vm_area_struct *vma,
+                           unsigned long address,
+                           phys_addr_t mfn,
+                           unsigned long size,
+                           pgprot_t prot,
+                           domid_t  domid);
+int direct_kernel_remap_pfn_range(unsigned long address,
+                                 unsigned long mfn,
+                                 unsigned long size,
+                                 pgprot_t prot,
+                                 domid_t  domid);
+int create_lookup_pte_addr(struct mm_struct *mm,
+                           unsigned long address,
+                           uint64_t *ptep);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable_32.h b/arch/x86/include/mach-xen/asm/pgtable_32.h

new file mode 100644 (file)

index 0000000..7d89873
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_32.h
@@ -0,0 +1,89 @@
+#ifndef _ASM_X86_PGTABLE_32_H
+#define _ASM_X86_PGTABLE_32_H
+
+#include <asm/pgtable_32_types.h>
+
+/*
+ * The Linux memory management assumes a three-level page table setup. On
+ * the i386, we use that, but "fold" the mid level into the top-level page
+ * table, so that we physically have the same two-level page table as the
+ * i386 mmu expects.
+ *
+ * This file contains the functions and defines necessary to modify and use
+ * the i386 page table tree.
+ */
+#ifndef __ASSEMBLY__
+#include <asm/processor.h>
+#include <asm/fixmap.h>
+#include <linux/threads.h>
+
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+
+struct vm_area_struct;
+
+extern pgd_t *swapper_pg_dir;
+extern pgd_t initial_page_table[1024];
+
+static inline void pgtable_cache_init(void) { }
+static inline void check_pgt_cache(void) { }
+void paging_init(void);
+
+extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
+
+
+/*
+ * Define this if things work differently on an i386 and an i486:
+ * it will (on an i486) warn about kernel memory accesses that are
+ * done without a 'access_ok(VERIFY_WRITE,..)'
+ */
+#undef TEST_ACCESS_OK
+
+#ifdef CONFIG_X86_PAE
+# include <asm/pgtable-3level.h>
+#else
+# include <asm/pgtable-2level.h>
+#endif
+
+#if defined(CONFIG_HIGHPTE)
+#define pte_offset_map(dir, address)                                   \
+       ((pte_t *)kmap_atomic_pte(pmd_page(*(dir))) +           \
+        pte_index((address)))
+#define pte_unmap(pte) kunmap_atomic((pte))
+#else
+#define pte_offset_map(dir, address)                                   \
+       ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
+#define pte_unmap(pte) do { } while (0)
+#endif
+
+/* Clear a kernel PTE and flush it from the TLB */
+#define kpte_clear_flush(ptep, vaddr)                                  \
+do {                                                                   \
+       if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
+               BUG(); \
+} while (0)
+
+/*
+ * The i386 doesn't have any external MMU info: the kernel page
+ * tables contain all the necessary information.
+ */
+#define update_mmu_cache(vma, address, ptep) do { } while (0)
+
+void make_lowmem_page_readonly(void *va, unsigned int feature);
+void make_lowmem_page_writable(void *va, unsigned int feature);
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * kern_addr_valid() is (1) for FLATMEM and (0) for
+ * SPARSEMEM and DISCONTIGMEM
+ */
+#ifdef CONFIG_FLATMEM
+#define kern_addr_valid(addr)  (1)
+#else
+#define kern_addr_valid(kaddr) (0)
+#endif
+
+#endif /* _ASM_X86_PGTABLE_32_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable_64.h b/arch/x86/include/mach-xen/asm/pgtable_64.h

new file mode 100644 (file)

index 0000000..f58b2ef
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_64.h
@@ -0,0 +1,203 @@
+#ifndef _ASM_X86_PGTABLE_64_H
+#define _ASM_X86_PGTABLE_64_H
+
+#include <linux/const.h>
+#include <asm/pgtable_64_types.h>
+
+#ifndef __ASSEMBLY__
+
+/*
+ * This file contains the functions and defines necessary to modify and use
+ * the x86-64 page table tree.
+ */
+#include <asm/processor.h>
+#include <linux/bitops.h>
+#include <linux/threads.h>
+#include <linux/sched.h>
+
+#ifdef CONFIG_XEN
+extern pud_t level3_user_pgt[512];
+
+extern void xen_init_pt(void);
+extern void xen_switch_pt(void);
+#endif
+
+extern pud_t level3_kernel_pgt[512];
+extern pud_t level3_ident_pgt[512];
+extern pmd_t level2_kernel_pgt[512];
+extern pmd_t level2_fixmap_pgt[512];
+extern pmd_t level2_ident_pgt[512];
+extern pgd_t init_level4_pgt[];
+
+#define swapper_pg_dir init_level4_pgt
+
+extern void paging_init(void);
+
+#define pte_ERROR(e)                                                   \
+       printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n",               \
+              __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e))
+#define pmd_ERROR(e)                                                   \
+       printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n",               \
+              __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e))
+#define pud_ERROR(e)                                                   \
+       printk("%s:%d: bad pud %p(%016lx pfn %010Lx).\n",               \
+              __FILE__, __LINE__, &(e), __pud_val(e),                  \
+              (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+#define pgd_ERROR(e)                                                   \
+       printk("%s:%d: bad pgd %p(%016lx pfn %010Lx).\n",               \
+              __FILE__, __LINE__, &(e), __pgd_val(e),                  \
+              (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT)
+
+struct mm_struct;
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
+
+
+#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0))
+
+static inline void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+       *ptep = pte;
+}
+
+static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+       xen_l2_entry_update(pmdp, pmd);
+}
+
+#define xen_pmd_clear(pmd)                     \
+({                                             \
+       pmd_t *__pmdp = (pmd);                  \
+       PagePinned(virt_to_page(__pmdp))        \
+       ? set_pmd(__pmdp, xen_make_pmd(0))      \
+       : (void)(*__pmdp = xen_make_pmd(0));    \
+})
+
+#ifdef CONFIG_SMP
+static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret)
+{
+       return __pte_ma(xchg(&xp->pte, 0));
+}
+#else
+#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_SMP
+static inline pmd_t xen_pmdp_get_and_clear(pmd_t *xp)
+{
+       return xen_make_pmd(xchg(&xp->pmd, 0));
+}
+#else
+#define xen_pmdp_get_and_clear(xp) xen_local_pmdp_get_and_clear(xp)
+#endif
+#endif
+
+static inline void xen_set_pud(pud_t *pudp, pud_t pud)
+{
+       xen_l3_entry_update(pudp, pud);
+}
+
+#define xen_pud_clear(pud)                     \
+({                                             \
+       pud_t *__pudp = (pud);                  \
+       PagePinned(virt_to_page(__pudp))        \
+       ? set_pud(__pudp, xen_make_pud(0))      \
+       : (void)(*__pudp = xen_make_pud(0));    \
+})
+
+static inline pgd_t *__user_pgd(pgd_t *pgd)
+{
+       if (unlikely(((unsigned long)pgd & PAGE_MASK)
+                    == (unsigned long)init_level4_pgt))
+               return NULL;
+       return (pgd_t *)(virt_to_page(pgd)->private
+                        + ((unsigned long)pgd & ~PAGE_MASK));
+}
+
+static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+       xen_l4_entry_update(pgdp, pgd);
+}
+
+#define xen_pgd_clear(pgd)                     \
+({                                             \
+       pgd_t *__pgdp = (pgd);                  \
+       PagePinned(virt_to_page(__pgdp))        \
+       ? xen_l4_entry_update(__pgdp, xen_make_pgd(0)) \
+       : (void)(*__user_pgd(__pgdp) = *__pgdp = xen_make_pgd(0)); \
+})
+
+#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT)
+
+extern unsigned long early_arbitrary_virt_to_mfn(void *va);
+
+extern void sync_global_pgds(unsigned long start, unsigned long end);
+
+/*
+ * Conversion functions: convert a page and protection to a page entry,
+ * and a page entry and page directory to the page they refer to.
+ */
+
+/*
+ * Level 4 access.
+ */
+static inline int pgd_large(pgd_t pgd) { return 0; }
+#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
+
+/* PUD - Level3 access */
+
+/* PMD  - Level 2 access */
+#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
+#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) |    \
+                                           _PAGE_FILE })
+#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
+
+/* PTE - Level 1 access. */
+
+/* x86-64 always has all page tables mapped. */
+#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
+#define pte_unmap(pte) ((void)(pte))/* NOP */
+
+#define update_mmu_cache(vma, address, ptep) do { } while (0)
+
+/* Encode and de-code a swap entry */
+#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
+#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+#else
+#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
+#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
+#endif
+
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+
+#define __swp_type(x)                  (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
+                                        & ((1U << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x)                        ((x).val >> SWP_OFFSET_SHIFT)
+#define __swp_entry(type, offset)      ((swp_entry_t) { \
+                                        ((type) << (_PAGE_BIT_PRESENT + 1)) \
+                                        | ((offset) << SWP_OFFSET_SHIFT) })
+#define __pte_to_swp_entry(pte)                ((swp_entry_t) { __pte_val(pte) })
+#define __swp_entry_to_pte(x)          ((pte_t) { .pte = (x).val })
+
+extern int kern_addr_valid(unsigned long addr);
+
+#define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+
+#define pgtable_cache_init()   do { } while (0)
+#define check_pgt_cache()      do { } while (0)
+
+#define PAGE_AGP    PAGE_KERNEL_NOCACHE
+#define HAVE_PAGE_AGP 1
+
+/* fs/proc/kcore.c */
+#define        kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
+#define        kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
+
+#define __HAVE_ARCH_PTE_SAME
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable_64_types.h b/arch/x86/include/mach-xen/asm/pgtable_64_types.h

new file mode 100644 (file)

index 0000000..c4c4665
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_64_types.h
@@ -0,0 +1,64 @@
+#ifndef _ASM_X86_PGTABLE_64_DEFS_H
+#define _ASM_X86_PGTABLE_64_DEFS_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+/*
+ * These are used to make use of C type-checking..
+ */
+typedef unsigned long  pteval_t;
+typedef unsigned long  pmdval_t;
+typedef unsigned long  pudval_t;
+typedef unsigned long  pgdval_t;
+typedef unsigned long  pgprotval_t;
+
+typedef union { pteval_t pte; unsigned int pte_low; } pte_t;
+
+#endif /* !__ASSEMBLY__ */
+
+#define SHARED_KERNEL_PMD      0
+#define PAGETABLE_LEVELS       4
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT    39
+#define PTRS_PER_PGD   512
+
+/*
+ * 3rd level page
+ */
+#define PUD_SHIFT      30
+#define PTRS_PER_PUD   512
+
+/*
+ * PMD_SHIFT determines the size of the area a middle-level
+ * page table can map
+ */
+#define PMD_SHIFT      21
+#define PTRS_PER_PMD   512
+
+/*
+ * entries per page directory level
+ */
+#define PTRS_PER_PTE   512
+
+#define PMD_SIZE       (_AC(1, UL) << PMD_SHIFT)
+#define PMD_MASK       (~(PMD_SIZE - 1))
+#define PUD_SIZE       (_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASK       (~(PUD_SIZE - 1))
+#define PGDIR_SIZE     (_AC(1, UL) << PGDIR_SHIFT)
+#define PGDIR_MASK     (~(PGDIR_SIZE - 1))
+
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
+#define MAX_PHYSMEM_BITS 43
+#define MAXMEM          _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define VMALLOC_START    _AC(0xffffc90000000000, UL)
+#define VMALLOC_END      _AC(0xffffe8ffffffffff, UL)
+#define VMEMMAP_START   _AC(0xffffea0000000000, UL)
+#define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
+#define MODULES_END      _AC(0xffffffffff000000, UL)
+#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
+
+#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/include/mach-xen/asm/pgtable_types.h b/arch/x86/include/mach-xen/asm/pgtable_types.h

new file mode 100644 (file)

index 0000000..d0ca475
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/pgtable_types.h
@@ -0,0 +1,392 @@
+#ifndef _ASM_X86_PGTABLE_DEFS_H
+#define _ASM_X86_PGTABLE_DEFS_H
+
+#include <linux/const.h>
+#include <asm/page_types.h>
+
+#define FIRST_USER_ADDRESS     0
+
+#define _PAGE_BIT_PRESENT      0       /* is present */
+#define _PAGE_BIT_RW           1       /* writeable */
+#define _PAGE_BIT_USER         2       /* userspace addressable */
+#define _PAGE_BIT_PWT          3       /* page write through */
+#define _PAGE_BIT_PCD          4       /* page cache disabled */
+#define _PAGE_BIT_ACCESSED     5       /* was accessed (raised by CPU) */
+#define _PAGE_BIT_DIRTY                6       /* was written to (raised by CPU) */
+#define _PAGE_BIT_PSE          7       /* 4 MB (or 2MB) page */
+#define _PAGE_BIT_PAT          7       /* on 4KB pages */
+#define _PAGE_BIT_GLOBAL       8       /* Global TLB entry PPro+ */
+#define _PAGE_BIT_UNUSED1      9       /* available for programmer */
+#define _PAGE_BIT_IOMAP                10      /* flag used to indicate IO mapping */
+#define _PAGE_BIT_HIDDEN       11      /* hidden by kmemcheck */
+#define _PAGE_BIT_PAT_LARGE    12      /* On 2MB or 1GB pages */
+#define _PAGE_BIT_SPECIAL      _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_CPA_TEST     _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_SPLITTING    _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
+#define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
+
+/* If _PAGE_BIT_PRESENT is clear, we use these: */
+/* - if the user mapped it with PROT_NONE; pte_present gives true */
+#define _PAGE_BIT_PROTNONE     _PAGE_BIT_GLOBAL
+/* - set: nonlinear file mapping, saved PTE; unset:swap */
+#define _PAGE_BIT_FILE         _PAGE_BIT_DIRTY
+
+#define _PAGE_PRESENT  (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
+#define _PAGE_RW       (_AT(pteval_t, 1) << _PAGE_BIT_RW)
+#define _PAGE_USER     (_AT(pteval_t, 1) << _PAGE_BIT_USER)
+#define _PAGE_PWT      (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
+#define _PAGE_PCD      (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
+#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
+#define _PAGE_DIRTY    (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
+#define _PAGE_PSE      (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
+#define _PAGE_GLOBAL   (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_UNUSED1  (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
+#define _PAGE_IOMAP    (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+#define _PAGE_PAT      (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
+#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
+#define _PAGE_SPECIAL  (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
+#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define _PAGE_SPLITTING        (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
+#define __HAVE_ARCH_PTE_SPECIAL
+
+#ifdef CONFIG_KMEMCHECK
+#define _PAGE_HIDDEN   (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
+#else
+#define _PAGE_HIDDEN   (_AT(pteval_t, 0))
+#endif
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_NX       (_AT(pteval_t, 1) << _PAGE_BIT_NX)
+#else
+#define _PAGE_NX       (_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_FILE     (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+
+#ifndef __ASSEMBLY__
+#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002
+extern unsigned int __kernel_page_user;
+#else
+#define __kernel_page_user 0
+#endif
+#endif
+
+#define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
+                        _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _KERNPG_TABLE  (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |    \
+                        _PAGE_DIRTY | __kernel_page_user)
+
+/* Set of bits not changed in pte_modify */
+#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \
+                        _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+
+/*
+ * PAT settings are part of the hypervisor interface, which sets the
+ * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]).
+ */
+#define _PAGE_CACHE_MASK       (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT)
+#define _PAGE_CACHE_WB         (0)
+#define _PAGE_CACHE_WT         (_PAGE_PWT)
+#define _PAGE_CACHE_WC         (_PAGE_PAT)
+#define _PAGE_CACHE_WP         (_PAGE_PAT | _PAGE_PWT)
+#define _PAGE_CACHE_UC_MINUS   (_PAGE_PCD)
+#define _PAGE_CACHE_UC         (_PAGE_PCD | _PAGE_PWT)
+
+#define PAGE_NONE      __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
+#define PAGE_SHARED    __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
+                                _PAGE_ACCESSED | _PAGE_NX)
+
+#define PAGE_SHARED_EXEC       __pgprot(_PAGE_PRESENT | _PAGE_RW |     \
+                                        _PAGE_USER | _PAGE_ACCESSED)
+#define PAGE_COPY_NOEXEC       __pgprot(_PAGE_PRESENT | _PAGE_USER |   \
+                                        _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_COPY_EXEC         __pgprot(_PAGE_PRESENT | _PAGE_USER |   \
+                                        _PAGE_ACCESSED)
+#define PAGE_COPY              PAGE_COPY_NOEXEC
+#define PAGE_READONLY          __pgprot(_PAGE_PRESENT | _PAGE_USER |   \
+                                        _PAGE_ACCESSED | _PAGE_NX)
+#define PAGE_READONLY_EXEC     __pgprot(_PAGE_PRESENT | _PAGE_USER |   \
+                                        _PAGE_ACCESSED)
+
+#define __PAGE_KERNEL_EXEC                                             \
+       (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user)
+#define __PAGE_KERNEL          (__PAGE_KERNEL_EXEC | _PAGE_NX)
+
+#define __PAGE_KERNEL_RO               (__PAGE_KERNEL & ~_PAGE_RW)
+#define __PAGE_KERNEL_RX               (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
+#define __PAGE_KERNEL_EXEC_NOCACHE     (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_WC               (__PAGE_KERNEL | _PAGE_CACHE_WC)
+#define __PAGE_KERNEL_NOCACHE          (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_UC_MINUS         (__PAGE_KERNEL | _PAGE_PCD)
+#define __PAGE_KERNEL_VSYSCALL         (__PAGE_KERNEL_RX | _PAGE_USER)
+#define __PAGE_KERNEL_VVAR             (__PAGE_KERNEL_RO | _PAGE_USER)
+#define __PAGE_KERNEL_VVAR_NOCACHE     (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_LARGE            (__PAGE_KERNEL | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_NOCACHE    (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
+#define __PAGE_KERNEL_LARGE_EXEC       (__PAGE_KERNEL_EXEC | _PAGE_PSE)
+
+#define __PAGE_KERNEL_IO               (__PAGE_KERNEL | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_NOCACHE       (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_UC_MINUS      (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO_WC            (__PAGE_KERNEL_WC | _PAGE_IOMAP)
+
+#define PAGE_KERNEL                    __pgprot(__PAGE_KERNEL)
+#define PAGE_KERNEL_RO                 __pgprot(__PAGE_KERNEL_RO)
+#define PAGE_KERNEL_EXEC               __pgprot(__PAGE_KERNEL_EXEC)
+#define PAGE_KERNEL_RX                 __pgprot(__PAGE_KERNEL_RX)
+#define PAGE_KERNEL_WC                 __pgprot(__PAGE_KERNEL_WC)
+#define PAGE_KERNEL_NOCACHE            __pgprot(__PAGE_KERNEL_NOCACHE)
+#define PAGE_KERNEL_UC_MINUS           __pgprot(__PAGE_KERNEL_UC_MINUS)
+#define PAGE_KERNEL_EXEC_NOCACHE       __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
+#define PAGE_KERNEL_LARGE              __pgprot(__PAGE_KERNEL_LARGE)
+#define PAGE_KERNEL_LARGE_NOCACHE      __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
+#define PAGE_KERNEL_LARGE_EXEC         __pgprot(__PAGE_KERNEL_LARGE_EXEC)
+#define PAGE_KERNEL_VSYSCALL           __pgprot(__PAGE_KERNEL_VSYSCALL)
+#define PAGE_KERNEL_VVAR               __pgprot(__PAGE_KERNEL_VVAR)
+#define PAGE_KERNEL_VVAR_NOCACHE       __pgprot(__PAGE_KERNEL_VVAR_NOCACHE)
+
+#define PAGE_KERNEL_IO                 __pgprot(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE         __pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#define PAGE_KERNEL_IO_UC_MINUS                __pgprot(__PAGE_KERNEL_IO_UC_MINUS)
+#define PAGE_KERNEL_IO_WC              __pgprot(__PAGE_KERNEL_IO_WC)
+
+/*         xwr */
+#define __P000 PAGE_NONE
+#define __P001 PAGE_READONLY
+#define __P010 PAGE_COPY
+#define __P011 PAGE_COPY
+#define __P100 PAGE_READONLY_EXEC
+#define __P101 PAGE_READONLY_EXEC
+#define __P110 PAGE_COPY_EXEC
+#define __P111 PAGE_COPY_EXEC
+
+#define __S000 PAGE_NONE
+#define __S001 PAGE_READONLY
+#define __S010 PAGE_SHARED
+#define __S011 PAGE_SHARED
+#define __S100 PAGE_READONLY_EXEC
+#define __S101 PAGE_READONLY_EXEC
+#define __S110 PAGE_SHARED_EXEC
+#define __S111 PAGE_SHARED_EXEC
+
+/*
+ * early identity mapping  pte attrib macros.
+ */
+#ifdef CONFIG_X86_64
+#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
+#else
+/*
+ * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
+ * bits are combined, this will alow user to access the high address mapped
+ * VDSO in the presence of CONFIG_COMPAT_VDSO
+ */
+#define PTE_IDENT_ATTR  0x003          /* PRESENT+RW */
+#define PDE_IDENT_ATTR  0x067          /* PRESENT+RW+USER+DIRTY+ACCESSED */
+#define PGD_IDENT_ATTR  0x001          /* PRESENT (no other attributes) */
+#endif
+
+#ifdef CONFIG_X86_32
+# include <asm/pgtable_32_types.h>
+#else
+# include "pgtable_64_types.h"
+#endif
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
+#define PTE_PFN_MASK           ((pteval_t)PHYSICAL_PAGE_MASK)
+
+/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
+#define PTE_FLAGS_MASK         (~PTE_PFN_MASK)
+
+typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
+
+#include <asm/maddr.h>
+
+typedef struct { pgdval_t pgd; } pgd_t;
+
+#define __pgd_ma(x) ((pgd_t) { (x) } )
+static inline pgd_t xen_make_pgd(pgdval_t val)
+{
+       if (likely(val & _PAGE_PRESENT))
+               val = pte_phys_to_machine(val);
+       return (pgd_t) { val };
+}
+
+#define __pgd_val(x) ((x).pgd)
+static inline pgdval_t xen_pgd_val(pgd_t pgd)
+{
+       pgdval_t ret = __pgd_val(pgd);
+#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002
+       if (likely(ret))
+               ret = machine_to_phys(ret) | _PAGE_PRESENT;
+#else
+       if (likely(ret & _PAGE_PRESENT))
+               ret = pte_machine_to_phys(ret);
+#endif
+       return ret;
+}
+
+static inline pgdval_t pgd_flags(pgd_t pgd)
+{
+       return __pgd_val(pgd) & PTE_FLAGS_MASK;
+}
+
+#if PAGETABLE_LEVELS > 3
+typedef struct { pudval_t pud; } pud_t;
+
+#define __pud_ma(x) ((pud_t) { (x) } )
+static inline pud_t xen_make_pud(pudval_t val)
+{
+       if (likely(val & _PAGE_PRESENT))
+               val = pte_phys_to_machine(val);
+       return (pud_t) { val };
+}
+
+#define __pud_val(x) ((x).pud)
+static inline pudval_t xen_pud_val(pud_t pud)
+{
+       pudval_t ret = __pud_val(pud);
+       if (likely(ret & _PAGE_PRESENT))
+               ret = pte_machine_to_phys(ret);
+       return ret;
+}
+#else
+#include <asm-generic/pgtable-nopud.h>
+
+#define __pud_val(x) __pgd_val((x).pgd)
+static inline pudval_t xen_pud_val(pud_t pud)
+{
+       return xen_pgd_val(pud.pgd);
+}
+#endif
+
+#if PAGETABLE_LEVELS > 2
+typedef struct { pmdval_t pmd; } pmd_t;
+
+#define __pmd_ma(x)    ((pmd_t) { (x) } )
+static inline pmd_t xen_make_pmd(pmdval_t val)
+{
+       if (likely(val & _PAGE_PRESENT))
+               val = pte_phys_to_machine(val);
+       return (pmd_t) { val };
+}
+
+#define __pmd_val(x) ((x).pmd)
+static inline pmdval_t xen_pmd_val(pmd_t pmd)
+{
+       pmdval_t ret = __pmd_val(pmd);
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (likely(ret))
+               ret = pte_machine_to_phys(ret) | _PAGE_PRESENT;
+#else
+       if (likely(ret & _PAGE_PRESENT))
+               ret = pte_machine_to_phys(ret);
+#endif
+       return ret;
+}
+#else
+#include <asm-generic/pgtable-nopmd.h>
+
+#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } )
+#define __pmd_val(x) __pgd_val((x).pud.pgd)
+static inline pmdval_t xen_pmd_val(pmd_t pmd)
+{
+       return xen_pgd_val(pmd.pud.pgd);
+}
+#endif
+
+static inline pudval_t pud_flags(pud_t pud)
+{
+       return __pud_val(pud) & PTE_FLAGS_MASK;
+}
+
+static inline pmdval_t pmd_flags(pmd_t pmd)
+{
+       return __pmd_val(pmd) & PTE_FLAGS_MASK;
+}
+
+#define __pte_ma(x) ((pte_t) { .pte = (x) } )
+static inline pte_t xen_make_pte(pteval_t val)
+{
+       if (likely((val & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT))
+               val = pte_phys_to_machine(val);
+       return (pte_t) { .pte = val };
+}
+
+#define __pte_val(x) ((x).pte)
+static inline pteval_t xen_pte_val(pte_t pte)
+{
+       pteval_t ret = __pte_val(pte);
+       if (likely((pte.pte_low & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT))
+               ret = pte_machine_to_phys(ret);
+       return ret;
+}
+
+static inline pteval_t pte_flags(pte_t pte)
+{
+       return __pte_val(pte) & PTE_FLAGS_MASK;
+}
+
+#define pgprot_val(x)  ((x).pgprot)
+#define __pgprot(x)    ((pgprot_t) { (x) } )
+
+
+typedef struct page *pgtable_t;
+
+extern pteval_t __supported_pte_mask;
+extern void set_nx(void);
+extern int nx_enabled;
+
+#define pgprot_writecombine    pgprot_writecombine
+extern pgprot_t pgprot_writecombine(pgprot_t prot);
+
+#ifndef CONFIG_XEN
+/* Indicate that x86 has its own track and untrack pfn vma functions */
+#define __HAVE_PFNMAP_TRACKING
+#endif
+
+#define __HAVE_PHYS_MEM_ACCESS_PROT
+struct file;
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+                              unsigned long size, pgprot_t vma_prot);
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+                              unsigned long size, pgprot_t *vma_prot);
+
+/* Install a pte for a particular vaddr in kernel space. */
+void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+
+extern void xen_pagetable_reserve(u64 start, u64 end);
+
+struct seq_file;
+extern void arch_report_meminfo(struct seq_file *m);
+
+enum {
+       PG_LEVEL_NONE,
+       PG_LEVEL_4K,
+       PG_LEVEL_2M,
+       PG_LEVEL_1G,
+       PG_LEVEL_NUM
+};
+
+#ifdef CONFIG_PROC_FS
+extern void update_page_count(int level, unsigned long pages);
+#else
+static inline void update_page_count(int level, unsigned long pages) { }
+#endif
+
+/*
+ * Helper function that returns the kernel pagetable entry controlling
+ * the virtual address 'address'. NULL means no pagetable entry present.
+ * NOTE: the return type is pte_t but if the pmd is PSE then we return it
+ * as a pte too.
+ */
+extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/include/mach-xen/asm/probe_roms.h b/arch/x86/include/mach-xen/asm/probe_roms.h

new file mode 100644 (file)

index 0000000..da90d01
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/probe_roms.h
@@ -0,0 +1,10 @@
+#if !defined(CONFIG_XEN_UNPRIVILEGED_GUEST)
+# include_next <asm/probe_roms.h>
+#elif !defined(_PROBE_ROMS_H_)
+# define _PROBE_ROMS_H_
+struct pci_dev;
+
+static inline void __iomem *pci_map_biosrom(struct pci_dev *pdev) { return NULL; }
+static inline void pci_unmap_biosrom(void __iomem *rom) { }
+static inline size_t pci_biosrom_size(struct pci_dev *pdev) { return 0; }
+#endif
diff --git a/arch/x86/include/mach-xen/asm/processor.h b/arch/x86/include/mach-xen/asm/processor.h

new file mode 100644 (file)

index 0000000..6bc8580
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/processor.h
@@ -0,0 +1,986 @@
+#ifndef _ASM_X86_PROCESSOR_H
+#define _ASM_X86_PROCESSOR_H
+
+#include <asm/processor-flags.h>
+
+/* Forward declaration, a strange C thing */
+struct task_struct;
+struct mm_struct;
+
+#include <asm/vm86.h>
+#include <asm/math_emu.h>
+#include <asm/segment.h>
+#include <asm/types.h>
+#include <asm/sigcontext.h>
+#include <asm/current.h>
+#include <asm/cpufeature.h>
+#include <asm/page.h>
+#include <asm/pgtable_types.h>
+#include <asm/percpu.h>
+#include <asm/msr.h>
+#include <asm/desc_defs.h>
+#include <asm/nops.h>
+#include <asm/special_insns.h>
+
+#include <linux/personality.h>
+#include <linux/cpumask.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/math64.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/irqflags.h>
+
+#include <xen/interface/physdev.h>
+
+/*
+ * We handle most unaligned accesses in hardware.  On the other hand
+ * unaligned DMA can be quite expensive on some Nehalem processors.
+ *
+ * Based on this we disable the IP header alignment in network drivers.
+ */
+#define NET_IP_ALIGN   0
+
+#define HBP_NUM 4
+/*
+ * Default implementation of macro that returns current
+ * instruction pointer ("program counter").
+ */
+static inline void *current_text_addr(void)
+{
+       void *pc;
+
+       asm volatile("mov $1f, %0; 1:":"=r" (pc));
+
+       return pc;
+}
+
+#ifdef CONFIG_X86_VSMP
+# define ARCH_MIN_TASKALIGN            (1 << INTERNODE_CACHE_SHIFT)
+# define ARCH_MIN_MMSTRUCT_ALIGN       (1 << INTERNODE_CACHE_SHIFT)
+#else
+# define ARCH_MIN_TASKALIGN            16
+# define ARCH_MIN_MMSTRUCT_ALIGN       0
+#endif
+
+/*
+ *  CPU type and hardware bug flags. Kept separately for each CPU.
+ *  Members of this structure are referenced in head.S, so think twice
+ *  before touching them. [mj]
+ */
+
+struct cpuinfo_x86 {
+       __u8                    x86;            /* CPU family */
+       __u8                    x86_vendor;     /* CPU vendor */
+       __u8                    x86_model;
+       __u8                    x86_mask;
+#ifdef CONFIG_X86_32
+       char                    wp_works_ok;    /* It doesn't on 386's */
+
+       /* Problems on some 486Dx4's and old 386's: */
+#ifndef CONFIG_XEN
+       char                    hlt_works_ok;
+#endif
+       char                    hard_math;
+#ifndef CONFIG_XEN
+       char                    rfu;
+       char                    fdiv_bug;
+       char                    f00f_bug;
+       char                    coma_bug;
+       char                    pad0;
+#endif
+#else
+       /* Number of 4K pages in DTLB/ITLB combined(in pages): */
+       int                     x86_tlbsize;
+#endif
+       __u8                    x86_virt_bits;
+       __u8                    x86_phys_bits;
+#ifndef CONFIG_XEN
+       /* CPUID returned core id bits: */
+       __u8                    x86_coreid_bits;
+#endif
+       /* Max extended CPUID function supported: */
+       __u32                   extended_cpuid_level;
+       /* Maximum supported CPUID level, -1=no CPUID: */
+       int                     cpuid_level;
+       __u32                   x86_capability[NCAPINTS];
+       char                    x86_vendor_id[16];
+       char                    x86_model_id[64];
+       /* in KB - valid for CPUS which support this call: */
+       int                     x86_cache_size;
+       int                     x86_cache_alignment;    /* In bytes */
+       int                     x86_power;
+       unsigned long           loops_per_jiffy;
+#ifndef CONFIG_XEN
+       /* cpuid returned max cores value: */
+       u16                      x86_max_cores;
+       u16                     apicid;
+       u16                     initial_apicid;
+#endif
+       u16                     x86_clflush_size;
+#ifndef CONFIG_XEN
+       /* number of cores as seen by the OS: */
+       u16                     booted_cores;
+       /* Physical processor id: */
+       u16                     phys_proc_id;
+       /* Core id: */
+       u16                     cpu_core_id;
+       /* Compute unit id */
+       u8                      compute_unit_id;
+#endif
+       /* Index into per_cpu list: */
+       u16                     cpu_index;
+#ifndef CONFIG_XEN
+       u32                     microcode;
+#endif
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define X86_VENDOR_INTEL       0
+#define X86_VENDOR_CYRIX       1
+#define X86_VENDOR_AMD         2
+#define X86_VENDOR_UMC         3
+#define X86_VENDOR_CENTAUR     5
+#define X86_VENDOR_TRANSMETA   7
+#define X86_VENDOR_NSC         8
+#define X86_VENDOR_NUM         9
+
+#define X86_VENDOR_UNKNOWN     0xff
+
+/*
+ * capabilities of CPUs
+ */
+extern struct cpuinfo_x86      boot_cpu_data;
+extern struct cpuinfo_x86      new_cpu_data;
+
+extern __u32                   cpu_caps_cleared[NCAPINTS];
+extern __u32                   cpu_caps_set[NCAPINTS];
+
+#ifdef CONFIG_SMP
+DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
+#define cpu_data(cpu)          per_cpu(cpu_info, cpu)
+#else
+#define cpu_info               boot_cpu_data
+#define cpu_data(cpu)          boot_cpu_data
+#endif
+
+extern const struct seq_operations cpuinfo_op;
+
+static inline int hlt_works(int cpu)
+{
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+       return cpu_data(cpu).hlt_works_ok;
+#else
+       return 1;
+#endif
+}
+
+#define cache_line_size()      (boot_cpu_data.x86_cache_alignment)
+
+extern void cpu_detect(struct cpuinfo_x86 *c);
+
+extern struct pt_regs *idle_regs(struct pt_regs *);
+
+extern void early_cpu_init(void);
+extern void identify_boot_cpu(void);
+extern void identify_secondary_cpu(struct cpuinfo_x86 *);
+extern void print_cpu_info(struct cpuinfo_x86 *);
+void print_cpu_msr(struct cpuinfo_x86 *);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern unsigned short num_cache_leaves;
+
+extern void detect_extended_topology(struct cpuinfo_x86 *c);
+extern void detect_ht(struct cpuinfo_x86 *c);
+
+static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+                            unsigned int *ecx, unsigned int *edx)
+{
+       /* ecx is often an input as well as an output. */
+       asm volatile(XEN_CPUID
+           : "=a" (*eax),
+             "=b" (*ebx),
+             "=c" (*ecx),
+             "=d" (*edx)
+           : "0" (*eax), "2" (*ecx)
+           : "memory");
+}
+
+static inline void load_cr3(pgd_t *pgdir)
+{
+       write_cr3(__pa(pgdir));
+}
+
+#ifndef CONFIG_X86_NO_TSS
+#ifdef CONFIG_X86_32
+/* This is the TSS defined by the hardware. */
+struct x86_hw_tss {
+       unsigned short          back_link, __blh;
+       unsigned long           sp0;
+       unsigned short          ss0, __ss0h;
+       unsigned long           sp1;
+       /* ss1 caches MSR_IA32_SYSENTER_CS: */
+       unsigned short          ss1, __ss1h;
+       unsigned long           sp2;
+       unsigned short          ss2, __ss2h;
+       unsigned long           __cr3;
+       unsigned long           ip;
+       unsigned long           flags;
+       unsigned long           ax;
+       unsigned long           cx;
+       unsigned long           dx;
+       unsigned long           bx;
+       unsigned long           sp;
+       unsigned long           bp;
+       unsigned long           si;
+       unsigned long           di;
+       unsigned short          es, __esh;
+       unsigned short          cs, __csh;
+       unsigned short          ss, __ssh;
+       unsigned short          ds, __dsh;
+       unsigned short          fs, __fsh;
+       unsigned short          gs, __gsh;
+       unsigned short          ldt, __ldth;
+       unsigned short          trace;
+       unsigned short          io_bitmap_base;
+
+} __attribute__((packed));
+extern struct tss_struct doublefault_tss;
+#else
+struct x86_hw_tss {
+       u32                     reserved1;
+       u64                     sp0;
+       u64                     sp1;
+       u64                     sp2;
+       u64                     reserved2;
+       u64                     ist[7];
+       u32                     reserved3;
+       u32                     reserved4;
+       u16                     reserved5;
+       u16                     io_bitmap_base;
+
+} __attribute__((packed)) ____cacheline_aligned;
+#endif
+#endif /* CONFIG_X86_NO_TSS */
+
+/*
+ * IO-bitmap sizes:
+ */
+#define IO_BITMAP_BITS                 65536
+#define IO_BITMAP_BYTES                        (IO_BITMAP_BITS/8)
+#define IO_BITMAP_LONGS                        (IO_BITMAP_BYTES/sizeof(long))
+#define IO_BITMAP_OFFSET               offsetof(struct tss_struct, io_bitmap)
+#define INVALID_IO_BITMAP_OFFSET       0x8000
+
+#ifndef CONFIG_X86_NO_TSS
+struct tss_struct {
+       /*
+        * The hardware state:
+        */
+       struct x86_hw_tss       x86_tss;
+
+       /*
+        * The extra 1 is there because the CPU will access an
+        * additional byte beyond the end of the IO permission
+        * bitmap. The extra byte must be all 1 bits, and must
+        * be within the limit.
+        */
+       unsigned long           io_bitmap[IO_BITMAP_LONGS + 1];
+
+       /*
+        * .. and then another 0x100 bytes for the emergency kernel stack:
+        */
+       unsigned long           stack[64];
+
+} ____cacheline_aligned;
+
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+
+/*
+ * Save the original ist values for checking stack pointers during debugging
+ */
+struct orig_ist {
+       unsigned long           ist[7];
+};
+#endif /* CONFIG_X86_NO_TSS */
+
+#define        MXCSR_DEFAULT           0x1f80
+
+struct i387_fsave_struct {
+       u32                     cwd;    /* FPU Control Word             */
+       u32                     swd;    /* FPU Status Word              */
+       u32                     twd;    /* FPU Tag Word                 */
+       u32                     fip;    /* FPU IP Offset                */
+       u32                     fcs;    /* FPU IP Selector              */
+       u32                     foo;    /* FPU Operand Pointer Offset   */
+       u32                     fos;    /* FPU Operand Pointer Selector */
+
+       /* 8*10 bytes for each FP-reg = 80 bytes:                       */
+       u32                     st_space[20];
+
+       /* Software status information [not touched by FSAVE ]:         */
+       u32                     status;
+};
+
+struct i387_fxsave_struct {
+       u16                     cwd; /* Control Word                    */
+       u16                     swd; /* Status Word                     */
+       u16                     twd; /* Tag Word                        */
+       u16                     fop; /* Last Instruction Opcode         */
+       union {
+               struct {
+                       u64     rip; /* Instruction Pointer             */
+                       u64     rdp; /* Data Pointer                    */
+               };
+               struct {
+                       u32     fip; /* FPU IP Offset                   */
+                       u32     fcs; /* FPU IP Selector                 */
+                       u32     foo; /* FPU Operand Offset              */
+                       u32     fos; /* FPU Operand Selector            */
+               };
+       };
+       u32                     mxcsr;          /* MXCSR Register State */
+       u32                     mxcsr_mask;     /* MXCSR Mask           */
+
+       /* 8*16 bytes for each FP-reg = 128 bytes:                      */
+       u32                     st_space[32];
+
+       /* 16*16 bytes for each XMM-reg = 256 bytes:                    */
+       u32                     xmm_space[64];
+
+       u32                     padding[12];
+
+       union {
+               u32             padding1[12];
+               u32             sw_reserved[12];
+       };
+
+} __attribute__((aligned(16)));
+
+struct i387_soft_struct {
+       u32                     cwd;
+       u32                     swd;
+       u32                     twd;
+       u32                     fip;
+       u32                     fcs;
+       u32                     foo;
+       u32                     fos;
+       /* 8*10 bytes for each FP-reg = 80 bytes: */
+       u32                     st_space[20];
+       u8                      ftop;
+       u8                      changed;
+       u8                      lookahead;
+       u8                      no_update;
+       u8                      rm;
+       u8                      alimit;
+       struct math_emu_info    *info;
+       u32                     entry_eip;
+};
+
+struct ymmh_struct {
+       /* 16 * 16 bytes for each YMMH-reg = 256 bytes */
+       u32 ymmh_space[64];
+};
+
+struct xsave_hdr_struct {
+       u64 xstate_bv;
+       u64 reserved1[2];
+       u64 reserved2[5];
+} __attribute__((packed));
+
+struct xsave_struct {
+       struct i387_fxsave_struct i387;
+       struct xsave_hdr_struct xsave_hdr;
+       struct ymmh_struct ymmh;
+       /* new processor state extensions will go here */
+} __attribute__ ((packed, aligned (64)));
+
+union thread_xstate {
+       struct i387_fsave_struct        fsave;
+       struct i387_fxsave_struct       fxsave;
+       struct i387_soft_struct         soft;
+       struct xsave_struct             xsave;
+};
+
+struct fpu {
+       unsigned int last_cpu;
+       unsigned int has_fpu;
+       union thread_xstate *state;
+};
+
+#ifdef CONFIG_X86_64
+#ifndef CONFIG_X86_NO_TSS
+DECLARE_PER_CPU(struct orig_ist, orig_ist);
+#endif
+
+union irq_stack_union {
+       char irq_stack[IRQ_STACK_SIZE];
+       /*
+        * GCC hardcodes the stack canary as %gs:40.  Since the
+        * irq_stack is the object at %gs:0, we reserve the bottom
+        * 48 bytes of the irq stack for the canary.
+        */
+       struct {
+               char gs_base[40];
+               unsigned long stack_canary;
+       };
+};
+
+DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union);
+DECLARE_INIT_PER_CPU(irq_stack_union);
+
+DECLARE_PER_CPU(char *, irq_stack_ptr);
+DECLARE_PER_CPU(unsigned int, irq_count);
+extern unsigned long kernel_eflags;
+extern asmlinkage void ignore_sysret(void);
+#else  /* X86_64 */
+#ifdef CONFIG_CC_STACKPROTECTOR
+/*
+ * Make sure stack canary segment base is cached-aligned:
+ *   "For Intel Atom processors, avoid non zero segment base address
+ *    that is not aligned to cache line boundary at all cost."
+ * (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
+ */
+struct stack_canary {
+       char __pad[20];         /* canary at %gs:20 */
+       unsigned long canary;
+};
+DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+#endif
+#endif /* X86_64 */
+
+extern unsigned int xstate_size;
+extern void free_thread_xstate(struct task_struct *);
+extern struct kmem_cache *task_xstate_cachep;
+
+struct perf_event;
+
+struct thread_struct {
+       /* Cached TLS descriptors: */
+       struct desc_struct      tls_array[GDT_ENTRY_TLS_ENTRIES];
+       unsigned long           sp0;
+       unsigned long           sp;
+#ifdef CONFIG_X86_32
+       unsigned long           sysenter_cs;
+#else
+       unsigned short          es;
+       unsigned short          ds;
+       unsigned short          fsindex;
+       unsigned short          gsindex;
+#endif
+#ifdef CONFIG_X86_32
+       unsigned long           ip;
+#endif
+#ifdef CONFIG_X86_64
+       unsigned long           fs;
+#endif
+       unsigned long           gs;
+       /* Save middle states of ptrace breakpoints */
+       struct perf_event       *ptrace_bps[HBP_NUM];
+       /* Debug status used for traps, single steps, etc... */
+       unsigned long           debugreg6;
+       /* Keep track of the exact dr7 value set by the user */
+       unsigned long           ptrace_dr7;
+       /* Fault info: */
+       unsigned long           cr2;
+       unsigned long           trap_nr;
+       unsigned long           error_code;
+       /* floating point and extended processor state */
+       struct fpu              fpu;
+#ifdef CONFIG_X86_32
+       /* Virtual 86 mode info */
+       struct vm86_struct __user *vm86_info;
+       unsigned long           screen_bitmap;
+       unsigned long           v86flags, v86mask, saved_sp0;
+       unsigned int            saved_fs, saved_gs;
+#endif
+       /* IO permissions: */
+       unsigned long           *io_bitmap_ptr;
+       unsigned long           iopl;
+       /* Max allowed port in the bitmap, in bytes: */
+       unsigned                io_bitmap_max;
+};
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void xen_set_iopl_mask(unsigned mask)
+{
+       struct physdev_set_iopl set_iopl;
+
+       /* Force the change at ring 0. */
+       set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+}
+
+#ifndef CONFIG_X86_NO_TSS
+static inline void
+native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+{
+       tss->x86_tss.sp0 = thread->sp0;
+#ifdef CONFIG_X86_32
+       /* Only happens when SEP is enabled, no need to test "SEP"arately: */
+       if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
+               tss->x86_tss.ss1 = thread->sysenter_cs;
+               wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+       }
+#endif
+}
+#else
+#define xen_load_sp0(tss, thread) do { \
+       if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \
+               BUG(); \
+} while (0)
+#endif
+
+#define __cpuid                        xen_cpuid
+#define paravirt_enabled()     1
+
+#define load_sp0 xen_load_sp0
+
+#define set_iopl_mask xen_set_iopl_mask
+
+/*
+ * Save the cr4 feature set we're using (ie
+ * Pentium 4MB enable and PPro Global page
+ * enable), so that any CPU's that boot up
+ * after us can get the correct flags.
+ */
+extern unsigned long           mmu_cr4_features;
+
+static inline void set_in_cr4(unsigned long mask)
+{
+       unsigned long cr4;
+
+       mmu_cr4_features |= mask;
+       cr4 = read_cr4();
+       cr4 |= mask;
+       write_cr4(cr4);
+}
+
+static inline void clear_in_cr4(unsigned long mask)
+{
+       unsigned long cr4;
+
+       mmu_cr4_features &= ~mask;
+       cr4 = read_cr4();
+       cr4 &= ~mask;
+       write_cr4(cr4);
+}
+
+typedef struct {
+       unsigned long           seg;
+} mm_segment_t;
+
+
+/*
+ * create a kernel thread without removing it from tasklists
+ */
+extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+
+/* Free all resources held by a thread. */
+extern void release_thread(struct task_struct *);
+
+/* Prepare to copy thread state - unlazy all lazy state */
+extern void prepare_to_copy(struct task_struct *tsk);
+
+unsigned long get_wchan(struct task_struct *p);
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(unsigned int op,
+                        unsigned int *eax, unsigned int *ebx,
+                        unsigned int *ecx, unsigned int *edx)
+{
+       *eax = op;
+       *ecx = 0;
+       __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void cpuid_count(unsigned int op, int count,
+                              unsigned int *eax, unsigned int *ebx,
+                              unsigned int *ecx, unsigned int *edx)
+{
+       *eax = op;
+       *ecx = count;
+       __cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum
+ */
+static inline unsigned int cpuid_eax(unsigned int op)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       cpuid(op, &eax, &ebx, &ecx, &edx);
+
+       return eax;
+}
+
+static inline unsigned int cpuid_ebx(unsigned int op)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       cpuid(op, &eax, &ebx, &ecx, &edx);
+
+       return ebx;
+}
+
+static inline unsigned int cpuid_ecx(unsigned int op)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       cpuid(op, &eax, &ebx, &ecx, &edx);
+
+       return ecx;
+}
+
+static inline unsigned int cpuid_edx(unsigned int op)
+{
+       unsigned int eax, ebx, ecx, edx;
+
+       cpuid(op, &eax, &ebx, &ecx, &edx);
+
+       return edx;
+}
+
+/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+static inline void rep_nop(void)
+{
+       asm volatile("rep; nop" ::: "memory");
+}
+
+static inline void cpu_relax(void)
+{
+       rep_nop();
+}
+
+/* Stop speculative execution and prefetching of modified code. */
+static inline void sync_core(void)
+{
+       int tmp;
+
+#if defined(CONFIG_M386) || defined(CONFIG_M486)
+       if (boot_cpu_data.x86 < 5)
+               /* There is no speculative execution.
+                * jmp is a barrier to prefetching. */
+               asm volatile("jmp 1f\n1:\n" ::: "memory");
+       else
+#endif
+               /* cpuid is a barrier to speculative execution.
+                * Prefetched instructions are automatically
+                * invalidated when modified. */
+               asm volatile("cpuid" : "=a" (tmp) : "0" (1)
+                            : "ebx", "ecx", "edx", "memory");
+}
+
+static inline void __monitor(const void *eax, unsigned long ecx,
+                            unsigned long edx)
+{
+       /* "monitor %eax, %ecx, %edx;" */
+       asm volatile(".byte 0x0f, 0x01, 0xc8;"
+                    :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
+static inline void __mwait(unsigned long eax, unsigned long ecx)
+{
+       /* "mwait %eax, %ecx;" */
+       asm volatile(".byte 0x0f, 0x01, 0xc9;"
+                    :: "a" (eax), "c" (ecx));
+}
+
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+       trace_hardirqs_on();
+       /* "mwait %eax, %ecx;" */
+       asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
+                    :: "a" (eax), "c" (ecx));
+}
+
+extern void select_idle_routine(const struct cpuinfo_x86 *c);
+extern void init_amd_e400_c1e_mask(void);
+
+extern unsigned long           boot_option_idle_override;
+extern bool                    amd_e400_c1e_detected;
+
+enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
+                        IDLE_POLL, IDLE_FORCE_MWAIT};
+
+extern void enable_sep_cpu(void);
+extern int sysenter_setup(void);
+
+extern void early_trap_init(void);
+
+/* Defined in head.S */
+extern struct desc_ptr         early_gdt_descr;
+
+extern void cpu_set_gdt(int);
+extern void switch_to_new_gdt(int);
+extern void load_percpu_segment(int);
+extern void cpu_init(void);
+
+static inline unsigned long get_debugctlmsr(void)
+{
+       unsigned long debugctlmsr = 0;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+       if (boot_cpu_data.x86 < 6)
+               return 0;
+#endif
+       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+
+       return debugctlmsr;
+}
+
+static inline void update_debugctlmsr(unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+       if (boot_cpu_data.x86 < 6)
+               return;
+#endif
+       wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
+}
+
+/*
+ * from system description table in BIOS. Mostly for MCA use, but
+ * others may find it useful:
+ */
+extern unsigned int            machine_id;
+extern unsigned int            machine_submodel_id;
+extern unsigned int            BIOS_revision;
+
+/* Boot loader type from the setup header: */
+extern int                     bootloader_type;
+extern int                     bootloader_version;
+
+extern char                    ignore_fpu_irq;
+
+#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
+#define ARCH_HAS_PREFETCHW
+#define ARCH_HAS_SPINLOCK_PREFETCH
+
+#ifdef CONFIG_X86_32
+# define BASE_PREFETCH         ASM_NOP4
+# define ARCH_HAS_PREFETCH
+#else
+# define BASE_PREFETCH         "prefetcht0 (%1)"
+#endif
+
+/*
+ * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
+ *
+ * It's not worth to care about 3dnow prefetches for the K6
+ * because they are microcoded there and very slow.
+ */
+static inline void prefetch(const void *x)
+{
+       alternative_input(BASE_PREFETCH,
+                         "prefetchnta (%1)",
+                         X86_FEATURE_XMM,
+                         "r" (x));
+}
+
+/*
+ * 3dnow prefetch to get an exclusive cache line.
+ * Useful for spinlocks to avoid one state transition in the
+ * cache coherency protocol:
+ */
+static inline void prefetchw(const void *x)
+{
+       alternative_input(BASE_PREFETCH,
+                         "prefetchw (%1)",
+                         X86_FEATURE_3DNOW,
+                         "r" (x));
+}
+
+static inline void spin_lock_prefetch(const void *x)
+{
+       prefetchw(x);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * User space process size: 3GB (default).
+ */
+#define TASK_SIZE              PAGE_OFFSET
+#define TASK_SIZE_MAX          TASK_SIZE
+#define STACK_TOP              TASK_SIZE
+#define STACK_TOP_MAX          STACK_TOP
+
+#define INIT_THREAD  {                                                   \
+       .sp0                    = sizeof(init_stack) + (long)&init_stack, \
+       .vm86_info              = NULL,                                   \
+       .sysenter_cs            = __KERNEL_CS,                            \
+       .io_bitmap_ptr          = NULL,                                   \
+}
+
+/*
+ * Note that the .io_bitmap member must be extra-big. This is because
+ * the CPU will access an additional byte beyond the end of the IO
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+#define INIT_TSS  {                                                      \
+       .x86_tss = {                                                      \
+               .sp0            = sizeof(init_stack) + (long)&init_stack, \
+               .ss0            = __KERNEL_DS,                            \
+               .ss1            = __KERNEL_CS,                            \
+               .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,               \
+        },                                                               \
+       .io_bitmap              = { [0 ... IO_BITMAP_LONGS] = ~0 },       \
+}
+
+extern unsigned long thread_saved_pc(struct task_struct *tsk);
+
+#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
+#define KSTK_TOP(info)                                                 \
+({                                                                     \
+       unsigned long *__ptr = (unsigned long *)(info);                 \
+       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
+})
+
+/*
+ * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+ * This is necessary to guarantee that the entire "struct pt_regs"
+ * is accessible even if the CPU haven't stored the SS/ESP registers
+ * on the stack (interrupt gate does not save these registers
+ * when switching to the same priv ring).
+ * Therefore beware: accessing the ss/esp fields of the
+ * "struct pt_regs" is possible, but they may contain the
+ * completely wrong values.
+ */
+#define task_pt_regs(task)                                             \
+({                                                                     \
+       struct pt_regs *__regs__;                                       \
+       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
+       __regs__ - 1;                                                   \
+})
+
+#else
+/*
+ * User space process size. 47bits minus one guard page.
+ */
+#define TASK_SIZE_MAX  ((1UL << 47) - PAGE_SIZE)
+
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define IA32_PAGE_OFFSET       ((current->personality & ADDR_LIMIT_3GB) ? \
+                                       0xc0000000 : 0xFFFFe000)
+
+#define TASK_SIZE              (test_thread_flag(TIF_ADDR32) ? \
+                                       IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+#define TASK_SIZE_OF(child)    ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
+                                       IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+
+#define STACK_TOP              TASK_SIZE
+#define STACK_TOP_MAX          TASK_SIZE_MAX
+
+#define INIT_THREAD  { \
+       .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+#define INIT_TSS  { \
+       .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+}
+
+/*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+#define thread_saved_pc(t)     (*(unsigned long *)((t)->thread.sp - 8))
+
+#define task_pt_regs(tsk)      ((struct pt_regs *)(tsk)->thread.sp0 - 1)
+
+/*
+ * User space RSP while inside the SYSCALL fast path
+ */
+DECLARE_PER_CPU(unsigned long, old_rsp);
+
+#endif /* CONFIG_X86_64 */
+
+extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
+                                              unsigned long new_sp);
+
+/*
+ * This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define TASK_UNMAPPED_BASE     (PAGE_ALIGN(TASK_SIZE / 3))
+
+#define KSTK_EIP(task)         (task_pt_regs(task)->ip)
+#define KSTK_ESP(task)         (task_pt_regs(task)->sp)
+
+/* Get/set a process' ability to use the timestamp counter instruction */
+#define GET_TSC_CTL(adr)       get_tsc_mode((adr))
+#define SET_TSC_CTL(val)       set_tsc_mode((val))
+
+extern int get_tsc_mode(unsigned long adr);
+extern int set_tsc_mode(unsigned int val);
+
+extern int amd_get_nb_id(int cpu);
+
+struct aperfmperf {
+       u64 aperf, mperf;
+};
+
+static inline void get_aperfmperf(struct aperfmperf *am)
+{
+       WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF));
+
+       rdmsrl(MSR_IA32_APERF, am->aperf);
+       rdmsrl(MSR_IA32_MPERF, am->mperf);
+}
+
+#define APERFMPERF_SHIFT 10
+
+static inline
+unsigned long calc_aperfmperf_ratio(struct aperfmperf *old,
+                                   struct aperfmperf *new)
+{
+       u64 aperf = new->aperf - old->aperf;
+       u64 mperf = new->mperf - old->mperf;
+       unsigned long ratio = aperf;
+
+       mperf >>= APERFMPERF_SHIFT;
+       if (mperf)
+               ratio = div64_u64(aperf, mperf);
+
+       return ratio;
+}
+
+/*
+ * AMD errata checking
+ */
+#ifdef CONFIG_CPU_SUP_AMD
+extern const int amd_erratum_383[];
+extern const int amd_erratum_400[];
+extern bool cpu_has_amd_erratum(const int *);
+
+#define AMD_LEGACY_ERRATUM(...)                { -1, __VA_ARGS__, 0 }
+#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
+#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
+       ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
+#define AMD_MODEL_RANGE_FAMILY(range)  (((range) >> 24) & 0xff)
+#define AMD_MODEL_RANGE_START(range)   (((range) >> 12) & 0xfff)
+#define AMD_MODEL_RANGE_END(range)     ((range) & 0xfff)
+
+#else
+#define cpu_has_amd_erratum(x) (false)
+#endif /* CONFIG_CPU_SUP_AMD */
+
+void cpu_idle_wait(void);
+
+extern unsigned long arch_align_stack(unsigned long sp);
+extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
+
+void xen_idle(void);
+bool set_pm_idle_to_default(void);
+
+void stop_this_cpu(void *dummy);
+
+#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/mach-xen/asm/setup.h b/arch/x86/include/mach-xen/asm/setup.h

new file mode 100644 (file)

index 0000000..aaa418c
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/setup.h
@@ -0,0 +1,21 @@
+#ifndef __ASSEMBLY__
+
+void xen_start_kernel(void);
+void xen_arch_setup(void);
+
+#ifdef CONFIG_X86_64
+void reserve_pfn_range(unsigned long pfn, unsigned long nr);
+void reserve_pgtable_low(void);
+#endif
+
+extern unsigned long xen_initrd_start;
+
+#ifdef CONFIG_EFI
+void efi_probe(void);
+#else
+#define efi_probe() ((void)0)
+#endif
+
+#endif
+
+#include_next <asm/setup.h>
diff --git a/arch/x86/include/mach-xen/asm/smp-processor-id.h b/arch/x86/include/mach-xen/asm/smp-processor-id.h

new file mode 100644 (file)

index 0000000..c6c1ec5
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/smp-processor-id.h
@@ -0,0 +1,36 @@
+#ifndef _ASM_X86_SMP_PROCESSOR_ID_H
+#define _ASM_X86_SMP_PROCESSOR_ID_H
+
+#if defined(CONFIG_SMP) && !defined(__ASSEMBLY__)
+
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU(int, cpu_number);
+
+/*
+ * This function is needed by all SMP systems. It must _always_ be valid
+ * from the initial startup. We map APIC_BASE very early in page_setup(),
+ * so this is correct in the x86 case.
+ */
+#define raw_smp_processor_id() percpu_read(cpu_number)
+#define safe_smp_processor_id() smp_processor_id()
+
+#ifdef CONFIG_X86_64_SMP
+#define stack_smp_processor_id()                                       \
+({                                                                     \
+       struct thread_info *ti;                                         \
+       __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK));      \
+       ti->cpu;                                                        \
+})
+#endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+extern unsigned int debug_smp_processor_id(void);
+# define smp_processor_id() debug_smp_processor_id()
+#else
+# define smp_processor_id() raw_smp_processor_id()
+#endif
+
+#endif /* SMP && !__ASSEMBLY__ */
+
+#endif /* _ASM_X86_SMP_PROCESSOR_ID_H */
diff --git a/arch/x86/include/mach-xen/asm/smp.h b/arch/x86/include/mach-xen/asm/smp.h

new file mode 100644 (file)

index 0000000..76f78bf
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/smp.h
@@ -0,0 +1,241 @@
+#ifndef _ASM_X86_SMP_H
+#define _ASM_X86_SMP_H
+#ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <asm/percpu.h>
+
+/*
+ * We need the APIC definitions automatically as part of 'smp.h'
+ */
+#ifdef CONFIG_X86_LOCAL_APIC
+# include <asm/mpspec.h>
+# include <asm/apic.h>
+# ifdef CONFIG_X86_IO_APIC
+#  include <asm/io_apic.h>
+# endif
+#endif
+#include <linux/thread_info.h>
+#include <asm/cpumask.h>
+#include <asm/cpufeature.h>
+
+extern unsigned int num_processors;
+
+#ifndef CONFIG_XEN
+static inline bool cpu_has_ht_siblings(void)
+{
+       bool has_siblings = false;
+#ifdef CONFIG_SMP
+       has_siblings = cpu_has_ht && smp_num_siblings > 1;
+#endif
+       return has_siblings;
+}
+
+DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
+/* cpus sharing the last level cache: */
+DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
+DECLARE_PER_CPU(u16, cpu_llc_id);
+DECLARE_PER_CPU(int, cpu_number);
+#endif
+
+static inline const struct cpumask *cpu_sibling_mask(int cpu)
+{
+       return cpumask_of(cpu);
+}
+
+static inline const struct cpumask *cpu_core_mask(int cpu)
+{
+       return cpumask_of(cpu);
+}
+
+#ifndef CONFIG_XEN
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+       return per_cpu(cpu_llc_shared_map, cpu);
+}
+
+DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
+DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
+#endif
+#endif
+
+#ifdef CONFIG_SMP
+
+#ifndef CONFIG_XEN
+
+/* Static state in head.S used to set up a CPU */
+extern unsigned long stack_start; /* Initial stack pointer address */
+
+struct smp_ops {
+       void (*smp_prepare_boot_cpu)(void);
+       void (*smp_prepare_cpus)(unsigned max_cpus);
+       void (*smp_cpus_done)(unsigned max_cpus);
+
+       void (*stop_other_cpus)(int wait);
+       void (*smp_send_reschedule)(int cpu);
+
+       int (*cpu_up)(unsigned cpu);
+       int (*cpu_disable)(void);
+       void (*cpu_die)(unsigned int cpu);
+       void (*play_dead)(void);
+
+       void (*send_call_func_ipi)(const struct cpumask *mask);
+       void (*send_call_func_single_ipi)(int cpu);
+};
+
+/* Globals due to paravirt */
+extern void set_cpu_sibling_map(int cpu);
+
+extern struct smp_ops smp_ops;
+
+static inline void smp_send_stop(void)
+{
+       smp_ops.stop_other_cpus(0);
+}
+
+static inline void stop_other_cpus(void)
+{
+       smp_ops.stop_other_cpus(1);
+}
+
+static inline void smp_prepare_boot_cpu(void)
+{
+       smp_ops.smp_prepare_boot_cpu();
+}
+
+static inline void smp_prepare_cpus(unsigned int max_cpus)
+{
+       smp_ops.smp_prepare_cpus(max_cpus);
+}
+
+static inline void smp_cpus_done(unsigned int max_cpus)
+{
+       smp_ops.smp_cpus_done(max_cpus);
+}
+
+static inline int __cpu_up(unsigned int cpu)
+{
+       return smp_ops.cpu_up(cpu);
+}
+
+static inline int __cpu_disable(void)
+{
+       return smp_ops.cpu_disable();
+}
+
+static inline void __cpu_die(unsigned int cpu)
+{
+       smp_ops.cpu_die(cpu);
+}
+
+static inline void play_dead(void)
+{
+       smp_ops.play_dead();
+}
+
+static inline void smp_send_reschedule(int cpu)
+{
+       smp_ops.smp_send_reschedule(cpu);
+}
+
+static inline void arch_send_call_function_single_ipi(int cpu)
+{
+       smp_ops.send_call_func_single_ipi(cpu);
+}
+
+static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
+{
+       smp_ops.send_call_func_ipi(mask);
+}
+
+void cpu_disable_common(void);
+void native_smp_prepare_boot_cpu(void);
+void native_smp_prepare_cpus(unsigned int max_cpus);
+void native_smp_cpus_done(unsigned int max_cpus);
+int native_cpu_up(unsigned int cpunum);
+int native_cpu_disable(void);
+void native_cpu_die(unsigned int cpu);
+void native_play_dead(void);
+void play_dead_common(void);
+void wbinvd_on_cpu(int cpu);
+int wbinvd_on_all_cpus(void);
+
+void smp_store_cpu_info(int id);
+#define cpu_physical_id(cpu)   per_cpu(x86_cpu_to_apicid, cpu)
+
+#else /* CONFIG_XEN */
+
+extern int __cpu_disable(void);
+extern void __cpu_die(unsigned int cpu);
+void xen_stop_other_cpus(int wait);
+void xen_smp_send_reschedule(int cpu);
+void xen_send_call_func_ipi(const struct cpumask *mask);
+void xen_send_call_func_single_ipi(int cpu);
+
+static inline void smp_send_stop(void)
+{
+       xen_stop_other_cpus(0);
+}
+
+#define smp_send_reschedule    xen_smp_send_reschedule
+#define arch_send_call_function_single_ipi     xen_send_call_func_single_ipi
+#define arch_send_call_function_ipi_mask       xen_send_call_func_ipi
+
+void play_dead(void);
+
+#endif /* CONFIG_XEN */
+
+/* We don't mark CPUs online until __cpu_up(), so we need another measure */
+static inline int num_booting_cpus(void)
+{
+       return cpumask_weight(cpu_callout_mask);
+}
+#elif /* !CONFIG_SMP && */ !defined(CONFIG_XEN)
+#define wbinvd_on_cpu(cpu)     wbinvd()
+static inline int wbinvd_on_all_cpus(void)
+{
+       wbinvd();
+       return 0;
+}
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_XEN
+int wbinvd_on_all_cpus(void);
+#endif
+
+extern unsigned disabled_cpus __cpuinitdata;
+
+#include <asm/smp-processor-id.h>
+
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
+
+#ifndef CONFIG_X86_64
+static inline int logical_smp_processor_id(void)
+{
+       /* we don't want to mark this access volatile - bad code generation */
+       return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+}
+
+#endif
+
+extern int hard_smp_processor_id(void);
+
+#else /* CONFIG_X86_LOCAL_APIC */
+
+# ifndef CONFIG_SMP
+#  define hard_smp_processor_id()      0
+# endif
+
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_DEBUG_NMI_SELFTEST
+extern void nmi_selftest(void);
+#else
+#define nmi_selftest() do { } while (0)
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/mach-xen/asm/special_insns.h b/arch/x86/include/mach-xen/asm/special_insns.h

new file mode 100644 (file)

index 0000000..652bd99
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/special_insns.h
@@ -0,0 +1,188 @@
+#ifndef _ASM_X86_SPECIAL_INSNS_H
+#define _ASM_X86_SPECIAL_INSNS_H
+
+
+#ifdef __KERNEL__
+
+#include <asm/hypervisor.h>
+#include <asm/maddr.h>
+
+static inline void xen_clts(void)
+{
+       HYPERVISOR_fpu_taskswitch(0);
+}
+
+static inline void xen_stts(void)
+{
+       HYPERVISOR_fpu_taskswitch(1);
+}
+
+/*
+ * Volatile isn't enough to prevent the compiler from reordering the
+ * read/write functions for the control registers and messing everything up.
+ * A memory clobber would solve the problem, but would prevent reordering of
+ * all loads stores around it, which can hurt performance. Solution is to
+ * use a variable and mimic reads and writes to it to enforce serialization
+ */
+static unsigned long __force_order;
+
+static inline unsigned long xen_read_cr0(void)
+{
+       unsigned long val;
+       asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
+       return val;
+}
+
+static inline void xen_write_cr0(unsigned long val)
+{
+       asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
+}
+
+#define xen_read_cr2() vcpu_info_read(arch.cr2)
+#define xen_write_cr2(val) vcpu_info_write(arch.cr2, val)
+
+static inline unsigned long xen_read_cr3(void)
+{
+       unsigned long val;
+       asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
+#ifdef CONFIG_X86_32
+       return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
+#else
+       return machine_to_phys(val);
+#endif
+}
+
+static inline void xen_write_cr3(unsigned long val)
+{
+#ifdef CONFIG_X86_32
+       val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
+#else
+       val = phys_to_machine(val);
+#endif
+       asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
+}
+
+static inline unsigned long xen_read_cr4(void)
+{
+       unsigned long val;
+       asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
+       return val;
+}
+
+#define xen_read_cr4_safe() xen_read_cr4()
+
+static inline void xen_write_cr4(unsigned long val)
+{
+       asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
+}
+
+#ifdef CONFIG_X86_64
+static inline unsigned long xen_read_cr8(void)
+{
+       return 0;
+}
+
+static inline void xen_write_cr8(unsigned long val)
+{
+       BUG_ON(val);
+}
+#endif
+
+static inline void native_wbinvd(void)
+{
+       asm volatile("wbinvd": : :"memory");
+}
+
+extern void xen_load_gs_index(unsigned);
+
+static inline unsigned long read_cr0(void)
+{
+       return xen_read_cr0();
+}
+
+static inline void write_cr0(unsigned long x)
+{
+       xen_write_cr0(x);
+}
+
+static inline unsigned long read_cr2(void)
+{
+       return xen_read_cr2();
+}
+
+static inline void write_cr2(unsigned long x)
+{
+       xen_write_cr2(x);
+}
+
+static inline unsigned long read_cr3(void)
+{
+       return xen_read_cr3();
+}
+
+static inline void write_cr3(unsigned long x)
+{
+       xen_write_cr3(x);
+}
+
+static inline unsigned long read_cr4(void)
+{
+       return xen_read_cr4();
+}
+
+static inline unsigned long read_cr4_safe(void)
+{
+       return xen_read_cr4_safe();
+}
+
+static inline void write_cr4(unsigned long x)
+{
+       xen_write_cr4(x);
+}
+
+static inline void wbinvd(void)
+{
+       native_wbinvd();
+}
+
+#ifdef CONFIG_X86_64
+
+static inline unsigned long read_cr8(void)
+{
+       return xen_read_cr8();
+}
+
+static inline void write_cr8(unsigned long x)
+{
+       xen_write_cr8(x);
+}
+
+static inline void load_gs_index(unsigned selector)
+{
+       xen_load_gs_index(selector);
+}
+
+#endif
+
+/* Clear the 'TS' bit */
+static inline void clts(void)
+{
+       xen_clts();
+}
+
+static inline void stts(void)
+{
+       xen_stts();
+}
+
+static inline void clflush(volatile void *__p)
+{
+       asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
+}
+
+#define nop() asm volatile ("nop")
+
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_X86_SPECIAL_INSNS_H */
diff --git a/arch/x86/include/mach-xen/asm/spinlock.h b/arch/x86/include/mach-xen/asm/spinlock.h

new file mode 100644 (file)

index 0000000..f034a6e
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/spinlock.h
@@ -0,0 +1,370 @@
+#ifndef _ASM_X86_SPINLOCK_H
+#define _ASM_X86_SPINLOCK_H
+
+#include <linux/atomic.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>
+
+/*
+ * Your basic SMP spinlocks, allowing only a single CPU anywhere
+ *
+ * Simple spin lock operations.  There are two variants, one clears IRQ's
+ * on the local processor, one does not.
+ *
+ * These are fair FIFO ticket locks, which are currently limited to 256
+ * CPUs.
+ *
+ * (the type definitions are in asm/spinlock_types.h)
+ */
+
+#ifdef CONFIG_X86_32
+# define LOCK_PTR_REG "a"
+# define REG_PTR_MODE "k"
+#else
+# define LOCK_PTR_REG "D"
+# define REG_PTR_MODE "q"
+#endif
+
+#if defined(CONFIG_XEN) || (defined(CONFIG_X86_32) && \
+       (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)))
+/*
+ * On Xen, as we read back the result of the unlocking increment, we must use
+ * a locked access (or insert a full memory barrier) in all cases (so that we
+ * read what is globally visible).
+ *
+ * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
+ * (PPro errata 66, 92)
+ */
+# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
+#else
+# define UNLOCK_LOCK_PREFIX
+#endif
+
+#ifdef TICKET_SHIFT
+
+#include <asm/irqflags.h>
+#include <asm/smp-processor-id.h>
+
+int xen_spinlock_init(unsigned int cpu);
+void xen_spinlock_cleanup(unsigned int cpu);
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+struct __raw_tickets xen_spin_adjust(const arch_spinlock_t *,
+                                    struct __raw_tickets);
+#else
+#define xen_spin_adjust(lock, raw_tickets) (raw_tickets)
+#define xen_spin_wait(l, t, f) xen_spin_wait(l, t)
+#endif
+unsigned int xen_spin_wait(arch_spinlock_t *, struct __raw_tickets *,
+                          unsigned int flags);
+void xen_spin_kick(const arch_spinlock_t *, unsigned int ticket);
+
+/*
+ * Ticket locks are conceptually two parts, one indicating the current head of
+ * the queue, and the other indicating the current tail. The lock is acquired
+ * by atomically noting the tail and incrementing it by one (thus adding
+ * ourself to the queue and noting our position), then waiting until the head
+ * becomes equal to the the initial value of the tail.
+ *
+ * We use an xadd covering *both* parts of the lock, to increment the tail and
+ * also load the position of the head, which takes care of memory ordering
+ * issues and should be optimal for the uncontended case. Note the tail must be
+ * in the high part, because a wide xadd increment of the low part would carry
+ * up and contaminate the high part.
+ */
+#define __spin_count_dec(c, l) (vcpu_running((l)->owner) ? --(c) : ((c) >>= 1))
+
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
+{
+       struct __raw_tickets inc = { .tail = 1 };
+       unsigned int count, flags = arch_local_irq_save();
+
+       inc = xadd(&lock->tickets, inc);
+       if (likely(inc.head == inc.tail))
+               arch_local_irq_restore(flags);
+       else {
+               inc = xen_spin_adjust(lock, inc);
+               arch_local_irq_restore(flags);
+               count = 1 << 12;
+               do {
+                       while (inc.head != inc.tail
+                              && __spin_count_dec(count, lock)) {
+                               cpu_relax();
+                               inc.head = ACCESS_ONCE(lock->tickets.head);
+                       }
+               } while (unlikely(!count)
+                        && (count = xen_spin_wait(lock, &inc, flags)));
+       }
+       barrier();              /* make sure nothing creeps before the lock is taken */
+       lock->owner = raw_smp_processor_id();
+}
+#else
+#define __ticket_spin_lock(lock) __ticket_spin_lock_flags(lock, -1)
+#endif
+
+static __always_inline void __ticket_spin_lock_flags(arch_spinlock_t *lock,
+                                                    unsigned long flags)
+{
+       struct __raw_tickets inc = { .tail = 1 };
+
+       inc = xadd(&lock->tickets, inc);
+       if (unlikely(inc.head != inc.tail)) {
+               unsigned int count = 1 << 12;
+
+               inc = xen_spin_adjust(lock, inc);
+               do {
+                       while (inc.head != inc.tail
+                              && __spin_count_dec(count, lock)) {
+                               cpu_relax();
+                               inc.head = ACCESS_ONCE(lock->tickets.head);
+                       }
+               } while (unlikely(!count)
+                        && (count = xen_spin_wait(lock, &inc, flags)));
+       }
+       barrier();              /* make sure nothing creeps before the lock is taken */
+       lock->owner = raw_smp_processor_id();
+}
+
+#undef __spin_count_dec
+
+static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
+{
+       arch_spinlock_t old;
+
+       old.tickets = ACCESS_ONCE(lock->tickets);
+       if (old.tickets.head != old.tickets.tail)
+               return 0;
+
+       /* cmpxchg is a full barrier, so nothing can move before it */
+       if (cmpxchg(&lock->head_tail, old.head_tail,
+                   old.head_tail + (1 << TICKET_SHIFT)) != old.head_tail)
+               return 0;
+       lock->owner = raw_smp_processor_id();
+       return 1;
+}
+
+static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
+{
+       register struct __raw_tickets new;
+
+       __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
+#if !defined(XEN_SPINLOCK_SOURCE) || !CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+# undef UNLOCK_LOCK_PREFIX
+#endif
+       new = ACCESS_ONCE(lock->tickets);
+       if (new.head != new.tail)
+               xen_spin_kick(lock, new.head);
+}
+
+static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
+{
+       struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+
+       return tmp.tail != tmp.head;
+}
+
+static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
+{
+       struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+
+       return (__ticket_t)(tmp.tail - tmp.head) > 1;
+}
+
+#define __arch_spin(n) __ticket_spin_##n
+
+#else /* TICKET_SHIFT */
+
+static inline int xen_spinlock_init(unsigned int cpu) { return 0; }
+static inline void xen_spinlock_cleanup(unsigned int cpu) {}
+
+static inline int __byte_spin_is_locked(arch_spinlock_t *lock)
+{
+       return lock->lock != 0;
+}
+
+static inline int __byte_spin_is_contended(arch_spinlock_t *lock)
+{
+       return lock->spinners != 0;
+}
+
+static inline void __byte_spin_lock(arch_spinlock_t *lock)
+{
+       s8 val = 1;
+
+       asm("1: xchgb %1, %0\n"
+           "   test %1,%1\n"
+           "   jz 3f\n"
+           "   " LOCK_PREFIX "incb %2\n"
+           "2: rep;nop\n"
+           "   cmpb $1, %0\n"
+           "   je 2b\n"
+           "   " LOCK_PREFIX "decb %2\n"
+           "   jmp 1b\n"
+           "3:"
+           : "+m" (lock->lock), "+q" (val), "+m" (lock->spinners): : "memory");
+}
+
+#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock)
+
+static inline int __byte_spin_trylock(arch_spinlock_t *lock)
+{
+       u8 old = 1;
+
+       asm("xchgb %1,%0"
+           : "+m" (lock->lock), "+q" (old) : : "memory");
+
+       return old == 0;
+}
+
+static inline void __byte_spin_unlock(arch_spinlock_t *lock)
+{
+       smp_wmb();
+       lock->lock = 0;
+}
+
+#define __arch_spin(n) __byte_spin_##n
+
+#endif /* TICKET_SHIFT */
+
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
+{
+       return __arch_spin(is_locked)(lock);
+}
+
+static inline int arch_spin_is_contended(arch_spinlock_t *lock)
+{
+       return __arch_spin(is_contended)(lock);
+}
+#define arch_spin_is_contended arch_spin_is_contended
+
+static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
+{
+       __arch_spin(lock)(lock);
+}
+
+static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
+{
+       return __arch_spin(trylock)(lock);
+}
+
+static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
+{
+       __arch_spin(unlock)(lock);
+}
+
+static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
+                                                 unsigned long flags)
+{
+       __arch_spin(lock_flags)(lock, flags);
+}
+
+#undef __arch_spin
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+       while (arch_spin_is_locked(lock))
+               cpu_relax();
+}
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ *
+ * NOTE! it is quite common to have readers in interrupts
+ * but no interrupt writers. For those circumstances we
+ * can "mix" irq-safe locks - any writer needs to get a
+ * irq-safe write-lock, but readers can get non-irqsafe
+ * read-locks.
+ *
+ * On x86, we implement read-write locks as a 32-bit counter
+ * with the high bit (sign) being the "contended" bit.
+ */
+
+/**
+ * read_can_lock - would read_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int arch_read_can_lock(arch_rwlock_t *lock)
+{
+       return lock->lock > 0;
+}
+
+/**
+ * write_can_lock - would write_trylock() succeed?
+ * @lock: the rwlock in question.
+ */
+static inline int arch_write_can_lock(arch_rwlock_t *lock)
+{
+       return lock->write == WRITE_LOCK_CMP;
+}
+
+static inline void arch_read_lock(arch_rwlock_t *rw)
+{
+       asm volatile(LOCK_PREFIX READ_LOCK_SIZE(dec) " (%0)\n\t"
+                    "jns 1f\n"
+                    "call __read_lock_failed\n\t"
+                    "1:\n"
+                    ::LOCK_PTR_REG (rw) : "memory");
+}
+
+static inline void arch_write_lock(arch_rwlock_t *rw)
+{
+       asm volatile(LOCK_PREFIX WRITE_LOCK_SUB(%1) "(%0)\n\t"
+                    "jz 1f\n"
+                    "call __write_lock_failed\n\t"
+                    "1:\n"
+                    ::LOCK_PTR_REG (&rw->write), "i" (RW_LOCK_BIAS)
+                    : "memory");
+}
+
+static inline int arch_read_trylock(arch_rwlock_t *lock)
+{
+       READ_LOCK_ATOMIC(t) *count = (READ_LOCK_ATOMIC(t) *)lock;
+
+       if (READ_LOCK_ATOMIC(dec_return)(count) >= 0)
+               return 1;
+       READ_LOCK_ATOMIC(inc)(count);
+       return 0;
+}
+
+static inline int arch_write_trylock(arch_rwlock_t *lock)
+{
+       atomic_t *count = (atomic_t *)&lock->write;
+
+       if (atomic_sub_and_test(WRITE_LOCK_CMP, count))
+               return 1;
+       atomic_add(WRITE_LOCK_CMP, count);
+       return 0;
+}
+
+static inline void arch_read_unlock(arch_rwlock_t *rw)
+{
+       asm volatile(LOCK_PREFIX READ_LOCK_SIZE(inc) " %0"
+                    :"+m" (rw->lock) : : "memory");
+}
+
+static inline void arch_write_unlock(arch_rwlock_t *rw)
+{
+       asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
+                    : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
+}
+
+#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
+#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
+
+#undef READ_LOCK_SIZE
+#undef READ_LOCK_ATOMIC
+#undef WRITE_LOCK_ADD
+#undef WRITE_LOCK_SUB
+#undef WRITE_LOCK_CMP
+
+#define arch_spin_relax(lock)  cpu_relax()
+#define arch_read_relax(lock)  cpu_relax()
+#define arch_write_relax(lock) cpu_relax()
+
+/* The {read|write|spin}_lock() on x86 are full memory barriers. */
+static inline void smp_mb__after_lock(void) { }
+#define ARCH_HAS_SMP_MB_AFTER_LOCK
+
+#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/mach-xen/asm/spinlock_types.h b/arch/x86/include/mach-xen/asm/spinlock_types.h

new file mode 100644 (file)

index 0000000..d78bbc0
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/spinlock_types.h
@@ -0,0 +1,62 @@
+#ifndef _ASM_X86_SPINLOCK_TYPES_H
+#define _ASM_X86_SPINLOCK_TYPES_H
+
+#ifndef __LINUX_SPINLOCK_TYPES_H
+# error "please don't include this file directly"
+#endif
+
+#include <linux/types.h>
+
+#ifdef CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+/*
+ * On Xen we support CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING levels of
+ * interrupt re-enabling per IRQ-safe lock. Hence we can have
+ * (CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING + 1) times as many outstanding
+ * tickets. Thus the cut-off for using byte register pairs must be at
+ * a sufficiently smaller number of CPUs.
+ */
+#if (CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING + 1) * CONFIG_NR_CPUS < 256
+typedef u8  __ticket_t;
+# define TICKET_SHIFT 8
+typedef u16 __ticketpair_t;
+#else
+typedef u16 __ticket_t;
+# define TICKET_SHIFT 16
+typedef u32 __ticketpair_t;
+#endif
+
+typedef union {
+       __ticketpair_t head_tail;
+       struct {
+               struct __raw_tickets {
+                       __ticket_t head, tail;
+               } tickets;
+#if CONFIG_NR_CPUS <= 256
+               u8 owner;
+#else
+               u16 owner;
+#endif
+       };
+#else /* ndef CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING */
+typedef struct {
+/*
+ * This differs from the pre-2.6.24 spinlock by always using xchgb
+ * rather than decb to take the lock; this allows it to use a
+ * zero-initialized lock structure.  It also maintains a 1-byte
+ * contention counter, so that we can implement
+ * __byte_spin_is_contended.
+ */
+       u8 lock;
+#if CONFIG_NR_CPUS < 256
+       u8 spinners;
+#else
+# error NR_CPUS >= 256 not implemented
+#endif
+#endif /* def CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING */
+} arch_spinlock_t;
+
+#define __ARCH_SPIN_LOCK_UNLOCKED      { 0 }
+
+#include <asm/rwlock.h>
+
+#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/mach-xen/asm/swiotlb.h b/arch/x86/include/mach-xen/asm/swiotlb.h

new file mode 100644 (file)

index 0000000..e82aad1
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/swiotlb.h
@@ -0,0 +1,8 @@
+#include_next <asm/swiotlb.h>
+
+#ifndef CONFIG_SWIOTLB
+#define swiotlb_init(verbose) ((void)(verbose))
+#endif
+
+dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size,
+                                  int dir);
diff --git a/arch/x86/include/mach-xen/asm/switch_to.h b/arch/x86/include/mach-xen/asm/switch_to.h

new file mode 100644 (file)

index 0000000..4c1d729
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/switch_to.h
@@ -0,0 +1,9 @@
+#ifndef _ASM_X86_SWITCH_TO_H
+
+#define __switch_to_xtra(prev, next, tss) __switch_to_xtra(prev, next)
+
+#include_next <asm/switch_to.h>
+
+#undef __switch_to_xtra
+
+#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/include/mach-xen/asm/time.h b/arch/x86/include/mach-xen/asm/time.h

new file mode 100644 (file)

index 0000000..d898756
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/time.h
@@ -0,0 +1,18 @@
+#ifndef _XEN_ASM_TIME_H
+#define _XEN_ASM_TIME_H
+
+unsigned long xen_read_wallclock(void);
+int xen_write_wallclock(unsigned long);
+
+struct timespec;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+int xen_update_wallclock(const struct timespec *);
+#else
+static inline int xen_update_wallclock(const struct timespec *tv) {
+       return -EPERM;
+}
+#endif
+
+#endif /* _XEN_ASM_TIME_H */
+
+#include_next <asm/time.h>
diff --git a/arch/x86/include/mach-xen/asm/tlbflush.h b/arch/x86/include/mach-xen/asm/tlbflush.h

new file mode 100644 (file)

index 0000000..54d0ee0
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/tlbflush.h
@@ -0,0 +1,114 @@
+#ifndef _ASM_X86_TLBFLUSH_H
+#define _ASM_X86_TLBFLUSH_H
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+
+#include <asm/processor.h>
+#include <asm/special_insns.h>
+
+#define __flush_tlb() xen_tlb_flush()
+#define __flush_tlb_global() xen_tlb_flush()
+#define __flush_tlb_single(addr) xen_invlpg(addr)
+#define __flush_tlb_all() xen_tlb_flush()
+#define __flush_tlb_one(addr) xen_invlpg(addr)
+
+#ifdef CONFIG_X86_32
+# define TLB_FLUSH_ALL 0xffffffff
+#else
+# define TLB_FLUSH_ALL -1ULL
+#endif
+
+/*
+ * TLB flushing:
+ *
+ *  - flush_tlb() flushes the current mm struct TLBs
+ *  - flush_tlb_all() flushes all processes TLBs
+ *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
+ *  - flush_tlb_page(vma, vmaddr) flushes one page
+ *  - flush_tlb_range(vma, start, end) flushes a range of pages
+ *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
+ *
+ * ..but the i386 has somewhat limited tlb flushing capabilities,
+ * and page-granular flushes are available only on i486 and up.
+ *
+ * x86-64 can only flush individual pages or full VMs. For a range flush
+ * we always do the full VM. Might be worth trying if for a small
+ * range a few INVLPGs in a row are a win.
+ */
+
+#ifndef CONFIG_SMP
+
+#define flush_tlb() __flush_tlb()
+#define flush_tlb_all() __flush_tlb_all()
+#define local_flush_tlb() __flush_tlb()
+
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+       if (mm == current->active_mm)
+               __flush_tlb();
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+                                 unsigned long addr)
+{
+       if (vma->vm_mm == current->active_mm)
+               __flush_tlb_one(addr);
+}
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+                                  unsigned long start, unsigned long end)
+{
+       if (vma->vm_mm == current->active_mm)
+               __flush_tlb();
+}
+
+static inline void reset_lazy_tlbstate(void)
+{
+}
+
+#else  /* SMP */
+
+#include <asm/smp.h>
+
+#define local_flush_tlb() __flush_tlb()
+
+#define flush_tlb_all xen_tlb_flush_all
+#define flush_tlb_current_task() xen_tlb_flush_mask(mm_cpumask(current->mm))
+#define flush_tlb_mm(mm) xen_tlb_flush_mask(mm_cpumask(mm))
+#define flush_tlb_page(vma, va) xen_invlpg_mask(mm_cpumask((vma)->vm_mm), va)
+
+#define flush_tlb()    flush_tlb_current_task()
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+                                  unsigned long start, unsigned long end)
+{
+       flush_tlb_mm(vma->vm_mm);
+}
+
+#ifndef CONFIG_XEN
+#define TLBSTATE_OK    1
+#define TLBSTATE_LAZY  2
+
+struct tlb_state {
+       struct mm_struct *active_mm;
+       int state;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+static inline void reset_lazy_tlbstate(void)
+{
+       percpu_write(cpu_tlbstate.state, 0);
+       percpu_write(cpu_tlbstate.active_mm, &init_mm);
+}
+#endif
+
+#endif /* SMP */
+
+static inline void flush_tlb_kernel_range(unsigned long start,
+                                         unsigned long end)
+{
+       flush_tlb_all();
+}
+
+#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/mach-xen/asm/vga.h b/arch/x86/include/mach-xen/asm/vga.h

new file mode 100644 (file)

index 0000000..fe4a3c4
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/vga.h
@@ -0,0 +1,20 @@
+/*
+ *     Access to VGA videoram
+ *
+ *     (c) 1998 Martin Mares <mj@ucw.cz>
+ */
+
+#ifndef _ASM_X86_VGA_H
+#define _ASM_X86_VGA_H
+
+/*
+ *     On the PC, we can just recalculate addresses and then
+ *     access the videoram directly without any black magic.
+ */
+
+#define VGA_MAP_MEM(x, s) (unsigned long)isa_bus_to_virt(x)
+
+#define vga_readb(x) (*(x))
+#define vga_writeb(x, y) (*(y) = (x))
+
+#endif /* _ASM_X86_VGA_H */
diff --git a/arch/x86/include/mach-xen/asm/xenoprof.h b/arch/x86/include/mach-xen/asm/xenoprof.h

new file mode 100644 (file)

index 0000000..2733e00
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/xenoprof.h
@@ -0,0 +1,48 @@
+/******************************************************************************
+ * asm-i386/mach-xen/asm/xenoprof.h
+ *
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+#ifndef __ASM_XENOPROF_H__
+#define __ASM_XENOPROF_H__
+#ifdef CONFIG_XEN
+
+struct super_block;
+struct dentry;
+int xenoprof_create_files(struct super_block * sb, struct dentry * root);
+#define HAVE_XENOPROF_CREATE_FILES
+
+struct xenoprof_init;
+void xenoprof_arch_init_counter(struct xenoprof_init *init);
+void xenoprof_arch_counter(void);
+void xenoprof_arch_start(void);
+void xenoprof_arch_stop(void);
+
+struct xenoprof_arch_shared_buffer {
+       /* nothing */
+};
+struct xenoprof_shared_buffer;
+void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf);
+struct xenoprof_get_buffer;
+int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf);
+struct xenoprof_passive;
+int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf);
+
+#endif /* CONFIG_XEN */
+#endif /* __ASM_XENOPROF_H__ */
diff --git a/arch/x86/include/mach-xen/asm/xor.h b/arch/x86/include/mach-xen/asm/xor.h

new file mode 100644 (file)

index 0000000..edb08e6
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/xor.h
@@ -0,0 +1,8 @@
+#ifdef CONFIG_KMEMCHECK
+/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
+# include <asm-generic/xor.h>
+#elif defined(CONFIG_X86_32)
+# include "../../asm/xor_32.h"
+#else
+# include "xor_64.h"
+#endif
diff --git a/arch/x86/include/mach-xen/asm/xor_64.h b/arch/x86/include/mach-xen/asm/xor_64.h

new file mode 100644 (file)

index 0000000..420d6fd
--- /dev/null
+++ b/arch/x86/include/mach-xen/asm/xor_64.h
@@ -0,0 +1,339 @@
+#ifndef _ASM_X86_XOR_64_H
+#define _ASM_X86_XOR_64_H
+
+#include <asm/fpu-internal.h>
+
+/*
+ * x86-64 changes / gcc fixes from Andi Kleen.
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ *
+ * This hasn't been optimized for the hammer yet, but there are likely
+ * no advantages to be gotten from x86-64 here anyways.
+ */
+
+typedef struct {
+       unsigned long a, b;
+} __attribute__((aligned(16))) xmm_store_t;
+
+/* Doesn't use gcc to save the XMM registers, because there is no easy way to
+   tell it to do a clts before the register saving. */
+#define XMMS_SAVE                              \
+do {                                           \
+       preempt_disable();                      \
+       if (!__thread_has_fpu(current))         \
+               clts();                         \
+       asm volatile(                           \
+               "movups %%xmm0,(%1)     ;\n\t"  \
+               "movups %%xmm1,0x10(%1) ;\n\t"  \
+               "movups %%xmm2,0x20(%1) ;\n\t"  \
+               "movups %%xmm3,0x30(%1) ;\n\t"  \
+               : "=&r" (cr0)                   \
+               : "r" (xmm_save)                \
+               : "memory");                    \
+} while (0)
+
+#define XMMS_RESTORE                           \
+do {                                           \
+       asm volatile(                           \
+               "sfence                 ;\n\t"  \
+               "movups (%1),%%xmm0     ;\n\t"  \
+               "movups 0x10(%1),%%xmm1 ;\n\t"  \
+               "movups 0x20(%1),%%xmm2 ;\n\t"  \
+               "movups 0x30(%1),%%xmm3 ;\n\t"  \
+               :                               \
+               : "r" (cr0), "r" (xmm_save)     \
+               : "memory");                    \
+       if (!__thread_has_fpu(current))         \
+               stts();                         \
+       preempt_enable();                       \
+} while (0)
+
+#define OFFS(x)                "16*("#x")"
+#define PF_OFFS(x)     "256+16*("#x")"
+#define        PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
+#define LD(x, y)       "       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"    ;\n"
+#define ST(x, y)       "       movaps %%xmm"#y",   "OFFS(x)"(%[p1])    ;\n"
+#define PF1(x)         "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
+#define PF2(x)         "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
+#define PF3(x)         "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
+#define PF4(x)         "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
+#define PF5(x)         "       prefetchnta "PF_OFFS(x)"(%[p6])         ;\n"
+#define XO1(x, y)      "       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"     ;\n"
+#define XO2(x, y)      "       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"     ;\n"
+#define XO3(x, y)      "       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"     ;\n"
+#define XO4(x, y)      "       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"     ;\n"
+#define XO5(x, y)      "       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"     ;\n"
+
+
+static void
+xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+       unsigned int lines = bytes >> 8;
+       unsigned long cr0;
+       xmm_store_t xmm_save[4];
+
+       XMMS_SAVE;
+
+       asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+               LD(i, 0)                                \
+                       LD(i + 1, 1)                    \
+               PF1(i)                                  \
+                               PF1(i + 2)              \
+                               LD(i + 2, 2)            \
+                                       LD(i + 3, 3)    \
+               PF0(i + 4)                              \
+                               PF0(i + 6)              \
+               XO1(i, 0)                               \
+                       XO1(i + 1, 1)                   \
+                               XO1(i + 2, 2)           \
+                                       XO1(i + 3, 3)   \
+               ST(i, 0)                                \
+                       ST(i + 1, 1)                    \
+                               ST(i + 2, 2)            \
+                                       ST(i + 3, 3)    \
+
+
+               PF0(0)
+                               PF0(2)
+
+       " .align 32                     ;\n"
+       " 1:                            ;\n"
+
+               BLOCK(0)
+               BLOCK(4)
+               BLOCK(8)
+               BLOCK(12)
+
+       "       addq %[inc], %[p1]           ;\n"
+       "       addq %[inc], %[p2]           ;\n"
+               "               decl %[cnt] ; jnz 1b"
+       : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
+       : [inc] "r" (256UL)
+       : "memory");
+
+       XMMS_RESTORE;
+}
+
+static void
+xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+         unsigned long *p3)
+{
+       unsigned int lines = bytes >> 8;
+       xmm_store_t xmm_save[4];
+       unsigned long cr0;
+
+       XMMS_SAVE;
+
+       asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+               PF1(i)                                  \
+                               PF1(i + 2)              \
+               LD(i, 0)                                        \
+                       LD(i + 1, 1)                    \
+                               LD(i + 2, 2)            \
+                                       LD(i + 3, 3)    \
+               PF2(i)                                  \
+                               PF2(i + 2)              \
+               PF0(i + 4)                              \
+                               PF0(i + 6)              \
+               XO1(i, 0)                               \
+                       XO1(i + 1, 1)                   \
+                               XO1(i + 2, 2)           \
+                                       XO1(i + 3, 3)   \
+               XO2(i, 0)                               \
+                       XO2(i + 1, 1)                   \
+                               XO2(i + 2, 2)           \
+                                       XO2(i + 3, 3)   \
+               ST(i, 0)                                \
+                       ST(i + 1, 1)                    \
+                               ST(i + 2, 2)            \
+                                       ST(i + 3, 3)    \
+
+
+               PF0(0)
+                               PF0(2)
+
+       " .align 32                     ;\n"
+       " 1:                            ;\n"
+
+               BLOCK(0)
+               BLOCK(4)
+               BLOCK(8)
+               BLOCK(12)
+
+       "       addq %[inc], %[p1]           ;\n"
+       "       addq %[inc], %[p2]          ;\n"
+       "       addq %[inc], %[p3]           ;\n"
+               "               decl %[cnt] ; jnz 1b"
+       : [cnt] "+r" (lines),
+         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+       : [inc] "r" (256UL)
+       : "memory");
+       XMMS_RESTORE;
+}
+
+static void
+xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+         unsigned long *p3, unsigned long *p4)
+{
+       unsigned int lines = bytes >> 8;
+       xmm_store_t xmm_save[4];
+       unsigned long cr0;
+
+       XMMS_SAVE;
+
+       asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+               PF1(i)                                  \
+                               PF1(i + 2)              \
+               LD(i, 0)                                \
+                       LD(i + 1, 1)                    \
+                               LD(i + 2, 2)            \
+                                       LD(i + 3, 3)    \
+               PF2(i)                                  \
+                               PF2(i + 2)              \
+               XO1(i, 0)                               \
+                       XO1(i + 1, 1)                   \
+                               XO1(i + 2, 2)           \
+                                       XO1(i + 3, 3)   \
+               PF3(i)                                  \
+                               PF3(i + 2)              \
+               PF0(i + 4)                              \
+                               PF0(i + 6)              \
+               XO2(i, 0)                               \
+                       XO2(i + 1, 1)                   \
+                               XO2(i + 2, 2)           \
+                                       XO2(i + 3, 3)   \
+               XO3(i, 0)                               \
+                       XO3(i + 1, 1)                   \
+                               XO3(i + 2, 2)           \
+                                       XO3(i + 3, 3)   \
+               ST(i, 0)                                \
+                       ST(i + 1, 1)                    \
+                               ST(i + 2, 2)            \
+                                       ST(i + 3, 3)    \
+
+
+               PF0(0)
+                               PF0(2)
+
+       " .align 32                     ;\n"
+       " 1:                            ;\n"
+
+               BLOCK(0)
+               BLOCK(4)
+               BLOCK(8)
+               BLOCK(12)
+
+       "       addq %[inc], %[p1]           ;\n"
+       "       addq %[inc], %[p2]           ;\n"
+       "       addq %[inc], %[p3]           ;\n"
+       "       addq %[inc], %[p4]           ;\n"
+       "       decl %[cnt] ; jnz 1b"
+       : [cnt] "+c" (lines),
+         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+       : [inc] "r" (256UL)
+       : "memory" );
+
+       XMMS_RESTORE;
+}
+
+static void
+xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+         unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+       unsigned int lines = bytes >> 8;
+       xmm_store_t xmm_save[4];
+       unsigned long cr0;
+
+       XMMS_SAVE;
+
+       asm volatile(
+#undef BLOCK
+#define BLOCK(i) \
+               PF1(i)                                  \
+                               PF1(i + 2)              \
+               LD(i, 0)                                \
+                       LD(i + 1, 1)                    \
+                               LD(i + 2, 2)            \
+                                       LD(i + 3, 3)    \
+               PF2(i)                                  \
+                               PF2(i + 2)              \
+               XO1(i, 0)                               \
+                       XO1(i + 1, 1)                   \
+                               XO1(i + 2, 2)           \
+                                       XO1(i + 3, 3)   \
+               PF3(i)                                  \
+                               PF3(i + 2)              \
+               XO2(i, 0)                               \
+                       XO2(i + 1, 1)                   \
+                               XO2(i + 2, 2)           \
+                                       XO2(i + 3, 3)   \
+               PF4(i)                                  \
+                               PF4(i + 2)              \
+               PF0(i + 4)                              \
+                               PF0(i + 6)              \
+               XO3(i, 0)                               \
+                       XO3(i + 1, 1)                   \
+                               XO3(i + 2, 2)           \
+                                       XO3(i + 3, 3)   \
+               XO4(i, 0)                               \
+                       XO4(i + 1, 1)                   \
+                               XO4(i + 2, 2)           \
+                                       XO4(i + 3, 3)   \
+               ST(i, 0)                                \
+                       ST(i + 1, 1)                    \
+                               ST(i + 2, 2)            \
+                                       ST(i + 3, 3)    \
+
+
+               PF0(0)
+                               PF0(2)
+
+       " .align 32                     ;\n"
+       " 1:                            ;\n"
+
+               BLOCK(0)
+               BLOCK(4)
+               BLOCK(8)
+               BLOCK(12)
+
+       "       addq %[inc], %[p1]           ;\n"
+       "       addq %[inc], %[p2]           ;\n"
+       "       addq %[inc], %[p3]           ;\n"
+       "       addq %[inc], %[p4]           ;\n"
+       "       addq %[inc], %[p5]           ;\n"
+       "       decl %[cnt] ; jnz 1b"
+       : [cnt] "+c" (lines),
+         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
+         [p5] "+r" (p5)
+       : [inc] "r" (256UL)
+       : "memory");
+
+       XMMS_RESTORE;
+}
+
+static struct xor_block_template xor_block_sse = {
+       .name = "generic_sse",
+       .do_2 = xor_sse_2,
+       .do_3 = xor_sse_3,
+       .do_4 = xor_sse_4,
+       .do_5 = xor_sse_5,
+};
+
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES                      \
+do {                                           \
+       xor_speed(&xor_block_sse);              \
+} while (0)
+
+/* We force the use of the SSE xor block because it can write around L2.
+   We may also be able to load into the L1 only depending on how the cpu
+   deals with a load to a line that is being prefetched.  */
+#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
+
+#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile

index 532d2e0..5da6a86 100644 (file)
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -102,6 +102,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
  obj-$(CONFIG_SWIOTLB)                  += pci-swiotlb.o
  obj-$(CONFIG_OF)                       += devicetree.o
  
+obj-$(CONFIG_X86_XEN)          += fixup.o
+
  ###
  # 64 bit specific files
  ifeq ($(CONFIG_X86_64),y)
@@ -113,3 +115,8 @@ ifeq ($(CONFIG_X86_64),y)
         obj-$(CONFIG_PCI_MMCONFIG)      += mmconf-fam10h_64.o
         obj-y                           += vsmp_64.o
  endif
+
+disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8237.o i8253.o \
+       i8259.o irqinit.o pci-swiotlb.o reboot.o smpboot.o trampoline%.o \
+       tsc.o tsc_sync.o vsmp_64.o
+disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms.o
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile

index 6f35260..528e3de 100644 (file)
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -5,6 +5,9 @@ obj-$(CONFIG_ACPI_SLEEP)        += sleep.o wakeup_rm.o wakeup_$(BITS).o
  
  ifneq ($(CONFIG_ACPI_PROCESSOR),)
  obj-y                          += cstate.o
+ifneq ($(CONFIG_PROCESSOR_EXTERNAL_CONTROL),)
+obj-$(CONFIG_XEN)              += processor_extcntl_xen.o
+endif
  endif
  
  $(obj)/wakeup_rm.o:    $(obj)/realmode/wakeup.bin
@@ -12,3 +15,4 @@ $(obj)/wakeup_rm.o:    $(obj)/realmode/wakeup.bin
  $(obj)/realmode/wakeup.bin: FORCE
         $(Q)$(MAKE) $(build)=$(obj)/realmode
  
+disabled-obj-$(CONFIG_XEN)     := cstate.o sleep.o wakeup_%.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c

index 7c439fe..79c14db 100644 (file)
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -70,6 +70,7 @@ int acpi_strict;
  
  u8 acpi_sci_flags __initdata;
  int acpi_sci_override_gsi __initdata;
+#ifndef CONFIG_XEN
  int acpi_skip_timer_override __initdata;
  int acpi_use_timer_override __initdata;
  int acpi_fix_pin2_polarity __initdata;
@@ -77,6 +78,10 @@ int acpi_fix_pin2_polarity __initdata;
  #ifdef CONFIG_X86_LOCAL_APIC
  static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
  #endif
+#else
+#define acpi_skip_timer_override 0
+#define acpi_fix_pin2_polarity 0
+#endif
  
  #ifndef __HAVE_ARCH_CMPXCHG
  #warning ACPI uses CMPXCHG, i486 and later hardware
@@ -182,6 +187,7 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
                 return -ENODEV;
         }
  
+#ifndef CONFIG_XEN
         if (madt->address) {
                 acpi_lapic_addr = (u64) madt->address;
  
@@ -191,12 +197,14 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
  
         default_acpi_madt_oem_check(madt->header.oem_id,
                                     madt->header.oem_table_id);
+#endif
  
         return 0;
  }
  
  static void __cpuinit acpi_register_lapic(int id, u8 enabled)
  {
+#ifndef CONFIG_XEN
         unsigned int ver = 0;
  
         if (id >= (MAX_LOCAL_APIC-1)) {
@@ -213,6 +221,7 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
                 ver = apic_version[boot_cpu_physical_apicid];
  
         generic_processor_info(id, ver);
+#endif
  }
  
  static int __init
@@ -243,7 +252,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
                 printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
         else
                 acpi_register_lapic(apic_id, enabled);
-#else
+#elif !defined(CONFIG_XEN)
         printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
  #endif
  
@@ -297,6 +306,7 @@ static int __init
  acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
                           const unsigned long end)
  {
+#ifndef CONFIG_XEN
         struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL;
  
         lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header;
@@ -305,6 +315,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
                 return -EINVAL;
  
         acpi_lapic_addr = lapic_addr_ovr->address;
+#endif
  
         return 0;
  }
@@ -593,6 +604,7 @@ void __init acpi_set_irq_model_ioapic(void)
  #ifdef CONFIG_ACPI_HOTPLUG_CPU
  #include <acpi/processor.h>
  
+#ifndef CONFIG_XEN
  static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
  {
  #ifdef CONFIG_ACPI_NUMA
@@ -679,6 +691,9 @@ free_tmp_map:
  out:
         return retval;
  }
+#else
+#define _acpi_map_lsapic(h, p) (-EINVAL)
+#endif
  
  /* wrapper to silence section mismatch warning */
  int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu)
@@ -689,9 +704,11 @@ EXPORT_SYMBOL(acpi_map_lsapic);
  
  int acpi_unmap_lsapic(int cpu)
  {
+#ifndef CONFIG_XEN
         per_cpu(x86_cpu_to_apicid, cpu) = -1;
         set_cpu_present(cpu, false);
         num_processors--;
+#endif
  
         return (0);
  }
@@ -1333,6 +1350,7 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
         return 0;
  }
  
+#ifndef CONFIG_XEN
  /*
   * Force ignoring BIOS IRQ0 pin2 override
   */
@@ -1350,6 +1368,22 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
         }
         return 0;
  }
+#endif
+
+static int __init force_acpi_rsdt(const struct dmi_system_id *d)
+{
+       if (!acpi_force) {
+               printk(KERN_NOTICE "%s detected: force use of acpi=rsdt\n",
+                      d->ident);
+               acpi_rsdt_forced = 1;
+       } else {
+               printk(KERN_NOTICE
+                      "Warning: acpi=force overrules DMI blacklist: "
+                      "acpi=rsdt\n");
+       }
+       return 0;
+
+}
  
  /*
   * If your system is blacklisted here, but you find that acpi=force
@@ -1426,9 +1460,36 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
                      DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
                      },
          },
+
+       /*
+        * Boxes that need RSDT as ACPI root table
+        */
+       {
+           .callback = force_acpi_rsdt,
+           .ident = "ThinkPad ", /* R40e, broken C-states */
+           .matches = {
+               DMI_MATCH(DMI_BIOS_VENDOR, "IBM"),
+               DMI_MATCH(DMI_BIOS_VERSION, "1SET")},
+       },
+       {
+           .callback = force_acpi_rsdt,
+           .ident = "ThinkPad ", /* R50e, slow booting */
+           .matches = {
+               DMI_MATCH(DMI_BIOS_VENDOR, "IBM"),
+               DMI_MATCH(DMI_BIOS_VERSION, "1WET")},
+       },
+       {
+           .callback = force_acpi_rsdt,
+           .ident = "ThinkPad ", /* T40, T40p, T41, T41p, T42, T42p
+                                    R50, R50p */
+           .matches = {
+               DMI_MATCH(DMI_BIOS_VENDOR, "IBM"),
+               DMI_MATCH(DMI_BIOS_VERSION, "1RET")},
+       },
         {}
  };
  
+#ifndef CONFIG_XEN
  /* second table for DMI checks that should run after early-quirks */
  static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
         /*
@@ -1475,6 +1536,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
          },
         {}
  };
+#endif
  
  /*
   * acpi_boot_table_init() and acpi_boot_init()
@@ -1547,8 +1609,10 @@ int __init early_acpi_boot_init(void)
  
  int __init acpi_boot_init(void)
  {
+#ifndef CONFIG_XEN
         /* those are executed after early-quirks are executed */
         dmi_check_system(acpi_dmi_table_late);
+#endif
  
         /*
          * If acpi_disabled, bail out
@@ -1613,6 +1677,18 @@ static int __init parse_acpi(char *arg)
  }
  early_param("acpi", parse_acpi);
  
+/* Alias for acpi=rsdt for compatibility with openSUSE 11.1 and SLE11 */
+static int __init parse_acpi_root_table(char *opt)
+{
+       if (!strcmp(opt, "rsdt")) {
+               acpi_rsdt_forced = 1;
+               printk(KERN_WARNING "acpi_root_table=rsdt is deprecated. "
+                      "Please use acpi=rsdt instead.\n");
+       }
+       return 0;
+}
+early_param("acpi_root_table", parse_acpi_root_table);
+
  /* FIXME: Using pci= for an ACPI parameter is a travesty. */
  static int __init parse_pci(char *arg)
  {
@@ -1636,7 +1712,7 @@ int __init acpi_mps_check(void)
         return 0;
  }
  
-#ifdef CONFIG_X86_IO_APIC
+#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
  static int __init parse_acpi_skip_timer_override(char *arg)
  {
         acpi_skip_timer_override = 1;
diff --git a/arch/x86/kernel/acpi/processor_extcntl_xen.c b/arch/x86/kernel/acpi/processor_extcntl_xen.c

new file mode 100644 (file)

index 0000000..b927803
--- /dev/null
+++ b/arch/x86/kernel/acpi/processor_extcntl_xen.c
@@ -0,0 +1,301 @@
+/*
+ * processor_extcntl_xen.c - interface to notify Xen
+ *
+ *  Copyright (C) 2008, Intel corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/cpufreq.h>
+#include <acpi/processor.h>
+#include <asm/hypercall.h>
+
+static int xen_cx_notifier(struct acpi_processor *pr, int action)
+{
+       int ret, count = 0, i;
+       xen_platform_op_t op = {
+               .cmd                    = XENPF_set_processor_pminfo,
+               .interface_version      = XENPF_INTERFACE_VERSION,
+               .u.set_pminfo.id        = pr->acpi_id,
+               .u.set_pminfo.type      = XEN_PM_CX,
+       };
+       struct xen_processor_cx *data, *buf;
+       struct acpi_processor_cx *cx;
+
+       /* Convert to Xen defined structure and hypercall */
+       buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx),
+                       GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+
+       data = buf;
+       for (i = 1; i <= pr->power.count; i++) {
+               cx = &pr->power.states[i];
+               /* Skip invalid cstate entry */
+               if (!cx->valid)
+                       continue;
+
+               data->type = cx->type;
+               data->latency = cx->latency;
+               data->power = cx->power;
+               data->reg.space_id = cx->reg.space_id;
+               data->reg.bit_width = cx->reg.bit_width;
+               data->reg.bit_offset = cx->reg.bit_offset;
+               data->reg.access_size = cx->reg.access_size;
+               data->reg.address = cx->reg.address;
+
+               /* Get dependency relationships */
+               if (cx->csd_count) {
+                       pr_warning("_CSD found: Not supported for now!\n");
+                       kfree(buf);
+                       return -EINVAL;
+               } else {
+                       data->dpcnt = 0;
+                       set_xen_guest_handle(data->dp, NULL);
+               }
+
+               data++;
+               count++;
+       }
+
+       if (!count) {
+               pr_info("No available Cx info for cpu %d\n", pr->acpi_id);
+               kfree(buf);
+               return -EINVAL;
+       }
+
+       op.u.set_pminfo.u.power.count = count;
+       op.u.set_pminfo.u.power.flags.bm_control = pr->flags.bm_control;
+       op.u.set_pminfo.u.power.flags.bm_check = pr->flags.bm_check;
+       op.u.set_pminfo.u.power.flags.has_cst = pr->flags.has_cst;
+       op.u.set_pminfo.u.power.flags.power_setup_done = pr->flags.power_setup_done;
+
+       set_xen_guest_handle(op.u.set_pminfo.u.power.states, buf);
+       ret = HYPERVISOR_platform_op(&op);
+       kfree(buf);
+       return ret;
+}
+
+static int xen_px_notifier(struct acpi_processor *pr, int action)
+{
+       int ret = -EINVAL;
+       xen_platform_op_t op = {
+               .cmd                    = XENPF_set_processor_pminfo,
+               .interface_version      = XENPF_INTERFACE_VERSION,
+               .u.set_pminfo.id        = pr->acpi_id,
+               .u.set_pminfo.type      = XEN_PM_PX,
+       };
+       struct xen_processor_performance *perf;
+       struct xen_processor_px *states = NULL;
+       struct acpi_processor_performance *px;
+       struct acpi_psd_package *pdomain;
+
+       if (!pr)
+               return -EINVAL;
+
+       perf = &op.u.set_pminfo.u.perf;
+       px = pr->performance;
+       if (!px)
+               return -EINVAL;
+
+       switch(action) {
+       case PROCESSOR_PM_CHANGE:
+               /* ppc dynamic handle */
+               perf->flags = XEN_PX_PPC;
+               perf->platform_limit = pr->performance_platform_limit;
+
+               ret = HYPERVISOR_platform_op(&op);
+               break;
+
+       case PROCESSOR_PM_INIT:
+               /* px normal init */
+               perf->flags = XEN_PX_PPC | 
+                             XEN_PX_PCT | 
+                             XEN_PX_PSS | 
+                             XEN_PX_PSD;
+
+               /* ppc */
+               perf->platform_limit = pr->performance_platform_limit;
+
+               /* pct */
+               xen_convert_pct_reg(&perf->control_register, &px->control_register);
+               xen_convert_pct_reg(&perf->status_register, &px->status_register);
+
+               /* pss */
+               perf->state_count = px->state_count;
+               states = kzalloc(px->state_count*sizeof(xen_processor_px_t),GFP_KERNEL);
+               if (!states)
+                       return -ENOMEM;
+               xen_convert_pss_states(states, px->states, px->state_count);
+               set_xen_guest_handle(perf->states, states);
+
+               /* psd */
+               pdomain = &px->domain_info;
+               xen_convert_psd_pack(&perf->domain_info, pdomain);
+               if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL)
+                       perf->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+               else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY)
+                       perf->shared_type = CPUFREQ_SHARED_TYPE_ANY;
+               else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL)
+                       perf->shared_type = CPUFREQ_SHARED_TYPE_HW;
+               else {
+                       ret = -ENODEV;
+                       kfree(states);
+                       break;
+               }
+
+               ret = HYPERVISOR_platform_op(&op);
+               kfree(states);
+               break;
+
+       default:
+               break;
+       }
+
+       return ret;
+}
+
+static int xen_tx_notifier(struct acpi_processor *pr, int action)
+{
+       return -EINVAL;
+}
+
+static int xen_hotplug_notifier(struct acpi_processor *pr, int event)
+{
+       int ret = -EINVAL;
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+       acpi_status status = 0;
+       acpi_object_type type;
+       uint32_t apic_id;
+       int device_decl = 0;
+       unsigned long long pxm;
+       xen_platform_op_t op;
+
+       status = acpi_get_type(pr->handle, &type);
+       if (ACPI_FAILURE(status)) {
+               pr_warn("can't get object type for acpi_id %#x\n",
+                       pr->acpi_id);
+               return -ENXIO;
+       }
+
+       switch (type) {
+       case ACPI_TYPE_PROCESSOR:
+               break;
+       case ACPI_TYPE_DEVICE:
+               device_decl = 1;
+               break;
+       default:
+               pr_warn("unsupported object type %#x for acpi_id %#x\n",
+                       type, pr->acpi_id);
+               return -EOPNOTSUPP;
+       }
+
+       apic_id = acpi_get_cpuid(pr->handle, ~device_decl, pr->acpi_id);
+       if (apic_id < 0) {
+               pr_warn("can't get apic_id for acpi_id %#x\n", pr->acpi_id);
+               return -ENODATA;
+       }
+
+       status = acpi_evaluate_integer(pr->handle, "_PXM", NULL, &pxm);
+       if (ACPI_FAILURE(status)) {
+               pr_warn("can't get pxm for acpi_id %#x\n", pr->acpi_id);
+               return -ENODATA;
+       }
+
+       switch (event) {
+       case HOTPLUG_TYPE_ADD:
+               op.cmd = XENPF_cpu_hotadd;
+               op.u.cpu_add.apic_id = apic_id;
+               op.u.cpu_add.acpi_id = pr->acpi_id;
+               op.u.cpu_add.pxm = pxm;
+               ret = HYPERVISOR_platform_op(&op);
+               break;
+       case HOTPLUG_TYPE_REMOVE:
+               pr_warn("Xen doesn't support CPU hot remove\n");
+               ret = -EOPNOTSUPP;
+               break;
+       }
+#endif
+
+       return ret;
+}
+
+static struct processor_extcntl_ops xen_extcntl_ops = {
+       .hotplug                = xen_hotplug_notifier,
+};
+
+static int xen_sleep(u8 sleep_state, u32 pm1a_ctrl, u32 pm1b_ctrl)
+{
+       int err = acpi_notify_hypervisor_state(sleep_state,
+                                              pm1a_ctrl, pm1b_ctrl);
+
+       if (!err)
+               return 1;
+
+       pr_err("ACPI: Hypervisor failure [%d]\n", err);
+       return -1;
+}
+
+static int __init init_extcntl(void)
+{
+       unsigned int pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8;
+
+#ifndef CONFIG_ACPI_HOTPLUG_CPU
+       if (!pmbits)
+               return 0;
+#endif
+       if (pmbits & XEN_PROCESSOR_PM_CX)
+               xen_extcntl_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier;
+       if (pmbits & XEN_PROCESSOR_PM_PX)
+               xen_extcntl_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier;
+       if (pmbits & XEN_PROCESSOR_PM_TX)
+               xen_extcntl_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier;
+
+       processor_extcntl_ops = &xen_extcntl_ops;
+
+       acpi_os_set_prepare_sleep(xen_sleep);
+
+       return 0;
+}
+arch_initcall(init_extcntl);
+
+unsigned int cpufreq_quick_get(unsigned int cpu)
+{
+       xen_platform_op_t op;
+
+       op.cmd = XENPF_get_cpu_freq;
+       op.u.get_cpu_freq.vcpu = cpu;
+       return HYPERVISOR_platform_op(&op) == 0 ? op.u.get_cpu_freq.freq : 0;
+}
+
+unsigned int cpufreq_quick_get_max(unsigned int cpu)
+{
+       xen_platform_op_t op;
+
+       op.cmd = XENPF_get_cpu_freq_max;
+       op.u.get_cpu_freq.vcpu = cpu;
+       return HYPERVISOR_platform_op(&op) == 0 ? op.u.get_cpu_freq.freq : 0;
+}
+EXPORT_SYMBOL(cpufreq_quick_get_max);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c

index be16854..1bd8529 100644 (file)
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -15,6 +15,10 @@ static u32 *flush_words;
  const struct pci_device_id amd_nb_misc_ids[] = {
         { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
         { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+#ifdef CONFIG_XEN
+       { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
+       { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, /* Fam12, Fam14 */
+#endif
         { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
         {}
  };
@@ -150,6 +154,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
         return res;
  }
  
+#ifndef CONFIG_XEN
  int amd_get_subcaches(int cpu)
  {
         struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
@@ -204,6 +209,7 @@ int amd_set_subcaches(int cpu, int mask)
  
         return 0;
  }
+#endif
  
  static int amd_cache_gart(void)
  {
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile

index 0ae0323..f30b902 100644 (file)
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -25,3 +25,7 @@ obj-$(CONFIG_X86_ES7000)      += es7000_32.o
  
  # For 32bit, probe_32 need to be listed last
  obj-$(CONFIG_X86_LOCAL_APIC)   += probe_$(BITS).o
+
+probe_64-$(CONFIG_XEN)         := probe_32.o
+
+disabled-obj-$(CONFIG_XEN)     := apic_%.o
diff --git a/arch/x86/kernel/apic/apic-xen.c b/arch/x86/kernel/apic/apic-xen.c

new file mode 100644 (file)

index 0000000..6b0603c
--- /dev/null
+++ b/arch/x86/kernel/apic/apic-xen.c
@@ -0,0 +1,69 @@
+/*
+ *     Local APIC handling stubs
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+
+#include <asm/smp.h>
+#include <asm/proto.h>
+#include <asm/apic.h>
+
+unsigned int num_processors;
+
+/*
+ * Debug level, exported for io_apic.c
+ */
+unsigned int apic_verbosity;
+
+/* Have we found an MP table */
+int smp_found_config;
+
+static int __init apic_set_verbosity(char *arg)
+{
+       if (!arg)  {
+#ifdef CONFIG_X86_64
+               skip_ioapic_setup = 0;
+               return 0;
+#endif
+               return -EINVAL;
+       }
+
+       if (strcmp("debug", arg) == 0)
+               apic_verbosity = APIC_DEBUG;
+       else if (strcmp("verbose", arg) == 0)
+               apic_verbosity = APIC_VERBOSE;
+       else {
+               pr_warning("APIC Verbosity level %s not recognised"
+                       " use apic=verbose or apic=debug\n", arg);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+early_param("apic", apic_set_verbosity);
+
+int setup_profiling_timer(unsigned int multiplier)
+{
+       return -EINVAL;
+}
+
+#ifndef CONFIG_SMP
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+       if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+               setup_IO_APIC();
+# ifdef CONFIG_X86_64
+       else
+               nr_ioapics = 0;
+# endif
+#endif
+
+       return 0;
+}
+#endif
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c

index 0cdec70..6e8a05c 100644 (file)
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -153,7 +153,7 @@ static void bigsmp_send_IPI_all(int vector)
  
  static int dmi_bigsmp; /* can be set by dmi scanners */
  
-static int hp_ht_bigsmp(const struct dmi_system_id *d)
+static int force_bigsmp_apic(const struct dmi_system_id *d)
  {
         printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident);
         dmi_bigsmp = 1;
@@ -163,17 +163,41 @@ static int hp_ht_bigsmp(const struct dmi_system_id *d)
  
  
  static const struct dmi_system_id bigsmp_dmi_table[] = {
-       { hp_ht_bigsmp, "HP ProLiant DL760 G2",
+       { force_bigsmp_apic, "HP ProLiant DL760 G2",
                 {       DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
                         DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
                 }
         },
  
-       { hp_ht_bigsmp, "HP ProLiant DL740",
+       { force_bigsmp_apic, "HP ProLiant DL740",
                 {       DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
                         DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
                 }
         },
+
+       { force_bigsmp_apic, "IBM x260 / x366 / x460",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "IBM"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "-[ZT"),
+               }
+       },
+
+       { force_bigsmp_apic, "IBM x3800 / x3850 / x3950",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "IBM"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "-[ZU"),
+               }
+       },
+
+       { force_bigsmp_apic, "IBM x3800 / x3850 / x3950",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "IBM"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "-[ZS"),
+               }
+       },
+
+       { force_bigsmp_apic, "IBM x3850 M2 / x3950 M2",
+               {       DMI_MATCH(DMI_BIOS_VENDOR, "IBM"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "-[A3"),
+               }
+       },
         { } /* NULL entry stops DMI scanning */
  };
  
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c

index 31cb9ae..8773f2c 100644 (file)
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -26,6 +26,10 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
  #endif
  
  #ifdef arch_trigger_all_cpu_backtrace
+#ifdef CONFIG_XEN
+#include <asm/ipi.h>
+#endif
+
  /* For reliability, we're prepared to waste bits here. */
  static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
  
@@ -46,7 +50,11 @@ void arch_trigger_all_cpu_backtrace(void)
         cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
  
         printk(KERN_INFO "sending NMI to all CPUs:\n");
+#ifndef CONFIG_XEN
         apic->send_IPI_all(NMI_VECTOR);
+#else /* this works even without CONFIG_X86_LOCAL_APIC */
+       xen_send_IPI_all(NMI_VECTOR);
+#endif
  
         /* Wait for up to 10 seconds for all CPUs to do the backtrace */
         for (i = 0; i < 10 * 1000; i++) {
diff --git a/arch/x86/kernel/apic/io_apic-xen.c b/arch/x86/kernel/apic/io_apic-xen.c

new file mode 100644 (file)

index 0000000..cad202f
--- /dev/null
+++ b/arch/x86/kernel/apic/io_apic-xen.c
@@ -0,0 +1,4292 @@
+/*
+ *     Intel IO-APIC support for multi-Pentium hosts.
+ *
+ *     Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ *
+ *     Many thanks to Stig Venaas for trying out countless experimental
+ *     patches and reporting/debugging problems patiently!
+ *
+ *     (c) 1999, Multiple IO-APIC support, developed by
+ *     Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ *     further tested and cleaned up by Zach Brown <zab@redhat.com>
+ *     and Ingo Molnar <mingo@redhat.com>
+ *
+ *     Fixes
+ *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
+ *                                     thanks to Eric Gilmore
+ *                                     and Rolf G. Tews
+ *                                     for testing these extensively
+ *     Paul Diefenbaugh        :       Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/syscore_ops.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h>     /* time_after() */
+#include <linux/slab.h>
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+#include <linux/bootmem.h>
+
+#include <asm/idle.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/acpi.h>
+#include <asm/dma.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/setup.h>
+#include <asm/hw_irq.h>
+
+#include <asm/apic.h>
+
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/evtchn.h>
+
+/* Fake i8259 */
+static void make_8259A_irq(unsigned int irq) { io_apic_irqs &= ~(1UL<<irq); }
+static const struct legacy_pic xen_legacy_pic = {
+       .nr_legacy_irqs = NR_IRQS_LEGACY,
+       .make_irq = make_8259A_irq
+};
+#define legacy_pic (&xen_legacy_pic)
+
+unsigned long io_apic_irqs;
+#endif /* CONFIG_XEN */
+
+#define __apicdebuginit(type) static type __init
+
+#define for_each_irq_pin(entry, head) \
+       for (entry = head; entry; entry = entry->next)
+
+#ifndef CONFIG_XEN
+static void            __init __ioapic_init_mappings(void);
+
+static unsigned int    __io_apic_read  (unsigned int apic, unsigned int reg);
+static void            __io_apic_write (unsigned int apic, unsigned int reg, unsigned int val);
+static void            __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val);
+
+static struct io_apic_ops io_apic_ops = {
+       .init   = __ioapic_init_mappings,
+       .read   = __io_apic_read,
+       .write  = __io_apic_write,
+       .modify = __io_apic_modify,
+};
+
+void __init set_io_apic_ops(const struct io_apic_ops *ops)
+{
+       io_apic_ops = *ops;
+}
+#endif
+
+/*
+ *      Is the SiS APIC rmw bug present ?
+ *      -1 = don't know, 0 = no, 1 = yes
+ */
+int sis_apic_bug = -1;
+
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
+#ifndef CONFIG_XEN
+static DEFINE_RAW_SPINLOCK(vector_lock);
+#endif
+
+static struct ioapic {
+       /*
+        * # of IRQ routing registers
+        */
+       int nr_registers;
+#ifndef CONFIG_XEN
+       /*
+        * Saved state during suspend/resume, or while enabling intr-remap.
+        */
+       struct IO_APIC_route_entry *saved_registers;
+#endif
+       /* I/O APIC config */
+       struct mpc_ioapic mp_config;
+       /* IO APIC gsi routing info */
+       struct mp_ioapic_gsi  gsi_config;
+       DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} ioapics[MAX_IO_APICS];
+
+#define mpc_ioapic_ver(ioapic_idx)     ioapics[ioapic_idx].mp_config.apicver
+
+int mpc_ioapic_id(int ioapic_idx)
+{
+       return ioapics[ioapic_idx].mp_config.apicid;
+}
+
+unsigned int mpc_ioapic_addr(int ioapic_idx)
+{
+       return ioapics[ioapic_idx].mp_config.apicaddr;
+}
+
+struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+{
+       return &ioapics[ioapic_idx].gsi_config;
+}
+
+int nr_ioapics;
+
+/* The one past the highest gsi number used */
+u32 gsi_top;
+
+/* MP IRQ source entries */
+struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* # of MP IRQ source entries */
+int mp_irq_entries;
+
+#ifndef CONFIG_XEN
+/* GSI interrupts */
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
+#endif
+
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
+int skip_ioapic_setup;
+
+/**
+ * disable_ioapic_support() - disables ioapic support at runtime
+ */
+static void __init _disable_ioapic_support(void)
+{
+#ifdef CONFIG_PCI
+       noioapicquirk = 1;
+       noioapicreroute = -1;
+#endif
+       skip_ioapic_setup = 1;
+}
+
+static int __init parse_noapic(char *str)
+{
+       /* disable IO-APIC */
+       _disable_ioapic_support();
+       return 0;
+}
+early_param("noapic", parse_noapic);
+
+static int io_apic_setup_irq_pin(unsigned int irq, int node,
+                                struct io_apic_irq_attr *attr);
+
+/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
+void mp_save_irq(struct mpc_intsrc *m)
+{
+       int i;
+
+       apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
+               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+               m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+               m->srcbusirq, m->dstapic, m->dstirq);
+
+       for (i = 0; i < mp_irq_entries; i++) {
+               if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
+                       return;
+       }
+
+       memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
+       if (++mp_irq_entries == MAX_IRQ_SOURCES)
+               panic("Max # of irq sources exceeded!!\n");
+}
+
+#ifndef CONFIG_XEN
+struct irq_pin_list {
+       int apic, pin;
+       struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *alloc_irq_pin_list(int node)
+{
+       return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
+}
+
+
+/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
+
+int __init arch_early_irq_init(void)
+{
+       struct irq_cfg *cfg;
+       int count, node, i;
+
+       if (!legacy_pic->nr_legacy_irqs)
+               io_apic_irqs = ~0UL;
+
+       for (i = 0; i < nr_ioapics; i++) {
+               ioapics[i].saved_registers =
+                       kzalloc(sizeof(struct IO_APIC_route_entry) *
+                               ioapics[i].nr_registers, GFP_KERNEL);
+               if (!ioapics[i].saved_registers)
+                       pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
+       }
+
+       cfg = irq_cfgx;
+       count = ARRAY_SIZE(irq_cfgx);
+       node = cpu_to_node(0);
+
+       /* Make sure the legacy interrupts are marked in the bitmap */
+       irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
+
+       for (i = 0; i < count; i++) {
+               irq_set_chip_data(i, &cfg[i]);
+               zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
+               zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
+               /*
+                * For legacy IRQ's, start with assigning irq0 to irq15 to
+                * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
+                */
+               if (i < legacy_pic->nr_legacy_irqs) {
+                       cfg[i].vector = IRQ0_VECTOR + i;
+                       cpumask_set_cpu(0, cfg[i].domain);
+               }
+       }
+
+       return 0;
+}
+
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+       return irq_get_chip_data(irq);
+}
+
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+{
+       struct irq_cfg *cfg;
+
+       cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+       if (!cfg)
+               return NULL;
+       if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
+               goto out_cfg;
+       if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+               goto out_domain;
+       return cfg;
+out_domain:
+       free_cpumask_var(cfg->domain);
+out_cfg:
+       kfree(cfg);
+       return NULL;
+}
+
+static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
+{
+       if (!cfg)
+               return;
+       irq_set_chip_data(at, NULL);
+       free_cpumask_var(cfg->domain);
+       free_cpumask_var(cfg->old_domain);
+       kfree(cfg);
+}
+
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+{
+       int res = irq_alloc_desc_at(at, node);
+       struct irq_cfg *cfg;
+
+       if (res < 0) {
+               if (res != -EEXIST)
+                       return NULL;
+               cfg = irq_get_chip_data(at);
+               if (cfg)
+                       return cfg;
+       }
+
+       cfg = alloc_irq_cfg(at, node);
+       if (cfg)
+               irq_set_chip_data(at, cfg);
+       else
+               irq_free_desc(at);
+       return cfg;
+}
+
+static int alloc_irq_from(unsigned int from, int node)
+{
+       return irq_alloc_desc_from(from, node);
+}
+
+static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
+{
+       free_irq_cfg(at, cfg);
+       irq_free_desc(at);
+}
+
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+       return io_apic_ops.read(apic, reg);
+}
+
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+       io_apic_ops.write(apic, reg, value);
+}
+
+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+{
+       io_apic_ops.modify(apic, reg, value);
+}
+
+
+struct io_apic {
+       unsigned int index;
+       unsigned int unused[3];
+       unsigned int data;
+       unsigned int unused2[11];
+       unsigned int eoi;
+};
+
+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
+{
+       return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+               + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
+}
+
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
+{
+       struct io_apic __iomem *io_apic = io_apic_base(apic);
+       writel(vector, &io_apic->eoi);
+}
+
+static unsigned int __io_apic_read(unsigned int apic, unsigned int reg)
+{
+       struct io_apic __iomem *io_apic = io_apic_base(apic);
+       writel(reg, &io_apic->index);
+       return readl(&io_apic->data);
+}
+
+static void __io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+       struct io_apic __iomem *io_apic = io_apic_base(apic);
+
+       writel(reg, &io_apic->index);
+       writel(value, &io_apic->data);
+}
+
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index register
+ */
+static void __io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
+{
+       struct io_apic __iomem *io_apic = io_apic_base(apic);
+
+       if (sis_apic_bug)
+               writel(reg, &io_apic->index);
+       writel(value, &io_apic->data);
+}
+
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+{
+       struct irq_pin_list *entry;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       for_each_irq_pin(entry, cfg->irq_2_pin) {
+               unsigned int reg;
+               int pin;
+
+               pin = entry->pin;
+               reg = io_apic_read(entry->apic, 0x10 + pin*2);
+               /* Is the remote IRR bit set? */
+               if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+                       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+                       return true;
+               }
+       }
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return false;
+}
+#else /* !CONFIG_XEN */
+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
+{
+       struct physdev_apic apic_op;
+       int ret;
+
+       apic_op.apic_physbase = mpc_ioapic_addr(apic);
+       apic_op.reg = reg;
+       ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
+       if (ret)
+               return ret;
+       return apic_op.value;
+}
+
+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+{
+       struct physdev_apic apic_op;
+
+       apic_op.apic_physbase = mpc_ioapic_addr(apic);
+       apic_op.reg = reg;
+       apic_op.value = value;
+       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
+}
+
+#define io_apic_modify io_apic_write
+#endif /* !CONFIG_XEN */
+
+union entry_union {
+       struct { u32 w1, w2; };
+       struct IO_APIC_route_entry entry;
+};
+
+#ifndef CONFIG_XEN
+static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
+{
+       union entry_union eu;
+
+       eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+       eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+
+       return eu.entry;
+}
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+       union entry_union eu;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       eu.entry = __ioapic_read_entry(apic, pin);
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return eu.entry;
+}
+#endif
+
+/*
+ * When we write a new IO APIC routing entry, we need to write the high
+ * word first! If the mask bit in the low word is clear, we will enable
+ * the interrupt, and we need to make sure the entry is fully populated
+ * before that happens.
+ */
+static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+       union entry_union eu = {{0, 0}};
+
+       eu.entry = e;
+       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       __ioapic_write_entry(apic, pin, e);
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * When we mask an IO APIC routing entry, we need to write the low
+ * word first, in order to set the mask bit before we change the
+ * high bits!
+ */
+static void ioapic_mask_entry(int apic, int pin)
+{
+       unsigned long flags;
+       union entry_union eu = { .entry.mask = 1 };
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+       io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+{
+       struct irq_pin_list **last, *entry;
+
+       /* don't allow duplicates */
+       last = &cfg->irq_2_pin;
+       for_each_irq_pin(entry, cfg->irq_2_pin) {
+               if (entry->apic == apic && entry->pin == pin)
+                       return 0;
+               last = &entry->next;
+       }
+
+       entry = alloc_irq_pin_list(node);
+       if (!entry) {
+               printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
+                               node, apic, pin);
+               return -ENOMEM;
+       }
+       entry->apic = apic;
+       entry->pin = pin;
+
+       *last = entry;
+       return 0;
+}
+
+static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+{
+       if (__add_pin_to_irq_node(cfg, node, apic, pin))
+               panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
+}
+
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
+                                          int oldapic, int oldpin,
+                                          int newapic, int newpin)
+{
+       struct irq_pin_list *entry;
+
+       for_each_irq_pin(entry, cfg->irq_2_pin) {
+               if (entry->apic == oldapic && entry->pin == oldpin) {
+                       entry->apic = newapic;
+                       entry->pin = newpin;
+                       /* every one is different, right? */
+                       return;
+               }
+       }
+
+       /* old apic/pin didn't exist, so just add new ones */
+       add_pin_to_irq_node(cfg, node, newapic, newpin);
+}
+
+static void __io_apic_modify_irq(struct irq_pin_list *entry,
+                                int mask_and, int mask_or,
+                                void (*final)(struct irq_pin_list *entry))
+{
+       unsigned int reg, pin;
+
+       pin = entry->pin;
+       reg = io_apic_read(entry->apic, 0x10 + pin * 2);
+       reg &= mask_and;
+       reg |= mask_or;
+       io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
+       if (final)
+               final(entry);
+}
+
+static void io_apic_modify_irq(struct irq_cfg *cfg,
+                              int mask_and, int mask_or,
+                              void (*final)(struct irq_pin_list *entry))
+{
+       struct irq_pin_list *entry;
+
+       for_each_irq_pin(entry, cfg->irq_2_pin)
+               __io_apic_modify_irq(entry, mask_and, mask_or, final);
+}
+
+static void io_apic_sync(struct irq_pin_list *entry)
+{
+       /*
+        * Synchronize the IO-APIC and the CPU by doing
+        * a dummy read from the IO-APIC
+        */
+       struct io_apic __iomem *io_apic;
+
+       io_apic = io_apic_base(entry->apic);
+       readl(&io_apic->data);
+}
+
+static void mask_ioapic(struct irq_cfg *cfg)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void mask_ioapic_irq(struct irq_data *data)
+{
+       mask_ioapic(data->chip_data);
+}
+
+static void __unmask_ioapic(struct irq_cfg *cfg)
+{
+       io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
+}
+
+static void unmask_ioapic(struct irq_cfg *cfg)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       __unmask_ioapic(cfg);
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void unmask_ioapic_irq(struct irq_data *data)
+{
+       unmask_ioapic(data->chip_data);
+}
+
+/*
+ * IO-APIC versions below 0x20 don't support EOI register.
+ * For the record, here is the information about various versions:
+ *     0Xh     82489DX
+ *     1Xh     I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+ *     2Xh     I/O(x)APIC which is PCI 2.2 Compliant
+ *     30h-FFh Reserved
+ *
+ * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
+ * version as 0x2. This is an error with documentation and these ICH chips
+ * use io-apic's of version 0x20.
+ *
+ * For IO-APIC's with EOI register, we use that to do an explicit EOI.
+ * Otherwise, we simulate the EOI message manually by changing the trigger
+ * mode to edge and then back to level, with RTE being masked during this.
+ */
+static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg)
+{
+       if (mpc_ioapic_ver(apic) >= 0x20) {
+               /*
+                * Intr-remapping uses pin number as the virtual vector
+                * in the RTE. Actual vector is programmed in
+                * intr-remapping table entry. Hence for the io-apic
+                * EOI we use the pin number.
+                */
+               if (cfg && irq_remapped(cfg))
+                       io_apic_eoi(apic, pin);
+               else
+                       io_apic_eoi(apic, vector);
+       } else {
+               struct IO_APIC_route_entry entry, entry1;
+
+               entry = entry1 = __ioapic_read_entry(apic, pin);
+
+               /*
+                * Mask the entry and change the trigger mode to edge.
+                */
+               entry1.mask = 1;
+               entry1.trigger = IOAPIC_EDGE;
+
+               __ioapic_write_entry(apic, pin, entry1);
+
+               /*
+                * Restore the previous level triggered entry.
+                */
+               __ioapic_write_entry(apic, pin, entry);
+       }
+}
+
+static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+       struct irq_pin_list *entry;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       for_each_irq_pin(entry, cfg->irq_2_pin)
+               __eoi_ioapic_pin(entry->apic, entry->pin, cfg->vector, cfg);
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+       struct IO_APIC_route_entry entry;
+
+       /* Check delivery_mode to be sure we're not clearing an SMI pin */
+       entry = ioapic_read_entry(apic, pin);
+       if (entry.delivery_mode == dest_SMI)
+               return;
+
+       /*
+        * Make sure the entry is masked and re-read the contents to check
+        * if it is a level triggered pin and if the remote-IRR is set.
+        */
+       if (!entry.mask) {
+               entry.mask = 1;
+               ioapic_write_entry(apic, pin, entry);
+               entry = ioapic_read_entry(apic, pin);
+       }
+
+       if (entry.irr) {
+               unsigned long flags;
+
+               /*
+                * Make sure the trigger mode is set to level. Explicit EOI
+                * doesn't clear the remote-IRR if the trigger mode is not
+                * set to level.
+                */
+               if (!entry.trigger) {
+                       entry.trigger = IOAPIC_LEVEL;
+                       ioapic_write_entry(apic, pin, entry);
+               }
+
+               raw_spin_lock_irqsave(&ioapic_lock, flags);
+               __eoi_ioapic_pin(apic, pin, entry.vector, NULL);
+               raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+       }
+
+       /*
+        * Clear the rest of the bits in the IO-APIC RTE except for the mask
+        * bit.
+        */
+       ioapic_mask_entry(apic, pin);
+       entry = ioapic_read_entry(apic, pin);
+       if (entry.irr)
+               printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n",
+                      mpc_ioapic_id(apic), pin);
+}
+
+static void clear_IO_APIC (void)
+{
+       int apic, pin;
+
+       for (apic = 0; apic < nr_ioapics; apic++)
+               for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+                       clear_IO_APIC_pin(apic, pin);
+}
+#else
+#define add_pin_to_irq_node(cfg, node, apic, pin)
+#define __add_pin_to_irq_node(cfg, node, apic, pin) 0
+#endif /* !CONFIG_XEN */
+
+#ifdef CONFIG_X86_32
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries[MAX_PIRQS] = {
+       [0 ... MAX_PIRQS - 1] = -1
+};
+
+static int __init ioapic_pirq_setup(char *str)
+{
+       int i, max;
+       int ints[MAX_PIRQS+1];
+
+       get_options(str, ARRAY_SIZE(ints), ints);
+
+       apic_printk(APIC_VERBOSE, KERN_INFO
+                       "PIRQ redirection, working around broken MP-BIOS.\n");
+       max = MAX_PIRQS;
+       if (ints[0] < MAX_PIRQS)
+               max = ints[0];
+
+       for (i = 0; i < max; i++) {
+               apic_printk(APIC_VERBOSE, KERN_DEBUG
+                               "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+               /*
+                * PIRQs are mapped upside down, usually.
+                */
+               pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+       }
+       return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+#endif /* CONFIG_X86_32 */
+
+#ifndef CONFIG_XEN
+/*
+ * Saves all the IO-APIC RTE's
+ */
+int save_ioapic_entries(void)
+{
+       int apic, pin;
+       int err = 0;
+
+       for (apic = 0; apic < nr_ioapics; apic++) {
+               if (!ioapics[apic].saved_registers) {
+                       err = -ENOMEM;
+                       continue;
+               }
+
+               for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+                       ioapics[apic].saved_registers[pin] =
+                               ioapic_read_entry(apic, pin);
+       }
+
+       return err;
+}
+
+/*
+ * Mask all IO APIC entries.
+ */
+void mask_ioapic_entries(void)
+{
+       int apic, pin;
+
+       for (apic = 0; apic < nr_ioapics; apic++) {
+               if (!ioapics[apic].saved_registers)
+                       continue;
+
+               for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+                       struct IO_APIC_route_entry entry;
+
+                       entry = ioapics[apic].saved_registers[pin];
+                       if (!entry.mask) {
+                               entry.mask = 1;
+                               ioapic_write_entry(apic, pin, entry);
+                       }
+               }
+       }
+}
+
+/*
+ * Restore IO APIC entries which was saved in the ioapic structure.
+ */
+int restore_ioapic_entries(void)
+{
+       int apic, pin;
+
+       for (apic = 0; apic < nr_ioapics; apic++) {
+               if (!ioapics[apic].saved_registers)
+                       continue;
+
+               for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+                       ioapic_write_entry(apic, pin,
+                                          ioapics[apic].saved_registers[pin]);
+       }
+       return 0;
+}
+#endif /* CONFIG_XEN */
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int ioapic_idx, int pin, int type)
+{
+       int i;
+
+       for (i = 0; i < mp_irq_entries; i++)
+               if (mp_irqs[i].irqtype == type &&
+                   (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) ||
+                    mp_irqs[i].dstapic == MP_APIC_ALL) &&
+                   mp_irqs[i].dstirq == pin)
+                       return i;
+
+       return -1;
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+       int i;
+
+       for (i = 0; i < mp_irq_entries; i++) {
+               int lbus = mp_irqs[i].srcbus;
+
+               if (test_bit(lbus, mp_bus_not_pci) &&
+                   (mp_irqs[i].irqtype == type) &&
+                   (mp_irqs[i].srcbusirq == irq))
+
+                       return mp_irqs[i].dstirq;
+       }
+       return -1;
+}
+
+static int __init find_isa_irq_apic(int irq, int type)
+{
+       int i;
+
+       for (i = 0; i < mp_irq_entries; i++) {
+               int lbus = mp_irqs[i].srcbus;
+
+               if (test_bit(lbus, mp_bus_not_pci) &&
+                   (mp_irqs[i].irqtype == type) &&
+                   (mp_irqs[i].srcbusirq == irq))
+                       break;
+       }
+
+       if (i < mp_irq_entries) {
+               int ioapic_idx;
+
+               for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+                       if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
+                               return ioapic_idx;
+       }
+
+       return -1;
+}
+#endif
+
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+       if (irq < legacy_pic->nr_legacy_irqs) {
+               unsigned int port = 0x4d0 + (irq >> 3);
+               return (inb(port) >> (irq & 7)) & 1;
+       }
+       apic_printk(APIC_VERBOSE, KERN_INFO
+                       "Broken MPtable reports ISA irq %d\n", irq);
+       return 0;
+}
+
+#endif
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx)       (0)
+#define default_ISA_polarity(idx)      (0)
+
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value.  If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].srcbusirq))
+#define default_EISA_polarity(idx)     default_ISA_polarity(idx)
+
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx)       (1)
+#define default_PCI_polarity(idx)      (1)
+
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_MCA_trigger(idx)       (1)
+#define default_MCA_polarity(idx)      default_ISA_polarity(idx)
+
+static int irq_polarity(int idx)
+{
+       int bus = mp_irqs[idx].srcbus;
+       int polarity;
+
+       /*
+        * Determine IRQ line polarity (high active or low active):
+        */
+       switch (mp_irqs[idx].irqflag & 3)
+       {
+               case 0: /* conforms, ie. bus-type dependent polarity */
+                       if (test_bit(bus, mp_bus_not_pci))
+                               polarity = default_ISA_polarity(idx);
+                       else
+                               polarity = default_PCI_polarity(idx);
+                       break;
+               case 1: /* high active */
+               {
+                       polarity = 0;
+                       break;
+               }
+               case 2: /* reserved */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       polarity = 1;
+                       break;
+               }
+               case 3: /* low active */
+               {
+                       polarity = 1;
+                       break;
+               }
+               default: /* invalid */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       polarity = 1;
+                       break;
+               }
+       }
+       return polarity;
+}
+
+static int irq_trigger(int idx)
+{
+       int bus = mp_irqs[idx].srcbus;
+       int trigger;
+
+       /*
+        * Determine IRQ trigger mode (edge or level sensitive):
+        */
+       switch ((mp_irqs[idx].irqflag>>2) & 3)
+       {
+               case 0: /* conforms, ie. bus-type dependent */
+                       if (test_bit(bus, mp_bus_not_pci))
+                               trigger = default_ISA_trigger(idx);
+                       else
+                               trigger = default_PCI_trigger(idx);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+                       switch (mp_bus_id_to_type[bus]) {
+                               case MP_BUS_ISA: /* ISA pin */
+                               {
+                                       /* set before the switch */
+                                       break;
+                               }
+                               case MP_BUS_EISA: /* EISA pin */
+                               {
+                                       trigger = default_EISA_trigger(idx);
+                                       break;
+                               }
+                               case MP_BUS_PCI: /* PCI pin */
+                               {
+                                       /* set before the switch */
+                                       break;
+                               }
+                               case MP_BUS_MCA: /* MCA pin */
+                               {
+                                       trigger = default_MCA_trigger(idx);
+                                       break;
+                               }
+                               default:
+                               {
+                                       printk(KERN_WARNING "broken BIOS!!\n");
+                                       trigger = 1;
+                                       break;
+                               }
+                       }
+#endif
+                       break;
+               case 1: /* edge */
+               {
+                       trigger = 0;
+                       break;
+               }
+               case 2: /* reserved */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       trigger = 1;
+                       break;
+               }
+               case 3: /* level */
+               {
+                       trigger = 1;
+                       break;
+               }
+               default: /* invalid */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       trigger = 0;
+                       break;
+               }
+       }
+       return trigger;
+}
+
+static int pin_2_irq(int idx, int apic, int pin)
+{
+       int irq;
+       int bus = mp_irqs[idx].srcbus;
+       struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
+
+       /*
+        * Debugging check, we are in big trouble if this message pops up!
+        */
+       if (mp_irqs[idx].dstirq != pin)
+               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+
+       if (test_bit(bus, mp_bus_not_pci)) {
+               irq = mp_irqs[idx].srcbusirq;
+       } else {
+               u32 gsi = gsi_cfg->gsi_base + pin;
+
+               if (gsi >= NR_IRQS_LEGACY)
+                       irq = gsi;
+               else
+                       irq = gsi_top + gsi;
+       }
+
+#ifdef CONFIG_X86_32
+       /*
+        * PCI IRQ command line redirection. Yes, limits are hardcoded.
+        */
+       if ((pin >= 16) && (pin <= 23)) {
+               if (pirq_entries[pin-16] != -1) {
+                       if (!pirq_entries[pin-16]) {
+                               apic_printk(APIC_VERBOSE, KERN_DEBUG
+                                               "disabling PIRQ%d\n", pin-16);
+                       } else {
+                               irq = pirq_entries[pin-16];
+                               apic_printk(APIC_VERBOSE, KERN_DEBUG
+                                               "using PIRQ%d -> IRQ %d\n",
+                                               pin-16, irq);
+                       }
+               }
+       }
+#endif
+
+       return irq;
+}
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
+                               struct io_apic_irq_attr *irq_attr)
+{
+       int ioapic_idx, i, best_guess = -1;
+
+       apic_printk(APIC_DEBUG,
+                   "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+                   bus, slot, pin);
+       if (test_bit(bus, mp_bus_not_pci)) {
+               apic_printk(APIC_VERBOSE,
+                           "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+               return -1;
+       }
+       for (i = 0; i < mp_irq_entries; i++) {
+               int lbus = mp_irqs[i].srcbus;
+
+               for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+                       if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic ||
+                           mp_irqs[i].dstapic == MP_APIC_ALL)
+                               break;
+
+               if (!test_bit(lbus, mp_bus_not_pci) &&
+                   !mp_irqs[i].irqtype &&
+                   (bus == lbus) &&
+                   (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+                       int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq);
+
+                       if (!(ioapic_idx || IO_APIC_IRQ(irq)))
+                               continue;
+
+                       if (pin == (mp_irqs[i].srcbusirq & 3)) {
+                               set_io_apic_irq_attr(irq_attr, ioapic_idx,
+                                                    mp_irqs[i].dstirq,
+                                                    irq_trigger(i),
+                                                    irq_polarity(i));
+                               return irq;
+                       }
+                       /*
+                        * Use the first all-but-pin matching entry as a
+                        * best-guess fuzzy result for broken mptables.
+                        */
+                       if (best_guess < 0) {
+                               set_io_apic_irq_attr(irq_attr, ioapic_idx,
+                                                    mp_irqs[i].dstirq,
+                                                    irq_trigger(i),
+                                                    irq_polarity(i));
+                               best_guess = irq;
+                       }
+               }
+       }
+       return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
+#ifndef CONFIG_XEN
+void lock_vector_lock(void)
+{
+       /* Used to the online set of cpus does not change
+        * during assign_irq_vector.
+        */
+       raw_spin_lock(&vector_lock);
+}
+
+void unlock_vector_lock(void)
+{
+       raw_spin_unlock(&vector_lock);
+}
+
+static int
+__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+       /*
+        * NOTE! The local APIC isn't very good at handling
+        * multiple interrupts at the same interrupt level.
+        * As the interrupt level is determined by taking the
+        * vector number and shifting that right by 4, we
+        * want to spread these out a bit so that they don't
+        * all fall in the same interrupt level.
+        *
+        * Also, we've got to be careful not to trash gate
+        * 0x80, because int 0x80 is hm, kind of importantish. ;)
+        */
+       static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
+       static int current_offset = VECTOR_OFFSET_START % 8;
+       unsigned int old_vector;
+       int cpu, err;
+       cpumask_var_t tmp_mask;
+
+       if (cfg->move_in_progress)
+               return -EBUSY;
+
+       if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+               return -ENOMEM;
+
+       old_vector = cfg->vector;
+       if (old_vector) {
+               cpumask_and(tmp_mask, mask, cpu_online_mask);
+               cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+               if (!cpumask_empty(tmp_mask)) {
+                       free_cpumask_var(tmp_mask);
+                       return 0;
+               }
+       }
+
+       /* Only try and allocate irqs on cpus that are present */
+       err = -ENOSPC;
+       for_each_cpu_and(cpu, mask, cpu_online_mask) {
+               int new_cpu;
+               int vector, offset;
+
+               apic->vector_allocation_domain(cpu, tmp_mask);
+
+               vector = current_vector;
+               offset = current_offset;
+next:
+               vector += 8;
+               if (vector >= first_system_vector) {
+                       /* If out of vectors on large boxen, must share them. */
+                       offset = (offset + 1) % 8;
+                       vector = FIRST_EXTERNAL_VECTOR + offset;
+               }
+               if (unlikely(current_vector == vector))
+                       continue;
+
+               if (test_bit(vector, used_vectors))
+                       goto next;
+
+#ifdef CONFIG_KDB
+               if (vector == KDBENTER_VECTOR)
+                       goto next;
+#endif /* CONFIG_KDB */
+               for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+                       if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+                               goto next;
+               /* Found one! */
+               current_vector = vector;
+               current_offset = offset;
+               if (old_vector) {
+                       cfg->move_in_progress = 1;
+                       cpumask_copy(cfg->old_domain, cfg->domain);
+               }
+               for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+                       per_cpu(vector_irq, new_cpu)[vector] = irq;
+               cfg->vector = vector;
+               cpumask_copy(cfg->domain, tmp_mask);
+               err = 0;
+               break;
+       }
+       free_cpumask_var(tmp_mask);
+       return err;
+}
+
+int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+       int err;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&vector_lock, flags);
+       err = __assign_irq_vector(irq, cfg, mask);
+       raw_spin_unlock_irqrestore(&vector_lock, flags);
+       return err;
+}
+
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
+{
+       int cpu, vector;
+
+       BUG_ON(!cfg->vector);
+
+       vector = cfg->vector;
+       for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+               per_cpu(vector_irq, cpu)[vector] = -1;
+
+       cfg->vector = 0;
+       cpumask_clear(cfg->domain);
+
+       if (likely(!cfg->move_in_progress))
+               return;
+       for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+               for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
+                                                               vector++) {
+                       if (per_cpu(vector_irq, cpu)[vector] != irq)
+                               continue;
+                       per_cpu(vector_irq, cpu)[vector] = -1;
+                       break;
+               }
+       }
+       cfg->move_in_progress = 0;
+}
+
+void __setup_vector_irq(int cpu)
+{
+       /* Initialize vector_irq on a new cpu */
+       int irq, vector;
+       struct irq_cfg *cfg;
+
+       /*
+        * vector_lock will make sure that we don't run into irq vector
+        * assignments that might be happening on another cpu in parallel,
+        * while we setup our initial vector to irq mappings.
+        */
+       raw_spin_lock(&vector_lock);
+       /* Mark the inuse vectors */
+       for_each_active_irq(irq) {
+               cfg = irq_get_chip_data(irq);
+               if (!cfg)
+                       continue;
+               /*
+                * If it is a legacy IRQ handled by the legacy PIC, this cpu
+                * will be part of the irq_cfg's domain.
+                */
+               if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
+                       cpumask_set_cpu(cpu, cfg->domain);
+
+               if (!cpumask_test_cpu(cpu, cfg->domain))
+                       continue;
+               vector = cfg->vector;
+               per_cpu(vector_irq, cpu)[vector] = irq;
+       }
+       /* Mark the free vectors */
+       for (vector = 0; vector < NR_VECTORS; ++vector) {
+               irq = per_cpu(vector_irq, cpu)[vector];
+               if (irq < 0)
+                       continue;
+
+               cfg = irq_cfg(irq);
+               if (!cpumask_test_cpu(cpu, cfg->domain))
+                       per_cpu(vector_irq, cpu)[vector] = -1;
+       }
+       raw_spin_unlock(&vector_lock);
+}
+
+static struct irq_chip ioapic_chip;
+
+#ifdef CONFIG_X86_32
+static inline int IO_APIC_irq_trigger(int irq)
+{
+       int apic, idx, pin;
+
+       for (apic = 0; apic < nr_ioapics; apic++) {
+               for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+                       idx = find_irq_entry(apic, pin, mp_INT);
+                       if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
+                               return irq_trigger(idx);
+               }
+       }
+       /*
+         * nonexistent IRQs are edge default
+         */
+       return 0;
+}
+#else
+static inline int IO_APIC_irq_trigger(int irq)
+{
+       return 1;
+}
+#endif
+
+static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
+                                unsigned long trigger)
+{
+       struct irq_chip *chip = &ioapic_chip;
+       irq_flow_handler_t hdl;
+       bool fasteoi;
+
+       if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+           trigger == IOAPIC_LEVEL) {
+               irq_set_status_flags(irq, IRQ_LEVEL);
+               fasteoi = true;
+       } else {
+               irq_clear_status_flags(irq, IRQ_LEVEL);
+               fasteoi = false;
+       }
+
+       if (irq_remapped(cfg)) {
+               irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+               irq_remap_modify_chip_defaults(chip);
+               fasteoi = trigger != 0;
+       }
+
+       hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+       irq_set_chip_and_handler_name(irq, chip, hdl,
+                                     fasteoi ? "fasteoi" : "edge");
+}
+
+
+static int setup_ir_ioapic_entry(int irq,
+                             struct IR_IO_APIC_route_entry *entry,
+                             unsigned int destination, int vector,
+                             struct io_apic_irq_attr *attr)
+{
+       int index;
+       struct irte irte;
+       int ioapic_id = mpc_ioapic_id(attr->ioapic);
+       struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id);
+
+       if (!iommu) {
+               pr_warn("No mapping iommu for ioapic %d\n", ioapic_id);
+               return -ENODEV;
+       }
+
+       index = alloc_irte(iommu, irq, 1);
+       if (index < 0) {
+               pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id);
+               return -ENOMEM;
+       }
+
+       prepare_irte(&irte, vector, destination);
+
+       /* Set source-id of interrupt request */
+       set_ioapic_sid(&irte, ioapic_id);
+
+       modify_irte(irq, &irte);
+
+       apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
+               "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
+               "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
+               "Avail:%X Vector:%02X Dest:%08X "
+               "SID:%04X SQ:%X SVT:%X)\n",
+               attr->ioapic, irte.present, irte.fpd, irte.dst_mode,
+               irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
+               irte.avail, irte.vector, irte.dest_id,
+               irte.sid, irte.sq, irte.svt);
+
+       memset(entry, 0, sizeof(*entry));
+
+       entry->index2   = (index >> 15) & 0x1;
+       entry->zero     = 0;
+       entry->format   = 1;
+       entry->index    = (index & 0x7fff);
+       /*
+        * IO-APIC RTE will be configured with virtual vector.
+        * irq handler will do the explicit EOI to the io-apic.
+        */
+       entry->vector   = attr->ioapic_pin;
+       entry->mask     = 0;                    /* enable IRQ */
+       entry->trigger  = attr->trigger;
+       entry->polarity = attr->polarity;
+
+       /* Mask level triggered irqs.
+        * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+        */
+       if (attr->trigger)
+               entry->mask = 1;
+
+       return 0;
+}
+#else /* !CONFIG_XEN */
+#define __clear_irq_vector(irq, cfg) ((void)0)
+#define ioapic_register_intr(irq, cfg, trigger) evtchn_register_pirq(irq)
+#endif
+
+static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
+                              unsigned int destination, int vector,
+                              struct io_apic_irq_attr *attr)
+{
+#ifndef CONFIG_XEN
+       if (intr_remapping_enabled)
+               return setup_ir_ioapic_entry(irq,
+                        (struct IR_IO_APIC_route_entry *)entry,
+                        destination, vector, attr);
+#endif
+
+       memset(entry, 0, sizeof(*entry));
+
+       entry->delivery_mode = apic->irq_delivery_mode;
+       entry->dest_mode     = apic->irq_dest_mode;
+       entry->dest          = destination;
+       entry->vector        = vector;
+       entry->mask          = 0;                       /* enable IRQ */
+       entry->trigger       = attr->trigger;
+       entry->polarity      = attr->polarity;
+
+       /*
+        * Mask level triggered irqs.
+        * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+        */
+       if (attr->trigger)
+               entry->mask = 1;
+
+       return 0;
+}
+
+static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
+                               struct io_apic_irq_attr *attr)
+{
+       struct IO_APIC_route_entry entry;
+       unsigned int dest;
+
+       if (!IO_APIC_IRQ(irq))
+               return;
+#ifndef CONFIG_XEN
+       /*
+        * For legacy irqs, cfg->domain starts with cpu 0 for legacy
+        * controllers like 8259. Now that IO-APIC can handle this irq, update
+        * the cfg->domain.
+        */
+       if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
+               apic->vector_allocation_domain(0, cfg->domain);
+#else
+       /*
+        * For legacy IRQs we may get here before trigger mode and polarity
+        * get obtained, but Xen refuses to set those through
+        * PHYSDEVOP_setup_gsi more than once (perhaps even at all).
+        */
+       if (irq >= legacy_pic->nr_legacy_irqs
+           || test_bit(attr->ioapic_pin,
+                       ioapics[attr->ioapic].pin_programmed)) {
+               struct physdev_setup_gsi setup_gsi = {
+                       .gsi = irq,
+                       .triggering = attr->trigger,
+                       .polarity = attr->polarity
+               };
+               struct physdev_map_pirq map_pirq = {
+                       .domid = DOMID_SELF,
+                       .type = MAP_PIRQ_TYPE_GSI,
+                       .index = irq,
+                       .pirq = irq
+               };
+
+               switch (HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi,
+                                             &setup_gsi)) {
+               case -EEXIST:
+                       if (irq < legacy_pic->nr_legacy_irqs)
+                               break;
+                       /* fall through */
+               case 0:
+                       evtchn_register_pirq(irq);
+                       if (HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq,
+                                                 &map_pirq) == 0) {
+                               /* fake (for init_IO_APIC_traps()): */
+                               cfg->vector = irq;
+                               return;
+                       }
+               }
+       }
+#endif
+
+       if (assign_irq_vector(irq, cfg, apic->target_cpus()))
+               return;
+
+#ifndef CONFIG_XEN
+       dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
+#else
+       dest = 0; /* meaningless */
+#endif
+
+       apic_printk(APIC_VERBOSE,KERN_DEBUG
+                   "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
+                   "IRQ %d Mode:%i Active:%i Dest:%d)\n",
+                   attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
+                   cfg->vector, irq, attr->trigger, attr->polarity, dest);
+
+       if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
+               pr_warn("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
+                       mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
+               __clear_irq_vector(irq, cfg);
+
+               return;
+       }
+
+       ioapic_register_intr(irq, cfg, attr->trigger);
+#ifndef CONFIG_XEN
+       if (irq < legacy_pic->nr_legacy_irqs)
+               legacy_pic->mask(irq);
+#endif
+
+       ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
+}
+
+static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin)
+{
+       if (idx != -1)
+               return false;
+
+       apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
+                   mpc_ioapic_id(ioapic_idx), pin);
+       return true;
+}
+
+static void __init __io_apic_setup_irqs(unsigned int ioapic_idx)
+{
+       int idx, node = cpu_to_node(0);
+       struct io_apic_irq_attr attr;
+       unsigned int pin, irq;
+
+       for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) {
+               idx = find_irq_entry(ioapic_idx, pin, mp_INT);
+               if (io_apic_pin_not_connected(idx, ioapic_idx, pin))
+                       continue;
+
+               irq = pin_2_irq(idx, ioapic_idx, pin);
+
+               if ((ioapic_idx > 0) && (irq > 16))
+                       continue;
+
+#ifdef CONFIG_XEN
+               if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
+                       continue;
+#else
+               /*
+                * Skip the timer IRQ if there's a quirk handler
+                * installed and if it returns 1:
+                */
+               if (apic->multi_timer_check &&
+                   apic->multi_timer_check(ioapic_idx, irq))
+                       continue;
+#endif
+
+               set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
+                                    irq_polarity(idx));
+
+               io_apic_setup_irq_pin(irq, node, &attr);
+       }
+}
+
+static void __init setup_IO_APIC_irqs(void)
+{
+       unsigned int ioapic_idx;
+
+       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+       for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+               __io_apic_setup_irqs(ioapic_idx);
+}
+
+/*
+ * for the gsit that is not in first ioapic
+ * but could not use acpi_register_gsi()
+ * like some special sci in IBM x3330
+ */
+void setup_IO_APIC_irq_extra(u32 gsi)
+{
+       int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0);
+       struct io_apic_irq_attr attr;
+
+       /*
+        * Convert 'gsi' to 'ioapic.pin'.
+        */
+       ioapic_idx = mp_find_ioapic(gsi);
+       if (ioapic_idx < 0)
+               return;
+
+       pin = mp_find_ioapic_pin(ioapic_idx, gsi);
+       idx = find_irq_entry(ioapic_idx, pin, mp_INT);
+       if (idx == -1)
+               return;
+
+       irq = pin_2_irq(idx, ioapic_idx, pin);
+#ifdef CONFIG_XEN
+       if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
+               return;
+#endif
+
+       /* Only handle the non legacy irqs on secondary ioapics */
+       if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY)
+               return;
+
+       set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
+                            irq_polarity(idx));
+
+       io_apic_setup_irq_pin_once(irq, node, &attr);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Set up the timer pin, possibly with the 8259A-master behind.
+ */
+static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
+                                        unsigned int pin, int vector)
+{
+       struct IO_APIC_route_entry entry;
+
+       if (intr_remapping_enabled)
+               return;
+
+       memset(&entry, 0, sizeof(entry));
+
+       /*
+        * We use logical delivery to get the timer IRQ
+        * to the first CPU.
+        */
+       entry.dest_mode = apic->irq_dest_mode;
+       entry.mask = 0;                 /* don't mask IRQ for edge */
+       entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
+       entry.delivery_mode = apic->irq_delivery_mode;
+       entry.polarity = 0;
+       entry.trigger = 0;
+       entry.vector = vector;
+
+       /*
+        * The timer IRQ doesn't have to know that behind the
+        * scene we may have a 8259A-master in AEOI mode ...
+        */
+       irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
+                                     "edge");
+
+       /*
+        * Add it to the IO-APIC irq-routing table:
+        */
+       ioapic_write_entry(ioapic_idx, pin, entry);
+}
+
+__apicdebuginit(void) print_IO_APIC(int ioapic_idx)
+{
+       int i;
+       union IO_APIC_reg_00 reg_00;
+       union IO_APIC_reg_01 reg_01;
+       union IO_APIC_reg_02 reg_02;
+       union IO_APIC_reg_03 reg_03;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       reg_00.raw = io_apic_read(ioapic_idx, 0);
+       reg_01.raw = io_apic_read(ioapic_idx, 1);
+       if (reg_01.bits.version >= 0x10)
+               reg_02.raw = io_apic_read(ioapic_idx, 2);
+       if (reg_01.bits.version >= 0x20)
+               reg_03.raw = io_apic_read(ioapic_idx, 3);
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       printk("\n");
+       printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
+       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
+       printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
+       printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
+
+       printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
+       printk(KERN_DEBUG ".......     : max redirection entries: %02X\n",
+               reg_01.bits.entries);
+
+       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
+       printk(KERN_DEBUG ".......     : IO APIC version: %02X\n",
+               reg_01.bits.version);
+
+       /*
+        * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+        * but the value of reg_02 is read as the previous read register
+        * value, so ignore it if reg_02 == reg_01.
+        */
+       if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
+               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+               printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
+       }
+
+       /*
+        * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+        * or reg_03, but the value of reg_0[23] is read as the previous read
+        * register value, so ignore it if reg_03 == reg_0[12].
+        */
+       if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+           reg_03.raw != reg_01.raw) {
+               printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+               printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
+       }
+
+       printk(KERN_DEBUG ".... IRQ redirection table:\n");
+
+       if (intr_remapping_enabled) {
+               printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
+                       " Pol Stat Indx2 Zero Vect:\n");
+       } else {
+               printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+                       " Stat Dmod Deli Vect:\n");
+       }
+
+       for (i = 0; i <= reg_01.bits.entries; i++) {
+               if (intr_remapping_enabled) {
+                       struct IO_APIC_route_entry entry;
+                       struct IR_IO_APIC_route_entry *ir_entry;
+
+                       entry = ioapic_read_entry(ioapic_idx, i);
+                       ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
+                       printk(KERN_DEBUG " %02x %04X ",
+                               i,
+                               ir_entry->index
+                       );
+                       printk("%1d   %1d    %1d    %1d   %1d   "
+                               "%1d    %1d     %X    %02X\n",
+                               ir_entry->format,
+                               ir_entry->mask,
+                               ir_entry->trigger,
+                               ir_entry->irr,
+                               ir_entry->polarity,
+                               ir_entry->delivery_status,
+                               ir_entry->index2,
+                               ir_entry->zero,
+                               ir_entry->vector
+                       );
+               } else {
+                       struct IO_APIC_route_entry entry;
+
+                       entry = ioapic_read_entry(ioapic_idx, i);
+                       printk(KERN_DEBUG " %02x %02X  ",
+                               i,
+                               entry.dest
+                       );
+                       printk("%1d    %1d    %1d   %1d   %1d    "
+                               "%1d    %1d    %02X\n",
+                               entry.mask,
+                               entry.trigger,
+                               entry.irr,
+                               entry.polarity,
+                               entry.delivery_status,
+                               entry.dest_mode,
+                               entry.delivery_mode,
+                               entry.vector
+                       );
+               }
+       }
+}
+
+__apicdebuginit(void) print_IO_APICs(void)
+{
+       int ioapic_idx;
+       struct irq_cfg *cfg;
+       unsigned int irq;
+       struct irq_chip *chip;
+
+       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+       for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+                      mpc_ioapic_id(ioapic_idx),
+                      ioapics[ioapic_idx].nr_registers);
+
+       /*
+        * We are a bit conservative about what we expect.  We have to
+        * know about every hardware change ASAP.
+        */
+       printk(KERN_INFO "testing the IO APIC.......................\n");
+
+       for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+               print_IO_APIC(ioapic_idx);
+
+       printk(KERN_DEBUG "IRQ to pin mappings:\n");
+       for_each_active_irq(irq) {
+               struct irq_pin_list *entry;
+
+               chip = irq_get_chip(irq);
+               if (chip != &ioapic_chip)
+                       continue;
+
+               cfg = irq_get_chip_data(irq);
+               if (!cfg)
+                       continue;
+               entry = cfg->irq_2_pin;
+               if (!entry)
+                       continue;
+               printk(KERN_DEBUG "IRQ%d ", irq);
+               for_each_irq_pin(entry, cfg->irq_2_pin)
+                       printk("-> %d:%d", entry->apic, entry->pin);
+               printk("\n");
+       }
+
+       printk(KERN_INFO ".................................... done.\n");
+}
+
+__apicdebuginit(void) print_APIC_field(int base)
+{
+       int i;
+
+       printk(KERN_DEBUG);
+
+       for (i = 0; i < 8; i++)
+               printk(KERN_CONT "%08x", apic_read(base + i*0x10));
+
+       printk(KERN_CONT "\n");
+}
+
+__apicdebuginit(void) print_local_APIC(void *dummy)
+{
+       unsigned int i, v, ver, maxlvt;
+       u64 icr;
+
+       printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+               smp_processor_id(), hard_smp_processor_id());
+       v = apic_read(APIC_ID);
+       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, read_apic_id());
+       v = apic_read(APIC_LVR);
+       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+       ver = GET_APIC_VERSION(v);
+       maxlvt = lapic_get_maxlvt();
+
+       v = apic_read(APIC_TASKPRI);
+       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+       if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
+               if (!APIC_XAPIC(ver)) {
+                       v = apic_read(APIC_ARBPRI);
+                       printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+                              v & APIC_ARBPRI_MASK);
+               }
+               v = apic_read(APIC_PROCPRI);
+               printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+       }
+
+       /*
+        * Remote read supported only in the 82489DX and local APIC for
+        * Pentium processors.
+        */
+       if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
+               v = apic_read(APIC_RRR);
+               printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+       }
+
+       v = apic_read(APIC_LDR);
+       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+       if (!x2apic_enabled()) {
+               v = apic_read(APIC_DFR);
+               printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+       }
+       v = apic_read(APIC_SPIV);
+       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+
+       printk(KERN_DEBUG "... APIC ISR field:\n");
+       print_APIC_field(APIC_ISR);
+       printk(KERN_DEBUG "... APIC TMR field:\n");
+       print_APIC_field(APIC_TMR);
+       printk(KERN_DEBUG "... APIC IRR field:\n");
+       print_APIC_field(APIC_IRR);
+
+       if (APIC_INTEGRATED(ver)) {             /* !82489DX */
+               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
+                       apic_write(APIC_ESR, 0);
+
+               v = apic_read(APIC_ESR);
+               printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+       }
+
+       icr = apic_icr_read();
+       printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
+       printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
+
+       v = apic_read(APIC_LVTT);
+       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+
+       if (maxlvt > 3) {                       /* PC is LVT#4. */
+               v = apic_read(APIC_LVTPC);
+               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+       }
+       v = apic_read(APIC_LVT0);
+       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+       v = apic_read(APIC_LVT1);
+       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+
+       if (maxlvt > 2) {                       /* ERR is LVT#3. */
+               v = apic_read(APIC_LVTERR);
+               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+       }
+
+       v = apic_read(APIC_TMICT);
+       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+       v = apic_read(APIC_TMCCT);
+       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+       v = apic_read(APIC_TDCR);
+       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+
+       if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+               v = apic_read(APIC_EFEAT);
+               maxlvt = (v >> 16) & 0xff;
+               printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
+               v = apic_read(APIC_ECTRL);
+               printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
+               for (i = 0; i < maxlvt; i++) {
+                       v = apic_read(APIC_EILVTn(i));
+                       printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
+               }
+       }
+       printk("\n");
+}
+
+__apicdebuginit(void) print_local_APICs(int maxcpu)
+{
+       int cpu;
+
+       if (!maxcpu)
+               return;
+
+       preempt_disable();
+       for_each_online_cpu(cpu) {
+               if (cpu >= maxcpu)
+                       break;
+               smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+       }
+       preempt_enable();
+}
+
+__apicdebuginit(void) print_PIC(void)
+{
+       unsigned int v;
+       unsigned long flags;
+
+       if (!legacy_pic->nr_legacy_irqs)
+               return;
+
+       printk(KERN_DEBUG "\nprinting PIC contents\n");
+
+       raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+       v = inb(0xa1) << 8 | inb(0x21);
+       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
+
+       v = inb(0xa0) << 8 | inb(0x20);
+       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
+
+       outb(0x0b,0xa0);
+       outb(0x0b,0x20);
+       v = inb(0xa0) << 8 | inb(0x20);
+       outb(0x0a,0xa0);
+       outb(0x0a,0x20);
+
+       raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+
+       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
+
+       v = inb(0x4d1) << 8 | inb(0x4d0);
+       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+
+static int __initdata show_lapic = 1;
+static __init int setup_show_lapic(char *arg)
+{
+       int num = -1;
+
+       if (strcmp(arg, "all") == 0) {
+               show_lapic = CONFIG_NR_CPUS;
+       } else {
+               get_option(&arg, &num);
+               if (num >= 0)
+                       show_lapic = num;
+       }
+
+       return 1;
+}
+__setup("show_lapic=", setup_show_lapic);
+
+__apicdebuginit(int) print_ICs(void)
+{
+       if (apic_verbosity == APIC_QUIET)
+               return 0;
+
+       print_PIC();
+
+       /* don't print out if apic is not there */
+       if (!cpu_has_apic && !apic_from_smp_config())
+               return 0;
+
+       print_local_APICs(show_lapic);
+       print_IO_APICs();
+
+       return 0;
+}
+
+late_initcall(print_ICs);
+
+
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+
+void __init enable_IO_APIC(void)
+{
+       int i8259_apic, i8259_pin;
+       int apic;
+
+       if (!legacy_pic->nr_legacy_irqs)
+               return;
+
+       for(apic = 0; apic < nr_ioapics; apic++) {
+               int pin;
+               /* See if any of the pins is in ExtINT mode */
+               for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+                       struct IO_APIC_route_entry entry;
+                       entry = ioapic_read_entry(apic, pin);
+
+                       /* If the interrupt line is enabled and in ExtInt mode
+                        * I have found the pin where the i8259 is connected.
+                        */
+                       if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+                               ioapic_i8259.apic = apic;
+                               ioapic_i8259.pin  = pin;
+                               goto found_i8259;
+                       }
+               }
+       }
+ found_i8259:
+       /* Look to see what if the MP table has reported the ExtINT */
+       /* If we could not find the appropriate pin by looking at the ioapic
+        * the i8259 probably is not connected the ioapic but give the
+        * mptable a chance anyway.
+        */
+       i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
+       i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
+       /* Trust the MP table if nothing is setup in the hardware */
+       if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
+               printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+               ioapic_i8259.pin  = i8259_pin;
+               ioapic_i8259.apic = i8259_apic;
+       }
+       /* Complain if the MP table and the hardware disagree */
+       if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
+               (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+       {
+               printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
+       }
+
+       /*
+        * Do not trust the IO-APIC being empty at bootup
+        */
+       clear_IO_APIC();
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+       /*
+        * Clear the IO-APIC before rebooting:
+        */
+       clear_IO_APIC();
+
+       if (!legacy_pic->nr_legacy_irqs)
+               return;
+
+       /*
+        * If the i8259 is routed through an IOAPIC
+        * Put that IOAPIC in virtual wire mode
+        * so legacy interrupts can be delivered.
+        *
+        * With interrupt-remapping, for now we will use virtual wire A mode,
+        * as virtual wire B is little complex (need to configure both
+        * IOAPIC RTE as well as interrupt-remapping table entry).
+        * As this gets called during crash dump, keep this simple for now.
+        */
+       if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
+               struct IO_APIC_route_entry entry;
+
+               memset(&entry, 0, sizeof(entry));
+               entry.mask            = 0; /* Enabled */
+               entry.trigger         = 0; /* Edge */
+               entry.irr             = 0;
+               entry.polarity        = 0; /* High */
+               entry.delivery_status = 0;
+               entry.dest_mode       = 0; /* Physical */
+               entry.delivery_mode   = dest_ExtINT; /* ExtInt */
+               entry.vector          = 0;
+               entry.dest            = read_apic_id();
+
+               /*
+                * Add it to the IO-APIC irq-routing table:
+                */
+               ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
+       }
+
+       /*
+        * Use virtual wire A mode when interrupt remapping is enabled.
+        */
+       if (cpu_has_apic || apic_from_smp_config())
+               disconnect_bsp_APIC(!intr_remapping_enabled &&
+                               ioapic_i8259.pin != -1);
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
+ */
+void __init setup_ioapic_ids_from_mpc_nocheck(void)
+{
+       union IO_APIC_reg_00 reg_00;
+       physid_mask_t phys_id_present_map;
+       int ioapic_idx;
+       int i;
+       unsigned char old_id;
+       unsigned long flags;
+
+       /*
+        * This is broken; anything with a real cpu count has to
+        * circumvent this idiocy regardless.
+        */
+       apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
+
+       /*
+        * Set the IOAPIC ID to the value stored in the MPC table.
+        */
+       for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
+               /* Read the register 0 value */
+               raw_spin_lock_irqsave(&ioapic_lock, flags);
+               reg_00.raw = io_apic_read(ioapic_idx, 0);
+               raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+               old_id = mpc_ioapic_id(ioapic_idx);
+
+               if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) {
+                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+                               ioapic_idx, mpc_ioapic_id(ioapic_idx));
+                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+                               reg_00.bits.ID);
+                       ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID;
+               }
+
+               /*
+                * Sanity check, is the ID really free? Every APIC in a
+                * system must have a unique ID or we get lots of nice
+                * 'stuck on smp_invalidate_needed IPI wait' messages.
+                */
+               if (apic->check_apicid_used(&phys_id_present_map,
+                                           mpc_ioapic_id(ioapic_idx))) {
+                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+                               ioapic_idx, mpc_ioapic_id(ioapic_idx));
+                       for (i = 0; i < get_physical_broadcast(); i++)
+                               if (!physid_isset(i, phys_id_present_map))
+                                       break;
+                       if (i >= get_physical_broadcast())
+                               panic("Max APIC ID exceeded!\n");
+                       printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+                               i);
+                       physid_set(i, phys_id_present_map);
+                       ioapics[ioapic_idx].mp_config.apicid = i;
+               } else {
+                       physid_mask_t tmp;
+                       apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx),
+                                                   &tmp);
+                       apic_printk(APIC_VERBOSE, "Setting %d in the "
+                                       "phys_id_present_map\n",
+                                       mpc_ioapic_id(ioapic_idx));
+                       physids_or(phys_id_present_map, phys_id_present_map, tmp);
+               }
+
+               /*
+                * We need to adjust the IRQ routing table
+                * if the ID changed.
+                */
+               if (old_id != mpc_ioapic_id(ioapic_idx))
+                       for (i = 0; i < mp_irq_entries; i++)
+                               if (mp_irqs[i].dstapic == old_id)
+                                       mp_irqs[i].dstapic
+                                               = mpc_ioapic_id(ioapic_idx);
+
+               /*
+                * Update the ID register according to the right value
+                * from the MPC table if they are different.
+                */
+               if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID)
+                       continue;
+
+               apic_printk(APIC_VERBOSE, KERN_INFO
+                       "...changing IO-APIC physical APIC ID to %d ...",
+                       mpc_ioapic_id(ioapic_idx));
+
+               reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+               raw_spin_lock_irqsave(&ioapic_lock, flags);
+               io_apic_write(ioapic_idx, 0, reg_00.raw);
+               raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+               /*
+                * Sanity check
+                */
+               raw_spin_lock_irqsave(&ioapic_lock, flags);
+               reg_00.raw = io_apic_read(ioapic_idx, 0);
+               raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+               if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
+                       printk("could not set ID!\n");
+               else
+                       apic_printk(APIC_VERBOSE, " ok.\n");
+       }
+}
+
+void __init setup_ioapic_ids_from_mpc(void)
+{
+
+       if (acpi_ioapic)
+               return;
+       /*
+        * Don't check I/O APIC IDs for xAPIC systems.  They have
+        * no meaning without the serial APIC bus.
+        */
+       if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+               || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+               return;
+       setup_ioapic_ids_from_mpc_nocheck();
+}
+#endif
+
+int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+       no_timer_check = 1;
+       return 1;
+}
+__setup("no_timer_check", notimercheck);
+
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ *     - timer IRQ defaults to IO-APIC IRQ
+ *     - if this function detects that timer IRQs are defunct, then we fall
+ *       back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+       unsigned long t1 = jiffies;
+       unsigned long flags;
+
+       if (no_timer_check)
+               return 1;
+
+       local_save_flags(flags);
+       local_irq_enable();
+       /* Let ten ticks pass... */
+       mdelay((10 * 1000) / HZ);
+       local_irq_restore(flags);
+
+       /*
+        * Expect a few ticks at least, to be sure some possible
+        * glue logic does not lock up after one or two first
+        * ticks in a non-ExtINT mode.  Also the local APIC
+        * might have cached one ExtINT interrupt.  Finally, at
+        * least one tick may be lost due to delays.
+        */
+
+       /* jiffies wrap? */
+       if (time_after(jiffies, t1 + 4))
+               return 1;
+       return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ */
+
+static unsigned int startup_ioapic_irq(struct irq_data *data)
+{
+       int was_pending = 0, irq = data->irq;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       if (irq < legacy_pic->nr_legacy_irqs) {
+               legacy_pic->mask(irq);
+               if (legacy_pic->irq_pending(irq))
+                       was_pending = 1;
+       }
+       __unmask_ioapic(data->chip_data);
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return was_pending;
+}
+
+static int ioapic_retrigger_irq(struct irq_data *data)
+{
+       struct irq_cfg *cfg = data->chip_data;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&vector_lock, flags);
+       apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
+       raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+       return 1;
+}
+
+/*
+ * Level and edge triggered IO-APIC interrupts need different handling,
+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
+ * handled with the level-triggered descriptor, but that one has slightly
+ * more overhead. Level-triggered interrupts cannot be handled with the
+ * edge-triggered handler, without risking IRQ storms and other ugly
+ * races.
+ */
+
+#ifdef CONFIG_SMP
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+       cpumask_var_t cleanup_mask;
+
+       if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+               unsigned int i;
+               for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+                       apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+       } else {
+               cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+               apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+               free_cpumask_var(cleanup_mask);
+       }
+       cfg->move_in_progress = 0;
+}
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+{
+       int apic, pin;
+       struct irq_pin_list *entry;
+       u8 vector = cfg->vector;
+
+       for_each_irq_pin(entry, cfg->irq_2_pin) {
+               unsigned int reg;
+
+               apic = entry->apic;
+               pin = entry->pin;
+               /*
+                * With interrupt-remapping, destination information comes
+                * from interrupt-remapping table entry.
+                */
+               if (!irq_remapped(cfg))
+                       io_apic_write(apic, 0x11 + pin*2, dest);
+               reg = io_apic_read(apic, 0x10 + pin*2);
+               reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+               reg |= vector;
+               io_apic_modify(apic, 0x10 + pin*2, reg);
+       }
+}
+
+/*
+ * Either sets data->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
+ * leaves data->affinity untouched.
+ */
+int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                         unsigned int *dest_id)
+{
+       struct irq_cfg *cfg = data->chip_data;
+
+       if (!cpumask_intersects(mask, cpu_online_mask))
+               return -1;
+
+       if (assign_irq_vector(data->irq, data->chip_data, mask))
+               return -1;
+
+       cpumask_copy(data->affinity, mask);
+
+       *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
+       return 0;
+}
+
+static int
+ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                   bool force)
+{
+       unsigned int dest, irq = data->irq;
+       unsigned long flags;
+       int ret;
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       ret = __ioapic_set_affinity(data, mask, &dest);
+       if (!ret) {
+               /* Only the high 8 bits are valid. */
+               dest = SET_APIC_LOGICAL_ID(dest);
+               __target_IO_APIC_irq(irq, dest, data->chip_data);
+       }
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+       return ret;
+}
+
+#ifdef CONFIG_IRQ_REMAP
+
+/*
+ * Migrate the IO-APIC irq in the presence of intr-remapping.
+ *
+ * For both level and edge triggered, irq migration is a simple atomic
+ * update(of vector and cpu destination) of IRTE and flush the hardware cache.
+ *
+ * For level triggered, we eliminate the io-apic RTE modification (with the
+ * updated vector information), by using a virtual vector (io-apic pin number).
+ * Real vector that is used for interrupting cpu will be coming from
+ * the interrupt-remapping table entry.
+ *
+ * As the migration is a simple atomic update of IRTE, the same mechanism
+ * is used to migrate MSI irq's in the presence of interrupt-remapping.
+ */
+static int
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                      bool force)
+{
+       struct irq_cfg *cfg = data->chip_data;
+       unsigned int dest, irq = data->irq;
+       struct irte irte;
+
+       if (!cpumask_intersects(mask, cpu_online_mask))
+               return -EINVAL;
+
+       if (get_irte(irq, &irte))
+               return -EBUSY;
+
+       if (assign_irq_vector(irq, cfg, mask))
+               return -EBUSY;
+
+       dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
+
+       irte.vector = cfg->vector;
+       irte.dest_id = IRTE_DEST(dest);
+
+       /*
+        * Atomically updates the IRTE with the new destination, vector
+        * and flushes the interrupt entry cache.
+        */
+       modify_irte(irq, &irte);
+
+       /*
+        * After this point, all the interrupts will start arriving
+        * at the new destination. So, time to cleanup the previous
+        * vector allocation.
+        */
+       if (cfg->move_in_progress)
+               send_cleanup_vector(cfg);
+
+       cpumask_copy(data->affinity, mask);
+       return 0;
+}
+
+#else
+static inline int
+ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                      bool force)
+{
+       return 0;
+}
+#endif
+
+asmlinkage void smp_irq_move_cleanup_interrupt(void)
+{
+       unsigned vector, me;
+
+       ack_APIC_irq();
+       irq_enter();
+       exit_idle();
+
+       me = smp_processor_id();
+       for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+               unsigned int irq;
+               unsigned int irr;
+               struct irq_desc *desc;
+               struct irq_cfg *cfg;
+               irq = __this_cpu_read(vector_irq[vector]);
+
+               if (irq == -1)
+                       continue;
+
+               desc = irq_to_desc(irq);
+               if (!desc)
+                       continue;
+
+               cfg = irq_cfg(irq);
+               raw_spin_lock(&desc->lock);
+
+               /*
+                * Check if the irq migration is in progress. If so, we
+                * haven't received the cleanup request yet for this irq.
+                */
+               if (cfg->move_in_progress)
+                       goto unlock;
+
+               if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+                       goto unlock;
+
+               irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+               /*
+                * Check if the vector that needs to be cleanedup is
+                * registered at the cpu's IRR. If so, then this is not
+                * the best time to clean it up. Lets clean it up in the
+                * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+                * to myself.
+                */
+               if (irr  & (1 << (vector % 32))) {
+                       apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+                       goto unlock;
+               }
+               __this_cpu_write(vector_irq[vector], -1);
+unlock:
+               raw_spin_unlock(&desc->lock);
+       }
+
+       irq_exit();
+}
+
+static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
+{
+       unsigned me;
+
+       if (likely(!cfg->move_in_progress))
+               return;
+
+       me = smp_processor_id();
+
+       if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+               send_cleanup_vector(cfg);
+}
+
+static void irq_complete_move(struct irq_cfg *cfg)
+{
+       __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
+}
+
+void irq_force_complete_move(int irq)
+{
+       struct irq_cfg *cfg = irq_get_chip_data(irq);
+
+       if (!cfg)
+               return;
+
+       __irq_complete_move(cfg, cfg->vector);
+}
+#else
+static inline void irq_complete_move(struct irq_cfg *cfg) { }
+#endif
+
+static void ack_apic_edge(struct irq_data *data)
+{
+       irq_complete_move(data->chip_data);
+       irq_move_irq(data);
+       ack_APIC_irq();
+}
+
+atomic_t irq_mis_count;
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+{
+       /* If we are moving the irq we need to mask it */
+       if (unlikely(irqd_is_setaffinity_pending(data))) {
+               mask_ioapic(cfg);
+               return true;
+       }
+       return false;
+}
+
+static inline void ioapic_irqd_unmask(struct irq_data *data,
+                                     struct irq_cfg *cfg, bool masked)
+{
+       if (unlikely(masked)) {
+               /* Only migrate the irq if the ack has been received.
+                *
+                * On rare occasions the broadcast level triggered ack gets
+                * delayed going to ioapics, and if we reprogram the
+                * vector while Remote IRR is still set the irq will never
+                * fire again.
+                *
+                * To prevent this scenario we read the Remote IRR bit
+                * of the ioapic.  This has two effects.
+                * - On any sane system the read of the ioapic will
+                *   flush writes (and acks) going to the ioapic from
+                *   this cpu.
+                * - We get to see if the ACK has actually been delivered.
+                *
+                * Based on failed experiments of reprogramming the
+                * ioapic entry from outside of irq context starting
+                * with masking the ioapic entry and then polling until
+                * Remote IRR was clear before reprogramming the
+                * ioapic I don't trust the Remote IRR bit to be
+                * completey accurate.
+                *
+                * However there appears to be no other way to plug
+                * this race, so if the Remote IRR bit is not
+                * accurate and is causing problems then it is a hardware bug
+                * and you can go talk to the chipset vendor about it.
+                */
+               if (!io_apic_level_ack_pending(cfg))
+                       irq_move_masked_irq(data);
+               unmask_ioapic(cfg);
+       }
+}
+#else
+static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+{
+       return false;
+}
+static inline void ioapic_irqd_unmask(struct irq_data *data,
+                                     struct irq_cfg *cfg, bool masked)
+{
+}
+#endif
+
+static void ack_apic_level(struct irq_data *data)
+{
+       struct irq_cfg *cfg = data->chip_data;
+       int i, irq = data->irq;
+       unsigned long v;
+       bool masked;
+
+       irq_complete_move(cfg);
+       masked = ioapic_irqd_mask(data, cfg);
+
+       /*
+        * It appears there is an erratum which affects at least version 0x11
+        * of I/O APIC (that's the 82093AA and cores integrated into various
+        * chipsets).  Under certain conditions a level-triggered interrupt is
+        * erroneously delivered as edge-triggered one but the respective IRR
+        * bit gets set nevertheless.  As a result the I/O unit expects an EOI
+        * message but it will never arrive and further interrupts are blocked
+        * from the source.  The exact reason is so far unknown, but the
+        * phenomenon was observed when two consecutive interrupt requests
+        * from a given source get delivered to the same CPU and the source is
+        * temporarily disabled in between.
+        *
+        * A workaround is to simulate an EOI message manually.  We achieve it
+        * by setting the trigger mode to edge and then to level when the edge
+        * trigger mode gets detected in the TMR of a local APIC for a
+        * level-triggered interrupt.  We mask the source for the time of the
+        * operation to prevent an edge-triggered interrupt escaping meanwhile.
+        * The idea is from Manfred Spraul.  --macro
+        *
+        * Also in the case when cpu goes offline, fixup_irqs() will forward
+        * any unhandled interrupt on the offlined cpu to the new cpu
+        * destination that is handling the corresponding interrupt. This
+        * interrupt forwarding is done via IPI's. Hence, in this case also
+        * level-triggered io-apic interrupt will be seen as an edge
+        * interrupt in the IRR. And we can't rely on the cpu's EOI
+        * to be broadcasted to the IO-APIC's which will clear the remoteIRR
+        * corresponding to the level-triggered interrupt. Hence on IO-APIC's
+        * supporting EOI register, we do an explicit EOI to clear the
+        * remote IRR and on IO-APIC's which don't have an EOI register,
+        * we use the above logic (mask+edge followed by unmask+level) from
+        * Manfred Spraul to clear the remote IRR.
+        */
+       i = cfg->vector;
+       v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+
+       /*
+        * We must acknowledge the irq before we move it or the acknowledge will
+        * not propagate properly.
+        */
+       ack_APIC_irq();
+
+       /*
+        * Tail end of clearing remote IRR bit (either by delivering the EOI
+        * message via io-apic EOI register write or simulating it using
+        * mask+edge followed by unnask+level logic) manually when the
+        * level triggered interrupt is seen as the edge triggered interrupt
+        * at the cpu.
+        */
+       if (!(v & (1 << (i & 0x1f)))) {
+               atomic_inc(&irq_mis_count);
+
+               eoi_ioapic_irq(irq, cfg);
+       }
+
+       ioapic_irqd_unmask(data, cfg, masked);
+}
+
+#ifdef CONFIG_IRQ_REMAP
+static void ir_ack_apic_edge(struct irq_data *data)
+{
+       ack_APIC_irq();
+}
+
+static void ir_ack_apic_level(struct irq_data *data)
+{
+       ack_APIC_irq();
+       eoi_ioapic_irq(data->irq, data->chip_data);
+}
+
+static void ir_print_prefix(struct irq_data *data, struct seq_file *p)
+{
+       seq_printf(p, " IR-%s", data->chip->name);
+}
+
+static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
+{
+       chip->irq_print_chip = ir_print_prefix;
+       chip->irq_ack = ir_ack_apic_edge;
+       chip->irq_eoi = ir_ack_apic_level;
+
+#ifdef CONFIG_SMP
+       chip->irq_set_affinity = ir_ioapic_set_affinity;
+#endif
+}
+#endif /* CONFIG_IRQ_REMAP */
+
+static struct irq_chip ioapic_chip __read_mostly = {
+       .name                   = "IO-APIC",
+       .irq_startup            = startup_ioapic_irq,
+       .irq_mask               = mask_ioapic_irq,
+       .irq_unmask             = unmask_ioapic_irq,
+       .irq_ack                = ack_apic_edge,
+       .irq_eoi                = ack_apic_level,
+#ifdef CONFIG_SMP
+       .irq_set_affinity       = ioapic_set_affinity,
+#endif
+       .irq_retrigger          = ioapic_retrigger_irq,
+};
+#endif /* !CONFIG_XEN */
+
+static inline void init_IO_APIC_traps(void)
+{
+       struct irq_cfg *cfg;
+       unsigned int irq;
+
+       /*
+        * NOTE! The local APIC isn't very good at handling
+        * multiple interrupts at the same interrupt level.
+        * As the interrupt level is determined by taking the
+        * vector number and shifting that right by 4, we
+        * want to spread these out a bit so that they don't
+        * all fall in the same interrupt level.
+        *
+        * Also, we've got to be careful not to trash gate
+        * 0x80, because int 0x80 is hm, kind of importantish. ;)
+        */
+       for_each_active_irq(irq) {
+#ifdef CONFIG_XEN
+               if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
+                       continue;
+#endif
+               cfg = irq_get_chip_data(irq);
+               if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
+                       /*
+                        * Hmm.. We don't have an entry for this,
+                        * so default to an old-fashioned 8259
+                        * interrupt if we can..
+                        */
+                       if (irq < legacy_pic->nr_legacy_irqs)
+                               legacy_pic->make_irq(irq);
+                       else
+                               /* Strange. Oh, well.. */
+                               irq_set_chip(irq, &no_irq_chip);
+               }
+       }
+}
+
+#ifndef CONFIG_XEN
+/*
+ * The local APIC irq-chip implementation:
+ */
+
+static void mask_lapic_irq(struct irq_data *data)
+{
+       unsigned long v;
+
+       v = apic_read(APIC_LVT0);
+       apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
+static void unmask_lapic_irq(struct irq_data *data)
+{
+       unsigned long v;
+
+       v = apic_read(APIC_LVT0);
+       apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+
+static void ack_lapic_irq(struct irq_data *data)
+{
+       ack_APIC_irq();
+}
+
+static struct irq_chip lapic_chip __read_mostly = {
+       .name           = "local-APIC",
+       .irq_mask       = mask_lapic_irq,
+       .irq_unmask     = unmask_lapic_irq,
+       .irq_ack        = ack_lapic_irq,
+};
+
+static void lapic_register_intr(int irq)
+{
+       irq_clear_status_flags(irq, IRQ_LEVEL);
+       irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
+                                     "edge");
+}
+
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic.  ICR does
+ * not support the ExtINT mode, unfortunately.  We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA.  --macro
+ */
+static inline void __init unlock_ExtINT_logic(void)
+{
+       int apic, pin, i;
+       struct IO_APIC_route_entry entry0, entry1;
+       unsigned char save_control, save_freq_select;
+
+       pin  = find_isa_irq_pin(8, mp_INT);
+       if (pin == -1) {
+               WARN_ON_ONCE(1);
+               return;
+       }
+       apic = find_isa_irq_apic(8, mp_INT);
+       if (apic == -1) {
+               WARN_ON_ONCE(1);
+               return;
+       }
+
+       entry0 = ioapic_read_entry(apic, pin);
+       clear_IO_APIC_pin(apic, pin);
+
+       memset(&entry1, 0, sizeof(entry1));
+
+       entry1.dest_mode = 0;                   /* physical delivery */
+       entry1.mask = 0;                        /* unmask IRQ now */
+       entry1.dest = hard_smp_processor_id();
+       entry1.delivery_mode = dest_ExtINT;
+       entry1.polarity = entry0.polarity;
+       entry1.trigger = 0;
+       entry1.vector = 0;
+
+       ioapic_write_entry(apic, pin, entry1);
+
+       save_control = CMOS_READ(RTC_CONTROL);
+       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+                  RTC_FREQ_SELECT);
+       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+
+       i = 100;
+       while (i-- > 0) {
+               mdelay(10);
+               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+                       i -= 10;
+       }
+
+       CMOS_WRITE(save_control, RTC_CONTROL);
+       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+       clear_IO_APIC_pin(apic, pin);
+
+       ioapic_write_entry(apic, pin, entry0);
+}
+
+static int disable_timer_pin_1 __initdata;
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
+{
+       disable_timer_pin_1 = 1;
+       return 0;
+}
+early_param("disable_timer_pin_1", disable_timer_pin_setup);
+
+int timer_through_8259 __initdata;
+
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
+ * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ *
+ * FIXME: really need to revamp this for all platforms.
+ */
+static inline void __init check_timer(void)
+{
+       struct irq_cfg *cfg = irq_get_chip_data(0);
+       int node = cpu_to_node(0);
+       int apic1, pin1, apic2, pin2;
+       unsigned long flags;
+       int no_pin1 = 0;
+
+       local_irq_save(flags);
+
+       /*
+        * get/set the timer IRQ vector:
+        */
+       legacy_pic->mask(0);
+       assign_irq_vector(0, cfg, apic->target_cpus());
+
+       /*
+        * As IRQ0 is to be enabled in the 8259A, the virtual
+        * wire has to be disabled in the local APIC.  Also
+        * timer interrupts need to be acknowledged manually in
+        * the 8259A for the i82489DX when using the NMI
+        * watchdog as that APIC treats NMIs as level-triggered.
+        * The AEOI mode will finish them in the 8259A
+        * automatically.
+        */
+       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+       legacy_pic->init(1);
+
+       pin1  = find_isa_irq_pin(0, mp_INT);
+       apic1 = find_isa_irq_apic(0, mp_INT);
+       pin2  = ioapic_i8259.pin;
+       apic2 = ioapic_i8259.apic;
+
+       apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+                   "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+                   cfg->vector, apic1, pin1, apic2, pin2);
+
+       /*
+        * Some BIOS writers are clueless and report the ExtINTA
+        * I/O APIC input from the cascaded 8259A as the timer
+        * interrupt input.  So just in case, if only one pin
+        * was found above, try it both directly and through the
+        * 8259A.
+        */
+       if (pin1 == -1) {
+               if (intr_remapping_enabled)
+                       panic("BIOS bug: timer not connected to IO-APIC");
+               pin1 = pin2;
+               apic1 = apic2;
+               no_pin1 = 1;
+       } else if (pin2 == -1) {
+               pin2 = pin1;
+               apic2 = apic1;
+       }
+
+       if (pin1 != -1) {
+               /*
+                * Ok, does IRQ0 through the IOAPIC work?
+                */
+               if (no_pin1) {
+                       add_pin_to_irq_node(cfg, node, apic1, pin1);
+                       setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+               } else {
+                       /* for edge trigger, setup_ioapic_irq already
+                        * leave it unmasked.
+                        * so only need to unmask if it is level-trigger
+                        * do we really have level trigger timer?
+                        */
+                       int idx;
+                       idx = find_irq_entry(apic1, pin1, mp_INT);
+                       if (idx != -1 && irq_trigger(idx))
+                               unmask_ioapic(cfg);
+               }
+               if (timer_irq_works()) {
+                       if (disable_timer_pin_1 > 0)
+                               clear_IO_APIC_pin(0, pin1);
+                       goto out;
+               }
+               if (intr_remapping_enabled)
+                       panic("timer doesn't work through Interrupt-remapped IO-APIC");
+               local_irq_disable();
+               clear_IO_APIC_pin(apic1, pin1);
+               if (!no_pin1)
+                       apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
+                                   "8254 timer not connected to IO-APIC\n");
+
+               apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+                           "(IRQ0) through the 8259A ...\n");
+               apic_printk(APIC_QUIET, KERN_INFO
+                           "..... (found apic %d pin %d) ...\n", apic2, pin2);
+               /*
+                * legacy devices should be connected to IO APIC #0
+                */
+               replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
+               setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+               legacy_pic->unmask(0);
+               if (timer_irq_works()) {
+                       apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
+                       timer_through_8259 = 1;
+                       goto out;
+               }
+               /*
+                * Cleanup, just in case ...
+                */
+               local_irq_disable();
+               legacy_pic->mask(0);
+               clear_IO_APIC_pin(apic2, pin2);
+               apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
+       }
+
+       apic_printk(APIC_QUIET, KERN_INFO
+                   "...trying to set up timer as Virtual Wire IRQ...\n");
+
+       lapic_register_intr(0);
+       apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
+       legacy_pic->unmask(0);
+
+       if (timer_irq_works()) {
+               apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+               goto out;
+       }
+       local_irq_disable();
+       legacy_pic->mask(0);
+       apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
+       apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
+
+       apic_printk(APIC_QUIET, KERN_INFO
+                   "...trying to set up timer as ExtINT IRQ...\n");
+
+       legacy_pic->init(0);
+       legacy_pic->make_irq(0);
+       apic_write(APIC_LVT0, APIC_DM_EXTINT);
+
+       unlock_ExtINT_logic();
+
+       if (timer_irq_works()) {
+               apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+               goto out;
+       }
+       local_irq_disable();
+       apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+       if (x2apic_preenabled)
+               apic_printk(APIC_QUIET, KERN_INFO
+                           "Perhaps problem with the pre-enabled x2apic mode\n"
+                           "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
+       panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
+               "report.  Then try booting with the 'noapic' option.\n");
+out:
+       local_irq_restore(flags);
+}
+#else
+#define check_timer() ((void)0)
+#endif
+
+/*
+ * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
+ * to devices.  However there may be an I/O APIC pin available for
+ * this interrupt regardless.  The pin may be left unconnected, but
+ * typically it will be reused as an ExtINT cascade interrupt for
+ * the master 8259A.  In the MPS case such a pin will normally be
+ * reported as an ExtINT interrupt in the MP table.  With ACPI
+ * there is no provision for ExtINT interrupts, and in the absence
+ * of an override it would be treated as an ordinary ISA I/O APIC
+ * interrupt, that is edge-triggered and unmasked by default.  We
+ * used to do this, but it caused problems on some systems because
+ * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
+ * the same ExtINT cascade interrupt to drive the local APIC of the
+ * bootstrap processor.  Therefore we refrain from routing IRQ2 to
+ * the I/O APIC in all cases now.  No actual device should request
+ * it anyway.  --macro
+ */
+#define PIC_IRQS       (1UL << PIC_CASCADE_IR)
+
+void __init setup_IO_APIC(void)
+{
+
+       /*
+        * calling enable_IO_APIC() is moved to setup_local_APIC for BP
+        */
+       io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
+
+       apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+       /*
+         * Set up IO-APIC IRQ routing.
+         */
+#ifndef CONFIG_XEN
+       x86_init.mpparse.setup_ioapic_ids();
+
+       sync_Arb_IDs();
+#endif
+       setup_IO_APIC_irqs();
+       init_IO_APIC_traps();
+       if (legacy_pic->nr_legacy_irqs)
+               check_timer();
+}
+
+/*
+ *      Called after all the initialization is done. If we didn't find any
+ *      APIC bugs then we can allow the modify fast path
+ */
+
+static int __init io_apic_bug_finalize(void)
+{
+       if (sis_apic_bug == -1)
+               sis_apic_bug = 0;
+#ifdef CONFIG_X86_XEN
+       if (is_initial_xendomain()) {
+               struct xen_platform_op op = { .cmd = XENPF_platform_quirk };
+               op.u.platform_quirk.quirk_id = sis_apic_bug ?
+                       QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL;
+               VOID(HYPERVISOR_platform_op(&op));
+       }
+#endif
+       return 0;
+}
+
+late_initcall(io_apic_bug_finalize);
+
+#ifndef CONFIG_XEN
+static void resume_ioapic_id(int ioapic_idx)
+{
+       unsigned long flags;
+       union IO_APIC_reg_00 reg_00;
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       reg_00.raw = io_apic_read(ioapic_idx, 0);
+       if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) {
+               reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
+               io_apic_write(ioapic_idx, 0, reg_00.raw);
+       }
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void ioapic_resume(void)
+{
+       int ioapic_idx;
+
+       for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--)
+               resume_ioapic_id(ioapic_idx);
+
+       restore_ioapic_entries();
+}
+
+static struct syscore_ops ioapic_syscore_ops = {
+       .suspend = save_ioapic_entries,
+       .resume = ioapic_resume,
+};
+
+static int __init ioapic_init_ops(void)
+{
+       register_syscore_ops(&ioapic_syscore_ops);
+
+       return 0;
+}
+
+device_initcall(ioapic_init_ops);
+
+/*
+ * Dynamic irq allocate and deallocation
+ */
+unsigned int create_irq_nr(unsigned int from, int node)
+{
+       struct irq_cfg *cfg;
+       unsigned long flags;
+       unsigned int ret = 0;
+       int irq;
+
+       if (from < nr_irqs_gsi)
+               from = nr_irqs_gsi;
+
+       irq = alloc_irq_from(from, node);
+       if (irq < 0)
+               return 0;
+       cfg = alloc_irq_cfg(irq, node);
+       if (!cfg) {
+               free_irq_at(irq, NULL);
+               return 0;
+       }
+
+       raw_spin_lock_irqsave(&vector_lock, flags);
+       if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
+               ret = irq;
+       raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+       if (ret) {
+               irq_set_chip_data(irq, cfg);
+               irq_clear_status_flags(irq, IRQ_NOREQUEST);
+       } else {
+               free_irq_at(irq, cfg);
+       }
+       return ret;
+}
+
+int create_irq(void)
+{
+       int node = cpu_to_node(0);
+       unsigned int irq_want;
+       int irq;
+
+       irq_want = nr_irqs_gsi;
+       irq = create_irq_nr(irq_want, node);
+
+       if (irq == 0)
+               irq = -1;
+
+       return irq;
+}
+
+void destroy_irq(unsigned int irq)
+{
+       struct irq_cfg *cfg = irq_get_chip_data(irq);
+       unsigned long flags;
+
+       irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
+
+       if (irq_remapped(cfg))
+               free_irte(irq);
+       raw_spin_lock_irqsave(&vector_lock, flags);
+       __clear_irq_vector(irq, cfg);
+       raw_spin_unlock_irqrestore(&vector_lock, flags);
+       free_irq_at(irq, cfg);
+}
+#endif /* !CONFIG_XEN */
+
+/*
+ * MSI message composition
+ */
+#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN)
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
+                          struct msi_msg *msg, u8 hpet_id)
+{
+       struct irq_cfg *cfg;
+       int err;
+       unsigned dest;
+
+       if (disable_apic)
+               return -ENXIO;
+
+       cfg = irq_cfg(irq);
+       err = assign_irq_vector(irq, cfg, apic->target_cpus());
+       if (err)
+               return err;
+
+       dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
+
+       if (irq_remapped(cfg)) {
+               struct irte irte;
+               int ir_index;
+               u16 sub_handle;
+
+               ir_index = map_irq_to_irte_handle(irq, &sub_handle);
+               BUG_ON(ir_index == -1);
+
+               prepare_irte(&irte, cfg->vector, dest);
+
+               /* Set source-id of interrupt request */
+               if (pdev)
+                       set_msi_sid(&irte, pdev);
+               else
+                       set_hpet_sid(&irte, hpet_id);
+
+               modify_irte(irq, &irte);
+
+               msg->address_hi = MSI_ADDR_BASE_HI;
+               msg->data = sub_handle;
+               msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+                                 MSI_ADDR_IR_SHV |
+                                 MSI_ADDR_IR_INDEX1(ir_index) |
+                                 MSI_ADDR_IR_INDEX2(ir_index);
+       } else {
+               if (x2apic_enabled())
+                       msg->address_hi = MSI_ADDR_BASE_HI |
+                                         MSI_ADDR_EXT_DEST_ID(dest);
+               else
+                       msg->address_hi = MSI_ADDR_BASE_HI;
+
+               msg->address_lo =
+                       MSI_ADDR_BASE_LO |
+                       ((apic->irq_dest_mode == 0) ?
+                               MSI_ADDR_DEST_MODE_PHYSICAL:
+                               MSI_ADDR_DEST_MODE_LOGICAL) |
+                       ((apic->irq_delivery_mode != dest_LowestPrio) ?
+                               MSI_ADDR_REDIRECTION_CPU:
+                               MSI_ADDR_REDIRECTION_LOWPRI) |
+                       MSI_ADDR_DEST_ID(dest);
+
+               msg->data =
+                       MSI_DATA_TRIGGER_EDGE |
+                       MSI_DATA_LEVEL_ASSERT |
+                       ((apic->irq_delivery_mode != dest_LowestPrio) ?
+                               MSI_DATA_DELIVERY_FIXED:
+                               MSI_DATA_DELIVERY_LOWPRI) |
+                       MSI_DATA_VECTOR(cfg->vector);
+       }
+       return err;
+}
+
+#ifdef CONFIG_SMP
+static int
+msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+{
+       struct irq_cfg *cfg = data->chip_data;
+       struct msi_msg msg;
+       unsigned int dest;
+
+       if (__ioapic_set_affinity(data, mask, &dest))
+               return -1;
+
+       __get_cached_msi_msg(data->msi_desc, &msg);
+
+       msg.data &= ~MSI_DATA_VECTOR_MASK;
+       msg.data |= MSI_DATA_VECTOR(cfg->vector);
+       msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+       msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+       __write_msi_msg(data->msi_desc, &msg);
+
+       return 0;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip msi_chip = {
+       .name                   = "PCI-MSI",
+       .irq_unmask             = unmask_msi_irq,
+       .irq_mask               = mask_msi_irq,
+       .irq_ack                = ack_apic_edge,
+#ifdef CONFIG_SMP
+       .irq_set_affinity       = msi_set_affinity,
+#endif
+       .irq_retrigger          = ioapic_retrigger_irq,
+};
+
+/*
+ * Map the PCI dev to the corresponding remapping hardware unit
+ * and allocate 'nvec' consecutive interrupt-remapping table entries
+ * in it.
+ */
+static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
+{
+       struct intel_iommu *iommu;
+       int index;
+
+       iommu = map_dev_to_ir(dev);
+       if (!iommu) {
+               printk(KERN_ERR
+                      "Unable to map PCI %s to iommu\n", pci_name(dev));
+               return -ENOENT;
+       }
+
+       index = alloc_irte(iommu, irq, nvec);
+       if (index < 0) {
+               printk(KERN_ERR
+                      "Unable to allocate %d IRTE for PCI %s\n", nvec,
+                      pci_name(dev));
+               return -ENOSPC;
+       }
+       return index;
+}
+
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+{
+       struct irq_chip *chip = &msi_chip;
+       struct msi_msg msg;
+       int ret;
+
+       ret = msi_compose_msg(dev, irq, &msg, -1);
+       if (ret < 0)
+               return ret;
+
+       irq_set_msi_desc(irq, msidesc);
+       write_msi_msg(irq, &msg);
+
+       if (irq_remapped(irq_get_chip_data(irq))) {
+               irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+               irq_remap_modify_chip_defaults(chip);
+       }
+
+       irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
+
+       dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+
+       return 0;
+}
+
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+       int node, ret, sub_handle, index = 0;
+       unsigned int irq, irq_want;
+       struct msi_desc *msidesc;
+       struct intel_iommu *iommu = NULL;
+
+       /* x86 doesn't support multiple MSI yet */
+       if (type == PCI_CAP_ID_MSI && nvec > 1)
+               return 1;
+
+       node = dev_to_node(&dev->dev);
+       irq_want = nr_irqs_gsi;
+       sub_handle = 0;
+       list_for_each_entry(msidesc, &dev->msi_list, list) {
+               irq = create_irq_nr(irq_want, node);
+               if (irq == 0)
+                       return -1;
+               irq_want = irq + 1;
+               if (!intr_remapping_enabled)
+                       goto no_ir;
+
+               if (!sub_handle) {
+                       /*
+                        * allocate the consecutive block of IRTE's
+                        * for 'nvec'
+                        */
+                       index = msi_alloc_irte(dev, irq, nvec);
+                       if (index < 0) {
+                               ret = index;
+                               goto error;
+                       }
+               } else {
+                       iommu = map_dev_to_ir(dev);
+                       if (!iommu) {
+                               ret = -ENOENT;
+                               goto error;
+                       }
+                       /*
+                        * setup the mapping between the irq and the IRTE
+                        * base index, the sub_handle pointing to the
+                        * appropriate interrupt remap table entry.
+                        */
+                       set_irte_irq(irq, iommu, index, sub_handle);
+               }
+no_ir:
+               ret = setup_msi_irq(dev, msidesc, irq);
+               if (ret < 0)
+                       goto error;
+               sub_handle++;
+       }
+       return 0;
+
+error:
+       destroy_irq(irq);
+       return ret;
+}
+
+void native_teardown_msi_irq(unsigned int irq)
+{
+       destroy_irq(irq);
+}
+
+#ifdef CONFIG_DMAR_TABLE
+#ifdef CONFIG_SMP
+static int
+dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+                     bool force)
+{
+       struct irq_cfg *cfg = data->chip_data;
+       unsigned int dest, irq = data->irq;
+       struct msi_msg msg;
+
+       if (__ioapic_set_affinity(data, mask, &dest))
+               return -1;
+
+       dmar_msi_read(irq, &msg);
+
+       msg.data &= ~MSI_DATA_VECTOR_MASK;
+       msg.data |= MSI_DATA_VECTOR(cfg->vector);
+       msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+       msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+       msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
+
+       dmar_msi_write(irq, &msg);
+
+       return 0;
+}
+
+#endif /* CONFIG_SMP */
+
+static struct irq_chip dmar_msi_type = {
+       .name                   = "DMAR_MSI",
+       .irq_unmask             = dmar_msi_unmask,
+       .irq_mask               = dmar_msi_mask,
+       .irq_ack                = ack_apic_edge,
+#ifdef CONFIG_SMP
+       .irq_set_affinity       = dmar_msi_set_affinity,
+#endif
+       .irq_retrigger          = ioapic_retrigger_irq,
+};
+
+int arch_setup_dmar_msi(unsigned int irq)
+{
+       int ret;
+       struct msi_msg msg;
+
+       ret = msi_compose_msg(NULL, irq, &msg, -1);
+       if (ret < 0)
+               return ret;
+       dmar_msi_write(irq, &msg);
+       irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
+                                     "edge");
+       return 0;
+}
+#endif
+
+#ifdef CONFIG_HPET_TIMER
+
+#ifdef CONFIG_SMP
+static int hpet_msi_set_affinity(struct irq_data *data,
+                                const struct cpumask *mask, bool force)
+{
+       struct irq_cfg *cfg = data->chip_data;
+       struct msi_msg msg;
+       unsigned int dest;
+
+       if (__ioapic_set_affinity(data, mask, &dest))
+               return -1;
+
+       hpet_msi_read(data->handler_data, &msg);
+
+       msg.data &= ~MSI_DATA_VECTOR_MASK;
+       msg.data |= MSI_DATA_VECTOR(cfg->vector);
+       msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+       msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+       hpet_msi_write(data->handler_data, &msg);
+
+       return 0;
+}
+
+#endif /* CONFIG_SMP */
+
+static struct irq_chip hpet_msi_type = {
+       .name = "HPET_MSI",
+       .irq_unmask = hpet_msi_unmask,
+       .irq_mask = hpet_msi_mask,
+       .irq_ack = ack_apic_edge,
+#ifdef CONFIG_SMP
+       .irq_set_affinity = hpet_msi_set_affinity,
+#endif
+       .irq_retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
+{
+       struct irq_chip *chip = &hpet_msi_type;
+       struct msi_msg msg;
+       int ret;
+
+       if (intr_remapping_enabled) {
+               struct intel_iommu *iommu = map_hpet_to_ir(id);
+               int index;
+
+               if (!iommu)
+                       return -1;
+
+               index = alloc_irte(iommu, irq, 1);
+               if (index < 0)
+                       return -1;
+       }
+
+       ret = msi_compose_msg(NULL, irq, &msg, id);
+       if (ret < 0)
+               return ret;
+
+       hpet_msi_write(irq_get_handler_data(irq), &msg);
+       irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+       if (irq_remapped(irq_get_chip_data(irq)))
+               irq_remap_modify_chip_defaults(chip);
+
+       irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
+       return 0;
+}
+#endif
+
+#endif /* CONFIG_PCI_MSI */
+/*
+ * Hypertransport interrupt support
+ */
+#ifdef CONFIG_HT_IRQ
+
+#ifdef CONFIG_SMP
+
+static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+       struct ht_irq_msg msg;
+       fetch_ht_irq_msg(irq, &msg);
+
+       msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
+       msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+
+       msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
+       msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+
+       write_ht_irq_msg(irq, &msg);
+}
+
+static int
+ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+{
+       struct irq_cfg *cfg = data->chip_data;
+       unsigned int dest;
+
+       if (__ioapic_set_affinity(data, mask, &dest))
+               return -1;
+
+       target_ht_irq(data->irq, dest, cfg->vector);
+       return 0;
+}
+
+#endif
+
+static struct irq_chip ht_irq_chip = {
+       .name                   = "PCI-HT",
+       .irq_mask               = mask_ht_irq,
+       .irq_unmask             = unmask_ht_irq,
+       .irq_ack                = ack_apic_edge,
+#ifdef CONFIG_SMP
+       .irq_set_affinity       = ht_set_affinity,
+#endif
+       .irq_retrigger          = ioapic_retrigger_irq,
+};
+
+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+{
+       struct irq_cfg *cfg;
+       int err;
+
+       if (disable_apic)
+               return -ENXIO;
+
+       cfg = irq_cfg(irq);
+       err = assign_irq_vector(irq, cfg, apic->target_cpus());
+       if (!err) {
+               struct ht_irq_msg msg;
+               unsigned dest;
+
+               dest = apic->cpu_mask_to_apicid_and(cfg->domain,
+                                                   apic->target_cpus());
+
+               msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+
+               msg.address_lo =
+                       HT_IRQ_LOW_BASE |
+                       HT_IRQ_LOW_DEST_ID(dest) |
+                       HT_IRQ_LOW_VECTOR(cfg->vector) |
+                       ((apic->irq_dest_mode == 0) ?
+                               HT_IRQ_LOW_DM_PHYSICAL :
+                               HT_IRQ_LOW_DM_LOGICAL) |
+                       HT_IRQ_LOW_RQEOI_EDGE |
+                       ((apic->irq_delivery_mode != dest_LowestPrio) ?
+                               HT_IRQ_LOW_MT_FIXED :
+                               HT_IRQ_LOW_MT_ARBITRATED) |
+                       HT_IRQ_LOW_IRQ_MASKED;
+
+               write_ht_irq_msg(irq, &msg);
+
+               irq_set_chip_and_handler_name(irq, &ht_irq_chip,
+                                             handle_edge_irq, "edge");
+
+               dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
+       }
+       return err;
+}
+#endif /* CONFIG_HT_IRQ */
+
+static int
+io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
+{
+       struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
+       int ret;
+
+       if (!cfg)
+               return -EINVAL;
+       ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
+       if (!ret)
+               setup_ioapic_irq(irq, cfg, attr);
+       return ret;
+}
+
+int io_apic_setup_irq_pin_once(unsigned int irq, int node,
+                              struct io_apic_irq_attr *attr)
+{
+       unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin;
+       int ret;
+
+       /* Avoid redundant programming */
+       if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) {
+               pr_debug("Pin %d-%d already programmed\n",
+                        mpc_ioapic_id(ioapic_idx), pin);
+               return 0;
+       }
+       ret = io_apic_setup_irq_pin(irq, node, attr);
+       if (!ret)
+               set_bit(pin, ioapics[ioapic_idx].pin_programmed);
+       return ret;
+}
+
+static int __init io_apic_get_redir_entries(int ioapic)
+{
+       union IO_APIC_reg_01    reg_01;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       reg_01.raw = io_apic_read(ioapic, 1);
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       /* The register returns the maximum index redir index
+        * supported, which is one less than the total number of redir
+        * entries.
+        */
+       return reg_01.bits.entries + 1;
+}
+
+#ifndef CONFIG_XEN
+static void __init probe_nr_irqs_gsi(void)
+{
+       int nr;
+
+       nr = gsi_top + NR_IRQS_LEGACY;
+       if (nr > nr_irqs_gsi)
+               nr_irqs_gsi = nr;
+
+       printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
+}
+
+int get_nr_irqs_gsi(void)
+{
+       return nr_irqs_gsi;
+}
+
+int __init arch_probe_nr_irqs(void)
+{
+       int nr;
+
+       if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
+               nr_irqs = NR_VECTORS * nr_cpu_ids;
+
+       nr = nr_irqs_gsi + 8 * nr_cpu_ids;
+#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
+       /*
+        * for MSI and HT dyn irq
+        */
+       nr += nr_irqs_gsi * 16;
+#endif
+       if (nr < nr_irqs)
+               nr_irqs = nr;
+
+       return NR_IRQS_LEGACY;
+}
+#endif /* CONFIG_XEN */
+
+int io_apic_set_pci_routing(struct device *dev, int irq,
+                           struct io_apic_irq_attr *irq_attr)
+{
+       int node;
+
+#ifdef CONFIG_XEN
+       if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) {
+               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n",
+                           irq_attr->ioapic, irq);
+               return -EINVAL;
+       }
+#endif
+       if (!IO_APIC_IRQ(irq)) {
+               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+                           irq_attr->ioapic);
+               return -EINVAL;
+       }
+
+       node = dev ? dev_to_node(dev) : cpu_to_node(0);
+
+       return io_apic_setup_irq_pin_once(irq, node, irq_attr);
+}
+
+#ifdef CONFIG_X86_32
+#ifndef CONFIG_XEN
+static int __init io_apic_get_unique_id(int ioapic, int apic_id)
+{
+       union IO_APIC_reg_00 reg_00;
+       static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+       physid_mask_t tmp;
+       unsigned long flags;
+       int i = 0;
+
+       /*
+        * The P4 platform supports up to 256 APIC IDs on two separate APIC
+        * buses (one for LAPICs, one for IOAPICs), where predecessors only
+        * supports up to 16 on one shared APIC bus.
+        *
+        * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+        *      advantage of new APIC bus architecture.
+        */
+
+       if (physids_empty(apic_id_map))
+               apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       reg_00.raw = io_apic_read(ioapic, 0);
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       if (apic_id >= get_physical_broadcast()) {
+               printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+                       "%d\n", ioapic, apic_id, reg_00.bits.ID);
+               apic_id = reg_00.bits.ID;
+       }
+
+       /*
+        * Every APIC in a system must have a unique ID or we get lots of nice
+        * 'stuck on smp_invalidate_needed IPI wait' messages.
+        */
+       if (apic->check_apicid_used(&apic_id_map, apic_id)) {
+
+               for (i = 0; i < get_physical_broadcast(); i++) {
+                       if (!apic->check_apicid_used(&apic_id_map, i))
+                               break;
+               }
+
+               if (i == get_physical_broadcast())
+                       panic("Max apic_id exceeded!\n");
+
+               printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+                       "trying %d\n", ioapic, apic_id, i);
+
+               apic_id = i;
+       }
+
+       apic->apicid_to_cpu_present(apic_id, &tmp);
+       physids_or(apic_id_map, apic_id_map, tmp);
+
+       if (reg_00.bits.ID != apic_id) {
+               reg_00.bits.ID = apic_id;
+
+               raw_spin_lock_irqsave(&ioapic_lock, flags);
+               io_apic_write(ioapic, 0, reg_00.raw);
+               reg_00.raw = io_apic_read(ioapic, 0);
+               raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+               /* Sanity check */
+               if (reg_00.bits.ID != apic_id) {
+                       printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+                       return -1;
+               }
+       }
+
+       apic_printk(APIC_VERBOSE, KERN_INFO
+                       "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+
+       return apic_id;
+}
+#endif
+
+static u8 __init io_apic_unique_id(u8 id)
+{
+#ifndef CONFIG_XEN
+       if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+           !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+               return io_apic_get_unique_id(nr_ioapics, id);
+       else
+#endif
+               return id;
+}
+#else
+static u8 __init io_apic_unique_id(u8 id)
+{
+       int i;
+       DECLARE_BITMAP(used, 256);
+
+       bitmap_zero(used, 256);
+       for (i = 0; i < nr_ioapics; i++) {
+               __set_bit(mpc_ioapic_id(i), used);
+       }
+       if (!test_bit(id, used))
+               return id;
+       return find_first_zero_bit(used, 256);
+}
+#endif
+
+static int __init io_apic_get_version(int ioapic)
+{
+       union IO_APIC_reg_01    reg_01;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&ioapic_lock, flags);
+       reg_01.raw = io_apic_read(ioapic, 1);
+       raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return reg_01.bits.version;
+}
+
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
+{
+       int ioapic, pin, idx;
+
+       if (skip_ioapic_setup)
+               return -1;
+
+       ioapic = mp_find_ioapic(gsi);
+       if (ioapic < 0)
+               return -1;
+
+       pin = mp_find_ioapic_pin(ioapic, gsi);
+       if (pin < 0)
+               return -1;
+
+       idx = find_irq_entry(ioapic, pin, mp_INT);
+       if (idx < 0)
+               return -1;
+
+       *trigger = irq_trigger(idx);
+       *polarity = irq_polarity(idx);
+       return 0;
+}
+
+#ifndef CONFIG_XEN
+/*
+ * This function currently is only a helper for the i386 smp boot process where
+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
+ * so mask in all cases should simply be apic->target_cpus()
+ */
+#ifdef CONFIG_SMP
+void __init setup_ioapic_dest(void)
+{
+       int pin, ioapic, irq, irq_entry;
+       const struct cpumask *mask;
+       struct irq_data *idata;
+
+       if (skip_ioapic_setup == 1)
+               return;
+
+       for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
+       for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
+               irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+               if (irq_entry == -1)
+                       continue;
+               irq = pin_2_irq(irq_entry, ioapic, pin);
+
+               if ((ioapic > 0) && (irq > 16))
+                       continue;
+
+               idata = irq_get_irq_data(irq);
+
+               /*
+                * Honour affinities which have been set in early boot
+                */
+               if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
+                       mask = idata->affinity;
+               else
+                       mask = apic->target_cpus();
+
+               if (intr_remapping_enabled)
+                       ir_ioapic_set_affinity(idata, mask, false);
+               else
+                       ioapic_set_affinity(idata, mask, false);
+       }
+
+}
+#endif
+
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+
+static struct resource *ioapic_resources;
+
+static struct resource * __init ioapic_setup_resources(int nr_ioapics)
+{
+       unsigned long n;
+       struct resource *res;
+       char *mem;
+       int i;
+
+       if (nr_ioapics <= 0)
+               return NULL;
+
+       n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+       n *= nr_ioapics;
+
+       mem = alloc_bootmem(n);
+       res = (void *)mem;
+
+       mem += sizeof(struct resource) * nr_ioapics;
+
+       for (i = 0; i < nr_ioapics; i++) {
+               res[i].name = mem;
+               res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
+               mem += IOAPIC_RESOURCE_NAME_SIZE;
+       }
+
+       ioapic_resources = res;
+
+       return res;
+}
+
+void __init ioapic_and_gsi_init(void)
+{
+       io_apic_ops.init();
+}
+
+static void __init __ioapic_init_mappings(void)
+{
+       unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+       struct resource *ioapic_res;
+       int i;
+
+       ioapic_res = ioapic_setup_resources(nr_ioapics);
+       for (i = 0; i < nr_ioapics; i++) {
+               if (smp_found_config) {
+                       ioapic_phys = mpc_ioapic_addr(i);
+#ifdef CONFIG_X86_32
+                       if (!ioapic_phys) {
+                               printk(KERN_ERR
+                                      "WARNING: bogus zero IO-APIC "
+                                      "address found in MPTABLE, "
+                                      "disabling IO/APIC support!\n");
+                               smp_found_config = 0;
+                               skip_ioapic_setup = 1;
+                               goto fake_ioapic_page;
+                       }
+#endif
+               } else {
+#ifdef CONFIG_X86_32
+fake_ioapic_page:
+#endif
+                       ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
+                       ioapic_phys = __pa(ioapic_phys);
+               }
+               set_fixmap_nocache(idx, ioapic_phys);
+               apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
+                       __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
+                       ioapic_phys);
+               idx++;
+
+               ioapic_res->start = ioapic_phys;
+               ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
+               ioapic_res++;
+       }
+
+       probe_nr_irqs_gsi();
+}
+
+void __init ioapic_insert_resources(void)
+{
+       int i;
+       struct resource *r = ioapic_resources;
+
+       if (!r) {
+               if (nr_ioapics > 0)
+                       printk(KERN_ERR
+                               "IO APIC resources couldn't be allocated.\n");
+               return;
+       }
+
+       for (i = 0; i < nr_ioapics; i++) {
+               insert_resource(&iomem_resource, r);
+               r++;
+       }
+}
+#endif /* !CONFIG_XEN */
+
+int mp_find_ioapic(u32 gsi)
+{
+       int i = 0;
+
+       if (nr_ioapics == 0)
+               return -1;
+
+       /* Find the IOAPIC that manages this GSI. */
+       for (i = 0; i < nr_ioapics; i++) {
+               struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
+               if ((gsi >= gsi_cfg->gsi_base)
+                   && (gsi <= gsi_cfg->gsi_end))
+                       return i;
+       }
+
+       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+       return -1;
+}
+
+int mp_find_ioapic_pin(int ioapic, u32 gsi)
+{
+       struct mp_ioapic_gsi *gsi_cfg;
+
+       if (WARN_ON(ioapic == -1))
+               return -1;
+
+       gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+       if (WARN_ON(gsi > gsi_cfg->gsi_end))
+               return -1;
+
+       return gsi - gsi_cfg->gsi_base;
+}
+
+static __init int bad_ioapic(unsigned long address)
+{
+       if (nr_ioapics >= MAX_IO_APICS) {
+               pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n",
+                       MAX_IO_APICS, nr_ioapics);
+               return 1;
+       }
+       if (!address) {
+               pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n");
+               return 1;
+       }
+       return 0;
+}
+
+static __init int bad_ioapic_register(int idx)
+{
+       union IO_APIC_reg_00 reg_00;
+       union IO_APIC_reg_01 reg_01;
+       union IO_APIC_reg_02 reg_02;
+
+       reg_00.raw = io_apic_read(idx, 0);
+       reg_01.raw = io_apic_read(idx, 1);
+       reg_02.raw = io_apic_read(idx, 2);
+
+       if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) {
+               pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n",
+                       mpc_ioapic_addr(idx));
+               return 1;
+       }
+
+       return 0;
+}
+
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+{
+       int idx = 0;
+       int entries;
+       struct mp_ioapic_gsi *gsi_cfg;
+
+       if (bad_ioapic(address))
+               return;
+
+       idx = nr_ioapics;
+
+       ioapics[idx].mp_config.type = MP_IOAPIC;
+       ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
+       ioapics[idx].mp_config.apicaddr = address;
+
+#ifndef CONFIG_XEN
+       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+#endif
+
+       if (bad_ioapic_register(idx)) {
+#ifndef CONFIG_XEN
+               clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+#endif
+               return;
+       }
+
+       ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
+       ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
+
+       /*
+        * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+        * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+        */
+       entries = io_apic_get_redir_entries(idx);
+       gsi_cfg = mp_ioapic_gsi_routing(idx);
+       gsi_cfg->gsi_base = gsi_base;
+       gsi_cfg->gsi_end = gsi_base + entries - 1;
+
+       /*
+        * The number of IO-APIC IRQ registers (== #pins):
+        */
+       ioapics[idx].nr_registers = entries;
+
+       if (gsi_cfg->gsi_end >= gsi_top)
+               gsi_top = gsi_cfg->gsi_end + 1;
+
+       pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
+               idx, mpc_ioapic_id(idx),
+               mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+               gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+
+       nr_ioapics++;
+}
+
+#ifdef CONFIG_X86_MRST
+/* Enable IOAPIC early just for system timer */
+void __init pre_init_apic_IRQ0(void)
+{
+       struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
+
+       printk(KERN_INFO "Early APIC setup for system timer0\n");
+#ifndef CONFIG_SMP
+       physid_set_mask_of_physid(boot_cpu_physical_apicid,
+                                        &phys_cpu_present_map);
+#endif
+       setup_local_APIC();
+
+       io_apic_setup_irq_pin(0, 0, &attr);
+       irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
+                                     "edge");
+}
+#endif
diff --git a/arch/x86/kernel/apic/ipi-xen.c b/arch/x86/kernel/apic/ipi-xen.c

new file mode 100644 (file)

index 0000000..a3ee607
--- /dev/null
+++ b/arch/x86/kernel/apic/ipi-xen.c
@@ -0,0 +1,43 @@
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+
+#include <asm/smp.h>
+#include <asm/ipi.h>
+
+#ifdef CONFIG_SMP
+#include <xen/evtchn.h>
+
+void xen_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
+{
+       unsigned int cpu, this_cpu = smp_processor_id();
+
+       WARN_ON(!cpumask_subset(cpumask, cpu_online_mask));
+       for_each_cpu_and(cpu, cpumask, cpu_online_mask)
+               if (cpu != this_cpu)
+                       notify_remote_via_ipi(vector, cpu);
+}
+
+void xen_send_IPI_mask(const struct cpumask *cpumask, int vector)
+{
+       unsigned int cpu;
+
+       WARN_ON(!cpumask_subset(cpumask, cpu_online_mask));
+       for_each_cpu_and(cpu, cpumask, cpu_online_mask)
+               notify_remote_via_ipi(vector, cpu);
+}
+
+void xen_send_IPI_allbutself(int vector)
+{
+       xen_send_IPI_mask_allbutself(cpu_online_mask, vector);
+}
+
+void xen_send_IPI_all(int vector)
+{
+       xen_send_IPI_mask(cpu_online_mask, vector);
+}
+
+void xen_send_IPI_self(int vector)
+{
+       notify_remote_via_ipi(vector, smp_processor_id());
+}
+#endif
diff --git a/arch/x86/kernel/apic/probe_32-xen.c b/arch/x86/kernel/apic/probe_32-xen.c

new file mode 100644 (file)

index 0000000..8602fa9
--- /dev/null
+++ b/arch/x86/kernel/apic/probe_32-xen.c
@@ -0,0 +1,57 @@
+/*
+ * Default generic APIC driver. This handles up to 8 CPUs.
+ *
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic x86 APIC driver probe layer.
+ */
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <asm/fixmap.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+
+#include <linux/smp.h>
+#include <asm/ipi.h>
+
+#include <linux/interrupt.h>
+#include <asm/acpi.h>
+#include <asm/e820.h>
+
+static int xen_phys_pkg_id(int cpuid_apic, int index_msb)
+{
+       return cpuid_apic;
+}
+
+static struct apic apic_xen = {
+
+       .name                           = "default",
+
+       .irq_delivery_mode              = dest_LowestPrio,
+       /* logical delivery broadcast to all CPUs: */
+       .irq_dest_mode                  = 1,
+
+       .target_cpus                    = default_target_cpus,
+
+       .phys_pkg_id                    = xen_phys_pkg_id,
+
+#ifdef CONFIG_SMP
+       .send_IPI_mask                  = xen_send_IPI_mask,
+       .send_IPI_mask_allbutself       = xen_send_IPI_mask_allbutself,
+       .send_IPI_allbutself            = xen_send_IPI_allbutself,
+       .send_IPI_all                   = xen_send_IPI_all,
+       .send_IPI_self                  = xen_send_IPI_self,
+#endif
+};
+
+struct apic *apic = &apic_xen;
+EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c

index ff2c1b9..e14711d 100644 (file)
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -240,7 +240,7 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
                 if (!(*drv)->mps_oem_check(mpc, oem, productid))
                         continue;
  
-               if (!cmdline_apic) {
+               if (!cmdline_apic && apic == &apic_default) {
                         apic = *drv;
                         printk(KERN_INFO "Switched to APIC driver `%s'.\n",
                                apic->name);
@@ -260,7 +260,7 @@ int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
                 if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
                         continue;
  
-               if (!cmdline_apic) {
+               if (!cmdline_apic && apic == &apic_default) {
                         apic = *drv;
                         printk(KERN_INFO "Switched to APIC driver `%s'.\n",
                                apic->name);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c

index 459e78c..aa941a8 100644 (file)
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -372,10 +372,12 @@ static struct {
         unsigned long   offset;
         unsigned short  segment;
  } apm_bios_entry;
+#ifdef CONFIG_APM_CPU_IDLE
  static int clock_slowed;
  static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
  static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
  static int set_pm_idle;
+#endif
  static int suspends_pending;
  static int standbys_pending;
  static int ignore_sys_suspend;
@@ -804,6 +806,7 @@ static int set_system_power_state(u_short state)
         return set_power_state(APM_DEVICE_ALL, state);
  }
  
+#ifdef CONFIG_APM_CPU_IDLE
  /**
   *     apm_do_idle     -       perform power saving
   *
@@ -963,6 +966,7 @@ recalc:
  
         local_irq_enable();
  }
+#endif
  
  /**
   *     apm_power_off   -       ask the BIOS to power off
@@ -1871,12 +1875,14 @@ static int __init apm_setup(char *str)
                 if ((strncmp(str, "bounce-interval=", 16) == 0) ||
                     (strncmp(str, "bounce_interval=", 16) == 0))
                         bounce_interval = simple_strtol(str + 16, NULL, 0);
+#ifdef CONFIG_APM_CPU_IDLE
                 if ((strncmp(str, "idle-threshold=", 15) == 0) ||
                     (strncmp(str, "idle_threshold=", 15) == 0))
                         idle_threshold = simple_strtol(str + 15, NULL, 0);
                 if ((strncmp(str, "idle-period=", 12) == 0) ||
                     (strncmp(str, "idle_period=", 12) == 0))
                         idle_period = simple_strtol(str + 12, NULL, 0);
+#endif
                 invert = (strncmp(str, "no-", 3) == 0) ||
                         (strncmp(str, "no_", 3) == 0);
                 if (invert)
@@ -2379,6 +2385,7 @@ static int __init apm_init(void)
         if (misc_register(&apm_device))
                 printk(KERN_WARNING "apm: Could not register misc device.\n");
  
+#ifdef CONFIG_APM_CPU_IDLE
         if (HZ != 100)
                 idle_period = (idle_period * HZ) / 100;
         if (idle_threshold < 100) {
@@ -2386,6 +2393,7 @@ static int __init apm_init(void)
                 pm_idle  = apm_cpu_idle;
                 set_pm_idle = 1;
         }
+#endif
  
         return 0;
  }
@@ -2394,6 +2402,7 @@ static void __exit apm_exit(void)
  {
         int error;
  
+#ifdef CONFIG_APM_CPU_IDLE
         if (set_pm_idle) {
                 pm_idle = original_pm_idle;
                 /*
@@ -2403,6 +2412,7 @@ static void __exit apm_exit(void)
                  */
                 cpu_idle_wait();
         }
+#endif
         if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
             && (apm_info.connection_version > 0x0100)) {
                 error = apm_engage_power_management(APM_DEVICE_ALL, 0);
@@ -2439,12 +2449,14 @@ MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call");
  module_param(realmode_power_off, bool, 0444);
  MODULE_PARM_DESC(realmode_power_off,
                 "Switch to real mode before powering off");
+#ifdef CONFIG_APM_CPU_IDLE
  module_param(idle_threshold, int, 0444);
  MODULE_PARM_DESC(idle_threshold,
         "System idle percentage above which to make APM BIOS idle calls");
  module_param(idle_period, int, 0444);
  MODULE_PARM_DESC(idle_period,
         "Period (in sec/100) over which to caculate the idle percentage");
+#endif
  module_param(smp, bool, 0444);
  MODULE_PARM_DESC(smp,
         "Set this to enable APM use on an SMP platform. Use with caution on older systems");
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c

index 68de2dc..3d01fce 100644 (file)
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -17,7 +17,7 @@
  #include <asm/bootparam.h>
  #include <asm/suspend.h>
  
-#ifdef CONFIG_XEN
+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
  #include <xen/interface/xen.h>
  #endif
  
@@ -55,7 +55,7 @@ void common(void) {
         OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
  #endif
  
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
         BLANK();
         OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
         OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c

index 85d98ab..cf6cc38 100644 (file)
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,7 +1,9 @@
  #include <asm/ucontext.h>
  
+#ifdef CONFIG_LGUEST_GUEST
  #include <linux/lguest.h>
  #include "../../../drivers/lguest/lg.h"
+#endif
  
  #define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
  static char syscalls[] = {
@@ -60,9 +62,19 @@ void foo(void)
         OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
         BLANK();
  
+#ifndef CONFIG_X86_NO_TSS
         /* Offset from the sysenter stack to tss.sp0 */
-       DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
+       DEFINE(SYSENTER_stack_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
                  sizeof(struct tss_struct));
+#else
+       /* sysenter stack points directly to sp0 */
+       DEFINE(SYSENTER_stack_sp0, 0);
+#endif
+
+#ifdef CONFIG_XEN
+       BLANK();
+       OFFSET(XEN_START_mfn_list, start_info, mfn_list);
+#endif
  
  #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
         BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c

index 1b4754f..125373b 100644 (file)
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -76,8 +76,10 @@ int main(void)
         BLANK();
  #undef ENTRY
  
+#ifndef CONFIG_X86_NO_TSS
         OFFSET(TSS_ist, tss_struct, x86_tss.ist);
         BLANK();
+#endif
  
         DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
         DEFINE(NR_syscalls, sizeof(syscalls_64));
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile

index 6ab6aa2..fefe4b9 100644 (file)
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -40,6 +40,9 @@ obj-$(CONFIG_MTRR)                    += mtrr/
  
  obj-$(CONFIG_X86_LOCAL_APIC)           += perfctr-watchdog.o perf_event_amd_ibs.o
  
+disabled-obj-$(CONFIG_XEN) := hypervisor.o mshyperv.o perfctr-watchdog.o \
+                             perf_event.o perf_event_%.o sched.o vmware.o
+
  quiet_cmd_mkcapflags = MKCAP   $@
        cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
  
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c

index 146bb62..5fab796 100644 (file)
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -334,7 +334,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
  int amd_get_nb_id(int cpu)
  {
         int id = 0;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
         id = per_cpu(cpu_llc_id, cpu);
  #endif
         return id;
@@ -458,8 +458,10 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
         if (c->x86_power & (1 << 8)) {
                 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
                 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+#ifndef CONFIG_XEN
                 if (!check_tsc_unstable())
                         sched_clock_stable = 1;
+#endif
         }
  
  #ifdef CONFIG_X86_64
@@ -471,7 +473,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
                     (c->x86_model == 8 && c->x86_mask >= 8))
                         set_cpu_cap(c, X86_FEATURE_K6_MTRR);
  #endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) && !defined(CONFIG_XEN)
         /* check CPU config space for extended APIC ID */
         if (cpu_has_apic && c->x86 >= 0xf) {
                 unsigned int val;
@@ -484,7 +486,9 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
  
  static void __cpuinit init_amd(struct cpuinfo_x86 *c)
  {
+#ifndef CONFIG_XEN
         u32 dummy;
+#endif
  
  #ifdef CONFIG_SMP
         unsigned long long value;
@@ -529,18 +533,26 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
                         u64 val;
  
                         clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
+#ifndef CONFIG_XEN
                         if (!rdmsrl_amd_safe(0xc001100d, &val)) {
                                 val &= ~(1ULL << 32);
                                 wrmsrl_amd_safe(0xc001100d, val);
                         }
+#else
+                       pr_warning("Long-mode LAHF feature wrongly enabled -"
+                                  "hypervisor update needed\n");
+                       (void)&val;
+#endif
                 }
  
         }
         if (c->x86 >= 0x10)
                 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
  
+#ifndef CONFIG_XEN
         /* get apicid instead of initial apic id from cpuid */
         c->apicid = hard_smp_processor_id();
+#endif
  #else
  
         /*
@@ -634,6 +646,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
                 fam10h_check_enable_mmcfg();
         }
  
+#ifndef CONFIG_XEN
         if (c == &boot_cpu_data && c->x86 >= 0xf) {
                 unsigned long long tseg;
  
@@ -653,6 +666,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
                 }
         }
  #endif
+#endif
  
         /*
          * Family 0x12 and above processors have APIC timer
@@ -661,6 +675,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
         if (c->x86 > 0x11)
                 set_cpu_cap(c, X86_FEATURE_ARAT);
  
+#ifndef CONFIG_XEN
         /*
          * Disable GART TLB Walk Errors on Fam10h. We do this here
          * because this is always needed when GART is enabled, even in a
@@ -684,6 +699,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
         }
  
         rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+#endif
  }
  
  #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c

index 46674fb..09087bd 100644 (file)
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -17,6 +17,7 @@
  #include <asm/paravirt.h>
  #include <asm/alternative.h>
  
+#ifndef CONFIG_XEN
  static int __init no_halt(char *s)
  {
         WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
@@ -25,6 +26,7 @@ static int __init no_halt(char *s)
  }
  
  __setup("no-hlt", no_halt);
+#endif
  
  static int __init no_387(char *s)
  {
@@ -84,13 +86,16 @@ static void __init check_fpu(void)
  
         kernel_fpu_end();
  
+#ifndef CONFIG_XEN
         boot_cpu_data.fdiv_bug = fdiv_bug;
         if (boot_cpu_data.fdiv_bug)
                 printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
+#endif
  }
  
  static void __init check_hlt(void)
  {
+#ifndef CONFIG_XEN
         if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
                 return;
  
@@ -104,6 +109,7 @@ static void __init check_hlt(void)
         halt();
         halt();
         printk(KERN_CONT "OK.\n");
+#endif
  }
  
  /*
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c

index 04f0fe5..25a2cda 100644 (file)
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
@@ -20,6 +20,7 @@ void __init check_bugs(void)
  #endif
         alternative_instructions();
  
+#ifndef CONFIG_XEN
         /*
          * Make sure the first 2MB area is not mapped by huge pages
          * There are typically fixed size MTRRs in there and overlapping
@@ -30,4 +31,5 @@ void __init check_bugs(void)
          */
         if (!direct_gbpages)
                 set_memory_4k((unsigned long)__va(0), 1);
+#endif
  }
diff --git a/arch/x86/kernel/cpu/common-xen.c b/arch/x86/kernel/cpu/common-xen.c

new file mode 100644 (file)

index 0000000..38d9f48
--- /dev/null
+++ b/arch/x86/kernel/cpu/common-xen.c
@@ -0,0 +1,1427 @@
+#include <linux/bootmem.h>
+#include <linux/linkage.h>
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/smp.h>
+#include <linux/io.h>
+
+#include <asm/stackprotector.h>
+#include <asm/perf_event.h>
+#include <asm/mmu_context.h>
+#include <asm/archrandom.h>
+#include <asm/hypervisor.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+#include <asm/sections.h>
+#include <linux/topology.h>
+#include <linux/cpumask.h>
+#include <asm/pgtable.h>
+#include <linux/atomic.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/fpu-internal.h>
+#include <asm/mtrr.h>
+#include <linux/numa.h>
+#include <asm/asm.h>
+#include <asm/cpu.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/uv/uv.h>
+#endif
+
+#ifdef CONFIG_XEN
+#include <xen/interface/callback.h>
+#endif
+
+#include "cpu.h"
+
+/* all of these masks are initialized in setup_cpu_local_masks() */
+cpumask_var_t cpu_initialized_mask;
+#ifndef CONFIG_XEN
+cpumask_var_t cpu_callout_mask;
+cpumask_var_t cpu_callin_mask;
+
+/* representing cpus for which sibling maps can be computed */
+cpumask_var_t cpu_sibling_setup_mask;
+#endif
+
+/* correctly size the local cpu masks */
+void __init setup_cpu_local_masks(void)
+{
+       alloc_bootmem_cpumask_var(&cpu_initialized_mask);
+#ifndef CONFIG_XEN
+       alloc_bootmem_cpumask_var(&cpu_callin_mask);
+       alloc_bootmem_cpumask_var(&cpu_callout_mask);
+       alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+#endif
+}
+
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+       cpu_detect_cache_sizes(c);
+#else
+       /* Not much we can do here... */
+       /* Check if at least it has cpuid */
+       if (c->cpuid_level == -1) {
+               /* No cpuid. It must be an ancient CPU */
+               if (c->x86 == 4)
+                       strcpy(c->x86_model_id, "486");
+               else if (c->x86 == 3)
+                       strcpy(c->x86_model_id, "386");
+       }
+#endif
+}
+
+static const struct cpu_dev __cpuinitconst default_cpu = {
+       .c_init         = default_init,
+       .c_vendor       = "Unknown",
+       .c_x86_vendor   = X86_VENDOR_UNKNOWN,
+};
+
+static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+#ifdef CONFIG_X86_64
+       /*
+        * We need valid kernel segments for data and code in long mode too
+        * IRET will check the segment types  kkeil 2000/10/28
+        * Also sysret mandates a special GDT layout
+        *
+        * TLS descriptors are currently at a different place compared to i386.
+        * Hopefully nobody expects them at a fixed place (Wine?)
+        */
+       [GDT_ENTRY_KERNEL32_CS]         = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+       [GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+       [GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+       [GDT_ENTRY_DEFAULT_USER32_CS]   = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
+       [GDT_ENTRY_DEFAULT_USER_DS]     = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
+       [GDT_ENTRY_DEFAULT_USER_CS]     = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+#else
+       [GDT_ENTRY_KERNEL_CS]           = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
+       [GDT_ENTRY_KERNEL_DS]           = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+       [GDT_ENTRY_DEFAULT_USER_CS]     = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
+       [GDT_ENTRY_DEFAULT_USER_DS]     = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+#ifndef CONFIG_XEN
+       /*
+        * Segments used for calling PnP BIOS have byte granularity.
+        * They code segments and data segments have fixed 64k limits,
+        * the transfer segment sizes are set at run time.
+        */
+       /* 32-bit code */
+       [GDT_ENTRY_PNPBIOS_CS32]        = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+       /* 16-bit code */
+       [GDT_ENTRY_PNPBIOS_CS16]        = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+       /* 16-bit data */
+       [GDT_ENTRY_PNPBIOS_DS]          = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
+       /* 16-bit data */
+       [GDT_ENTRY_PNPBIOS_TS1]         = GDT_ENTRY_INIT(0x0092, 0, 0),
+       /* 16-bit data */
+       [GDT_ENTRY_PNPBIOS_TS2]         = GDT_ENTRY_INIT(0x0092, 0, 0),
+       /*
+        * The APM segments have byte granularity and their bases
+        * are set at run time.  All have 64k limits.
+        */
+       /* 32-bit code */
+       [GDT_ENTRY_APMBIOS_BASE]        = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
+       /* 16-bit code */
+       [GDT_ENTRY_APMBIOS_BASE+1]      = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
+       /* data */
+       [GDT_ENTRY_APMBIOS_BASE+2]      = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
+
+       [GDT_ENTRY_ESPFIX_SS]           = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+#endif
+       [GDT_ENTRY_PERCPU]              = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+       GDT_STACK_CANARY_INIT
+#endif
+} };
+EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+
+static int __init x86_xsave_setup(char *s)
+{
+       setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+       setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+       return 1;
+}
+__setup("noxsave", x86_xsave_setup);
+
+static int __init x86_xsaveopt_setup(char *s)
+{
+       setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+       return 1;
+}
+__setup("noxsaveopt", x86_xsaveopt_setup);
+
+#ifdef CONFIG_X86_32
+static int cachesize_override __cpuinitdata = -1;
+
+static int __init cachesize_setup(char *str)
+{
+       get_option(&str, &cachesize_override);
+       return 1;
+}
+__setup("cachesize=", cachesize_setup);
+
+static int __init x86_fxsr_setup(char *s)
+{
+       setup_clear_cpu_cap(X86_FEATURE_FXSR);
+       setup_clear_cpu_cap(X86_FEATURE_XMM);
+       return 1;
+}
+__setup("nofxsr", x86_fxsr_setup);
+
+static int __init x86_sep_setup(char *s)
+{
+       setup_clear_cpu_cap(X86_FEATURE_SEP);
+       return 1;
+}
+__setup("nosep", x86_sep_setup);
+#endif
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+/* Standard macro to see if a specific flag is changeable */
+static inline int flag_is_changeable_p(u32 flag)
+{
+       u32 f1, f2;
+
+       /*
+        * Cyrix and IDT cpus allow disabling of CPUID
+        * so the code below may return different results
+        * when it is executed before and after enabling
+        * the CPUID. Add "volatile" to not allow gcc to
+        * optimize the subsequent calls to this function.
+        */
+       asm volatile ("pushfl           \n\t"
+                     "pushfl           \n\t"
+                     "popl %0          \n\t"
+                     "movl %0, %1      \n\t"
+                     "xorl %2, %0      \n\t"
+                     "pushl %0         \n\t"
+                     "popfl            \n\t"
+                     "pushfl           \n\t"
+                     "popl %0          \n\t"
+                     "popfl            \n\t"
+
+                     : "=&r" (f1), "=&r" (f2)
+                     : "ir" (flag));
+
+       return ((f1^f2) & flag) != 0;
+}
+
+/* Probe for the CPUID instruction */
+static int __cpuinit have_cpuid_p(void)
+{
+       return flag_is_changeable_p(X86_EFLAGS_ID);
+}
+
+static int disable_x86_serial_nr __cpuinitdata = 1;
+
+static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+       unsigned long lo, hi;
+
+       if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr)
+               return;
+
+       /* Disable processor serial number: */
+
+       rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+       lo |= 0x200000;
+       wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+
+       printk(KERN_NOTICE "CPU serial number disabled.\n");
+       clear_cpu_cap(c, X86_FEATURE_PN);
+
+       /* Disabling the serial number may affect the cpuid level */
+       c->cpuid_level = cpuid_eax(0);
+}
+
+static int __init x86_serial_nr_setup(char *s)
+{
+       disable_x86_serial_nr = 0;
+       return 1;
+}
+__setup("serialnumber", x86_serial_nr_setup);
+#else
+static inline int flag_is_changeable_p(u32 flag)
+{
+       return 1;
+}
+/* Probe for the CPUID instruction */
+static inline int have_cpuid_p(void)
+{
+       return 1;
+}
+static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+}
+#endif
+
+static int disable_smep __cpuinitdata;
+static __init int setup_disable_smep(char *arg)
+{
+       disable_smep = 1;
+       return 1;
+}
+__setup("nosmep", setup_disable_smep);
+
+static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
+{
+       if (cpu_has(c, X86_FEATURE_SMEP)) {
+               if (unlikely(disable_smep)) {
+                       setup_clear_cpu_cap(X86_FEATURE_SMEP);
+                       clear_in_cr4(X86_CR4_SMEP);
+               } else
+                       set_in_cr4(X86_CR4_SMEP);
+       }
+}
+
+/*
+ * Some CPU features depend on higher CPUID levels, which may not always
+ * be available due to CPUID level capping or broken virtualization
+ * software.  Add those features to this table to auto-disable them.
+ */
+struct cpuid_dependent_feature {
+       u32 feature;
+       u32 level;
+};
+
+static const struct cpuid_dependent_feature __cpuinitconst
+cpuid_dependent_features[] = {
+       { X86_FEATURE_MWAIT,            0x00000005 },
+       { X86_FEATURE_DCA,              0x00000009 },
+       { X86_FEATURE_XSAVE,            0x0000000d },
+       { 0, 0 }
+};
+
+static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
+{
+       const struct cpuid_dependent_feature *df;
+
+       for (df = cpuid_dependent_features; df->feature; df++) {
+
+               if (!cpu_has(c, df->feature))
+                       continue;
+               /*
+                * Note: cpuid_level is set to -1 if unavailable, but
+                * extended_extended_level is set to 0 if unavailable
+                * and the legitimate extended levels are all negative
+                * when signed; hence the weird messing around with
+                * signs here...
+                */
+               if (!((s32)df->level < 0 ?
+                    (u32)df->level > (u32)c->extended_cpuid_level :
+                    (s32)df->level > (s32)c->cpuid_level))
+                       continue;
+
+               clear_cpu_cap(c, df->feature);
+               if (!warn)
+                       continue;
+
+               printk(KERN_WARNING
+                      "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+                               x86_cap_flags[df->feature], df->level);
+       }
+}
+
+/*
+ * Naming convention should be: <Name> [(<Codename>)]
+ * This table only is used unless init_<vendor>() below doesn't set it;
+ * in particular, if CPUID levels 0x80000002..4 are supported, this
+ * isn't used
+ */
+
+/* Look up CPU names by table lookup. */
+static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
+{
+       const struct cpu_model_info *info;
+
+       if (c->x86_model >= 16)
+               return NULL;    /* Range check */
+
+       if (!this_cpu)
+               return NULL;
+
+       info = this_cpu->c_models;
+
+       while (info && info->family) {
+               if (info->family == c->x86)
+                       return info->model_names[c->x86_model];
+               info++;
+       }
+       return NULL;            /* Not found */
+}
+
+__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
+
+void __ref load_percpu_segment(int cpu)
+{
+#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
+       static bool done;
+
+       if (!done) {
+               done = true;
+               adjust_boot_vcpu_info();
+       }
+#endif
+#ifdef CONFIG_X86_32
+       loadsegment(fs, __KERNEL_PERCPU);
+#else
+       loadsegment(gs, 0);
+#ifndef CONFIG_XEN
+       wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
+#else
+       if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
+                       (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)))
+               BUG();
+#endif
+#endif
+       load_stack_canary_segment();
+}
+
+/*
+ * Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one.
+ */
+void switch_to_new_gdt(int cpu)
+{
+       struct desc_ptr gdt_descr;
+       unsigned long va, frames[16];
+       int f;
+
+       gdt_descr.address = (long)get_cpu_gdt_table(cpu);
+       gdt_descr.size = GDT_SIZE - 1;
+
+       for (va = gdt_descr.address, f = 0;
+            va < gdt_descr.address + gdt_descr.size;
+            va += PAGE_SIZE, f++) {
+               frames[f] = arbitrary_virt_to_mfn(va);
+               make_page_readonly((void *)va,
+                                  XENFEAT_writable_descriptor_tables);
+       }
+       if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8))
+               BUG();
+
+       /* Reload the per-cpu base */
+
+       load_percpu_segment(cpu);
+}
+
+static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
+
+static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
+{
+       unsigned int *v;
+       char *p, *q;
+
+       if (c->extended_cpuid_level < 0x80000004)
+               return;
+
+       v = (unsigned int *)c->x86_model_id;
+       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+       c->x86_model_id[48] = 0;
+
+       /*
+        * Intel chips right-justify this string for some dumb reason;
+        * undo that brain damage:
+        */
+       p = q = &c->x86_model_id[0];
+       while (*p == ' ')
+               p++;
+       if (p != q) {
+               while (*p)
+                       *q++ = *p++;
+               while (q <= &c->x86_model_id[48])
+                       *q++ = '\0';    /* Zero-pad the rest */
+       }
+}
+
+void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
+{
+       unsigned int n, dummy, ebx, ecx, edx, l2size;
+
+       n = c->extended_cpuid_level;
+
+       if (n >= 0x80000005) {
+               cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
+               c->x86_cache_size = (ecx>>24) + (edx>>24);
+#ifdef CONFIG_X86_64
+               /* On K8 L1 TLB is inclusive, so don't count it */
+               c->x86_tlbsize = 0;
+#endif
+       }
+
+       if (n < 0x80000006)     /* Some chips just has a large L1. */
+               return;
+
+       cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
+       l2size = ecx >> 16;
+
+#ifdef CONFIG_X86_64
+       c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+#else
+       /* do processor-specific cache resizing */
+       if (this_cpu->c_size_cache)
+               l2size = this_cpu->c_size_cache(c, l2size);
+
+       /* Allow user to override all this if necessary. */
+       if (cachesize_override != -1)
+               l2size = cachesize_override;
+
+       if (l2size == 0)
+               return;         /* Again, no L2 cache is possible */
+#endif
+
+       c->x86_cache_size = l2size;
+}
+
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_HT
+       u32 eax, ebx, ecx, edx;
+       int index_msb, core_bits;
+       static bool printed;
+
+       if (!cpu_has(c, X86_FEATURE_HT))
+               return;
+
+       if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+               goto out;
+
+       if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
+               return;
+
+       cpuid(1, &eax, &ebx, &ecx, &edx);
+
+       smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+       if (smp_num_siblings == 1) {
+               printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
+               goto out;
+       }
+
+       if (smp_num_siblings <= 1)
+               goto out;
+
+       index_msb = get_count_order(smp_num_siblings);
+       c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
+
+       smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+       index_msb = get_count_order(smp_num_siblings);
+
+       core_bits = get_count_order(c->x86_max_cores);
+
+       c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
+                                      ((1 << core_bits) - 1);
+
+out:
+       if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
+               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+                      c->phys_proc_id);
+               printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+                      c->cpu_core_id);
+               printed = 1;
+       }
+#endif
+}
+
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+{
+       char *v = c->x86_vendor_id;
+       int i;
+
+       for (i = 0; i < X86_VENDOR_NUM; i++) {
+               if (!cpu_devs[i])
+                       break;
+
+               if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+                   (cpu_devs[i]->c_ident[1] &&
+                    !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+
+                       this_cpu = cpu_devs[i];
+                       c->x86_vendor = this_cpu->c_x86_vendor;
+                       return;
+               }
+       }
+
+       printk_once(KERN_ERR
+                       "CPU: vendor_id '%s' unknown, using generic init.\n" \
+                       "CPU: Your system may be unstable.\n", v);
+
+       c->x86_vendor = X86_VENDOR_UNKNOWN;
+       this_cpu = &default_cpu;
+}
+
+void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
+{
+       /* Get vendor name */
+       cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
+             (unsigned int *)&c->x86_vendor_id[0],
+             (unsigned int *)&c->x86_vendor_id[8],
+             (unsigned int *)&c->x86_vendor_id[4]);
+
+       c->x86 = 4;
+       /* Intel-defined flags: level 0x00000001 */
+       if (c->cpuid_level >= 0x00000001) {
+               u32 junk, tfms, cap0, misc;
+
+               cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
+               c->x86 = (tfms >> 8) & 0xf;
+               c->x86_model = (tfms >> 4) & 0xf;
+               c->x86_mask = tfms & 0xf;
+
+               if (c->x86 == 0xf)
+                       c->x86 += (tfms >> 20) & 0xff;
+               if (c->x86 >= 0x6)
+                       c->x86_model += ((tfms >> 16) & 0xf) << 4;
+
+               if (cap0 & (1<<19)) {
+                       c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+                       c->x86_cache_alignment = c->x86_clflush_size;
+               }
+       }
+}
+
+void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
+{
+       u32 tfms, xlvl;
+       u32 ebx;
+
+       /* Intel-defined flags: level 0x00000001 */
+       if (c->cpuid_level >= 0x00000001) {
+               u32 capability, excap;
+
+               cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
+               c->x86_capability[0] = capability;
+               c->x86_capability[4] = excap;
+       }
+
+       /* Additional Intel-defined flags: level 0x00000007 */
+       if (c->cpuid_level >= 0x00000007) {
+               u32 eax, ebx, ecx, edx;
+
+               cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+
+               c->x86_capability[9] = ebx;
+       }
+
+       /* AMD-defined flags: level 0x80000001 */
+       xlvl = cpuid_eax(0x80000000);
+       c->extended_cpuid_level = xlvl;
+
+       if ((xlvl & 0xffff0000) == 0x80000000) {
+               if (xlvl >= 0x80000001) {
+                       c->x86_capability[1] = cpuid_edx(0x80000001);
+                       c->x86_capability[6] = cpuid_ecx(0x80000001);
+               }
+       }
+
+       if (c->extended_cpuid_level >= 0x80000008) {
+               u32 eax = cpuid_eax(0x80000008);
+
+               c->x86_virt_bits = (eax >> 8) & 0xff;
+               c->x86_phys_bits = eax & 0xff;
+       }
+#ifdef CONFIG_X86_32
+       else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
+               c->x86_phys_bits = 36;
+#endif
+
+       if (c->extended_cpuid_level >= 0x80000007)
+               c->x86_power = cpuid_edx(0x80000007);
+
+       init_scattered_cpuid_features(c);
+}
+
+static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+       int i;
+
+       /*
+        * First of all, decide if this is a 486 or higher
+        * It's a 486 if we can modify the AC flag
+        */
+       if (flag_is_changeable_p(X86_EFLAGS_AC))
+               c->x86 = 4;
+       else
+               c->x86 = 3;
+
+       for (i = 0; i < X86_VENDOR_NUM; i++)
+               if (cpu_devs[i] && cpu_devs[i]->c_identify) {
+                       c->x86_vendor_id[0] = 0;
+                       cpu_devs[i]->c_identify(c);
+                       if (c->x86_vendor_id[0]) {
+                               get_cpu_vendor(c);
+                               break;
+                       }
+               }
+#endif
+}
+
+/*
+ * Do minimum CPU detection early.
+ * Fields really needed: vendor, cpuid_level, family, model, mask,
+ * cache alignment.
+ * The others are not touched to avoid unwanted side effects.
+ *
+ * WARNING: this function is only called on the BP.  Don't add code here
+ * that is supposed to run on all CPUs.
+ */
+static void __init early_identify_cpu(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+       c->x86_clflush_size = 64;
+       c->x86_phys_bits = 36;
+       c->x86_virt_bits = 48;
+#else
+       c->x86_clflush_size = 32;
+       c->x86_phys_bits = 32;
+       c->x86_virt_bits = 32;
+#endif
+       c->x86_cache_alignment = c->x86_clflush_size;
+
+       memset(&c->x86_capability, 0, sizeof c->x86_capability);
+       c->extended_cpuid_level = 0;
+
+       if (!have_cpuid_p())
+               identify_cpu_without_cpuid(c);
+
+       /* cyrix could have cpuid enabled via c_identify()*/
+       if (!have_cpuid_p())
+               return;
+
+       cpu_detect(c);
+
+       get_cpu_vendor(c);
+
+       get_cpu_cap(c);
+#ifdef CONFIG_XEN
+       if (!cpu_has_xsave)
+               x86_xsave_setup(NULL);
+#endif
+
+       if (this_cpu->c_early_init)
+               this_cpu->c_early_init(c);
+
+       c->cpu_index = 0;
+       filter_cpuid_features(c, false);
+
+       setup_smep(c);
+
+       if (this_cpu->c_bsp_init)
+               this_cpu->c_bsp_init(c);
+}
+
+void __init early_cpu_init(void)
+{
+       const struct cpu_dev *const *cdev;
+       int count = 0;
+
+#ifdef CONFIG_PROCESSOR_SELECT
+       printk(KERN_INFO "KERNEL supported cpus:\n");
+#endif
+
+       for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
+               const struct cpu_dev *cpudev = *cdev;
+
+               if (count >= X86_VENDOR_NUM)
+                       break;
+               cpu_devs[count] = cpudev;
+               count++;
+
+#ifdef CONFIG_PROCESSOR_SELECT
+               {
+                       unsigned int j;
+
+                       for (j = 0; j < 2; j++) {
+                               if (!cpudev->c_ident[j])
+                                       continue;
+                               printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
+                                       cpudev->c_ident[j]);
+                       }
+               }
+#endif
+       }
+       early_identify_cpu(&boot_cpu_data);
+}
+
+/*
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
+ * unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
+ */
+static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+       clear_cpu_cap(c, X86_FEATURE_NOPL);
+#else
+       set_cpu_cap(c, X86_FEATURE_NOPL);
+#endif
+}
+
+static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
+{
+       c->extended_cpuid_level = 0;
+
+       if (!have_cpuid_p())
+               identify_cpu_without_cpuid(c);
+
+       /* cyrix could have cpuid enabled via c_identify()*/
+       if (!have_cpuid_p())
+               return;
+
+       cpu_detect(c);
+
+       get_cpu_vendor(c);
+
+       get_cpu_cap(c);
+
+#ifndef CONFIG_XEN
+       if (c->cpuid_level >= 0x00000001) {
+               c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_HT
+               c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+# else
+               c->apicid = c->initial_apicid;
+# endif
+#endif
+               c->phys_proc_id = c->initial_apicid;
+       }
+#endif
+
+       setup_smep(c);
+
+       get_model_name(c); /* Default name */
+
+       detect_nopl(c);
+}
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+{
+       int i;
+
+       c->loops_per_jiffy = loops_per_jiffy;
+       c->x86_cache_size = -1;
+       c->x86_vendor = X86_VENDOR_UNKNOWN;
+       c->x86_model = c->x86_mask = 0; /* So far unknown... */
+       c->x86_vendor_id[0] = '\0'; /* Unset */
+       c->x86_model_id[0] = '\0';  /* Unset */
+#ifndef CONFIG_XEN
+       c->x86_max_cores = 1;
+       c->x86_coreid_bits = 0;
+#endif
+#ifdef CONFIG_X86_64
+       c->x86_clflush_size = 64;
+       c->x86_phys_bits = 36;
+       c->x86_virt_bits = 48;
+#else
+       c->cpuid_level = -1;    /* CPUID not detected */
+       c->x86_clflush_size = 32;
+       c->x86_phys_bits = 32;
+       c->x86_virt_bits = 32;
+#endif
+       c->x86_cache_alignment = c->x86_clflush_size;
+       memset(&c->x86_capability, 0, sizeof c->x86_capability);
+       if (boot_cpu_has(X86_FEATURE_SYSCALL32))
+               set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+
+       generic_identify(c);
+
+       if (this_cpu->c_identify)
+               this_cpu->c_identify(c);
+
+       /* Clear/Set all flags overriden by options, after probe */
+       for (i = 0; i < NCAPINTS; i++) {
+               c->x86_capability[i] &= ~cpu_caps_cleared[i];
+               c->x86_capability[i] |= cpu_caps_set[i];
+       }
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+       c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+#endif
+
+       /*
+        * Vendor-specific initialization.  In this section we
+        * canonicalize the feature flags, meaning if there are
+        * features a certain CPU supports which CPUID doesn't
+        * tell us, CPUID claiming incorrect flags, or other bugs,
+        * we handle them here.
+        *
+        * At the end of this section, c->x86_capability better
+        * indicate the features this CPU genuinely supports!
+        */
+       if (this_cpu->c_init)
+               this_cpu->c_init(c);
+
+       /* Disable the PN if appropriate */
+       squash_the_stupid_serial_number(c);
+
+       /*
+        * The vendor-specific functions might have changed features.
+        * Now we do "generic changes."
+        */
+
+       /* Filter out anything that depends on CPUID levels we don't have */
+       filter_cpuid_features(c, true);
+
+       /* If the model name is still unset, do table lookup. */
+       if (!c->x86_model_id[0]) {
+               const char *p;
+               p = table_lookup_model(c);
+               if (p)
+                       strcpy(c->x86_model_id, p);
+               else
+                       /* Last resort... */
+                       sprintf(c->x86_model_id, "%02x/%02x",
+                               c->x86, c->x86_model);
+       }
+
+#ifdef CONFIG_X86_64
+       detect_ht(c);
+#endif
+
+       init_hypervisor(c);
+       x86_init_rdrand(c);
+
+       /*
+        * Clear/Set all flags overriden by options, need do it
+        * before following smp all cpus cap AND.
+        */
+       for (i = 0; i < NCAPINTS; i++) {
+               c->x86_capability[i] &= ~cpu_caps_cleared[i];
+               c->x86_capability[i] |= cpu_caps_set[i];
+       }
+
+       /*
+        * On SMP, boot_cpu_data holds the common feature set between
+        * all CPUs; so make sure that we indicate which features are
+        * common between the CPUs.  The first time this routine gets
+        * executed, c == &boot_cpu_data.
+        */
+       if (c != &boot_cpu_data) {
+               /* AND the already accumulated flags with these */
+               for (i = 0; i < NCAPINTS; i++)
+                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+       }
+
+       /* Init Machine Check Exception if available. */
+       mcheck_cpu_init(c);
+
+       select_idle_routine(c);
+
+#ifdef CONFIG_NUMA
+       numa_add_cpu(smp_processor_id());
+#endif
+}
+
+#ifdef CONFIG_X86_64
+static void vgetcpu_set_mode(void)
+{
+       if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
+               vgetcpu_mode = VGETCPU_RDTSCP;
+       else
+               vgetcpu_mode = VGETCPU_LSL;
+}
+#endif
+
+void __init identify_boot_cpu(void)
+{
+       identify_cpu(&boot_cpu_data);
+       init_amd_e400_c1e_mask();
+#ifdef CONFIG_X86_32
+       sysenter_setup();
+       enable_sep_cpu();
+#else
+       vgetcpu_set_mode();
+#endif
+}
+
+#ifdef CONFIG_XEN
+void set_perf_event_pending(void) {}
+#endif
+
+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+       BUG_ON(c == &boot_cpu_data);
+       identify_cpu(c);
+#ifdef CONFIG_X86_32
+       enable_sep_cpu();
+#endif
+       mtrr_ap_init();
+}
+
+struct msr_range {
+       unsigned        min;
+       unsigned        max;
+};
+
+static const struct msr_range msr_range_array[] __cpuinitconst = {
+       { 0x00000000, 0x00000418},
+       { 0xc0000000, 0xc000040b},
+       { 0xc0010000, 0xc0010142},
+       { 0xc0011000, 0xc001103b},
+};
+
+static void __cpuinit __print_cpu_msr(void)
+{
+       unsigned index_min, index_max;
+       unsigned index;
+       u64 val;
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
+               index_min = msr_range_array[i].min;
+               index_max = msr_range_array[i].max;
+
+               for (index = index_min; index < index_max; index++) {
+                       if (rdmsrl_amd_safe(index, &val))
+                               continue;
+                       printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
+               }
+       }
+}
+
+static int show_msr __cpuinitdata;
+
+static __init int setup_show_msr(char *arg)
+{
+       int num;
+
+       get_option(&arg, &num);
+
+       if (num > 0)
+               show_msr = num;
+       return 1;
+}
+__setup("show_msr=", setup_show_msr);
+
+static __init int setup_noclflush(char *arg)
+{
+       setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+       return 1;
+}
+__setup("noclflush", setup_noclflush);
+
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+{
+       const char *vendor = NULL;
+
+       if (c->x86_vendor < X86_VENDOR_NUM) {
+               vendor = this_cpu->c_vendor;
+       } else {
+               if (c->cpuid_level >= 0)
+                       vendor = c->x86_vendor_id;
+       }
+
+       if (vendor && !strstr(c->x86_model_id, vendor))
+               printk(KERN_CONT "%s ", vendor);
+
+       if (c->x86_model_id[0])
+               printk(KERN_CONT "%s", c->x86_model_id);
+       else
+               printk(KERN_CONT "%d86", c->x86);
+
+       if (c->x86_mask || c->cpuid_level >= 0)
+               printk(KERN_CONT " stepping %02x\n", c->x86_mask);
+       else
+               printk(KERN_CONT "\n");
+
+       print_cpu_msr(c);
+}
+
+void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c)
+{
+       if (c->cpu_index < show_msr)
+               __print_cpu_msr();
+}
+
+static __init int setup_disablecpuid(char *arg)
+{
+       int bit;
+
+       if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+               setup_clear_cpu_cap(bit);
+       else
+               return 0;
+
+       return 1;
+}
+__setup("clearcpuid=", setup_disablecpuid);
+
+#ifdef CONFIG_X86_64
+#ifndef CONFIG_X86_NO_IDT
+struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
+struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
+                                   (unsigned long) nmi_idt_table };
+#endif
+
+DEFINE_PER_CPU_FIRST(union irq_stack_union,
+                    irq_stack_union) __aligned(PAGE_SIZE);
+
+void xen_switch_pt(void)
+{
+#ifdef CONFIG_XEN
+       xen_pt_switch(init_level4_pgt);
+#endif
+}
+
+/*
+ * The following four percpu variables are hot.  Align current_task to
+ * cacheline size such that all four fall in the same cacheline.
+ */
+DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
+       &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+DEFINE_PER_CPU(unsigned long, kernel_stack) =
+       (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(kernel_stack);
+
+DEFINE_PER_CPU(char *, irq_stack_ptr) =
+       init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+
+DEFINE_PER_CPU(unsigned int, irq_count) = -1;
+
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+
+#ifndef CONFIG_X86_NO_TSS
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+         [0 ... N_EXCEPTION_STACKS - 1]        = EXCEPTION_STKSZ,
+         [DEBUG_STACK - 1]                     = DEBUG_STKSZ
+};
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+       [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+#endif
+
+void __cpuinit syscall_init(void)
+{
+#ifndef CONFIG_XEN
+       /*
+        * LSTAR and STAR live in a bit strange symbiosis.
+        * They both write to the same internal register. STAR allows to
+        * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
+        */
+       wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
+       wrmsrl(MSR_LSTAR, system_call);
+       wrmsrl(MSR_CSTAR, ignore_sysret);
+#endif
+
+#ifdef CONFIG_IA32_EMULATION
+       syscall32_cpu_init();
+#elif defined(CONFIG_XEN)
+       struct callback_register cb = {
+               .type = CALLBACKTYPE_syscall32,
+               .address = (unsigned long)ignore_sysret
+       };
+
+       if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb))
+               pr_warning("Unable to register CSTAR stub\n");
+       cb.type = CALLBACKTYPE_sysenter;
+       if (HYPERVISOR_callback_op(CALLBACKOP_register, &cb))
+               pr_warning("Unable to register SEP stub\n");
+#endif
+
+#ifndef CONFIG_XEN
+       /* Flags to clear on syscall */
+       wrmsrl(MSR_SYSCALL_MASK,
+              X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+#endif
+}
+
+unsigned long kernel_eflags;
+
+#ifndef CONFIG_X86_NO_TSS
+/*
+ * Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+#endif
+
+#ifndef CONFIG_X86_NO_IDT
+static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
+DEFINE_PER_CPU(int, debug_stack_usage);
+
+int is_debug_stack(unsigned long addr)
+{
+       return __get_cpu_var(debug_stack_usage) ||
+               (addr <= __get_cpu_var(debug_stack_addr) &&
+                addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
+}
+
+void debug_stack_set_zero(void)
+{
+       load_idt((const struct desc_ptr *)&nmi_idt_descr);
+}
+
+void debug_stack_reset(void)
+{
+       load_idt((const struct desc_ptr *)&idt_descr);
+}
+#endif
+
+#else  /* CONFIG_X86_64 */
+
+DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+#endif
+
+/* Make sure %fs and %gs are initialized properly in idle threads */
+struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
+{
+       memset(regs, 0, sizeof(struct pt_regs));
+       regs->fs = __KERNEL_PERCPU;
+       regs->gs = __KERNEL_STACK_CANARY;
+
+       return regs;
+}
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Clear all 6 debug registers:
+ */
+static void clear_all_debug_regs(void)
+{
+       int i;
+
+       for (i = 0; i < 8; i++) {
+               /* Ignore db4, db5 */
+               if ((i == 4) || (i == 5))
+                       continue;
+
+               set_debugreg(0, i);
+       }
+}
+
+#ifdef CONFIG_KGDB
+/*
+ * Restore debug regs if using kgdbwait and you have a kernel debugger
+ * connection established.
+ */
+static void dbg_restore_debug_regs(void)
+{
+       if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
+               arch_kgdb_ops.correct_hw_break();
+}
+#else /* ! CONFIG_KGDB */
+#define dbg_restore_debug_regs()
+#endif /* ! CONFIG_KGDB */
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init for 64 bit
+ */
+#ifdef CONFIG_X86_64
+
+void __cpuinit cpu_init(void)
+{
+#ifndef CONFIG_X86_NO_TSS
+       struct orig_ist *oist;
+       struct tss_struct *t;
+       unsigned long v;
+       int i;
+#endif
+       struct task_struct *me;
+       int cpu;
+
+       cpu = stack_smp_processor_id();
+       /* CPU 0 is initialised in head64.c */
+       if (cpu != 0)
+               xen_switch_pt();
+#ifndef CONFIG_X86_NO_TSS
+       t = &per_cpu(init_tss, cpu);
+       oist = &per_cpu(orig_ist, cpu);
+#endif
+
+#ifdef CONFIG_NUMA
+       if (cpu != 0 && percpu_read(numa_node) == 0 &&
+           early_cpu_to_node(cpu) != NUMA_NO_NODE)
+               set_numa_node(early_cpu_to_node(cpu));
+#endif
+
+       me = current;
+
+       if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
+               panic("CPU#%d already initialized!\n", cpu);
+
+       pr_debug("Initializing CPU#%d\n", cpu);
+
+       clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+       /*
+        * Initialize the per-CPU GDT with the boot GDT,
+        * and set up the GDT descriptor:
+        */
+
+       switch_to_new_gdt(cpu);
+       loadsegment(fs, 0);
+
+#ifndef CONFIG_X86_NO_IDT
+       load_idt((const struct desc_ptr *)&idt_descr);
+#endif
+
+       memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+       syscall_init();
+
+       wrmsrl(MSR_FS_BASE, 0);
+       wrmsrl(MSR_KERNEL_GS_BASE, 0);
+       barrier();
+
+       x86_configure_nx();
+#ifdef CONFIG_X86_LOCAL_APIC
+       if (cpu != 0)
+               enable_x2apic();
+#endif
+
+#ifndef CONFIG_X86_NO_TSS
+       /*
+        * set up and load the per-CPU TSS
+        */
+       if (!oist->ist[0]) {
+               char *estacks = per_cpu(exception_stacks, cpu);
+
+               for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+                       estacks += exception_stack_sizes[v];
+                       oist->ist[v] = t->x86_tss.ist[v] =
+                                       (unsigned long)estacks;
+#ifndef CONFIG_X86_NO_IDT
+                       if (v == DEBUG_STACK-1)
+                               per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
+#endif
+               }
+       }
+
+       t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+
+       /*
+        * <= is required because the CPU will access up to
+        * 8 bits beyond the end of the IO permission bitmap.
+        */
+       for (i = 0; i <= IO_BITMAP_LONGS; i++)
+               t->io_bitmap[i] = ~0UL;
+#endif
+
+       atomic_inc(&init_mm.mm_count);
+       me->active_mm = &init_mm;
+       BUG_ON(me->mm);
+       enter_lazy_tlb(&init_mm, me);
+
+       load_sp0(t, &current->thread);
+#ifndef CONFIG_X86_NO_TSS
+       set_tss_desc(cpu, t);
+       load_TR_desc();
+#endif
+       load_LDT(&init_mm.context);
+
+       clear_all_debug_regs();
+       dbg_restore_debug_regs();
+
+       fpu_init();
+       xsave_init();
+
+#ifndef CONFIG_XEN
+       raw_local_save_flags(kernel_eflags);
+#else
+       asm ("pushfq; popq %0" : "=rm" (kernel_eflags));
+       if (raw_irqs_disabled())
+               kernel_eflags &= ~X86_EFLAGS_IF;
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+       if (is_uv_system())
+               uv_cpu_init();
+#endif
+}
+
+#else
+
+void __cpuinit cpu_init(void)
+{
+       int cpu = smp_processor_id();
+       struct task_struct *curr = current;
+#ifndef CONFIG_X86_NO_TSS
+       struct tss_struct *t = &per_cpu(init_tss, cpu);
+#endif
+       struct thread_struct *thread = &curr->thread;
+
+       if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
+               printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
+               for (;;)
+                       local_irq_enable();
+       }
+
+       printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+       if (cpu_has_vme || cpu_has_de)
+               clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+       switch_to_new_gdt(cpu);
+
+       /*
+        * Set up and load the per-CPU TSS and LDT
+        */
+       atomic_inc(&init_mm.mm_count);
+       curr->active_mm = &init_mm;
+       BUG_ON(curr->mm);
+       enter_lazy_tlb(&init_mm, curr);
+
+       load_sp0(t, thread);
+
+       load_LDT(&init_mm.context);
+
+#ifndef CONFIG_X86_NO_TSS
+       t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+#endif
+
+#ifdef CONFIG_DOUBLEFAULT
+       /* Set up doublefault TSS pointer in the GDT */
+       __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
+#endif
+
+       clear_all_debug_regs();
+       dbg_restore_debug_regs();
+
+       fpu_init();
+       xsave_init();
+}
+#endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c

index 3e6ff6c..c55ce62 100644 (file)
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -36,10 +36,15 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
                 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
  
                 if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
+#ifndef CONFIG_XEN
                         misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
                         wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
                         c->cpuid_level = cpuid_eax(0);
                         get_cpu_cap(c);
+#else
+                       pr_warning("CPUID levels are restricted -"
+                                  " update hypervisor\n");
+#endif
                 }
         }
  
@@ -47,6 +52,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
                 (c->x86 == 0x6 && c->x86_model >= 0x0e))
                 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
  
+#ifndef CONFIG_XEN
         if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) {
                 unsigned lower_word;
  
@@ -69,6 +75,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
                 printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
                 clear_cpu_cap(c, X86_FEATURE_PSE);
         }
+#endif
  
  #ifdef CONFIG_X86_64
         set_cpu_cap(c, X86_FEATURE_SYSENTER32);
@@ -93,8 +100,10 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
         if (c->x86_power & (1 << 8)) {
                 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
                 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+#ifndef CONFIG_XEN
                 if (!check_tsc_unstable())
                         sched_clock_stable = 1;
+#endif
         }
  
         /*
@@ -238,9 +247,13 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
                 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
                 if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
                         printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
+#ifndef CONFIG_XEN
                         printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
                         lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
                         wrmsr(MSR_IA32_MISC_ENABLE, lo, hi);
+#else
+                       pr_warning("CPU: Hypervisor update needed\n");
+#endif
                 }
         }
  
@@ -285,6 +298,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
  }
  #endif
  
+#ifndef CONFIG_XEN
  static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
  {
  #ifdef CONFIG_NUMA
@@ -357,6 +371,7 @@ static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
                         set_cpu_cap(c, X86_FEATURE_VPID);
         }
  }
+#endif
  
  static void __cpuinit init_intel(struct cpuinfo_x86 *c)
  {
@@ -440,6 +455,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
                 set_cpu_cap(c, X86_FEATURE_P3);
  #endif
  
+#ifndef CONFIG_XEN
         if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
                 /*
                  * let's use the legacy cpuid vector 0x1 and 0x4 for topology
@@ -456,6 +472,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
  
         if (cpu_has(c, X86_FEATURE_VMX))
                 detect_vmx_virtcap(c);
+#endif
  
         /*
          * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c

index b8f3653..d57b832 100644 (file)
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -279,8 +279,9 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
         eax->split.type = types[leaf];
         eax->split.level = levels[leaf];
         eax->split.num_threads_sharing = 0;
+#ifndef CONFIG_XEN
         eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
-
+#endif
  
         if (assoc == 0xffff)
                 eax->split.is_fully_associative = 1;
@@ -298,7 +299,7 @@ struct _cache_attr {
                          unsigned int);
  };
  
-#ifdef CONFIG_AMD_NB
+#if defined(CONFIG_AMD_NB) && !defined(CONFIG_XEN)
  
  /*
   * L3 cache descriptors
@@ -578,8 +579,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
         unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
         unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
         unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
-       unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
  #ifdef CONFIG_X86_HT
+       unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
         unsigned int cpu = c->cpu_index;
  #endif
  
@@ -613,16 +614,20 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
                                         break;
                                 case 2:
                                         new_l2 = this_leaf.size/1024;
+#ifdef CONFIG_X86_HT
                                         num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
                                         index_msb = get_count_order(num_threads_sharing);
                                         l2_id = c->apicid >> index_msb;
+#endif
                                         break;
                                 case 3:
                                         new_l3 = this_leaf.size/1024;
+#ifdef CONFIG_X86_HT
                                         num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
                                         index_msb = get_count_order(
                                                         num_threads_sharing);
                                         l3_id = c->apicid >> index_msb;
+#endif
                                         break;
                                 default:
                                         break;
@@ -723,7 +728,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
  static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
  #define CPUID4_INFO_IDX(x, y)  (&((per_cpu(ici_cpuid4_info, x))[y]))
  
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
  
  static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
  {
@@ -982,7 +987,7 @@ static struct attribute *default_attrs[] = {
         NULL
  };
  
-#ifdef CONFIG_AMD_NB
+#if defined(CONFIG_AMD_NB) && !defined(CONFIG_XEN)
  static struct attribute ** __cpuinit amd_l3_attrs(void)
  {
         static struct attribute **attrs;
@@ -1128,7 +1133,7 @@ static int __cpuinit cache_add_dev(struct device *dev)
                 this_leaf = CPUID4_INFO_IDX(cpu, i);
  
                 ktype_cache.default_attrs = default_attrs;
-#ifdef CONFIG_AMD_NB
+#if defined(CONFIG_AMD_NB) && !defined(CONFIG_XEN)
                 if (this_leaf->base.nb)
                         ktype_cache.default_attrs = amd_l3_attrs();
  #endif
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile

index bb34b03..21e0a8a 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -3,6 +3,7 @@ obj-y                           =  mce.o mce-severity.o
  obj-$(CONFIG_X86_ANCIENT_MCE)  += winchip.o p5.o
  obj-$(CONFIG_X86_MCE_INTEL)    += mce_intel.o
  obj-$(CONFIG_X86_MCE_AMD)      += mce_amd.o
+obj-$(CONFIG_X86_XEN_MCE)      += mce_dom0.o
  obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
  obj-$(CONFIG_X86_MCE_INJECT)   += mce-inject.o
  
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c

index fc4beb3..abdbadd 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -93,6 +93,7 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
         return NMI_HANDLED;
  }
  
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
  static void mce_irq_ipi(void *info)
  {
         int cpu = smp_processor_id();
@@ -104,6 +105,7 @@ static void mce_irq_ipi(void *info)
                 raise_exception(m, NULL);
         }
  }
+#endif
  
  /* Inject mce on current CPU */
  static int raise_local(void)
@@ -151,7 +153,7 @@ static void raise_mce(struct mce *m)
         if (context == MCJ_CTX_RANDOM)
                 return;
  
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
         if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) {
                 unsigned long start;
                 int cpu;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c

index 11c9166..769363c 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -118,8 +118,10 @@ void mce_setup(struct mce *m)
         m->time = get_seconds();
         m->cpuvendor = boot_cpu_data.x86_vendor;
         m->cpuid = cpuid_eax(1);
+#ifndef CONFIG_XEN
         m->socketid = cpu_data(m->extcpu).phys_proc_id;
         m->apicid = cpu_data(m->extcpu).initial_apicid;
+#endif
         rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
  }
  
@@ -266,9 +268,14 @@ static void print_mce(struct mce *m)
          * Note this output is parsed by external tools and old fields
          * should not be changed.
          */
+#ifndef CONFIG_XEN
         pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
                 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
                 cpu_data(m->extcpu).microcode);
+#else
+       pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+               m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
+#endif
  
         /*
          * Print out human-readable details about the MCE error,
@@ -1236,8 +1243,15 @@ void mce_log_therm_throt_event(__u64 status)
   * Periodic polling timer for "silent" machine check errors.  If the
   * poller finds an MCE, poll 2x faster.  When the poller finds no more
   * errors, poll 2x slower (up to check_interval seconds).
+ *
+ * We will disable polling in DOM0 since all CMCI/Polling
+ * mechanism will be done in XEN for Intel CPUs
   */
+#if defined (CONFIG_X86_XEN_MCE)
+static int check_interval = 0; /* disable polling */
+#else
  static int check_interval = 5 * 60; /* 5 minutes */
+#endif
  
  static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
  static DEFINE_PER_CPU(struct timer_list, mce_timer);
@@ -1410,6 +1424,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
  
         /* This should be disabled by the BIOS, but isn't always */
         if (c->x86_vendor == X86_VENDOR_AMD) {
+#ifndef CONFIG_XEN
                 if (c->x86 == 15 && banks > 4) {
                         /*
                          * disable GART TBL walk error reporting, which
@@ -1418,6 +1433,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
                          */
                         clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
                 }
+#endif
                 if (c->x86 <= 17 && mce_bootlog < 0) {
                         /*
                          * Lots of broken BIOS around that don't clear them
@@ -1490,6 +1506,7 @@ static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
  
  static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
  {
+#ifndef CONFIG_X86_64_XEN
         switch (c->x86_vendor) {
         case X86_VENDOR_INTEL:
                 mce_intel_feature_init(c);
@@ -1500,6 +1517,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
         default:
                 break;
         }
+#endif
  }
  
  static void __mcheck_cpu_init_timer(void)
@@ -2288,6 +2306,16 @@ static __init int mcheck_init_device(void)
         /* register character device /dev/mcelog */
         misc_register(&mce_chrdev_device);
  
+#ifdef CONFIG_X86_XEN_MCE
+       if (is_initial_xendomain()) {
+               /* Register vIRQ handler for MCE LOG processing */
+               extern int bind_virq_for_mce(void);
+
+               printk(KERN_DEBUG "MCE: bind virq for DOM0 logging\n");
+               bind_virq_for_mce();
+       }
+#endif
+
         return err;
  }
  device_initcall(mcheck_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_dom0.c b/arch/x86/kernel/cpu/mcheck/mce_dom0.c

new file mode 100644 (file)

index 0000000..b6d5c3e
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce_dom0.c
@@ -0,0 +1,185 @@
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <xen/interface/xen.h>
+#include <xen/evtchn.h>
+#include <xen/interface/vcpu.h>
+#include <asm/hypercall.h>
+#include <asm/mce.h>
+
+static xen_mc_logical_cpu_t *g_physinfo;
+static unsigned int ncpus;
+
+static int convert_log(struct mc_info *mi)
+{
+       struct mcinfo_common *mic = NULL;
+       struct mcinfo_global *mc_global;
+       struct mcinfo_bank *mc_bank;
+       struct mce m;
+       unsigned int i;
+       bool found = false;
+
+       x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
+       if (mic == NULL)
+       {
+               pr_err("DOM0_MCE_LOG: global data is NULL\n");
+               return -1;
+       }
+
+       mce_setup(&m);
+       mc_global = (struct mcinfo_global*)mic;
+       m.mcgstatus = mc_global->mc_gstatus;
+       m.apicid = mc_global->mc_apicid;
+
+       for (i = 0; i < ncpus; i++)
+               if (g_physinfo[i].mc_apicid == m.apicid) {
+                       found = true;
+                       break;
+               }
+       WARN_ON_ONCE(!found);
+       m.socketid = mc_global->mc_socketid;
+       m.cpu = m.extcpu = g_physinfo[i].mc_cpunr;
+       m.cpuvendor = (__u8)g_physinfo[i].mc_vendor;
+
+       x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK);
+       do
+       {
+               if (mic == NULL || mic->size == 0)
+                       break;
+               if (mic->type == MC_TYPE_BANK)
+               {
+                       mc_bank = (struct mcinfo_bank*)mic;
+                       m.misc = mc_bank->mc_misc;
+                       m.status = mc_bank->mc_status;
+                       m.addr = mc_bank->mc_addr;
+                       m.tsc = mc_bank->mc_tsc;
+                       m.bank = mc_bank->mc_bank;
+                       printk(KERN_DEBUG "[CPU%d, BANK%d, addr %llx, state %llx]\n", 
+                                               m.bank, m.cpu, m.addr, m.status);
+                       /*log this record*/
+                       mce_log(&m);
+               }
+               mic = x86_mcinfo_next(mic);
+       }while (1);
+
+       return 0;
+}
+
+static struct mc_info *g_mi;
+
+/*dom0 mce virq handler, logging physical mce error info*/
+
+static irqreturn_t mce_dom0_interrupt(int irq, void *dev_id)
+{
+       xen_mc_t mc_op;
+       int result = 0;
+
+       printk(KERN_DEBUG "MCE_DOM0_LOG: enter dom0 mce vIRQ handler\n");
+       mc_op.cmd = XEN_MC_fetch;
+       set_xen_guest_handle(mc_op.u.mc_fetch.data, g_mi);
+urgent:
+       mc_op.u.mc_fetch.flags = XEN_MC_URGENT;
+       result = HYPERVISOR_mca(&mc_op);
+       if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
+                       mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
+       {
+               printk(KERN_DEBUG "MCE_DOM0_LOG: No more urgent data\n");
+               goto nonurgent;
+       }
+       else
+       {
+               result = convert_log(g_mi);
+               if (result) {
+                       pr_err("MCE_DOM0_LOG: Log conversion failed\n");
+                       goto end;
+               }
+               /* After fetching the telem from DOM0, we need to dec the telem's
+                * refcnt and release the entry. The telem is reserved and inc
+                * refcnt when filling the telem.
+                */
+               mc_op.u.mc_fetch.flags = XEN_MC_URGENT | XEN_MC_ACK;
+               result = HYPERVISOR_mca(&mc_op);
+
+               goto urgent;
+       }
+nonurgent:
+       mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT;
+       result = HYPERVISOR_mca(&mc_op);
+       if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA ||
+                       mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED)
+       {
+               printk(KERN_DEBUG "MCE_DOM0_LOG: No more nonurgent data\n");
+               goto end;
+       }
+       else
+       {
+               result = convert_log(g_mi);
+               if (result) {
+                       pr_err("MCE_DOM0_LOG: Log conversion failed\n");
+                       goto end;
+               }
+               /* After fetching the telem from DOM0, we need to dec the telem's
+                * refcnt and release the entry. The telem is reserved and inc
+                * refcnt when filling the telem.
+                */
+               mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT | XEN_MC_ACK;
+               result = HYPERVISOR_mca(&mc_op);
+
+               goto nonurgent;
+       }
+end:
+       return IRQ_HANDLED;
+}
+
+int __init bind_virq_for_mce(void)
+{
+       int ret;
+       xen_mc_t mc_op;
+
+       g_mi = kmalloc(sizeof(*g_mi), GFP_KERNEL);
+       if (!g_mi)
+               return -ENOMEM;
+
+       /* fetch physical CPU count */
+       mc_op.cmd = XEN_MC_physcpuinfo;
+       set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, NULL);
+       ret = HYPERVISOR_mca(&mc_op);
+       if (ret) {
+               pr_err("MCE: Failed to get physical CPU count\n");
+               kfree(g_mi);
+               return ret;
+       }
+
+       /* fetch CPU physical info for later reference */
+       ncpus = mc_op.u.mc_physcpuinfo.ncpus;
+       g_physinfo = kmalloc(sizeof(*g_physinfo) * ncpus, GFP_KERNEL);
+       if (!g_physinfo) {
+               kfree(g_mi);
+               return -ENOMEM;
+       }
+       set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo);
+       ret = HYPERVISOR_mca(&mc_op);
+       if (ret) {
+               pr_err("MCE: Failed to get physical CPUs' info\n");
+               kfree(g_mi);
+               kfree(g_physinfo);
+               return ret;
+       }
+
+       ret  = bind_virq_to_irqhandler(VIRQ_MCA, 0, 
+               mce_dom0_interrupt, 0, "mce", NULL);
+
+       if (ret < 0) {
+               pr_err("MCE: Failed to bind vIRQ for Dom0\n");
+               kfree(g_mi);
+               kfree(g_physinfo);
+               return ret;
+       }
+
+       /* Log the machine checks left over from the previous reset. */
+       mce_dom0_interrupt(VIRQ_MCA, NULL);
+
+       return 0;
+}
+
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile

index ad9e5ed..b854116 100644 (file)
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,4 @@
  obj-y          := main.o if.o generic.o cleanup.o
  obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
  
+obj-$(CONFIG_XEN) := main.o if.o
diff --git a/arch/x86/kernel/cpu/mtrr/main-xen.c b/arch/x86/kernel/cpu/mtrr/main-xen.c

new file mode 100644 (file)

index 0000000..013e120
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
@@ -0,0 +1,326 @@
+#define DEBUG
+
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+
+#include <asm/mtrr.h>
+#include "mtrr.h"
+
+static DEFINE_MUTEX(mtrr_mutex);
+
+void generic_get_mtrr(unsigned int reg, unsigned long *base,
+                     unsigned long *size, mtrr_type * type)
+{
+       struct xen_platform_op op;
+
+       op.cmd = XENPF_read_memtype;
+       op.u.read_memtype.reg = reg;
+       if (unlikely(HYPERVISOR_platform_op(&op)))
+               memset(&op.u.read_memtype, 0, sizeof(op.u.read_memtype));
+
+       *size = op.u.read_memtype.nr_mfns;
+       *base = op.u.read_memtype.mfn;
+       *type = op.u.read_memtype.type;
+}
+
+const struct mtrr_ops generic_mtrr_ops = {
+       .use_intel_if      = 1,
+       .get               = generic_get_mtrr,
+};
+
+const struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
+unsigned int num_var_ranges;
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
+
+static u64 tom2;
+
+static void __init set_num_var_ranges(void)
+{
+       struct xen_platform_op op;
+
+       for (num_var_ranges = 0; ; num_var_ranges++) {
+               op.cmd = XENPF_read_memtype;
+               op.u.read_memtype.reg = num_var_ranges;
+               if (HYPERVISOR_platform_op(&op) != 0)
+                       break;
+       }
+}
+
+static void __init init_table(void)
+{
+       int i, max;
+
+       max = num_var_ranges;
+       for (i = 0; i < max; i++)
+               mtrr_usage_table[i] = 0;
+}
+
+int mtrr_add_page(unsigned long base, unsigned long size,
+                 unsigned int type, bool increment)
+{
+       int error;
+       struct xen_platform_op op;
+
+       mutex_lock(&mtrr_mutex);
+
+       op.cmd = XENPF_add_memtype;
+       op.u.add_memtype.mfn     = base;
+       op.u.add_memtype.nr_mfns = size;
+       op.u.add_memtype.type    = type;
+       error = HYPERVISOR_platform_op(&op);
+       if (error) {
+               mutex_unlock(&mtrr_mutex);
+               BUG_ON(error > 0);
+               return error;
+       }
+
+       if (increment)
+               ++mtrr_usage_table[op.u.add_memtype.reg];
+
+       mutex_unlock(&mtrr_mutex);
+
+       return op.u.add_memtype.reg;
+}
+
+static int mtrr_check(unsigned long base, unsigned long size)
+{
+       if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
+               pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
+               pr_debug("mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
+               dump_stack();
+               return -1;
+       }
+       return 0;
+}
+
+int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
+            bool increment)
+{
+       if (mtrr_check(base, size))
+               return -EINVAL;
+       return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
+                            increment);
+}
+EXPORT_SYMBOL(mtrr_add);
+
+int mtrr_del_page(int reg, unsigned long base, unsigned long size)
+{
+       unsigned i;
+       mtrr_type ltype;
+       unsigned long lbase, lsize;
+       int error = -EINVAL;
+       struct xen_platform_op op;
+
+       mutex_lock(&mtrr_mutex);
+
+       if (reg < 0) {
+               /*  Search for existing MTRR  */
+               for (i = 0; i < num_var_ranges; ++i) {
+                       mtrr_if->get(i, &lbase, &lsize, &ltype);
+                       if (lbase == base && lsize == size) {
+                               reg = i;
+                               break;
+                       }
+               }
+               if (reg < 0) {
+                       pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n",
+                                base, size);
+                       goto out;
+               }
+       }
+       if (mtrr_usage_table[reg] < 1) {
+               pr_warning("mtrr: reg: %d has count=0\n", reg);
+               goto out;
+       }
+       if (--mtrr_usage_table[reg] < 1) {
+               op.cmd = XENPF_del_memtype;
+               op.u.del_memtype.handle = 0;
+               op.u.del_memtype.reg    = reg;
+               error = HYPERVISOR_platform_op(&op);
+               if (error) {
+                       BUG_ON(error > 0);
+                       goto out;
+               }
+       }
+       error = reg;
+ out:
+       mutex_unlock(&mtrr_mutex);
+       return error;
+}
+
+int mtrr_del(int reg, unsigned long base, unsigned long size)
+{
+       if (mtrr_check(base, size))
+               return -EINVAL;
+       return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(mtrr_del);
+
+/*
+ * Returns the effective MTRR type for the region
+ * Error returns:
+ * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
+ * - 0xFF - when MTRR is not enabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end)
+{
+       int i, error;
+       u64 start_mfn, end_mfn, base_mfn, top_mfn;
+       u8 prev_match, curr_match;
+       struct xen_platform_op op;
+
+       if (!is_initial_xendomain())
+               return MTRR_TYPE_WRBACK;
+
+       if (!num_var_ranges)
+               return 0xFF;
+
+       start_mfn = start >> PAGE_SHIFT;
+       /* Make end inclusive end, instead of exclusive */
+       end_mfn = --end >> PAGE_SHIFT;
+
+       /* Look in fixed ranges. Just return the type as per start */
+       if (start_mfn < 0x100) {
+#if 0//todo
+               op.cmd = XENPF_read_memtype;
+               op.u.read_memtype.reg = ???;
+               error = HYPERVISOR_platform_op(&op);
+               if (!error)
+                       return op.u.read_memtype.type;
+#endif
+               return MTRR_TYPE_UNCACHABLE;
+       }
+
+       /*
+        * Look in variable ranges
+        * Look of multiple ranges matching this address and pick type
+        * as per MTRR precedence
+        */
+       prev_match = 0xFF;
+       for (i = 0; i < num_var_ranges; ++i) {
+               op.cmd = XENPF_read_memtype;
+               op.u.read_memtype.reg = i;
+               error = HYPERVISOR_platform_op(&op);
+
+               if (error || !op.u.read_memtype.nr_mfns)
+                       continue;
+
+               base_mfn = op.u.read_memtype.mfn;
+               top_mfn = base_mfn + op.u.read_memtype.nr_mfns - 1;
+
+               if (base_mfn > end_mfn || start_mfn > top_mfn) {
+                       continue;
+               }
+
+               if (base_mfn > start_mfn || end_mfn > top_mfn) {
+                       return 0xFE;
+               }
+
+               curr_match = op.u.read_memtype.type;
+               if (prev_match == 0xFF) {
+                       prev_match = curr_match;
+                       continue;
+               }
+
+               if (prev_match == MTRR_TYPE_UNCACHABLE ||
+                   curr_match == MTRR_TYPE_UNCACHABLE) {
+                       return MTRR_TYPE_UNCACHABLE;
+               }
+
+               if ((prev_match == MTRR_TYPE_WRBACK &&
+                    curr_match == MTRR_TYPE_WRTHROUGH) ||
+                   (prev_match == MTRR_TYPE_WRTHROUGH &&
+                    curr_match == MTRR_TYPE_WRBACK)) {
+                       prev_match = MTRR_TYPE_WRTHROUGH;
+                       curr_match = MTRR_TYPE_WRTHROUGH;
+               }
+
+               if (prev_match != curr_match) {
+                       return MTRR_TYPE_UNCACHABLE;
+               }
+       }
+
+       if (tom2) {
+               if (start >= (1ULL<<32) && (end < tom2))
+                       return MTRR_TYPE_WRBACK;
+       }
+
+       if (prev_match != 0xFF)
+               return prev_match;
+
+#if 0//todo
+       op.cmd = XENPF_read_def_memtype;
+       error = HYPERVISOR_platform_op(&op);
+       if (!error)
+               return op.u.read_def_memtype.type;
+#endif
+       return MTRR_TYPE_UNCACHABLE;
+}
+
+/*
+ * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
+ * for memory >4GB. Check for that here.
+ * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
+ * apply to are wrong, but so far we don't know of any such case in the wild.
+ */
+#define Tom2Enabled (1U << 21)
+#define Tom2ForceMemTypeWB (1U << 22)
+
+static int __init _amd_special_default_mtrr(void)
+{
+       u32 l, h;
+
+       if (!is_initial_xendomain())
+               return 0;
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+               return 0;
+       if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
+               return 0;
+       /* In case some hypervisor doesn't pass SYSCFG through */
+       if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+               return 0;
+       /*
+        * Memory between 4GB and top of mem is forced WB by this magic bit.
+        * Reserved before K8RevF, but should be zero there.
+        */
+       if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
+                (Tom2Enabled | Tom2ForceMemTypeWB))
+               return 1;
+       return 0;
+}
+
+void __init mtrr_bp_init(void)
+{
+       if (_amd_special_default_mtrr()) {
+               /* TOP_MEM2 */
+               rdmsrl(MSR_K8_TOP_MEM2, tom2);
+               tom2 &= 0xffffff8000000ULL;
+       }
+}
+
+void mtrr_ap_init(void)
+{
+}
+
+static int __init mtrr_init(void)
+{
+       struct cpuinfo_x86 *c = &boot_cpu_data;
+
+       if (!is_initial_xendomain())
+               return -ENODEV;
+
+       if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
+           (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
+           (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
+           (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
+               return -ENODEV;
+
+       set_num_var_ranges();
+       init_table();
+
+       return 0;
+}
+
+subsys_initcall(mtrr_init);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

index bb8e034..7c8dbeb 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1730,6 +1730,17 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
   * callchain support
   */
  
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+       /* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+       /* Ignore warnings */
+}
+
  static int backtrace_stack(void *data, char *name)
  {
         return 0;
@@ -1743,6 +1754,8 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
  }
  
  static const struct stacktrace_ops backtrace_ops = {
+       .warning                = backtrace_warning,
+       .warning_symbol         = backtrace_warning_symbol,
         .stack                  = backtrace_stack,
         .address                = backtrace_address,
         .walk_stack             = print_context_stack_bp,
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c

index 8022c66..845e3bd 100644 (file)
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -10,7 +10,7 @@
  static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
                               unsigned int cpu)
  {
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
         if (c->x86_max_cores * smp_num_siblings > 1) {
                 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
                 seq_printf(m, "siblings\t: %d\n",
@@ -32,18 +32,22 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
          */
         int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
         seq_printf(m,
+#ifndef CONFIG_XEN
                    "fdiv_bug\t: %s\n"
                    "hlt_bug\t\t: %s\n"
                    "f00f_bug\t: %s\n"
                    "coma_bug\t: %s\n"
+#endif
                    "fpu\t\t: %s\n"
                    "fpu_exception\t: %s\n"
                    "cpuid level\t: %d\n"
                    "wp\t\t: %s\n",
+#ifndef CONFIG_XEN
                    c->fdiv_bug ? "yes" : "no",
                    c->hlt_works_ok ? "no" : "yes",
                    c->f00f_bug ? "yes" : "no",
                    c->coma_bug ? "yes" : "no",
+#endif
                    c->hard_math ? "yes" : "no",
                    fpu_exception ? "yes" : "no",
                    c->cpuid_level,
@@ -83,8 +87,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
                 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
         else
                 seq_printf(m, "stepping\t: unknown\n");
+#ifndef CONFIG_XEN
         if (c->microcode)
                 seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
+#endif
  
         if (cpu_has(c, X86_FEATURE_TSC)) {
                 unsigned int freq = cpufreq_quick_get(cpu);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c

index addf9e8..a1c53de 100644 (file)
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -41,6 +41,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
                 { X86_FEATURE_XSAVEOPT,         CR_EAX, 0, 0x0000000d, 1 },
                 { X86_FEATURE_CPB,              CR_EDX, 9, 0x80000007, 0 },
                 { X86_FEATURE_HW_PSTATE,        CR_EDX, 7, 0x80000007, 0 },
+#ifndef CONFIG_XEN
                 { X86_FEATURE_NPT,              CR_EDX, 0, 0x8000000a, 0 },
                 { X86_FEATURE_LBRV,             CR_EDX, 1, 0x8000000a, 0 },
                 { X86_FEATURE_SVML,             CR_EDX, 2, 0x8000000a, 0 },
@@ -51,6 +52,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
                 { X86_FEATURE_DECODEASSISTS,    CR_EDX, 7, 0x8000000a, 0 },
                 { X86_FEATURE_PAUSEFILTER,      CR_EDX,10, 0x8000000a, 0 },
                 { X86_FEATURE_PFTHRESHOLD,      CR_EDX,12, 0x8000000a, 0 },
+#endif
                 { 0, 0, 0, 0, 0 }
         };
  
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c

index 4397e98..dc581ec 100644 (file)
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -28,7 +28,7 @@
   */
  void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
  {
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_XEN)
         unsigned int eax, ebx, ecx, edx, sub_index;
         unsigned int ht_mask_width, core_plus_mask_width;
         unsigned int core_select_mask, core_level_siblings;
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c

index 1b81839..f65a2aa 100644 (file)
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -17,12 +17,18 @@
  #include <linux/sysfs.h>
  
  #include <asm/stacktrace.h>
+#include <linux/unwind.h>
  
  
  int panic_on_unrecovered_nmi;
  int panic_on_io_nmi;
  unsigned int code_bytes = 64;
  int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
+#ifdef CONFIG_STACK_UNWIND
+static int call_trace = 1;
+#else
+#define call_trace (-1)
+#endif
  static int die_counter;
  
  void printk_address(unsigned long address, int reliable)
@@ -65,6 +71,69 @@ print_ftrace_graph_addr(unsigned long addr, void *data,
  { }
  #endif
  
+int asmlinkage dump_trace_unwind(struct unwind_frame_info *info,
+                     const struct stacktrace_ops *ops, void *data)
+{
+       int n = 0;
+#ifdef CONFIG_STACK_UNWIND
+       unsigned long sp = UNW_SP(info);
+
+       if (arch_unw_user_mode(info))
+               return -1;
+       while (unwind(info) == 0 && UNW_PC(info)) {
+               n++;
+               ops->address(data, UNW_PC(info), 1);
+               if (arch_unw_user_mode(info))
+                       break;
+               if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1))
+                   && sp > UNW_SP(info))
+                       break;
+               sp = UNW_SP(info);
+       }
+#endif
+       return n;
+}
+
+int try_stack_unwind(struct task_struct *task, struct pt_regs *regs,
+                    unsigned long **stack, unsigned long *bp,
+                    const struct stacktrace_ops *ops, void *data)
+{
+#ifdef CONFIG_STACK_UNWIND
+       int unw_ret = 0;
+       struct unwind_frame_info info;
+       if (call_trace < 0)
+               return 0;
+
+       if (regs) {
+               if (unwind_init_frame_info(&info, task, regs) == 0)
+                       unw_ret = dump_trace_unwind(&info, ops, data);
+       } else if (task == current)
+               unw_ret = unwind_init_running(&info, dump_trace_unwind, ops, data);
+#ifdef CONFIG_SMP
+       else if (task->on_cpu)
+               /* nothing */;
+#endif
+       else if (unwind_init_blocked(&info, task) == 0)
+               unw_ret = dump_trace_unwind(&info, ops, data);
+       if (unw_ret > 0) {
+               if (call_trace == 1 && !arch_unw_user_mode(&info)) {
+                       ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
+                                           UNW_PC(&info));
+                       if (UNW_SP(&info) >= PAGE_OFFSET) {
+                               ops->warning(data, "Leftover inexact backtrace:\n");
+                               *stack = (void *)UNW_SP(&info);
+                               *bp = UNW_FP(&info);
+                               return 0;
+                       }
+               } else if (call_trace >= 1)
+                       return -1;
+               ops->warning(data, "Full inexact backtrace again:\n");
+       } else
+               ops->warning(data, "Inexact backtrace:\n");
+#endif
+       return 0;
+}
+
  /*
   * x86-64 can have up to three kernel stacks:
   * process stack
@@ -138,6 +207,20 @@ print_context_stack_bp(struct thread_info *tinfo,
  }
  EXPORT_SYMBOL_GPL(print_context_stack_bp);
  
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+       printk(data);
+       print_symbol(msg, symbol);
+       printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+       printk("%s%s\n", (char *)data, msg);
+}
+
  static int print_trace_stack(void *data, char *name)
  {
         printk("%s <%s> ", (char *)data, name);
@@ -155,6 +238,8 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
  }
  
  static const struct stacktrace_ops print_trace_ops = {
+       .warning                = print_trace_warning,
+       .warning_symbol         = print_trace_warning_symbol,
         .stack                  = print_trace_stack,
         .address                = print_trace_address,
         .walk_stack             = print_context_stack,
@@ -327,3 +412,21 @@ static int __init code_bytes_setup(char *s)
         return 1;
  }
  __setup("code_bytes=", code_bytes_setup);
+
+#ifdef CONFIG_STACK_UNWIND
+static int __init call_trace_setup(char *s)
+{
+       if (!s)
+               return -EINVAL;
+       if (strcmp(s, "old") == 0)
+               call_trace = -1;
+       else if (strcmp(s, "both") == 0)
+               call_trace = 0;
+       else if (strcmp(s, "newfallback") == 0)
+               call_trace = 1;
+       else if (strcmp(s, "new") == 0)
+               call_trace = 2;
+       return 0;
+}
+early_param("call_trace", call_trace_setup);
+#endif
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c

index 88ec912..18d0eb6 100644 (file)
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -26,6 +26,10 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
         if (!task)
                 task = current;
  
+       bp = stack_frame(task, regs);
+       if (try_stack_unwind(task, regs, &stack, &bp, ops, data))
+               return;
+
         if (!stack) {
                 unsigned long dummy;
  
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c

index 17107bd..d9b4f99 100644 (file)
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -14,12 +14,14 @@
  #include <linux/bug.h>
  #include <linux/nmi.h>
  
+#include <linux/unwind.h>
  #include <asm/stacktrace.h>
  
  
  #define N_EXCEPTION_STACKS_END \
                 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
  
+#ifndef CONFIG_X86_NO_TSS
  static char x86_stack_ids[][8] = {
                 [ DEBUG_STACK-1                 ]       = "#DB",
                 [ NMI_STACK-1                   ]       = "NMI",
@@ -31,10 +33,12 @@ static char x86_stack_ids[][8] = {
                   N_EXCEPTION_STACKS_END        ]       = "#DB[?]"
  #endif
  };
+#endif
  
  static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
                                          unsigned *usedp, char **idp)
  {
+#ifndef CONFIG_X86_NO_TSS
         unsigned k;
  
         /*
@@ -94,6 +98,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
                 }
  #endif
         }
+#endif /* CONFIG_X86_NO_TSS */
         return NULL;
  }
  
@@ -126,6 +131,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
         if (!task)
                 task = current;
  
+       bp = stack_frame(task, regs);
+       if (try_stack_unwind(task, regs, &stack, &bp, ops, data)) {
+               put_cpu();
+               return;
+       }
+
         if (!stack) {
                 if (regs)
                         stack = (unsigned long *)regs->sp;
diff --git a/arch/x86/kernel/e820-xen.c b/arch/x86/kernel/e820-xen.c

new file mode 100644 (file)

index 0000000..481afb7
--- /dev/null
+++ b/arch/x86/kernel/e820-xen.c
@@ -0,0 +1,1291 @@
+/*
+ * Handle the memory map.
+ * The functions here do the job until bootmem takes over.
+ *
+ *  Getting sanitize_e820_map() in sync with i386 version by applying change:
+ *  -  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *     Alex Achenbach <xela@slit.de>, December 2002.
+ *  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/crash_dump.h>
+#include <linux/export.h>
+#include <linux/bootmem.h>
+#include <linux/pfn.h>
+#include <linux/suspend.h>
+#include <linux/acpi.h>
+#include <linux/firmware-map.h>
+#include <linux/memblock.h>
+#include <linux/sort.h>
+
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/setup.h>
+#include <xen/interface/memory.h>
+
+/*
+ * The e820 map is the map that gets modified e.g. with command line parameters
+ * and that is also registered with modifications in the kernel resource tree
+ * with the iomem_resource as parent.
+ *
+ * The e820_saved is directly saved after the BIOS-provided memory map is
+ * copied. It doesn't get modified afterwards. It's registered for the
+ * /sys/firmware/memmap interface.
+ *
+ * That memory map is not modified and is used as base for kexec. The kexec'd
+ * kernel should get the same memory map as the firmware provides. Then the
+ * user can e.g. boot the original kernel with mem=1G while still booting the
+ * next kernel with full memory.
+ */
+struct e820map e820;
+#if !defined(CONFIG_XEN)
+struct e820map e820_saved;
+#elif defined(CONFIG_XEN_PRIVILEGED_GUEST)
+struct e820map machine_e820;
+# define e820_saved machine_e820
+#else
+# define machine_e820 e820
+# define e820_saved e820
+#endif
+
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0xaeedbabe;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int
+e820_any_mapped(u64 start, u64 end, unsigned type)
+{
+       int i;
+
+#ifndef CONFIG_XEN
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+#else
+       if (!is_initial_xendomain())
+               return 0;
+       for (i = 0; i < machine_e820.nr_map; ++i) {
+               const struct e820entry *ei = &machine_e820.map[i];
+#endif
+
+               if (type && ei->type != type)
+                       continue;
+               if (ei->addr >= end || ei->addr + ei->size <= start)
+                       continue;
+               return 1;
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(e820_any_mapped);
+
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init e820_all_mapped(u64 start, u64 end, unsigned type)
+{
+       int i;
+
+#ifndef CONFIG_XEN
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+#else
+       if (!is_initial_xendomain())
+               return 0;
+       for (i = 0; i < machine_e820.nr_map; ++i) {
+               const struct e820entry *ei = &machine_e820.map[i];
+#endif
+
+               if (type && ei->type != type)
+                       continue;
+               /* is the region (part) in overlap with the current region ?*/
+               if (ei->addr >= end || ei->addr + ei->size <= start)
+                       continue;
+
+               /* if the region is at the beginning of <start,end> we move
+                * start to the end of the region since it's ok until there
+                */
+               if (ei->addr <= start)
+                       start = ei->addr + ei->size;
+               /*
+                * if start is now at or beyond end, we're done, full
+                * coverage
+                */
+               if (start >= end)
+                       return 1;
+       }
+       return 0;
+}
+
+/*
+ * Add a memory region to the kernel e820 map.
+ */
+static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
+                                        int type)
+{
+       int x = e820x->nr_map;
+
+       if (x >= ARRAY_SIZE(e820x->map)) {
+               printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+               return;
+       }
+
+       e820x->map[x].addr = start;
+       e820x->map[x].size = size;
+       e820x->map[x].type = type;
+       e820x->nr_map++;
+}
+
+void __init e820_add_region(u64 start, u64 size, int type)
+{
+       __e820_add_region(&e820, start, size, type);
+}
+
+static void __init e820_print_type(u32 type)
+{
+       switch (type) {
+       case E820_RAM:
+       case E820_RESERVED_KERN:
+               printk(KERN_CONT "(usable)");
+               break;
+       case E820_RESERVED:
+               printk(KERN_CONT "(reserved)");
+               break;
+       case E820_ACPI:
+               printk(KERN_CONT "(ACPI data)");
+               break;
+       case E820_NVS:
+               printk(KERN_CONT "(ACPI NVS)");
+               break;
+       case E820_UNUSABLE:
+               printk(KERN_CONT "(unusable)");
+               break;
+       default:
+               printk(KERN_CONT "type %u", type);
+               break;
+       }
+}
+
+static void __init _e820_print_map(const struct e820map *e820, const char *who)
+{
+       int i;
+
+       for (i = 0; i < e820->nr_map; i++) {
+               printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
+                      (unsigned long long) e820->map[i].addr,
+                      (unsigned long long)
+                      (e820->map[i].addr + e820->map[i].size));
+               e820_print_type(e820->map[i].type);
+               printk(KERN_CONT "\n");
+       }
+}
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries. The following
+ * replaces the original e820 map with a new one, removing overlaps,
+ * and resolving conflicting memory types in favor of highest
+ * numbered type.
+ *
+ * The input parameter biosmap points to an array of 'struct
+ * e820entry' which on entry has elements in the range [0, *pnr_map)
+ * valid, and which has space for up to max_nr_map entries.
+ * On return, the resulting sanitized e820 map entries will be in
+ * overwritten in the same location, starting at biosmap.
+ *
+ * The integer pointed to by pnr_map must be valid on entry (the
+ * current number of valid entries located at biosmap) and will
+ * be updated on return, with the new number of valid entries
+ * (something no more than max_nr_map.)
+ *
+ * The return value from sanitize_e820_map() is zero if it
+ * successfully 'sanitized' the map entries passed in, and is -1
+ * if it did nothing, which can happen if either of (1) it was
+ * only passed one map entry, or (2) any of the input map entries
+ * were invalid (start + size < start, meaning that the size was
+ * so big the described memory range wrapped around through zero.)
+ *
+ *     Visually we're performing the following
+ *     (1,2,3,4 = memory types)...
+ *
+ *     Sample memory map (w/overlaps):
+ *        ____22__________________
+ *        ______________________4_
+ *        ____1111________________
+ *        _44_____________________
+ *        11111111________________
+ *        ____________________33__
+ *        ___________44___________
+ *        __________33333_________
+ *        ______________22________
+ *        ___________________2222_
+ *        _________111111111______
+ *        _____________________11_
+ *        _________________4______
+ *
+ *     Sanitized equivalent (no overlap):
+ *        1_______________________
+ *        _44_____________________
+ *        ___1____________________
+ *        ____22__________________
+ *        ______11________________
+ *        _________1______________
+ *        __________3_____________
+ *        ___________44___________
+ *        _____________33_________
+ *        _______________2________
+ *        ________________1_______
+ *        _________________4______
+ *        ___________________2____
+ *        ____________________33__
+ *        ______________________4_
+ */
+struct change_member {
+       struct e820entry *pbios; /* pointer to original bios entry */
+       unsigned long long addr; /* address for this change point */
+};
+
+static int __init cpcompare(const void *a, const void *b)
+{
+       struct change_member * const *app = a, * const *bpp = b;
+       const struct change_member *ap = *app, *bp = *bpp;
+
+       /*
+        * Inputs are pointers to two elements of change_point[].  If their
+        * addresses are unequal, their difference dominates.  If the addresses
+        * are equal, then consider one that represents the end of its region
+        * to be greater than one that does not.
+        */
+       if (ap->addr != bp->addr)
+               return ap->addr > bp->addr ? 1 : -1;
+
+       return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
+}
+
+int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
+                            u32 *pnr_map)
+{
+       static struct change_member change_point_list[2*E820_X_MAX] __initdata;
+       static struct change_member *change_point[2*E820_X_MAX] __initdata;
+       static struct e820entry *overlap_list[E820_X_MAX] __initdata;
+       static struct e820entry new_bios[E820_X_MAX] __initdata;
+       unsigned long current_type, last_type;
+       unsigned long long last_addr;
+       int chgidx;
+       int overlap_entries;
+       int new_bios_entry;
+       int old_nr, new_nr, chg_nr;
+       int i;
+
+       /* if there's only one memory region, don't bother */
+#ifdef CONFIG_XEN
+       if (*pnr_map == 1)
+               return 0;
+#endif
+       if (*pnr_map < 2)
+               return -1;
+
+       old_nr = *pnr_map;
+       BUG_ON(old_nr > max_nr_map);
+
+       /* bail out if we find any unreasonable addresses in bios map */
+       for (i = 0; i < old_nr; i++)
+               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+                       return -1;
+
+       /* create pointers for initial change-point information (for sorting) */
+       for (i = 0; i < 2 * old_nr; i++)
+               change_point[i] = &change_point_list[i];
+
+       /* record all known change-points (starting and ending addresses),
+          omitting those that are for empty memory regions */
+       chgidx = 0;
+       for (i = 0; i < old_nr; i++)    {
+               if (biosmap[i].size != 0) {
+                       change_point[chgidx]->addr = biosmap[i].addr;
+                       change_point[chgidx++]->pbios = &biosmap[i];
+                       change_point[chgidx]->addr = biosmap[i].addr +
+                               biosmap[i].size;
+                       change_point[chgidx++]->pbios = &biosmap[i];
+               }
+       }
+       chg_nr = chgidx;
+
+       /* sort change-point list by memory addresses (low -> high) */
+       sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
+
+       /* create a new bios memory map, removing overlaps */
+       overlap_entries = 0;     /* number of entries in the overlap table */
+       new_bios_entry = 0;      /* index for creating new bios map entries */
+       last_type = 0;           /* start with undefined memory type */
+       last_addr = 0;           /* start with 0 as last starting address */
+
+       /* loop through change-points, determining affect on the new bios map */
+       for (chgidx = 0; chgidx < chg_nr; chgidx++) {
+               /* keep track of all overlapping bios entries */
+               if (change_point[chgidx]->addr ==
+                   change_point[chgidx]->pbios->addr) {
+                       /*
+                        * add map entry to overlap list (> 1 entry
+                        * implies an overlap)
+                        */
+                       overlap_list[overlap_entries++] =
+                               change_point[chgidx]->pbios;
+               } else {
+                       /*
+                        * remove entry from list (order independent,
+                        * so swap with last)
+                        */
+                       for (i = 0; i < overlap_entries; i++) {
+                               if (overlap_list[i] ==
+                                   change_point[chgidx]->pbios)
+                                       overlap_list[i] =
+                                               overlap_list[overlap_entries-1];
+                       }
+                       overlap_entries--;
+               }
+               /*
+                * if there are overlapping entries, decide which
+                * "type" to use (larger value takes precedence --
+                * 1=usable, 2,3,4,4+=unusable)
+                */
+               current_type = 0;
+               for (i = 0; i < overlap_entries; i++)
+                       if (overlap_list[i]->type > current_type)
+                               current_type = overlap_list[i]->type;
+               /*
+                * continue building up new bios map based on this
+                * information
+                */
+               if (current_type != last_type)  {
+                       if (last_type != 0)      {
+                               new_bios[new_bios_entry].size =
+                                       change_point[chgidx]->addr - last_addr;
+                               /*
+                                * move forward only if the new size
+                                * was non-zero
+                                */
+                               if (new_bios[new_bios_entry].size != 0)
+                                       /*
+                                        * no more space left for new
+                                        * bios entries ?
+                                        */
+                                       if (++new_bios_entry >= max_nr_map)
+                                               break;
+                       }
+                       if (current_type != 0)  {
+                               new_bios[new_bios_entry].addr =
+                                       change_point[chgidx]->addr;
+                               new_bios[new_bios_entry].type = current_type;
+                               last_addr = change_point[chgidx]->addr;
+                       }
+                       last_type = current_type;
+               }
+       }
+       /* retain count for new bios entries */
+       new_nr = new_bios_entry;
+
+       /* copy new bios mapping into original location */
+       memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
+       *pnr_map = new_nr;
+
+       return 0;
+}
+
+static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
+{
+       while (nr_map) {
+               u64 start = biosmap->addr;
+               u64 size = biosmap->size;
+               u64 end = start + size;
+               u32 type = biosmap->type;
+
+               /* Overflow in 64 bits? Ignore the memory map. */
+               if (start > end)
+                       return -1;
+
+               e820_add_region(start, size, type);
+
+               biosmap++;
+               nr_map--;
+       }
+       return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ */
+static int __init append_e820_map(struct e820entry *biosmap, int nr_map)
+{
+#ifndef CONFIG_XEN
+       /* Only one memory region (or negative)? Ignore it */
+       if (nr_map < 2)
+               return -1;
+#else
+       BUG_ON(nr_map < 1);
+#endif
+
+       return __append_e820_map(biosmap, nr_map);
+}
+
+static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
+                                       u64 size, unsigned old_type,
+                                       unsigned new_type)
+{
+       u64 end;
+       unsigned int i;
+       u64 real_updated_size = 0;
+
+       BUG_ON(old_type == new_type);
+
+       if (size > (ULLONG_MAX - start))
+               size = ULLONG_MAX - start;
+
+       end = start + size;
+       printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
+                      (unsigned long long) start,
+                      (unsigned long long) end);
+       e820_print_type(old_type);
+       printk(KERN_CONT " ==> ");
+       e820_print_type(new_type);
+       printk(KERN_CONT "\n");
+
+       for (i = 0; i < e820x->nr_map; i++) {
+               struct e820entry *ei = &e820x->map[i];
+               u64 final_start, final_end;
+               u64 ei_end;
+
+               if (ei->type != old_type)
+                       continue;
+
+               ei_end = ei->addr + ei->size;
+               /* totally covered by new range? */
+               if (ei->addr >= start && ei_end <= end) {
+                       ei->type = new_type;
+                       real_updated_size += ei->size;
+                       continue;
+               }
+
+               /* new range is totally covered? */
+               if (ei->addr < start && ei_end > end) {
+                       __e820_add_region(e820x, start, size, new_type);
+                       __e820_add_region(e820x, end, ei_end - end, ei->type);
+                       ei->size = start - ei->addr;
+                       real_updated_size += size;
+                       continue;
+               }
+
+               /* partially covered */
+               final_start = max(start, ei->addr);
+               final_end = min(end, ei_end);
+               if (final_start >= final_end)
+                       continue;
+
+               __e820_add_region(e820x, final_start, final_end - final_start,
+                                 new_type);
+
+               real_updated_size += final_end - final_start;
+
+               /*
+                * left range could be head or tail, so need to update
+                * size at first.
+                */
+               ei->size -= final_end - final_start;
+               if (ei->addr < final_start)
+                       continue;
+               ei->addr = final_end;
+       }
+       return real_updated_size;
+}
+
+u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
+                            unsigned new_type)
+{
+       return __e820_update_range(&e820, start, size, old_type, new_type);
+}
+
+#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST
+static u64 __init e820_update_range_saved(u64 start, u64 size,
+                                         unsigned old_type, unsigned new_type)
+{
+#ifdef CONFIG_XEN
+       if (!is_initial_xendomain())
+               return 0;
+       return __e820_update_range(&machine_e820, phys_to_machine(start),
+                                  size, old_type, new_type);
+#else
+       return __e820_update_range(&e820_saved, start, size, old_type,
+                                    new_type);
+#endif
+}
+#endif
+
+/* make e820 not cover the range */
+u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
+                            int checktype)
+{
+       int i;
+       u64 end;
+       u64 real_removed_size = 0;
+
+       if (size > (ULLONG_MAX - start))
+               size = ULLONG_MAX - start;
+
+       end = start + size;
+       printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
+                      (unsigned long long) start,
+                      (unsigned long long) end);
+       if (checktype)
+               e820_print_type(old_type);
+       printk(KERN_CONT "\n");
+
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+               u64 final_start, final_end;
+               u64 ei_end;
+
+               if (checktype && ei->type != old_type)
+                       continue;
+
+               ei_end = ei->addr + ei->size;
+               /* totally covered? */
+               if (ei->addr >= start && ei_end <= end) {
+                       real_removed_size += ei->size;
+                       memset(ei, 0, sizeof(struct e820entry));
+                       continue;
+               }
+
+               /* new range is totally covered? */
+               if (ei->addr < start && ei_end > end) {
+                       e820_add_region(end, ei_end - end, ei->type);
+                       ei->size = start - ei->addr;
+                       real_removed_size += size;
+                       continue;
+               }
+
+               /* partially covered */
+               final_start = max(start, ei->addr);
+               final_end = min(end, ei_end);
+               if (final_start >= final_end)
+                       continue;
+               real_removed_size += final_end - final_start;
+
+               /*
+                * left range could be head or tail, so need to update
+                * size at first.
+                */
+               ei->size -= final_end - final_start;
+               if (ei->addr < final_start)
+                       continue;
+               ei->addr = final_end;
+       }
+       return real_removed_size;
+}
+
+void __init update_e820(void)
+{
+       u32 nr_map;
+
+       nr_map = e820.nr_map;
+       if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
+               return;
+       e820.nr_map = nr_map;
+       printk(KERN_INFO "modified physical RAM map:\n");
+       _e820_print_map(&e820, "modified");
+}
+#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST
+static void __init update_e820_saved(void)
+{
+       u32 nr_map;
+
+       nr_map = e820_saved.nr_map;
+       if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
+               return;
+       e820_saved.nr_map = nr_map;
+}
+#endif
+
+#ifdef CONFIG_XEN
+#define e820 machine_e820
+#endif
+
+#define MAX_GAP_END 0x100000000ull
+/*
+ * Search for a gap in the e820 memory space from start_addr to end_addr.
+ */
+__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
+               unsigned long start_addr, unsigned long long end_addr)
+{
+       unsigned long long last;
+       int i = e820.nr_map;
+       int found = 0;
+
+       last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
+#ifdef CONFIG_X86_64
+       if (start_addr >= MAX_GAP_END)
+               last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits);
+#endif
+
+       while (--i >= 0) {
+               unsigned long long start = e820.map[i].addr;
+               unsigned long long end = start + e820.map[i].size;
+
+               if (end < start_addr)
+                       continue;
+
+               /*
+                * Since "last" is at most 4GB, we know we'll
+                * fit in 32 bits if this condition is true
+                */
+               if (last > end) {
+                       unsigned long gap = last - end;
+
+                       if (gap >= *gapsize) {
+                               *gapsize = gap;
+                               *gapstart = end;
+                               found = 1;
+                       }
+               }
+               if (start < last)
+                       last = start;
+       }
+       return found;
+}
+
+/*
+ * Search for the biggest gap in the low 32 bits of the e820
+ * memory space.  We pass this space to PCI to assign MMIO resources
+ * for hotplug or unconfigured devices in.
+ * Hopefully the BIOS let enough space left.
+ */
+__init void e820_setup_gap(void)
+{
+       unsigned long gapstart, gapsize;
+       int found;
+
+       gapstart = 0x10000000;
+       gapsize = 0x400000;
+       found  = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
+
+#ifdef CONFIG_X86_64
+       if (!found) {
+               printk(KERN_ERR
+       "PCI: Warning: Cannot find a gap in the 32bit address range\n"
+       "PCI: Unassigned devices with 32bit resource registers may break!\n");
+               found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0);
+               WARN_ON(!found);
+       }
+#endif
+
+       /*
+        * e820_reserve_resources_late protect stolen RAM already
+        */
+       pci_mem_start = gapstart;
+
+       printk(KERN_INFO
+              "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
+              pci_mem_start, gapstart, gapsize);
+}
+
+#undef e820
+
+#ifndef CONFIG_XEN
+/**
+ * Because of the size limitation of struct boot_params, only first
+ * 128 E820 memory entries are passed to kernel via
+ * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
+ * linked list of struct setup_data, which is parsed here.
+ */
+void __init parse_e820_ext(struct setup_data *sdata)
+{
+       int entries;
+       struct e820entry *extmap;
+
+       entries = sdata->len / sizeof(struct e820entry);
+       extmap = (struct e820entry *)(sdata->data);
+       __append_e820_map(extmap, entries);
+       sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+       printk(KERN_INFO "extended physical RAM map:\n");
+       _e820_print_map(&e820, "extended");
+}
+
+#if defined(CONFIG_X86_64) || \
+       (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
+/**
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(unsigned long limit_pfn)
+{
+       int i;
+       unsigned long pfn;
+
+       pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+       for (i = 1; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+
+               if (pfn < PFN_UP(ei->addr))
+                       register_nosave_region(pfn, PFN_UP(ei->addr));
+
+               pfn = PFN_DOWN(ei->addr + ei->size);
+               if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+                       register_nosave_region(PFN_UP(ei->addr), pfn);
+
+               if (pfn >= limit_pfn)
+                       break;
+       }
+}
+#endif
+
+#ifdef CONFIG_ACPI
+/**
+ * Mark ACPI NVS memory region, so that we can save/restore it during
+ * hibernation and the subsequent resume.
+ */
+static int __init e820_mark_nvs_memory(void)
+{
+       int i;
+
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+
+               if (ei->type == E820_NVS)
+                       acpi_nvs_register(ei->addr, ei->size);
+       }
+
+       return 0;
+}
+core_initcall(e820_mark_nvs_memory);
+#endif
+#endif
+
+#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST
+/*
+ * pre allocated 4k and reserved it in memblock and e820_saved
+ */
+u64 __init early_reserve_e820(u64 size, u64 align)
+{
+       u64 addr;
+#ifdef CONFIG_XEN
+       unsigned int order = get_order(size);
+       int rc;
+       unsigned long max_initmap_pfn;
+
+       if (!is_initial_xendomain())
+               return 0;
+       size = PAGE_SIZE << order;
+       if (align < PAGE_SIZE)
+               align = PAGE_SIZE;
+#endif
+       addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+       if (addr) {
+               e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
+               printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
+               update_e820_saved();
+       }
+#ifdef CONFIG_XEN
+       else
+               return 0;
+       max_initmap_pfn = ALIGN(PFN_UP(__pa(xen_start_info->pt_base))
+                                      + xen_start_info->nr_pt_frames
+                                      + 1 + (1 << (19 - PAGE_SHIFT)),
+                               1UL << (22 - PAGE_SHIFT));
+#ifdef CONFIG_X86_32
+       if ((addr >> PAGE_SHIFT)
+           < max(max_initmap_pfn, max_pfn_mapped))
+               rc = xen_create_contiguous_region((unsigned long)__va(addr),
+                                                 order, 32);
+#else
+       if ((addr >> PAGE_SHIFT) < max_pfn_mapped)
+               rc = xen_create_contiguous_region((unsigned long)__va(addr),
+                                                 order, 32);
+       else if ((addr >> PAGE_SHIFT) < max_initmap_pfn)
+               rc = xen_create_contiguous_region(__START_KERNEL_map + addr,
+                                                 order, 32);
+#endif
+       else
+               rc = early_create_contiguous_region(addr >> PAGE_SHIFT,
+                                                   order, 32);
+       if (rc)
+               return 0;
+#endif
+
+       return addr;
+}
+#endif
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_PAE
+#  define MAX_ARCH_PFN         (1ULL<<(40-PAGE_SHIFT))
+# else
+#  define MAX_ARCH_PFN         (1ULL<<(32-PAGE_SHIFT))
+# endif
+#else /* CONFIG_X86_32 */
+# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
+#endif
+
+/*
+ * Find the highest page frame number we have available
+ */
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+{
+       int i;
+       unsigned long last_pfn = 0;
+       unsigned long max_arch_pfn = MAX_ARCH_PFN;
+
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+               unsigned long start_pfn;
+               unsigned long end_pfn;
+
+               if (ei->type != type)
+                       continue;
+
+               start_pfn = ei->addr >> PAGE_SHIFT;
+               end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
+
+               if (start_pfn >= limit_pfn)
+                       continue;
+               if (end_pfn > limit_pfn) {
+                       last_pfn = limit_pfn;
+                       break;
+               }
+               if (end_pfn > last_pfn)
+                       last_pfn = end_pfn;
+       }
+
+       if (last_pfn > max_arch_pfn)
+               last_pfn = max_arch_pfn;
+
+       printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
+                        last_pfn, max_arch_pfn);
+       return last_pfn;
+}
+unsigned long __init e820_end_of_ram_pfn(void)
+{
+       return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+}
+
+unsigned long __init e820_end_of_low_ram_pfn(void)
+{
+       return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+}
+
+static void early_panic(char *msg)
+{
+       early_printk(msg);
+       panic(msg);
+}
+
+static int userdef __initdata;
+
+/* "mem=nopentium" disables the 4MB page tables. */
+static int __init parse_memopt(char *p)
+{
+       u64 mem_size, current_end;
+       unsigned int i;
+
+       if (!p)
+               return -EINVAL;
+
+#ifndef CONFIG_XEN
+       if (!strcmp(p, "nopentium")) {
+#ifdef CONFIG_X86_32
+               setup_clear_cpu_cap(X86_FEATURE_PSE);
+               return 0;
+#else
+               printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n");
+               return -EINVAL;
+#endif
+       }
+#endif
+
+       userdef = 1;
+       mem_size = memparse(p, &p);
+       /* don't remove all of memory when handling "mem={invalid}" param */
+       if (mem_size == 0)
+               return -EINVAL;
+#ifdef CONFIG_XEN
+       /*
+        * A little less than 2% of available memory are needed for page
+        * tables, p2m map, and mem_map. Hence the maximum amount of memory
+        * we can potentially balloon up to can in no case exceed about 50
+        * times of what we've been given initially. Since even with that we
+        * won't be able to boot (due to various calculations done based on
+        * the total number of pages) we further restrict this to factor 32.
+        */
+       if ((mem_size >> (PAGE_SHIFT + 5)) > xen_start_info->nr_pages) {
+               u64 size = (u64)xen_start_info->nr_pages << 5;
+
+               pr_warn("mem=%Luk is invalid for an initial"
+                       " allocation of %luk, using %Luk\n",
+                       (unsigned long long)mem_size >> 10,
+                       xen_start_info->nr_pages << (PAGE_SHIFT - 10),
+                       (unsigned long long)size << (PAGE_SHIFT - 10));
+               mem_size = size << PAGE_SHIFT;
+       }
+#endif
+       e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+
+       i = e820.nr_map - 1;
+       current_end = e820.map[i].addr + e820.map[i].size;
+       if (current_end < mem_size) {
+               /*
+                * The e820 map ends before our requested size so
+                * extend the final entry to the requested address.
+                */
+               if (e820.map[i].type == E820_RAM)
+                       e820.map[i].size = mem_size - e820.map[i].addr;
+               else
+                       e820_add_region(current_end, mem_size - current_end, E820_RAM);
+       }
+
+       return 0;
+}
+early_param("mem", parse_memopt);
+
+#ifndef CONFIG_XEN
+static int __init parse_memmap_opt(char *p)
+{
+       char *oldp;
+       u64 start_at, mem_size;
+
+       if (!p)
+               return -EINVAL;
+
+       if (!strncmp(p, "exactmap", 8)) {
+#ifdef CONFIG_CRASH_DUMP
+               /*
+                * If we are doing a crash dump, we still need to know
+                * the real mem size before original memory map is
+                * reset.
+                */
+               saved_max_pfn = e820_end_of_ram_pfn();
+#endif
+               e820.nr_map = 0;
+               userdef = 1;
+               return 0;
+       }
+
+       oldp = p;
+       mem_size = memparse(p, &p);
+       if (p == oldp)
+               return -EINVAL;
+
+       userdef = 1;
+       if (*p == '@') {
+               start_at = memparse(p+1, &p);
+               e820_add_region(start_at, mem_size, E820_RAM);
+       } else if (*p == '#') {
+               start_at = memparse(p+1, &p);
+               e820_add_region(start_at, mem_size, E820_ACPI);
+       } else if (*p == '$') {
+               start_at = memparse(p+1, &p);
+               e820_add_region(start_at, mem_size, E820_RESERVED);
+       } else
+               e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+
+       return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+#endif
+
+void __init finish_e820_parsing(void)
+{
+       if (userdef) {
+               u32 nr = e820.nr_map;
+
+               if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
+                       early_panic("Invalid user supplied memory map");
+               e820.nr_map = nr;
+
+               printk(KERN_INFO "user-defined physical RAM map:\n");
+               _e820_print_map(&e820, "user");
+       }
+}
+
+static inline const char *e820_type_to_string(int e820_type)
+{
+       switch (e820_type) {
+       case E820_RESERVED_KERN:
+       case E820_RAM:  return "System RAM";
+       case E820_ACPI: return "ACPI Tables";
+       case E820_NVS:  return "ACPI Non-volatile Storage";
+       case E820_UNUSABLE:     return "Unusable memory";
+       default:        return "reserved";
+       }
+}
+
+#ifdef CONFIG_XEN
+#define e820 machine_e820
+#endif
+
+/*
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+static struct resource __initdata *e820_res;
+void __init e820_reserve_resources(void)
+{
+       int i;
+       struct resource *res;
+       u64 end;
+
+       res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
+       e820_res = res;
+       for (i = 0; i < e820.nr_map; i++) {
+               end = e820.map[i].addr + e820.map[i].size - 1;
+               if (end != (resource_size_t)end) {
+                       res++;
+                       continue;
+               }
+               res->name = e820_type_to_string(e820.map[i].type);
+               res->start = e820.map[i].addr;
+               res->end = end;
+
+               res->flags = IORESOURCE_MEM;
+
+               /*
+                * don't register the region that could be conflicted with
+                * pci device BAR resource and insert them later in
+                * pcibios_resource_survey()
+                */
+               if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
+                       if (e820.map[i].type != E820_NVS)
+                               res->flags |= IORESOURCE_BUSY;
+                       insert_resource(&iomem_resource, res);
+               }
+               res++;
+       }
+
+       for (i = 0; i < e820_saved.nr_map; i++) {
+               struct e820entry *entry = &e820_saved.map[i];
+               firmware_map_add_early(entry->addr,
+                       entry->addr + entry->size - 1,
+                       e820_type_to_string(entry->type));
+       }
+}
+
+/* How much should we pad RAM ending depending on where it is? */
+static unsigned long ram_alignment(resource_size_t pos)
+{
+       unsigned long mb = pos >> 20;
+
+       /* To 64kB in the first megabyte */
+       if (!mb)
+               return 64*1024;
+
+       /* To 1MB in the first 16MB */
+       if (mb < 16)
+               return 1024*1024;
+
+       /* To 64MB for anything above that */
+       return 64*1024*1024;
+}
+
+#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
+
+void __init e820_reserve_resources_late(void)
+{
+       int i;
+       struct resource *res;
+
+       res = e820_res;
+       for (i = 0; i < e820.nr_map; i++) {
+               if (!res->parent && res->end)
+                       insert_resource_expand_to_fit(&iomem_resource, res);
+               res++;
+       }
+
+       /*
+        * Try to bump up RAM regions to reasonable boundaries to
+        * avoid stolen RAM:
+        */
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *entry = &e820.map[i];
+               u64 start, end;
+
+               if (entry->type != E820_RAM)
+                       continue;
+               start = entry->addr + entry->size;
+               end = round_up(start, ram_alignment(start)) - 1;
+               if (end > MAX_RESOURCE_SIZE)
+                       end = MAX_RESOURCE_SIZE;
+               if (start >= end)
+                       continue;
+               printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
+                              start, end);
+               reserve_region_with_split(&iomem_resource, start, end,
+                                         "RAM buffer");
+       }
+}
+
+#undef e820
+
+char *__init default_machine_specific_memory_setup(void)
+{
+       int rc, nr_map;
+       unsigned long maxmem;
+       struct xen_memory_map memmap;
+       static struct e820entry __initdata map[E820MAX];
+
+       memmap.nr_entries = E820MAX;
+       set_xen_guest_handle(memmap.buffer, map);
+
+       rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+       if (rc == -ENOSYS) {
+               memmap.nr_entries = 1;
+               map[0].addr = 0ULL;
+               map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages);
+               /* 8MB slack (to balance backend allocations). */
+               map[0].size += 8ULL << 20;
+               map[0].type = E820_RAM;
+               rc = 0;
+       }
+       BUG_ON(rc);
+
+       nr_map = memmap.nr_entries;
+       sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map);
+
+       if (append_e820_map(map, nr_map) < 0)
+               BUG();
+
+#ifdef CONFIG_XEN
+       /* See the comment in parse_memopt(). */
+       for (maxmem = rc = 0; rc < e820.nr_map; ++rc)
+               if (e820.map[rc].type == E820_RAM)
+                       maxmem += e820.map[rc].size >> PAGE_SHIFT;
+       if (is_initial_xendomain()) {
+               domid_t domid = DOMID_SELF;
+
+               rc = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
+               if (rc > 0 && maxmem > rc)
+                       maxmem = rc;
+       }
+       if ((maxmem >> 5) > xen_start_info->nr_pages) {
+               unsigned long long size = (u64)xen_start_info->nr_pages << 5;
+
+               pr_warn("maxmem of %luM is invalid for an initial"
+                       " allocation of %luM, using %LuM\n",
+                       maxmem >> (20 - PAGE_SHIFT),
+                       xen_start_info->nr_pages >> (20 - PAGE_SHIFT),
+                       size >> (20 - PAGE_SHIFT));
+               size <<= PAGE_SHIFT;
+               e820_remove_range(size, ULLONG_MAX - size, E820_RAM, 1);
+       }
+
+       if (is_initial_xendomain()) {
+               memmap.nr_entries = E820MAX;
+               set_xen_guest_handle(memmap.buffer, machine_e820.map);
+
+               if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
+                       BUG();
+               machine_e820.nr_map = memmap.nr_entries;
+       }
+#endif
+
+       return "Xen";
+}
+
+void __init setup_memory_map(void)
+{
+       char *who;
+
+       who = x86_init.resources.memory_setup();
+#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST
+#ifdef CONFIG_XEN
+       if (is_initial_xendomain()) {
+               printk(KERN_INFO "Xen-provided machine memory map:\n");
+               _e820_print_map(&machine_e820, "BIOS");
+       } else
+#endif
+               memcpy(&e820_saved, &e820, sizeof(struct e820map));
+#endif
+       printk(KERN_INFO "Xen-provided physical RAM map:\n");
+       _e820_print_map(&e820, who);
+}
+
+void __init memblock_x86_fill(void)
+{
+       int i;
+       u64 end;
+
+       /*
+        * EFI may have more than 128 entries
+        * We are safe to enable resizing, beause memblock_x86_fill()
+        * is rather later for x86
+        */
+       memblock_allow_resize();
+
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *ei = &e820.map[i];
+
+               end = ei->addr + ei->size;
+               if (end != (resource_size_t)end)
+                       continue;
+
+               if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
+                       continue;
+
+               memblock_add(ei->addr, ei->size);
+       }
+
+#ifdef CONFIG_XEN
+       if (max_pfn > xen_start_info->nr_pages)
+               memblock_reserve(PFN_PHYS(xen_start_info->nr_pages),
+                                PFN_PHYS(max_pfn - xen_start_info->nr_pages));
+#endif
+
+       memblock_dump_all();
+}
+
+void __init memblock_find_dma_reserve(void)
+{
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+       u64 nr_pages = 0, nr_free_pages = 0;
+       unsigned long start_pfn, end_pfn;
+       phys_addr_t start, end;
+       int i;
+       u64 u;
+
+       /*
+        * need to find out used area below MAX_DMA_PFN
+        * need to use memblock to get free size in [0, MAX_DMA_PFN]
+        * at first, and assume boot_mem will not take below MAX_DMA_PFN
+        */
+       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+               start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN);
+               end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN);
+               nr_pages += end_pfn - start_pfn;
+       }
+
+       for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
+               start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
+               end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
+               if (start_pfn < end_pfn)
+                       nr_free_pages += end_pfn - start_pfn;
+       }
+
+       set_dma_reserve(nr_pages - nr_free_pages);
+#endif
+}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c

index 62d61e9..664d1a7 100644 (file)
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -935,7 +935,8 @@ void __init e820_reserve_resources(void)
                  * pcibios_resource_survey()
                  */
                 if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
-                       res->flags |= IORESOURCE_BUSY;
+                       if (e820.map[i].type != E820_NVS)
+                               res->flags |= IORESOURCE_BUSY;
                         insert_resource(&iomem_resource, res);
                 }
                 res++;
diff --git a/arch/x86/kernel/early_printk-xen.c b/arch/x86/kernel/early_printk-xen.c

new file mode 100644 (file)

index 0000000..ea02752
--- /dev/null
+++ b/arch/x86/kernel/early_printk-xen.c
@@ -0,0 +1,291 @@
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/screen_info.h>
+#include <linux/usb/ch9.h>
+#include <linux/pci_regs.h>
+#include <linux/pci_ids.h>
+#include <linux/errno.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include <asm/fcntl.h>
+#include <asm/setup.h>
+#include <asm/pci-direct.h>
+#include <asm/fixmap.h>
+#include <asm/mrst.h>
+#include <asm/pgtable.h>
+#include <linux/usb/ehci_def.h>
+
+#ifndef CONFIG_XEN
+/* Simple VGA output */
+#define VGABASE                (__ISA_IO_base + 0xb8000)
+
+static int max_ypos = 25, max_xpos = 80;
+static int current_ypos = 25, current_xpos;
+
+static void early_vga_write(struct console *con, const char *str, unsigned n)
+{
+       char c;
+       int  i, k, j;
+
+       while ((c = *str++) != '\0' && n-- > 0) {
+               if (current_ypos >= max_ypos) {
+                       /* scroll 1 line up */
+                       for (k = 1, j = 0; k < max_ypos; k++, j++) {
+                               for (i = 0; i < max_xpos; i++) {
+                                       writew(readw(VGABASE+2*(max_xpos*k+i)),
+                                              VGABASE + 2*(max_xpos*j + i));
+                               }
+                       }
+                       for (i = 0; i < max_xpos; i++)
+                               writew(0x720, VGABASE + 2*(max_xpos*j + i));
+                       current_ypos = max_ypos-1;
+               }
+#ifdef CONFIG_KGDB_KDB
+               if (c == '\b') {
+                       if (current_xpos > 0)
+                               current_xpos--;
+               } else if (c == '\r') {
+                       current_xpos = 0;
+               } else
+#endif
+               if (c == '\n') {
+                       current_xpos = 0;
+                       current_ypos++;
+               } else if (c != '\r')  {
+                       writew(((0x7 << 8) | (unsigned short) c),
+                              VGABASE + 2*(max_xpos*current_ypos +
+                                               current_xpos++));
+                       if (current_xpos >= max_xpos) {
+                               current_xpos = 0;
+                               current_ypos++;
+                       }
+               }
+       }
+}
+
+static struct console early_vga_console = {
+       .name =         "earlyvga",
+       .write =        early_vga_write,
+       .flags =        CON_PRINTBUFFER,
+       .index =        -1,
+};
+
+/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
+
+static int early_serial_base = 0x3f8;  /* ttyS0 */
+
+#define XMTRDY          0x20
+
+#define DLAB           0x80
+
+#define TXR             0       /*  Transmit register (WRITE) */
+#define RXR             0       /*  Receive register  (READ)  */
+#define IER             1       /*  Interrupt Enable          */
+#define IIR             2       /*  Interrupt ID              */
+#define FCR             2       /*  FIFO control              */
+#define LCR             3       /*  Line control              */
+#define MCR             4       /*  Modem control             */
+#define LSR             5       /*  Line Status               */
+#define MSR             6       /*  Modem Status              */
+#define DLL             0       /*  Divisor Latch Low         */
+#define DLH             1       /*  Divisor latch High        */
+
+static int early_serial_putc(unsigned char ch)
+{
+       unsigned timeout = 0xffff;
+
+       while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+               cpu_relax();
+       outb(ch, early_serial_base + TXR);
+       return timeout ? 0 : -1;
+}
+
+static void early_serial_write(struct console *con, const char *s, unsigned n)
+{
+       while (*s && n-- > 0) {
+               if (*s == '\n')
+                       early_serial_putc('\r');
+               early_serial_putc(*s);
+               s++;
+       }
+}
+
+#define DEFAULT_BAUD 9600
+
+static __init void early_serial_init(char *s)
+{
+       unsigned char c;
+       unsigned divisor;
+       unsigned baud = DEFAULT_BAUD;
+       char *e;
+
+       if (*s == ',')
+               ++s;
+
+       if (*s) {
+               unsigned port;
+               if (!strncmp(s, "0x", 2)) {
+                       early_serial_base = simple_strtoul(s, &e, 16);
+               } else {
+                       static const int __initconst bases[] = { 0x3f8, 0x2f8 };
+
+                       if (!strncmp(s, "ttyS", 4))
+                               s += 4;
+                       port = simple_strtoul(s, &e, 10);
+                       if (port > 1 || s == e)
+                               port = 0;
+                       early_serial_base = bases[port];
+               }
+               s += strcspn(s, ",");
+               if (*s == ',')
+                       s++;
+       }
+
+       outb(0x3, early_serial_base + LCR);     /* 8n1 */
+       outb(0, early_serial_base + IER);       /* no interrupt */
+       outb(0, early_serial_base + FCR);       /* no fifo */
+       outb(0x3, early_serial_base + MCR);     /* DTR + RTS */
+
+       if (*s) {
+               baud = simple_strtoul(s, &e, 0);
+               if (baud == 0 || s == e)
+                       baud = DEFAULT_BAUD;
+       }
+
+       divisor = 115200 / baud;
+       c = inb(early_serial_base + LCR);
+       outb(c | DLAB, early_serial_base + LCR);
+       outb(divisor & 0xff, early_serial_base + DLL);
+       outb((divisor >> 8) & 0xff, early_serial_base + DLH);
+       outb(c & ~DLAB, early_serial_base + LCR);
+}
+
+#else /* CONFIG_XEN */
+
+static void
+early_serial_write(struct console *con, const char *s, unsigned count)
+{
+       int n;
+
+       while (count > 0) {
+               n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s);
+               if (n <= 0)
+                       break;
+               count -= n;
+               s += n;
+       }
+} 
+
+static __init void early_serial_init(char *s)
+{
+}
+
+/*
+ * No early VGA console on Xen, as we do not have convenient ISA-space
+ * mappings. Someone should fix this for domain 0. For now, use fake serial.
+ */
+#define early_vga_console early_serial_console
+
+#endif
+
+static struct console early_serial_console = {
+       .name =         "earlyser",
+       .write =        early_serial_write,
+       .flags =        CON_PRINTBUFFER,
+       .index =        -1,
+};
+
+/* Direct interface for emergencies */
+static struct console *early_console = &early_vga_console;
+static int __initdata early_console_initialized;
+
+asmlinkage void early_printk(const char *fmt, ...)
+{
+       char buf[512];
+       int n;
+       va_list ap;
+
+       va_start(ap, fmt);
+       n = vscnprintf(buf, sizeof(buf), fmt, ap);
+       early_console->write(early_console, buf, n);
+       va_end(ap);
+}
+
+static inline void early_console_register(struct console *con, int keep_early)
+{
+       if (early_console->index != -1) {
+               printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
+                      con->name);
+               return;
+       }
+       early_console = con;
+       if (keep_early)
+               early_console->flags &= ~CON_BOOT;
+       else
+               early_console->flags |= CON_BOOT;
+       register_console(early_console);
+}
+
+static int __init setup_early_printk(char *buf)
+{
+       int keep;
+
+       if (!buf)
+               return 0;
+
+       if (early_console_initialized)
+               return 0;
+       early_console_initialized = 1;
+
+       keep = (strstr(buf, "keep") != NULL);
+
+       while (*buf != '\0') {
+               if (!strncmp(buf, "serial", 6)) {
+                       buf += 6;
+                       early_serial_init(buf);
+                       early_console_register(&early_serial_console, keep);
+                       if (!strncmp(buf, ",ttyS", 5))
+                               buf += 5;
+               }
+               if (!strncmp(buf, "ttyS", 4)) {
+                       early_serial_init(buf + 4);
+                       early_console_register(&early_serial_console, keep);
+               }
+#ifndef CONFIG_XEN
+               if (!strncmp(buf, "vga", 3) &&
+                   boot_params.screen_info.orig_video_isVGA == 1) {
+                       max_xpos = boot_params.screen_info.orig_video_cols;
+                       max_ypos = boot_params.screen_info.orig_video_lines;
+                       current_ypos = boot_params.screen_info.orig_y;
+#else
+               if (!strncmp(buf, "vga", 3) || !strncmp(buf, "xen", 3)) {
+#endif
+                       early_console_register(&early_vga_console, keep);
+               }
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+               if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4))
+                       early_console_register(&early_dbgp_console, keep);
+#endif
+#ifdef CONFIG_HVC_XEN
+               if (!strncmp(buf, "xen", 3))
+                       early_console_register(&xenboot_console, keep);
+#endif
+#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
+               if (!strncmp(buf, "mrst", 4)) {
+                       mrst_early_console_init();
+                       early_console_register(&early_mrst_console, keep);
+               }
+
+               if (!strncmp(buf, "hsu", 3)) {
+                       hsu_early_console_init(buf + 3);
+                       early_console_register(&early_hsu_console, keep);
+               }
+#endif
+               buf++;
+       }
+       return 0;
+}
+
+early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/entry_32-xen.S b/arch/x86/kernel/entry_32-xen.S

new file mode 100644 (file)

index 0000000..35a899c
--- /dev/null
+++ b/arch/x86/kernel/entry_32-xen.S
@@ -0,0 +1,1725 @@
+/*
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ * This also contains the timer-interrupt handler, as well as all interrupts
+ * and faults that can result in a task-switch.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after a timer-interrupt and after each system call.
+ *
+ * I changed all the .align's to 4 (16 byte alignment), as that's faster
+ * on a 486.
+ *
+ * Stack layout in 'syscall_exit':
+ *     ptrace needs to have all regs on the stack.
+ *     if the order here is changed, it needs to be
+ *     updated in fork.c:copy_process, signal.c:do_signal,
+ *     ptrace.c and ptrace.h
+ *
+ *      0(%esp) - %ebx
+ *      4(%esp) - %ecx
+ *      8(%esp) - %edx
+ *       C(%esp) - %esi
+ *     10(%esp) - %edi
+ *     14(%esp) - %ebp
+ *     18(%esp) - %eax
+ *     1C(%esp) - %ds
+ *     20(%esp) - %es
+ *     24(%esp) - %fs
+ *     28(%esp) - %gs          saved iff !CONFIG_X86_32_LAZY_GS
+ *     2C(%esp) - orig_eax
+ *     30(%esp) - %eip
+ *     34(%esp) - %cs
+ *     38(%esp) - %eflags
+ *     3C(%esp) - %oldesp
+ *     40(%esp) - %oldss
+ *
+ * "current" is in register %ebx during any slow entries.
+ */
+
+#include <linux/linkage.h>
+#include <linux/err.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/dwarf2.h>
+#include <asm/processor-flags.h>
+#include <asm/ftrace.h>
+#include <asm/irq_vectors.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+#include <xen/interface/xen.h>
+
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386                (EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE           0x40000000
+
+#ifndef CONFIG_AUDITSYSCALL
+#define sysenter_audit syscall_trace_entry
+#define sysexit_audit  syscall_exit_work
+#endif
+
+       .section .entry.text, "ax"
+
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization.  The following will never clobber any registers:
+ *   INTERRUPT_RETURN (aka. "iret")
+ *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+
+/* Pseudo-eflags. */
+NMI_MASK       = 0x80000000
+
+#ifdef CONFIG_PREEMPT
+#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
+#else
+#define preempt_stop(clobbers)
+#define resume_kernel          restore_all
+#endif
+
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+       testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)     # interrupts off?
+       jz 1f
+       TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc.  Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+       pushl_cfi $0
+.endm
+.macro POP_GS pop=0
+       addl $(4 + \pop), %esp
+       CFI_ADJUST_CFA_OFFSET -(4 + \pop)
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else  /* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+       pushl_cfi %gs
+       /*CFI_REL_OFFSET gs, 0*/
+.endm
+
+.macro POP_GS pop=0
+98:    popl_cfi %gs
+       /*CFI_RESTORE gs*/
+  .if \pop <> 0
+       add $\pop, %esp
+       CFI_ADJUST_CFA_OFFSET -\pop
+  .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99:    movl $0, (%esp)
+       jmp 98b
+.section __ex_table, "a"
+       .align 4
+       .long 98b, 99b
+.popsection
+.endm
+
+.macro PTGS_TO_GS
+98:    mov PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99:    movl $0, PT_GS(%esp)
+       jmp 98b
+.section __ex_table, "a"
+       .align 4
+       .long 98b, 99b
+.popsection
+.endm
+
+.macro GS_TO_REG reg
+       movl %gs, \reg
+       /*CFI_REGISTER gs, \reg*/
+.endm
+.macro REG_TO_PTGS reg
+       movl \reg, PT_GS(%esp)
+       /*CFI_REL_OFFSET gs, PT_GS*/
+.endm
+.macro SET_KERNEL_GS reg
+       movl $(__KERNEL_STACK_CANARY), \reg
+       movl \reg, %gs
+.endm
+
+#endif /* CONFIG_X86_32_LAZY_GS */
+
+.macro SAVE_ALL
+       cld
+       PUSH_GS
+       pushl_cfi %fs
+       /*CFI_REL_OFFSET fs, 0;*/
+       pushl_cfi %es
+       /*CFI_REL_OFFSET es, 0;*/
+       pushl_cfi %ds
+       /*CFI_REL_OFFSET ds, 0;*/
+       pushl_cfi %eax
+       CFI_REL_OFFSET eax, 0
+       pushl_cfi %ebp
+       CFI_REL_OFFSET ebp, 0
+       pushl_cfi %edi
+       CFI_REL_OFFSET edi, 0
+       pushl_cfi %esi
+       CFI_REL_OFFSET esi, 0
+       pushl_cfi %edx
+       CFI_REL_OFFSET edx, 0
+       pushl_cfi %ecx
+       CFI_REL_OFFSET ecx, 0
+       pushl_cfi %ebx
+       CFI_REL_OFFSET ebx, 0
+       movl $(__USER_DS), %edx
+       movl %edx, %ds
+       movl %edx, %es
+       movl $(__KERNEL_PERCPU), %edx
+       movl %edx, %fs
+       SET_KERNEL_GS %edx
+.endm
+
+.macro RESTORE_INT_REGS
+       popl_cfi %ebx
+       CFI_RESTORE ebx
+       popl_cfi %ecx
+       CFI_RESTORE ecx
+       popl_cfi %edx
+       CFI_RESTORE edx
+       popl_cfi %esi
+       CFI_RESTORE esi
+       popl_cfi %edi
+       CFI_RESTORE edi
+       popl_cfi %ebp
+       CFI_RESTORE ebp
+       popl_cfi %eax
+       CFI_RESTORE eax
+.endm
+
+.macro RESTORE_REGS pop=0
+       RESTORE_INT_REGS
+1:     popl_cfi %ds
+       /*CFI_RESTORE ds;*/
+2:     popl_cfi %es
+       /*CFI_RESTORE es;*/
+3:     popl_cfi %fs
+       /*CFI_RESTORE fs;*/
+       POP_GS \pop
+.pushsection .fixup, "ax"
+4:     movl $0, (%esp)
+       jmp 1b
+5:     movl $0, (%esp)
+       jmp 2b
+6:     movl $0, (%esp)
+       jmp 3b
+.section __ex_table, "a"
+       .align 4
+       .long 1b, 4b
+       .long 2b, 5b
+       .long 3b, 6b
+.popsection
+       POP_GS_EX
+.endm
+
+.macro RING0_INT_FRAME
+       CFI_STARTPROC simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA esp, 3*4
+       /*CFI_OFFSET cs, -2*4;*/
+       CFI_OFFSET eip, -3*4
+.endm
+
+.macro RING0_EC_FRAME
+       CFI_STARTPROC simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA esp, 4*4
+       /*CFI_OFFSET cs, -2*4;*/
+       CFI_OFFSET eip, -3*4
+.endm
+
+.macro RING0_PTREGS_FRAME
+       CFI_STARTPROC simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
+       /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
+       CFI_OFFSET eip, PT_EIP-PT_OLDESP
+       /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
+       /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
+       CFI_OFFSET eax, PT_EAX-PT_OLDESP
+       CFI_OFFSET ebp, PT_EBP-PT_OLDESP
+       CFI_OFFSET edi, PT_EDI-PT_OLDESP
+       CFI_OFFSET esi, PT_ESI-PT_OLDESP
+       CFI_OFFSET edx, PT_EDX-PT_OLDESP
+       CFI_OFFSET ecx, PT_ECX-PT_OLDESP
+       CFI_OFFSET ebx, PT_EBX-PT_OLDESP
+.endm
+
+ENTRY(ret_from_fork)
+       CFI_STARTPROC
+       pushl_cfi %eax
+       call schedule_tail
+       GET_THREAD_INFO(%ebp)
+       popl_cfi %eax
+       pushl_cfi $0x0202               # Reset kernel eflags
+       popfl_cfi
+       jmp syscall_exit
+       CFI_ENDPROC
+END(ret_from_fork)
+
+/*
+ * Interrupt exit functions should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
+/*
+ * Return to user mode is not as complex as all this looks,
+ * but we want the default path for a system call return to
+ * go as quickly as possible which is why some of this is
+ * less clear than it otherwise should be.
+ */
+
+       # userspace resumption stub bypassing syscall exit tracing
+       ALIGN
+       RING0_PTREGS_FRAME
+ret_from_exception:
+       preempt_stop(CLBR_ANY)
+ret_from_intr:
+       GET_THREAD_INFO(%ebp)
+resume_userspace_sig:
+#ifdef CONFIG_VM86
+       movl PT_EFLAGS(%esp), %eax      # mix EFLAGS and CS
+       movb PT_CS(%esp), %al
+       andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
+       /*
+        * We can be coming here from a syscall done in the kernel space,
+        * e.g. a failed kernel_execve().
+        */
+       movl PT_CS(%esp), %eax
+       andl $SEGMENT_RPL_MASK, %eax
+#endif
+       cmpl $USER_RPL, %eax
+       jb resume_kernel                # not returning to v8086 or userspace
+
+ENTRY(resume_userspace)
+       LOCKDEP_SYS_EXIT
+       DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
+                                       # setting need_resched or sigpending
+                                       # between sampling and the iret
+       TRACE_IRQS_OFF
+       movl TI_flags(%ebp), %ecx
+       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
+                                       # int/exception return?
+       jne work_pending
+       jmp restore_all
+END(ret_from_exception)
+
+#ifdef CONFIG_PREEMPT
+ENTRY(resume_kernel)
+       DISABLE_INTERRUPTS(CLBR_ANY)
+       cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
+       jnz restore_all
+need_resched:
+       movl TI_flags(%ebp), %ecx       # need_resched set ?
+       testb $_TIF_NEED_RESCHED, %cl
+       jz restore_all
+       testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)    # interrupts off (exception path) ?
+       jz restore_all
+       call preempt_schedule_irq
+       jmp need_resched
+END(resume_kernel)
+#endif
+       CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+       .popsection
+
+/* SYSENTER_RETURN points to after the "sysenter" instruction in
+   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
+
+       # sysenter call handler stub
+ENTRY(ia32_sysenter_target)
+       CFI_STARTPROC simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA esp, 0
+       CFI_REGISTER esp, ebp
+       movl SYSENTER_stack_sp0(%esp),%esp
+sysenter_past_esp:
+       /*
+        * Interrupts are disabled here, but we can't trace it until
+        * enough kernel state to call TRACE_IRQS_OFF can be called - but
+        * we immediately enable interrupts at that point anyway.
+        */
+       pushl_cfi $__USER_DS
+       /*CFI_REL_OFFSET ss, 0*/
+       pushl_cfi %ebp
+       CFI_REL_OFFSET esp, 0
+       pushfl_cfi
+       orl $X86_EFLAGS_IF, (%esp)
+       pushl_cfi $__USER_CS
+       /*CFI_REL_OFFSET cs, 0*/
+       /*
+        * Push current_thread_info()->sysenter_return to the stack.
+        * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
+        * pushed above; +8 corresponds to copy_thread's esp0 setting.
+        */
+       pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
+       CFI_REL_OFFSET eip, 0
+
+       pushl_cfi %eax
+       SAVE_ALL
+       ENABLE_INTERRUPTS(CLBR_NONE)
+
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+       cmpl $__PAGE_OFFSET-3,%ebp
+       jae syscall_fault
+1:     movl (%ebp),%ebp
+       movl %ebp,PT_EBP(%esp)
+.section __ex_table,"a"
+       .align 4
+       .long 1b,syscall_fault
+.previous
+
+       GET_THREAD_INFO(%ebp)
+
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+       jnz sysenter_audit
+sysenter_do_call:
+       cmpl $(NR_syscalls), %eax
+       jae syscall_badsys
+       call *sys_call_table(,%eax,4)
+       movl %eax,PT_EAX(%esp)
+       LOCKDEP_SYS_EXIT
+       DISABLE_INTERRUPTS(CLBR_ANY)
+       TRACE_IRQS_OFF
+       movl TI_flags(%ebp), %ecx
+       testl $_TIF_ALLWORK_MASK, %ecx
+       jne sysexit_audit
+sysenter_exit:
+/* if something modifies registers it must also disable sysexit */
+       movl PT_EIP(%esp), %edx
+       movl PT_OLDESP(%esp), %ecx
+       xorl %ebp,%ebp
+#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
+       GET_VCPU_INFO
+#endif
+       TRACE_IRQS_ON
+1:     mov  PT_FS(%esp), %fs
+       PTGS_TO_GS
+       ENABLE_INTERRUPTS_SYSEXIT
+
+#ifdef CONFIG_AUDITSYSCALL
+sysenter_audit:
+       testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+       jnz syscall_trace_entry
+       addl $4,%esp
+       CFI_ADJUST_CFA_OFFSET -4
+       /* %esi already in 8(%esp)         6th arg: 4th syscall arg */
+       /* %edx already in 4(%esp)         5th arg: 3rd syscall arg */
+       /* %ecx already in 0(%esp)         4th arg: 2nd syscall arg */
+       movl %ebx,%ecx                  /* 3rd arg: 1st syscall arg */
+       movl %eax,%edx                  /* 2nd arg: syscall number */
+       movl $AUDIT_ARCH_I386,%eax      /* 1st arg: audit arch */
+       call __audit_syscall_entry
+       pushl_cfi %ebx
+       movl PT_EAX(%esp),%eax          /* reload syscall number */
+       jmp sysenter_do_call
+
+sysexit_audit:
+       testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+       jne syscall_exit_work
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_ANY)
+       movl %eax,%edx          /* second arg, syscall return value */
+       cmpl $-MAX_ERRNO,%eax   /* is it an error ? */
+       setbe %al               /* 1 if so, 0 if not */
+       movzbl %al,%eax         /* zero-extend that */
+       call __audit_syscall_exit
+       DISABLE_INTERRUPTS(CLBR_ANY)
+       TRACE_IRQS_OFF
+       movl TI_flags(%ebp), %ecx
+       testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+       jne syscall_exit_work
+       movl PT_EAX(%esp),%eax  /* reload syscall return value */
+       jmp sysenter_exit
+#endif
+
+       CFI_ENDPROC
+.pushsection .fixup,"ax"
+2:     movl $0,PT_FS(%esp)
+       jmp 1b
+.section __ex_table,"a"
+       .align 4
+       .long 1b,2b
+.popsection
+       PTGS_TO_GS_EX
+ENDPROC(ia32_sysenter_target)
+
+       # pv sysenter call handler stub
+ENTRY(ia32pv_sysenter_target)
+       RING0_INT_FRAME
+       movl $__USER_DS,16(%esp)
+       movl %ebp,12(%esp)
+       movl $__USER_CS,4(%esp)
+       addl $4,%esp
+       CFI_ADJUST_CFA_OFFSET -4
+       /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */
+       pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+       cmpl $__PAGE_OFFSET-3,%ebp
+       jae syscall_fault
+1:     movl (%ebp),%ebp
+.section __ex_table,"a"
+       .align 4
+       .long 1b,syscall_fault
+.previous
+       jmp system_call
+       CFI_ENDPROC
+ENDPROC(ia32pv_sysenter_target)
+
+/*
+ * syscall stub including irq exit should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
+       # system call handler stub
+ENTRY(system_call)
+       RING0_INT_FRAME                 # can't unwind into user space anyway
+       pushl_cfi %eax                  # save orig_eax
+       SAVE_ALL
+       GET_THREAD_INFO(%ebp)
+                                       # system call tracing in operation / emulation
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+       jnz syscall_trace_entry
+       cmpl $(NR_syscalls), %eax
+       jae syscall_badsys
+syscall_call:
+       call *sys_call_table(,%eax,4)
+       movl %eax,PT_EAX(%esp)          # store the return value
+syscall_exit:
+       LOCKDEP_SYS_EXIT
+       DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
+                                       # setting need_resched or sigpending
+                                       # between sampling and the iret
+       TRACE_IRQS_OFF
+       movl TI_flags(%ebp), %ecx
+       testl $_TIF_ALLWORK_MASK, %ecx  # current->work
+       jne syscall_exit_work
+
+restore_all:
+       TRACE_IRQS_IRET
+restore_all_notrace:
+#ifndef CONFIG_XEN
+       movl PT_EFLAGS(%esp), %eax      # mix EFLAGS, SS and CS
+       # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+       # are returning to the kernel.
+       # See comments in process.c:copy_thread() for details.
+       movb PT_OLDSS(%esp), %ah
+       movb PT_CS(%esp), %al
+       andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+       cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+       CFI_REMEMBER_STATE
+       je ldt_ss                       # returning to user-space with LDT SS
+restore_nocheck:
+#else
+restore_nocheck:
+       movl PT_EFLAGS(%esp), %eax
+       testl $(X86_EFLAGS_VM|NMI_MASK), %eax
+       CFI_REMEMBER_STATE
+       jnz hypervisor_iret
+       shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
+       GET_VCPU_INFO
+       andb evtchn_upcall_mask(%esi),%al
+       andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
+       CFI_REMEMBER_STATE
+       jnz restore_all_enable_events   #        != 0 => enable event delivery
+#endif
+       RESTORE_REGS 4                  # skip orig_eax/error_code
+irq_return:
+       INTERRUPT_RETURN
+.section .fixup,"ax"
+ENTRY(iret_exc)
+       pushl $0                        # no error code
+       pushl $do_iret_error
+       jmp error_code
+.previous
+.section __ex_table,"a"
+       .align 4
+       .long irq_return,iret_exc
+.previous
+
+       CFI_RESTORE_STATE
+#ifndef CONFIG_XEN
+ldt_ss:
+       larl PT_OLDSS(%esp), %eax
+       jnz restore_nocheck
+       testl $0x00400000, %eax         # returning to 32bit stack?
+       jnz restore_nocheck             # allright, normal return
+
+#ifdef CONFIG_PARAVIRT
+       /*
+        * The kernel can't run on a non-flat stack if paravirt mode
+        * is active.  Rather than try to fixup the high bits of
+        * ESP, bypass this code entirely.  This may break DOSemu
+        * and/or Wine support in a paravirt VM, although the option
+        * is still available to implement the setting of the high
+        * 16-bits in the INTERRUPT_RETURN paravirt-op.
+        */
+       cmpl $0, pv_info+PARAVIRT_enabled
+       jne restore_nocheck
+#endif
+
+/*
+ * Setup and switch to ESPFIX stack
+ *
+ * We're returning to userspace with a 16 bit stack. The CPU will not
+ * restore the high word of ESP for us on executing iret... This is an
+ * "official" bug of all the x86-compatible CPUs, which we can work
+ * around to make dosemu and wine happy. We do this by preloading the
+ * high word of ESP with the high word of the userspace ESP while
+ * compensating for the offset by changing to the ESPFIX segment with
+ * a base address that matches for the difference.
+ */
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
+       mov %esp, %edx                  /* load kernel esp */
+       mov PT_OLDESP(%esp), %eax       /* load userspace esp */
+       mov %dx, %ax                    /* eax: new kernel esp */
+       sub %eax, %edx                  /* offset (low word is 0) */
+       shr $16, %edx
+       mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+       mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
+       pushl_cfi $__ESPFIX_SS
+       pushl_cfi %eax                  /* new kernel esp */
+       /* Disable interrupts, but do not irqtrace this section: we
+        * will soon execute iret and the tracer was already set to
+        * the irqstate after the iret */
+       DISABLE_INTERRUPTS(CLBR_EAX)
+       lss (%esp), %esp                /* switch to espfix segment */
+       CFI_ADJUST_CFA_OFFSET -8
+       jmp restore_nocheck
+#else
+        ALIGN
+restore_all_enable_events:
+       TRACE_IRQS_ON
+       __ENABLE_INTERRUPTS
+scrit: /**** START OF CRITICAL REGION ****/
+       __TEST_PENDING
+       jnz  14f                        # process more events if necessary...
+       RESTORE_REGS 4
+1:     INTERRUPT_RETURN
+.section __ex_table,"a"
+       .align 4
+       .long 1b,iret_exc
+.previous
+14:    __DISABLE_INTERRUPTS
+       TRACE_IRQS_OFF
+ecrit:  /**** END OF CRITICAL REGION ****/
+       jmp  .Ldo_upcall
+
+       CFI_RESTORE_STATE
+hypervisor_iret:
+       andl $~NMI_MASK, PT_EFLAGS(%esp)
+       RESTORE_REGS 4
+       jmp  hypercall_page + (__HYPERVISOR_iret * 32)
+#endif
+       CFI_ENDPROC
+ENDPROC(system_call)
+
+       # perform work that needs to be done immediately before resumption
+       ALIGN
+       RING0_PTREGS_FRAME              # can't unwind into user space anyway
+work_pending:
+       testb $_TIF_NEED_RESCHED, %cl
+       jz work_notifysig
+work_resched:
+       call schedule
+       LOCKDEP_SYS_EXIT
+       DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
+                                       # setting need_resched or sigpending
+                                       # between sampling and the iret
+       TRACE_IRQS_OFF
+       movl TI_flags(%ebp), %ecx
+       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done other
+                                       # than syscall tracing?
+       jz restore_all
+       testb $_TIF_NEED_RESCHED, %cl
+       jnz work_resched
+
+work_notifysig:                                # deal with pending signals and
+                                       # notify-resume requests
+#ifdef CONFIG_VM86
+       testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
+       movl %esp, %eax
+       jne work_notifysig_v86          # returning to kernel-space or
+                                       # vm86-space
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
+       xorl %edx, %edx
+       call do_notify_resume
+       jmp resume_userspace_sig
+
+       ALIGN
+work_notifysig_v86:
+       pushl_cfi %ecx                  # save ti_flags for do_notify_resume
+       call save_v86_state             # %eax contains pt_regs pointer
+       popl_cfi %ecx
+       movl %eax, %esp
+#else
+       movl %esp, %eax
+#endif
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
+       xorl %edx, %edx
+       call do_notify_resume
+       jmp resume_userspace_sig
+END(work_pending)
+
+       # perform syscall exit tracing
+       ALIGN
+syscall_trace_entry:
+       movl $-ENOSYS,PT_EAX(%esp)
+       movl %esp, %eax
+       call syscall_trace_enter
+       /* What it returned is what we'll actually use.  */
+       cmpl $(NR_syscalls), %eax
+       jnae syscall_call
+       jmp syscall_exit
+END(syscall_trace_entry)
+
+       # perform syscall exit tracing
+       ALIGN
+syscall_exit_work:
+       testl $_TIF_WORK_SYSCALL_EXIT, %ecx
+       jz work_pending
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_ANY)     # could let syscall_trace_leave() call
+                                       # schedule() instead
+       movl %esp, %eax
+       call syscall_trace_leave
+       jmp resume_userspace
+END(syscall_exit_work)
+       CFI_ENDPROC
+
+       RING0_INT_FRAME                 # can't unwind into user space anyway
+syscall_fault:
+       GET_THREAD_INFO(%ebp)
+       movl $-EFAULT,PT_EAX(%esp)
+       jmp resume_userspace
+END(syscall_fault)
+
+syscall_badsys:
+       movl $-ENOSYS,PT_EAX(%esp)
+       jmp resume_userspace
+END(syscall_badsys)
+       CFI_ENDPROC
+/*
+ * End of kprobes section
+ */
+       .popsection
+
+/*
+ * System calls that need a pt_regs pointer.
+ */
+#define PTREGSCALL0(name) \
+ENTRY(ptregs_##name) ;  \
+       leal 4(%esp),%eax; \
+       jmp sys_##name; \
+ENDPROC(ptregs_##name)
+
+#define PTREGSCALL1(name) \
+ENTRY(ptregs_##name) ; \
+       leal 4(%esp),%edx; \
+       movl (PT_EBX+4)(%esp),%eax; \
+       jmp sys_##name; \
+ENDPROC(ptregs_##name)
+
+#define PTREGSCALL2(name) \
+ENTRY(ptregs_##name) ; \
+       leal 4(%esp),%ecx; \
+       movl (PT_ECX+4)(%esp),%edx; \
+       movl (PT_EBX+4)(%esp),%eax; \
+       jmp sys_##name; \
+ENDPROC(ptregs_##name)
+
+#define PTREGSCALL3(name) \
+ENTRY(ptregs_##name) ; \
+       CFI_STARTPROC; \
+       leal 4(%esp),%eax; \
+       pushl_cfi %eax; \
+       movl PT_EDX(%eax),%ecx; \
+       movl PT_ECX(%eax),%edx; \
+       movl PT_EBX(%eax),%eax; \
+       call sys_##name; \
+       addl $4,%esp; \
+       CFI_ADJUST_CFA_OFFSET -4; \
+       ret; \
+       CFI_ENDPROC; \
+ENDPROC(ptregs_##name)
+
+PTREGSCALL1(iopl)
+PTREGSCALL0(fork)
+PTREGSCALL0(vfork)
+PTREGSCALL3(execve)
+PTREGSCALL2(sigaltstack)
+PTREGSCALL0(sigreturn)
+PTREGSCALL0(rt_sigreturn)
+PTREGSCALL2(vm86)
+PTREGSCALL1(vm86old)
+
+/* Clone is an oddball.  The 4th arg is in %edi */
+ENTRY(ptregs_clone)
+       CFI_STARTPROC
+       leal 4(%esp),%eax
+       pushl_cfi %eax
+       pushl_cfi PT_EDI(%eax)
+       movl PT_EDX(%eax),%ecx
+       movl PT_ECX(%eax),%edx
+       movl PT_EBX(%eax),%eax
+       call sys_clone
+       addl $8,%esp
+       CFI_ADJUST_CFA_OFFSET -8
+       ret
+       CFI_ENDPROC
+ENDPROC(ptregs_clone)
+
+#ifndef CONFIG_XEN
+.macro FIXUP_ESPFIX_STACK
+/*
+ * Switch back for ESPFIX stack to the normal zerobased stack
+ *
+ * We can't call C functions using the ESPFIX stack. This code reads
+ * the high word of the segment base from the GDT and swiches to the
+ * normal stack and adjusts ESP with the matching offset.
+ */
+       /* fixup the stack */
+       mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+       mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
+       shl $16, %eax
+       addl %esp, %eax                 /* the adjusted stack pointer */
+       pushl_cfi $__KERNEL_DS
+       pushl_cfi %eax
+       lss (%esp), %esp                /* switch to the normal stack segment */
+       CFI_ADJUST_CFA_OFFSET -8
+.endm
+.macro UNWIND_ESPFIX_STACK
+       movl %ss, %eax
+       /* see if on espfix stack */
+       cmpw $__ESPFIX_SS, %ax
+       jne 27f
+       movl $__KERNEL_DS, %eax
+       movl %eax, %ds
+       movl %eax, %es
+       /* switch to normal stack */
+       FIXUP_ESPFIX_STACK
+27:
+.endm
+
+/*
+ * Build the entry stubs and pointer table with some assembler magic.
+ * We pack 7 stubs into a single 32-byte chunk, which will fit in a
+ * single cache line on all modern x86 implementations.
+ */
+.section .init.rodata,"a"
+ENTRY(interrupt)
+.section .entry.text, "ax"
+       .p2align 5
+       .p2align CONFIG_X86_L1_CACHE_SHIFT
+ENTRY(irq_entries_start)
+       RING0_INT_FRAME
+vector=FIRST_EXTERNAL_VECTOR
+.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
+       .balign 32
+  .rept        7
+    .if vector < NR_VECTORS
+      .if vector <> FIRST_EXTERNAL_VECTOR
+       CFI_ADJUST_CFA_OFFSET -4
+      .endif
+1:     pushl_cfi $(~vector+0x80)       /* Note: always in signed byte range */
+      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
+       jmp 2f
+      .endif
+      .previous
+       .long 1b
+      .section .entry.text, "ax"
+vector=vector+1
+    .endif
+  .endr
+2:     jmp common_interrupt
+.endr
+END(irq_entries_start)
+
+.previous
+END(interrupt)
+.previous
+
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
+       .p2align CONFIG_X86_L1_CACHE_SHIFT
+common_interrupt:
+       addl $-0x80,(%esp)      /* Adjust vector into the [-256,-1] range */
+       SAVE_ALL
+       TRACE_IRQS_OFF
+       movl %esp,%eax
+       call do_IRQ
+       jmp ret_from_intr
+ENDPROC(common_interrupt)
+       CFI_ENDPROC
+
+/*
+ *  Irq entries should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
+#define BUILD_INTERRUPT3(name, nr, fn) \
+ENTRY(name)                            \
+       RING0_INT_FRAME;                \
+       pushl_cfi $~(nr);               \
+       SAVE_ALL;                       \
+       TRACE_IRQS_OFF                  \
+       movl %esp,%eax;                 \
+       call fn;                        \
+       jmp ret_from_intr;              \
+       CFI_ENDPROC;                    \
+ENDPROC(name)
+
+#define BUILD_INTERRUPT(name, nr)      BUILD_INTERRUPT3(name, nr, smp_##name)
+
+/* The include is where all of the SMP etc. interrupts come from */
+#include <asm/entry_arch.h>
+
+#else
+#define UNWIND_ESPFIX_STACK
+
+       .pushsection .kprobes.text, "ax"
+
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+#
+# The sysexit critical region is slightly different. sysexit
+# atomically removes the entire stack frame. If we interrupt in the
+# critical region we know that the entire frame is present and correct
+# so we can simply throw away the new one.
+ENTRY(hypervisor_callback)
+       RING0_INT_FRAME
+       pushl_cfi %eax
+       SAVE_ALL
+       movl PT_CS(%esp),%ecx
+       movl PT_EIP(%esp),%eax
+       andl $SEGMENT_RPL_MASK,%ecx
+       cmpl $USER_RPL,%ecx
+       jae  .Ldo_upcall
+       cmpl $scrit,%eax
+       jb   0f
+       cmpl $ecrit,%eax
+       jb   critical_region_fixup
+0:
+#ifdef CONFIG_XEN_SUPERVISOR_MODE_KERNEL
+       cmpl $sysexit_scrit,%eax
+       jb   .Ldo_upcall
+       cmpl $sysexit_ecrit,%eax
+       ja   .Ldo_upcall
+       addl $PT_OLDESP,%esp            # Remove eflags...ebx from stack frame.
+#endif
+.Ldo_upcall:
+       pushl_cfi %esp
+       call evtchn_do_upcall
+       add  $4,%esp
+       CFI_ADJUST_CFA_OFFSET -4
+       jmp  ret_from_intr
+       CFI_ENDPROC
+
+# [How we do the fixup]. We want to merge the current stack frame with the
+# just-interrupted frame. How we do this depends on where in the critical
+# region the interrupted handler was executing, and so how many saved
+# registers are in each frame. We do this quickly using the lookup table
+# 'critical_fixup_table'. For each byte offset in the critical region, it
+# provides the number of bytes which have already been popped from the
+# interrupted stack frame.
+critical_region_fixup:
+       movsbl critical_fixup_table-scrit(%eax),%ecx # %ecx contains num slots popped
+       testl %ecx,%ecx
+       leal (%esp,%ecx,4),%esi         # %esi points at end of src region
+       leal PT_OLDESP(%esp),%edi       # %edi points at end of dst region
+       jle   17f                       # skip loop if nothing to copy
+16:    subl $4,%esi                    # pre-decrementing copy loop
+       subl $4,%edi
+       movl (%esi),%eax
+       movl %eax,(%edi)
+       loop 16b
+17:    movl %edi,%esp                  # final %edi is top of merged stack
+       jmp  .Ldo_upcall
+
+.section .rodata,"a"
+critical_fixup_table:
+       .rept __SIZEOF_TEST_PENDING
+       .byte -1
+       .endr
+       .byte -1,-1                     # jnz  14f
+       .byte 0                         # pop  %ebx
+       .byte 1                         # pop  %ecx
+       .byte 2                         # pop  %edx
+       .byte 3                         # pop  %esi
+       .byte 4                         # pop  %edi
+       .byte 5                         # pop  %ebp
+       .byte 6                         # pop  %eax
+       .byte 7                         # pop  %ds
+       .byte 8                         # pop  %es
+       .byte 9,9                       # pop  %fs
+#ifndef CONFIG_X86_32_LAZY_GS
+       .byte 10,10                     # pop  %gs
+       .byte 11,11,11                  # add  $4,%esp
+#else
+       .byte 10,10,10                  # add  $8,%esp
+#endif
+       .byte 12                        # iret
+       .rept __SIZEOF_DISABLE_INTERRUPTS
+       .byte -1
+       .endr
+.previous
+
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(failsafe_callback)
+       pushl %eax
+       movl $1,%eax
+1:     mov 4(%esp),%ds
+2:     mov 8(%esp),%es
+3:     mov 12(%esp),%fs
+4:     mov 16(%esp),%gs
+       testl %eax,%eax
+       popl %eax
+       jz 5f
+       addl $16,%esp           # EAX != 0 => Category 2 (Bad IRET)
+       jmp iret_exc
+5:     addl $16,%esp           # EAX == 0 => Category 1 (Bad segment)
+       RING0_INT_FRAME
+       pushl $0
+       SAVE_ALL
+       jmp ret_from_exception
+.section .fixup,"ax";          \
+6:     xorl %eax,%eax;         \
+       movl %eax,4(%esp);      \
+       jmp 1b;                 \
+7:     xorl %eax,%eax;         \
+       movl %eax,8(%esp);      \
+       jmp 2b;                 \
+8:     xorl %eax,%eax;         \
+       movl %eax,12(%esp);     \
+       jmp 3b;                 \
+9:     xorl %eax,%eax;         \
+       movl %eax,16(%esp);     \
+       jmp 4b;                 \
+.previous;                     \
+.section __ex_table,"a";       \
+       .align 4;               \
+       .long 1b,6b;            \
+       .long 2b,7b;            \
+       .long 3b,8b;            \
+       .long 4b,9b;            \
+.previous
+#endif
+       CFI_ENDPROC
+
+ENTRY(coprocessor_error)
+       RING0_INT_FRAME
+       pushl_cfi $0
+       pushl_cfi $do_coprocessor_error
+       jmp error_code
+       CFI_ENDPROC
+END(coprocessor_error)
+
+ENTRY(simd_coprocessor_error)
+       RING0_INT_FRAME
+       pushl_cfi $0
+#ifdef CONFIG_X86_INVD_BUG
+       /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
+661:   pushl_cfi $do_general_protection
+662:
+.section .altinstructions,"a"
+       altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
+.previous
+.section .altinstr_replacement,"ax"
+663:   pushl $do_simd_coprocessor_error
+664:
+.previous
+#else
+       pushl_cfi $do_simd_coprocessor_error
+#endif
+       jmp error_code
+       CFI_ENDPROC
+END(simd_coprocessor_error)
+
+ENTRY(device_not_available)
+       RING0_INT_FRAME
+       pushl_cfi $-1                   # mark this as an int
+       pushl_cfi $do_device_not_available
+       jmp error_code
+       CFI_ENDPROC
+END(device_not_available)
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+       iret
+.section __ex_table,"a"
+       .align 4
+       .long native_iret, iret_exc
+.previous
+END(native_iret)
+
+ENTRY(native_irq_enable_sysexit)
+       sti
+       sysexit
+END(native_irq_enable_sysexit)
+#endif
+
+ENTRY(overflow)
+       RING0_INT_FRAME
+       pushl_cfi $0
+       pushl_cfi $do_overflow
+       jmp error_code
+       CFI_ENDPROC
+END(overflow)
+
+ENTRY(bounds)
+       RING0_INT_FRAME
+       pushl_cfi $0
+       pushl_cfi $do_bounds
+       jmp error_code
+       CFI_ENDPROC
+END(bounds)
+
+ENTRY(invalid_op)
+       RING0_INT_FRAME
+       pushl_cfi $0
+       pushl_cfi $do_invalid_op
+       jmp error_code
+       CFI_ENDPROC
+END(invalid_op)
+
+ENTRY(coprocessor_segment_overrun)
+       RING0_INT_FRAME
+       pushl_cfi $0
+       pushl_cfi $do_coprocessor_segment_overrun
+       jmp error_code
+       CFI_ENDPROC
+END(coprocessor_segment_overrun)
+
+ENTRY(invalid_TSS)
+       RING0_EC_FRAME
+       pushl_cfi $do_invalid_TSS
+       jmp error_code
+       CFI_ENDPROC
+END(invalid_TSS)
+
+ENTRY(segment_not_present)
+       RING0_EC_FRAME
+       pushl_cfi $do_segment_not_present
+       jmp error_code
+       CFI_ENDPROC
+END(segment_not_present)
+
+ENTRY(stack_segment)
+       RING0_EC_FRAME
+       pushl_cfi $do_stack_segment
+       jmp error_code
+       CFI_ENDPROC
+END(stack_segment)
+
+ENTRY(alignment_check)
+       RING0_EC_FRAME
+       pushl_cfi $do_alignment_check
+       jmp error_code
+       CFI_ENDPROC
+END(alignment_check)
+
+ENTRY(divide_error)
+       RING0_INT_FRAME
+       pushl_cfi $0                    # no error code
+       pushl_cfi $do_divide_error
+       jmp error_code
+       CFI_ENDPROC
+END(divide_error)
+
+#ifdef CONFIG_X86_MCE
+ENTRY(machine_check)
+       RING0_INT_FRAME
+       pushl_cfi $0
+       pushl_cfi machine_check_vector
+       jmp error_code
+       CFI_ENDPROC
+END(machine_check)
+#endif
+
+#ifndef CONFIG_XEN
+ENTRY(spurious_interrupt_bug)
+       RING0_INT_FRAME
+       pushl_cfi $0
+       pushl_cfi $do_spurious_interrupt_bug
+       jmp error_code
+       CFI_ENDPROC
+END(spurious_interrupt_bug)
+#endif /* !CONFIG_XEN */
+
+ENTRY(fixup_4gb_segment)
+       RING0_EC_FRAME
+       pushl_cfi $do_fixup_4gb_segment
+       jmp error_code
+       CFI_ENDPROC
+END(fixup_4gb_segment)
+/*
+ * End of kprobes section
+ */
+       .popsection
+
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+       CFI_STARTPROC
+       movl    4(%esp), %edx
+       movl    (%esp), %ecx
+       leal    4(%esp), %eax
+       movl    %ebx, PT_EBX(%edx)
+       xorl    %ebx, %ebx
+       movl    %ebx, PT_ECX(%edx)
+       movl    %ebx, PT_EDX(%edx)
+       movl    %esi, PT_ESI(%edx)
+       movl    %edi, PT_EDI(%edx)
+       movl    %ebp, PT_EBP(%edx)
+       movl    %ebx, PT_EAX(%edx)
+       movl    $__USER_DS, PT_DS(%edx)
+       movl    $__USER_DS, PT_ES(%edx)
+       movl    $__KERNEL_PERCPU, PT_FS(%edx)
+       movl    $__KERNEL_STACK_CANARY, PT_GS(%edx)
+       movl    %eax, PT_OLDESP(%edx)
+       movl    16(%esp), %eax
+       movl    %ebx, PT_ORIG_EAX(%edx)
+       movl    %ecx, PT_EIP(%edx)
+       movl    12(%esp), %ecx
+       movl    $__KERNEL_CS, PT_CS(%edx)
+       movl    %eax, 12(%esp)
+       movl    8(%esp), %eax
+       movl    %ecx, 8(%esp)
+       movl    %ebx, PT_EFLAGS(%edx)
+       movl    PT_EBX(%edx), %ebx
+       movl    $__KERNEL_DS, PT_OLDSS(%edx)
+       jmpl    *%eax
+       CFI_ENDPROC
+ENDPROC(arch_unwind_init_running)
+#endif
+
+ENTRY(kernel_thread_helper)
+       pushl $0                # fake return address for unwinder
+       CFI_STARTPROC
+       movl %edi,%eax
+       call *%esi
+       call do_exit
+       ud2                     # padding for call trace
+       CFI_ENDPROC
+ENDPROC(kernel_thread_helper)
+
+#ifdef CONFIG_FUNCTION_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+       ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
+
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       movl 0xc(%esp), %eax
+       movl 0x4(%ebp), %edx
+       subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+       call ftrace_stub
+
+       popl %edx
+       popl %ecx
+       popl %eax
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+       jmp ftrace_stub
+#endif
+
+.globl ftrace_stub
+ftrace_stub:
+       ret
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(mcount)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
+
+       cmpl $ftrace_stub, ftrace_trace_function
+       jnz trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       cmpl $ftrace_stub, ftrace_graph_return
+       jnz ftrace_graph_caller
+
+       cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+       jnz ftrace_graph_caller
+#endif
+.globl ftrace_stub
+ftrace_stub:
+       ret
+
+       /* taken from glibc */
+trace:
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       movl 0xc(%esp), %eax
+       movl 0x4(%ebp), %edx
+       subl $MCOUNT_INSN_SIZE, %eax
+
+       call *ftrace_trace_function
+
+       popl %edx
+       popl %ecx
+       popl %eax
+       jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+       cmpl $0, function_trace_stop
+       jne ftrace_stub
+
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       movl 0xc(%esp), %edx
+       lea 0x4(%ebp), %eax
+       movl (%ebp), %ecx
+       subl $MCOUNT_INSN_SIZE, %edx
+       call prepare_ftrace_return
+       popl %edx
+       popl %ecx
+       popl %eax
+       ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+       pushl %eax
+       pushl %edx
+       movl %ebp, %eax
+       call ftrace_return_to_handler
+       movl %eax, %ecx
+       popl %edx
+       popl %eax
+       jmp *%ecx
+#endif
+
+#ifdef TIF_CSTAR
+       # pv syscall call handler stub
+ENTRY(ia32pv_cstar_target)
+       RING0_INT_FRAME
+       movl $__USER_DS,16(%esp)
+       movl %ebp,%ecx
+       movl $__USER_CS,4(%esp)
+       movl 12(%esp),%ebp
+       pushl_cfi %eax                  # save orig_eax
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+       cmpl $__PAGE_OFFSET-4,%ebp
+       CFI_REMEMBER_STATE
+       ja cstar_fault
+1:     movl (%ebp),%ebp
+.section __ex_table,"a"
+       .align 4
+       .long 1b,cstar_fault
+.previous
+       SAVE_ALL
+       GET_THREAD_INFO(%ebp)
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+       jnz cstar_trace_entry
+       cmpl $NR_syscalls,%eax
+       jae cstar_badsys
+.Lcstar_call:
+       btl %eax,cstar_special
+       jc .Lcstar_special
+       call *cstar_call_table(,%eax,4)
+       movl %eax,PT_EAX(%esp)          # store the return value
+.Lcstar_exit:
+       movl PT_ECX(%esp),%ecx
+       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
+       jmp syscall_exit
+.Lcstar_special:
+       movl PT_ECX(%esp),%ecx
+       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
+       jmp syscall_call
+GLOBAL(cstar_set_tif)
+       movl $cstar_clear_tif,(%esp)    # replace return address
+       LOCK_PREFIX
+       orl $_TIF_CSTAR,TI_flags(%ebp)
+       jmp *sys_call_table(,%eax,4)
+cstar_clear_tif:
+       movl %eax,PT_EAX(%esp)          # store the return value
+       LOCK_PREFIX
+       andl $~_TIF_CSTAR,TI_flags(%ebp)
+       jmp .Lcstar_exit
+cstar_trace_entry:
+       movl $-ENOSYS,PT_EAX(%esp)
+       cmpl $NR_syscalls,%eax
+       jae 1f
+       btl %eax,cstar_special
+       jc .Lcstar_trace_special
+1:     movl %esp,%eax
+       LOCK_PREFIX
+       orl $_TIF_CSTAR,TI_flags(%ebp)
+       call syscall_trace_enter
+       LOCK_PREFIX
+       andl $~_TIF_CSTAR,TI_flags(%ebp)
+       /* What it returned is what we'll actually use.  */
+       cmpl $NR_syscalls,%eax
+       jb .Lcstar_call
+       jmp .Lcstar_exit
+.Lcstar_trace_special:
+       movl PT_ECX(%esp),%ecx
+       movl %esp,%eax
+       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
+       call syscall_trace_enter
+       /* What it returned is what we'll actually use.  */
+       cmpl $NR_syscalls,%eax
+       jb syscall_call
+       jmp syscall_exit
+cstar_badsys:
+       movl $-ENOSYS,PT_EAX(%esp)
+.Lcstar_resume:
+       movl PT_ECX(%esp),%ecx
+       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
+       jmp resume_userspace
+       CFI_RESTORE_STATE
+cstar_fault:
+       movl $-EFAULT,%eax
+       SAVE_ALL
+       GET_THREAD_INFO(%ebp)
+       jmp .Lcstar_resume
+       CFI_ENDPROC
+ENDPROC(ia32pv_cstar_target)
+
+ENTRY(cstar_ret_from_fork)
+       CFI_STARTPROC
+       movl PT_ECX(%esp),%ecx
+       GET_THREAD_INFO(%ebp)
+       movl %ecx,PT_EBP(%esp)          # put user EBP back in place
+       LOCK_PREFIX
+       andl $~_TIF_CSTAR,TI_flags(%ebp)
+       jmp ret_from_fork
+       CFI_ENDPROC
+END(cstar_ret_from_fork)
+
+#include <asm/unistd.h>
+.pushsection .rodata,"a"
+.balign 4
+cstar_special:
+nr=0
+mask=0
+.rept NR_syscalls+31
+ .irp n, __NR_sigreturn, __NR_rt_sigreturn
+  .if nr == \n
+   mask = mask | (1 << (\n & 31))
+  .endif
+ .endr
+ nr = nr + 1
+ .if (nr & 31) == 0
+  .long mask
+  mask = 0
+ .endif
+.endr
+.popsection
+#endif /* TIF_CSTAR */
+
+/*
+ * Some functions should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
+
+ENTRY(page_fault)
+       RING0_EC_FRAME
+       pushl_cfi $do_page_fault
+       ALIGN
+error_code:
+       /* the function address is in %gs's slot on the stack */
+       pushl_cfi %fs
+       /*CFI_REL_OFFSET fs, 0*/
+       pushl_cfi %es
+       /*CFI_REL_OFFSET es, 0*/
+       pushl_cfi %ds
+       /*CFI_REL_OFFSET ds, 0*/
+       pushl_cfi %eax
+       CFI_REL_OFFSET eax, 0
+       pushl_cfi %ebp
+       CFI_REL_OFFSET ebp, 0
+       pushl_cfi %edi
+       CFI_REL_OFFSET edi, 0
+       pushl_cfi %esi
+       CFI_REL_OFFSET esi, 0
+       pushl_cfi %edx
+       CFI_REL_OFFSET edx, 0
+       pushl_cfi %ecx
+       CFI_REL_OFFSET ecx, 0
+       pushl_cfi %ebx
+       CFI_REL_OFFSET ebx, 0
+       cld
+       movl $(__KERNEL_PERCPU), %ecx
+       movl %ecx, %fs
+       UNWIND_ESPFIX_STACK
+       GS_TO_REG %ecx
+       movl PT_GS(%esp), %edi          # get the function address
+       movl PT_ORIG_EAX(%esp), %edx    # get the error code
+       movl $-1, PT_ORIG_EAX(%esp)     # no syscall to restart
+       REG_TO_PTGS %ecx
+       SET_KERNEL_GS %ecx
+       movl $(__USER_DS), %ecx
+       movl %ecx, %ds
+       movl %ecx, %es
+       TRACE_IRQS_OFF
+       movl %esp,%eax                  # pt_regs pointer
+       call *%edi
+       jmp ret_from_exception
+       CFI_ENDPROC
+END(page_fault)
+
+#ifndef CONFIG_XEN
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+.macro FIX_STACK offset ok label
+       cmpw $__KERNEL_CS, 4(%esp)
+       jne \ok
+\label:
+       movl TSS_sysenter_sp0 + \offset(%esp), %esp
+       CFI_DEF_CFA esp, 0
+       CFI_UNDEFINED eip
+       pushfl_cfi
+       pushl_cfi $__KERNEL_CS
+       pushl_cfi $sysenter_past_esp
+       CFI_REL_OFFSET eip, 0
+.endm
+#endif /* CONFIG_XEN */
+
+ENTRY(debug)
+       RING0_INT_FRAME
+#ifndef CONFIG_XEN
+       cmpl $ia32_sysenter_target,(%esp)
+       jne debug_stack_correct
+       FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
+debug_stack_correct:
+#endif /* !CONFIG_XEN */
+       pushl_cfi $-1                   # mark this as an int
+       SAVE_ALL
+       TRACE_IRQS_OFF
+       xorl %edx,%edx                  # error code 0
+       movl %esp,%eax                  # pt_regs pointer
+       call do_debug
+       jmp ret_from_exception
+       CFI_ENDPROC
+END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got  an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+       RING0_INT_FRAME
+       pushl_cfi %eax
+#ifndef CONFIG_XEN
+       movl %ss, %eax
+       cmpw $__ESPFIX_SS, %ax
+       popl_cfi %eax
+       je nmi_espfix_stack
+       cmpl $ia32_sysenter_target,(%esp)
+       je nmi_stack_fixup
+       pushl_cfi %eax
+       movl %esp,%eax
+       /* Do not access memory above the end of our stack page,
+        * it might not exist.
+        */
+       andl $(THREAD_SIZE-1),%eax
+       cmpl $(THREAD_SIZE-20),%eax
+       popl_cfi %eax
+       jae nmi_stack_correct
+       cmpl $ia32_sysenter_target,12(%esp)
+       je nmi_debug_stack_check
+nmi_stack_correct:
+       /* We have a RING0_INT_FRAME here */
+       pushl_cfi %eax
+       SAVE_ALL
+       xorl %edx,%edx          # zero error code
+       movl %esp,%eax          # pt_regs pointer
+       call do_nmi
+       jmp restore_all_notrace
+       CFI_ENDPROC
+
+nmi_stack_fixup:
+       RING0_INT_FRAME
+       FIX_STACK 12, nmi_stack_correct, 1
+       jmp nmi_stack_correct
+
+nmi_debug_stack_check:
+       /* We have a RING0_INT_FRAME here */
+       cmpw $__KERNEL_CS,16(%esp)
+       jne nmi_stack_correct
+       cmpl $debug,(%esp)
+       jb nmi_stack_correct
+       cmpl $debug_esp_fix_insn,(%esp)
+       ja nmi_stack_correct
+       FIX_STACK 24, nmi_stack_correct, 1
+       jmp nmi_stack_correct
+
+nmi_espfix_stack:
+       /* We have a RING0_INT_FRAME here.
+        *
+        * create the pointer to lss back
+        */
+       pushl_cfi %ss
+       pushl_cfi %esp
+       addl $4, (%esp)
+       /* copy the iret frame of 12 bytes */
+       .rept 3
+       pushl_cfi 16(%esp)
+       .endr
+       pushl_cfi %eax
+       SAVE_ALL
+       FIXUP_ESPFIX_STACK              # %eax == %esp
+       xorl %edx,%edx                  # zero error code
+       call do_nmi
+       RESTORE_REGS
+       lss 12+4(%esp), %esp            # back to espfix stack
+       CFI_ADJUST_CFA_OFFSET -24
+       jmp irq_return
+#else
+       SAVE_ALL
+       xorl %edx,%edx          # zero error code
+       movl %esp,%eax          # pt_regs pointer
+       call do_nmi
+       orl  $NMI_MASK, PT_EFLAGS(%esp)
+       jmp restore_all
+#endif
+       CFI_ENDPROC
+END(nmi)
+
+ENTRY(int3)
+       RING0_INT_FRAME
+       pushl_cfi $-1                   # mark this as an int
+       SAVE_ALL
+       TRACE_IRQS_OFF
+       xorl %edx,%edx          # zero error code
+       movl %esp,%eax          # pt_regs pointer
+       call do_int3
+       jmp ret_from_exception
+       CFI_ENDPROC
+END(int3)
+
+ENTRY(general_protection)
+       RING0_EC_FRAME
+       pushl_cfi $do_general_protection
+       jmp error_code
+       CFI_ENDPROC
+END(general_protection)
+
+#ifdef CONFIG_KVM_GUEST
+ENTRY(async_page_fault)
+       RING0_EC_FRAME
+       pushl_cfi $do_async_page_fault
+       jmp error_code
+       CFI_ENDPROC
+END(async_page_fault)
+#endif
+
+/*
+ * End of kprobes section
+ */
+       .popsection
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S

index 7b784f4..2bd1af4 100644 (file)
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -380,7 +380,7 @@ ENTRY(ia32_sysenter_target)
         CFI_SIGNAL_FRAME
         CFI_DEF_CFA esp, 0
         CFI_REGISTER esp, ebp
-       movl TSS_sysenter_sp0(%esp),%esp
+       movl SYSENTER_stack_sp0(%esp),%esp
  sysenter_past_esp:
         /*
          * Interrupts are disabled here, but we can't trace it until
@@ -1003,6 +1003,41 @@ END(spurious_interrupt_bug)
   */
         .popsection
  
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+       CFI_STARTPROC
+       movl    4(%esp), %edx
+       movl    (%esp), %ecx
+       leal    4(%esp), %eax
+       movl    %ebx, PT_EBX(%edx)
+       xorl    %ebx, %ebx
+       movl    %ebx, PT_ECX(%edx)
+       movl    %ebx, PT_EDX(%edx)
+       movl    %esi, PT_ESI(%edx)
+       movl    %edi, PT_EDI(%edx)
+       movl    %ebp, PT_EBP(%edx)
+       movl    %ebx, PT_EAX(%edx)
+       movl    $__USER_DS, PT_DS(%edx)
+       movl    $__USER_DS, PT_ES(%edx)
+       movl    $__KERNEL_PERCPU, PT_FS(%edx)
+       movl    $__KERNEL_STACK_CANARY, PT_GS(%edx)
+       movl    %eax, PT_OLDESP(%edx)
+       movl    16(%esp), %eax
+       movl    %ebx, PT_ORIG_EAX(%edx)
+       movl    %ecx, PT_EIP(%edx)
+       movl    12(%esp), %ecx
+       movl    $__KERNEL_CS, PT_CS(%edx)
+       movl    %eax, 12(%esp)
+       movl    8(%esp), %eax
+       movl    %ecx, 8(%esp)
+       movl    %ebx, PT_EFLAGS(%edx)
+       movl    PT_EBX(%edx), %ebx
+       movl    $__KERNEL_DS, PT_OLDSS(%edx)
+       jmpl    *%eax
+       CFI_ENDPROC
+ENDPROC(arch_unwind_init_running)
+#endif
+
  ENTRY(kernel_thread_helper)
         pushl $0                # fake return address for unwinder
         CFI_STARTPROC
@@ -1013,7 +1048,7 @@ ENTRY(kernel_thread_helper)
         CFI_ENDPROC
  ENDPROC(kernel_thread_helper)
  
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
  /* Xen doesn't set %esp to be precisely what the normal sysenter
     entrypoint expects, so fix it up before using the normal path. */
  ENTRY(xen_sysenter_target)
@@ -1105,7 +1140,7 @@ ENDPROC(xen_failsafe_callback)
  BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
                 xen_evtchn_do_upcall)
  
-#endif /* CONFIG_XEN */
+#endif /* CONFIG_PARAVIRT_XEN */
  
  #ifdef CONFIG_FUNCTION_TRACER
  #ifdef CONFIG_DYNAMIC_FTRACE
@@ -1268,7 +1303,7 @@ END(page_fault)
   * that sets up the real kernel stack. Check here, since we can't
   * allow the wrong stack to be used.
   *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have
   * already pushed 3 words if it hits on the sysenter instruction:
   * eflags, cs and eip.
   *
@@ -1280,7 +1315,7 @@ END(page_fault)
         cmpw $__KERNEL_CS, 4(%esp)
         jne \ok
  \label:
-       movl TSS_sysenter_sp0 + \offset(%esp), %esp
+       movl SYSENTER_stack_sp0 + \offset(%esp), %esp
         CFI_DEF_CFA esp, 0
         CFI_UNDEFINED eip
         pushfl_cfi
diff --git a/arch/x86/kernel/entry_64-xen.S b/arch/x86/kernel/entry_64-xen.S

new file mode 100644 (file)

index 0000000..349aa58
--- /dev/null
+++ b/arch/x86/kernel/entry_64-xen.S
@@ -0,0 +1,1428 @@
+/*
+ *  linux/arch/x86_64/entry.S
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
+ *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
+ *  Jun Nakajima <jun.nakajima@intel.com>
+ *  Asit Mallick <asit.k.mallick@intel.com>
+ *      Modified for Xen
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ *
+ * Some of this is documented in Documentation/x86/entry_64.txt
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after an interrupt and after each system call.
+ *
+ * Normal syscalls and interrupts don't save a full stack frame, this is
+ * only done for syscall tracing, signals or fork/exec et.al.
+ *
+ * A note on terminology:
+ * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
+ * - partial stack frame: partially saved registers up to R11.
+ * - full stack frame: Like partial stack frame, but all register saved.
+ *
+ * Some macro usage:
+ * - CFI macros are used to generate dwarf2 unwind information for better
+ * backtraces. They don't change any code.
+ * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
+ * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
+ * There are unfortunately lots of special cases where some registers
+ * not touched. The macro is a big mess that should be cleaned up.
+ * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
+ * Gives a full stack frame.
+ * - ENTRY/END Define functions in the symbol table.
+ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
+ * frame that is otherwise undefined after a SYSCALL
+ * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
+ * - errorentry/paranoidentry/zeroentry - Define exception entry points.
+ */
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/asm-offsets.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+#include <asm/page_types.h>
+#include <asm/irqflags.h>
+#include <asm/processor-flags.h>
+#include <asm/ftrace.h>
+#include <asm/percpu.h>
+#include <linux/err.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/features.h>
+
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_X86_64      (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_64BIT 0x80000000
+#define __AUDIT_ARCH_LE           0x40000000
+
+       .code64
+       .section .entry.text, "ax"
+
+#ifdef CONFIG_FUNCTION_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+ENTRY(mcount)
+       retq
+END(mcount)
+
+ENTRY(ftrace_caller)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
+
+       MCOUNT_SAVE_FRAME
+
+       movq 0x38(%rsp), %rdi
+       movq 8(%rbp), %rsi
+       subq $MCOUNT_INSN_SIZE, %rdi
+
+GLOBAL(ftrace_call)
+       call ftrace_stub
+
+       MCOUNT_RESTORE_FRAME
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+GLOBAL(ftrace_graph_call)
+       jmp ftrace_stub
+#endif
+
+GLOBAL(ftrace_stub)
+       retq
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+ENTRY(mcount)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
+
+       cmpq $ftrace_stub, ftrace_trace_function
+       jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       cmpq $ftrace_stub, ftrace_graph_return
+       jnz ftrace_graph_caller
+
+       cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+       jnz ftrace_graph_caller
+#endif
+
+GLOBAL(ftrace_stub)
+       retq
+
+trace:
+       MCOUNT_SAVE_FRAME
+
+       movq 0x38(%rsp), %rdi
+       movq 8(%rbp), %rsi
+       subq $MCOUNT_INSN_SIZE, %rdi
+
+       call   *ftrace_trace_function
+
+       MCOUNT_RESTORE_FRAME
+
+       jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+       cmpl $0, function_trace_stop
+       jne ftrace_stub
+
+       MCOUNT_SAVE_FRAME
+
+       leaq 8(%rbp), %rdi
+       movq 0x38(%rsp), %rsi
+       movq (%rbp), %rdx
+       subq $MCOUNT_INSN_SIZE, %rsi
+
+       call    prepare_ftrace_return
+
+       MCOUNT_RESTORE_FRAME
+
+       retq
+END(ftrace_graph_caller)
+
+GLOBAL(return_to_handler)
+       subq  $24, %rsp
+
+       /* Save the return values */
+       movq %rax, (%rsp)
+       movq %rdx, 8(%rsp)
+       movq %rbp, %rdi
+
+       call ftrace_return_to_handler
+
+       movq %rax, %rdi
+       movq 8(%rsp), %rdx
+       movq (%rsp), %rax
+       addq $24, %rsp
+       jmp *%rdi
+#endif
+
+
+#ifndef CONFIG_PREEMPT
+#define retint_kernel retint_restore_args
+#endif
+
+
+.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+#ifdef CONFIG_TRACE_IRQFLAGS
+       bt   $9,EFLAGS-\offset(%rsp)    /* interrupts off? */
+       jnc  1f
+       TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+NMI_MASK = 0x80000000
+       
+/*
+ * C code is not supposed to know about undefined top of stack. Every time
+ * a C function with an pt_regs argument is called from the SYSCALL based
+ * fast path FIXUP_TOP_OF_STACK is needed.
+ * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
+ * manipulation.
+ */
+
+       /* %rsp:at FRAMEEND */
+       .macro FIXUP_TOP_OF_STACK tmp offset=0
+       movq $__USER_CS,CS+\offset(%rsp)
+       movq $-1,RCX+\offset(%rsp)
+       .endm
+
+       .macro RESTORE_TOP_OF_STACK tmp offset=0
+       .endm
+
+       .macro FAKE_STACK_FRAME child_rip
+       /* push in order ss, rsp, eflags, cs, rip */
+       xorl %eax, %eax
+       pushq_cfi $__KERNEL_DS /* ss */
+       /*CFI_REL_OFFSET        ss,0*/
+       pushq_cfi %rax /* rsp */
+       CFI_REL_OFFSET  rsp,0
+       pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
+       /*CFI_REL_OFFSET        rflags,0*/
+       pushq_cfi $__KERNEL_CS /* cs */
+       /*CFI_REL_OFFSET        cs,0*/
+       pushq_cfi \child_rip /* rip */
+       CFI_REL_OFFSET  rip,0
+       pushq_cfi %rax /* orig rax */
+       .endm
+
+       .macro UNFAKE_STACK_FRAME
+       addq $8*6, %rsp
+       CFI_ADJUST_CFA_OFFSET   -(6*8)
+       .endm
+
+/*
+ * initial frame state for syscall
+ */
+       .macro BASIC_FRAME start=1 offset=0
+       .if \start
+       CFI_STARTPROC simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA rsp, SS+8+\offset-RIP
+       .else
+       CFI_DEF_CFA_OFFSET SS+8+\offset-RIP
+       .endif
+       /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
+       CFI_REL_OFFSET rsp, RSP+\offset-RIP
+       /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
+       /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
+       CFI_REL_OFFSET rip, RIP+\offset-RIP
+       .endm
+
+/*
+ * initial frame state for interrupts (and exceptions without error code)
+ */
+       .macro INTR_FRAME start=1 offset=0
+       .if \start == 1
+       BASIC_FRAME 1, \offset+2*8
+       CFI_REL_OFFSET rcx, 0+\offset
+       CFI_REL_OFFSET r11, 8+\offset
+       .else
+       BASIC_FRAME \start, \offset
+       .endif
+       .endm
+
+/*
+ * initial frame state for exceptions with error code (and interrupts
+ * with vector already pushed)
+ */
+       .macro XCPT_FRAME start=1 offset=0
+       INTR_FRAME \start, RIP+\offset-ORIG_RAX
+       .endm
+
+/*
+ * frame that enables calling into C.
+ */
+       .macro PARTIAL_FRAME start=1 offset=0
+       .if \start >= 0
+       XCPT_FRAME 2*\start, ORIG_RAX+\offset-ARGOFFSET
+       .endif
+       CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
+       CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
+       CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
+       CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
+       CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
+       CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
+       CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
+       CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
+       CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+       .endm
+
+/*
+ * frame that enables passing a complete pt_regs to a C function.
+ */
+       .macro DEFAULT_FRAME start=1 offset=0
+       .if \start >= -1
+       PARTIAL_FRAME \start, R11+\offset-R15
+       .endif
+       CFI_REL_OFFSET rbx, RBX+\offset
+       CFI_REL_OFFSET rbp, RBP+\offset
+       CFI_REL_OFFSET r12, R12+\offset
+       CFI_REL_OFFSET r13, R13+\offset
+       CFI_REL_OFFSET r14, R14+\offset
+       CFI_REL_OFFSET r15, R15+\offset
+       .endm
+
+        /*
+         * Must be consistent with the definition in arch-x86/xen-x86_64.h:
+         *     struct iret_context {
+         *        u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+         *     };
+         * with rax, r11, and rcx being taken care of in the hypercall stub.
+         */
+       .macro HYPERVISOR_IRET flag
+       .if \flag == 0  # return from syscall always uses the hypercall
+       testb $3,1*8(%rsp)
+       jnz   2f
+       testl $NMI_MASK,2*8(%rsp)
+       jnz   2f
+
+       cmpb  $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip)
+       jne   1f
+
+       /* Direct iret to kernel space. Correct CS and SS. */
+       orl   $3,1*8(%rsp)
+       orl   $3,4*8(%rsp)
+1:     iretq
+       .endif
+
+2:     /* Slow iret via hypervisor. */
+       andl  $~NMI_MASK, 2*8(%rsp)
+       pushq $\flag & VGCF_in_syscall
+       jmp  hypercall_page + (__HYPERVISOR_iret * 32)
+       .endm
+
+#ifndef CONFIG_XEN
+/* save partial stack frame */
+       .macro SAVE_ARGS_IRQ
+       cld
+       /* start from rbp in pt_regs and jump over */
+       movq_cfi rdi, RDI-RBP
+       movq_cfi rsi, RSI-RBP
+       movq_cfi rdx, RDX-RBP
+       movq_cfi rcx, RCX-RBP
+       movq_cfi rax, RAX-RBP
+       movq_cfi  r8,  R8-RBP
+       movq_cfi  r9,  R9-RBP
+       movq_cfi r10, R10-RBP
+       movq_cfi r11, R11-RBP
+
+       /* Save rbp so that we can unwind from get_irq_regs() */
+       movq_cfi rbp, 0
+
+       /* Save previous stack value */
+       movq %rsp, %rsi
+
+       leaq -RBP(%rsp),%rdi    /* arg1 for handler */
+       testl $3, CS-RBP(%rsi)
+       je 1f
+       SWAPGS
+       /*
+        * irq_count is used to check if a CPU is already on an interrupt stack
+        * or not. While this is essentially redundant with preempt_count it is
+        * a little cheaper to use a separate counter in the PDA (short of
+        * moving irq_enter into assembly, which would be too much work)
+        */
+1:     incl PER_CPU_VAR(irq_count)
+       cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
+       CFI_DEF_CFA_REGISTER    rsi
+
+       /* Store previous stack value */
+       pushq %rsi
+       CFI_ESCAPE      0x0f /* DW_CFA_def_cfa_expression */, 6, \
+                       0x77 /* DW_OP_breg7 */, 0, \
+                       0x06 /* DW_OP_deref */, \
+                       0x08 /* DW_OP_const1u */, SS+8-RBP, \
+                       0x22 /* DW_OP_plus */
+       /* We entered an interrupt context - irqs are off: */
+       TRACE_IRQS_OFF
+       .endm
+#endif
+
+ENTRY(save_rest)
+       CFI_STARTPROC
+       movq 5*8+16(%rsp), %r11 /* save return address */
+       movq %rbx, RBX+16(%rsp)
+       movq %rbp, RBP+16(%rsp)
+       movq %r12, R12+16(%rsp)
+       movq %r13, R13+16(%rsp)
+       movq %r14, R14+16(%rsp)
+       movq %r15, R15+16(%rsp)
+       movq %r11, 8(%rsp)      /* return address */
+       FIXUP_TOP_OF_STACK %r11, 16
+       ret
+       CFI_ENDPROC
+END(save_rest)
+
+#ifndef CONFIG_XEN
+/* save complete stack frame */
+       .pushsection .kprobes.text, "ax"
+ENTRY(save_paranoid)
+       XCPT_FRAME offset=ORIG_RAX-R15+8
+       cld
+       movq %rdi, RDI+8(%rsp)
+       movq %rsi, RSI+8(%rsp)
+       movq_cfi rdx, RDX+8
+       movq_cfi rcx, RCX+8
+       movq_cfi rax, RAX+8
+       movq %r8, R8+8(%rsp)
+       movq %r9, R9+8(%rsp)
+       movq %r10, R10+8(%rsp)
+       movq %r11, R11+8(%rsp)
+       movq_cfi rbx, RBX+8
+       movq %rbp, RBP+8(%rsp)
+       movq %r12, R12+8(%rsp)
+       movq %r13, R13+8(%rsp)
+       movq %r14, R14+8(%rsp)
+       movq %r15, R15+8(%rsp)
+       movl $1,%ebx
+       movl $MSR_GS_BASE,%ecx
+       rdmsr
+       testl %edx,%edx
+       js 1f   /* negative -> in kernel */
+       SWAPGS
+       xorl %ebx,%ebx
+1:     ret
+       CFI_ENDPROC
+END(save_paranoid)
+       .popsection
+#endif
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
+ENTRY(ret_from_fork)
+       DEFAULT_FRAME
+
+       LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
+       pushq_cfi kernel_eflags(%rip)
+       popfq_cfi                               # reset kernel eflags
+
+       call schedule_tail                      # rdi: 'prev' task parameter
+
+       GET_THREAD_INFO(%rcx)
+
+       RESTORE_REST
+
+       testl $3, CS-ARGOFFSET(%rsp)            # from kernel_thread?
+       jnz  1f
+       /* Need to set the proper %ss (not NULL) for ring 3 iretq */
+       movl $__KERNEL_DS,SS-ARGOFFSET(%rsp)
+       jmp  retint_restore_args
+1:
+       testl $_TIF_IA32, TI_flags(%rcx)        # 32-bit compat task needs IRET
+       jnz  int_ret_from_sys_call
+
+       RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
+       jmp ret_from_sys_call                   # go to the SYSRET fastpath
+
+       CFI_ENDPROC
+END(ret_from_fork)
+
+/*
+ * System call entry. Up to 6 arguments in registers are supported.
+ *
+ * SYSCALL does not save anything on the stack and does not change the
+ * stack pointer.
+ */
+
+/*
+ * Register setup:
+ * rax  system call number
+ * rdi  arg0
+ * rcx  return address for syscall/sysret, C arg3
+ * rsi  arg1
+ * rdx  arg2
+ * r10  arg3   (--> moved to rcx for C)
+ * r8   arg4
+ * r9   arg5
+ * r11  eflags for syscall/sysret, temporary for C
+ * r12-r15,rbp,rbx saved by C code, not touched.
+ *
+ * Interrupts are enabled on entry.
+ * Only called from user space.
+ *
+ * XXX if we had a free scratch register we could save the RSP into the stack frame
+ *      and report it properly in ps. Unfortunately we haven't.
+ *
+ * When user can change the frames always force IRET. That is because
+ * it deals with uncanonical addresses better. SYSRET has trouble
+ * with them due to bugs in both AMD and Intel CPUs.
+ */
+
+ENTRY(system_call)
+       INTR_FRAME start=2 offset=2*8
+       SAVE_ARGS -8,0
+       movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
+       testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       jnz tracesys
+system_call_fastpath:
+#if __SYSCALL_MASK == ~0
+       cmpq $__NR_syscall_max,%rax
+#else
+       andl $__SYSCALL_MASK,%eax
+       cmpl $__NR_syscall_max,%eax
+#endif
+       ja badsys
+       movq %r10,%rcx
+       call *sys_call_table(,%rax,8)  # XXX:    rip relative
+       movq %rax,RAX-ARGOFFSET(%rsp)
+/*
+ * Syscall return path ending with SYSRET (fast path)
+ * Has incomplete stack frame and undefined top of stack.
+ */
+ret_from_sys_call:
+       movl $_TIF_ALLWORK_MASK,%edi
+       /* edi: flagmask */
+sysret_check:
+       LOCKDEP_SYS_EXIT
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
+       movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
+       andl %edi,%edx
+       jnz  sysret_careful
+       CFI_REMEMBER_STATE
+       /*
+        * sysretq will re-enable interrupts:
+        */
+       TRACE_IRQS_ON
+       RESTORE_ARGS 1,8,0,0
+       xor %ecx,%ecx
+       xor %r11,%r11
+        HYPERVISOR_IRET VGCF_IN_SYSCALL
+
+       CFI_RESTORE_STATE
+       /* Handle reschedules */
+       /* edx: work, edi: workmask */
+sysret_careful:
+       bt $TIF_NEED_RESCHED,%edx
+       jnc sysret_signal
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
+       pushq_cfi %rdi
+       call schedule
+       popq_cfi %rdi
+       jmp sysret_check
+
+       /* Handle a signal */
+sysret_signal:
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
+#ifdef CONFIG_AUDITSYSCALL
+       bt $TIF_SYSCALL_AUDIT,%edx
+       jc sysret_audit
+#endif
+       /*
+        * We have a signal, or exit tracing or single-step.
+        * These all wind up with the iret return path anyway,
+        * so just join that path right now.
+        */
+       FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
+       jmp int_check_syscall_exit_work
+
+badsys:
+       movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+       jmp ret_from_sys_call
+
+#ifdef CONFIG_AUDITSYSCALL
+       /*
+        * Fast path for syscall audit without full syscall trace.
+        * We just call __audit_syscall_entry() directly, and then
+        * jump back to the normal fast path.
+        */
+auditsys:
+       movq %r10,%r9                   /* 6th arg: 4th syscall arg */
+       movq %rdx,%r8                   /* 5th arg: 3rd syscall arg */
+       movq %rsi,%rcx                  /* 4th arg: 2nd syscall arg */
+       movq %rdi,%rdx                  /* 3rd arg: 1st syscall arg */
+       movq %rax,%rsi                  /* 2nd arg: syscall number */
+       movl $AUDIT_ARCH_X86_64,%edi    /* 1st arg: audit arch */
+       call __audit_syscall_entry
+       LOAD_ARGS 0             /* reload call-clobbered registers */
+       jmp system_call_fastpath
+
+       /*
+        * Return fast path for syscall audit.  Call __audit_syscall_exit()
+        * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
+        * masked off.
+        */
+sysret_audit:
+       movq RAX-ARGOFFSET(%rsp),%rsi   /* second arg, syscall return value */
+       cmpq $-MAX_ERRNO,%rsi   /* is it < -MAX_ERRNO? */
+       setbe %al               /* 1 if so, 0 if not */
+       movzbl %al,%edi         /* zero-extend that into %edi */
+       call __audit_syscall_exit
+       movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
+       jmp sysret_check
+#endif /* CONFIG_AUDITSYSCALL */
+
+       /* Do syscall tracing */
+tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+       testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       jz auditsys
+#endif
+       SAVE_REST
+       movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
+       FIXUP_TOP_OF_STACK %rdi
+       movq %rsp,%rdi
+       call syscall_trace_enter
+       /*
+        * Reload arg registers from stack in case ptrace changed them.
+        * We don't reload %rax because syscall_trace_enter() returned
+        * the value it wants us to use in the table lookup.
+        */
+       LOAD_ARGS ARGOFFSET, 1
+       RESTORE_REST
+#if __SYSCALL_MASK == ~0
+       cmpq $__NR_syscall_max,%rax
+#else
+       andl $__SYSCALL_MASK,%eax
+       cmpl $__NR_syscall_max,%eax
+#endif
+       ja   int_ret_from_sys_call      /* RAX(%rsp) set to -ENOSYS above */
+       movq %r10,%rcx  /* fixup for C */
+       call *sys_call_table(,%rax,8)
+       movq %rax,RAX-ARGOFFSET(%rsp)
+       /* Use IRET because user could have changed frame */
+
+/*
+ * Syscall return path ending with IRET.
+ * Has correct top of stack, but partial stack frame.
+ */
+GLOBAL(int_ret_from_sys_call)
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
+       movl $_TIF_ALLWORK_MASK,%edi
+       /* edi: mask to check */
+GLOBAL(int_with_check)
+       LOCKDEP_SYS_EXIT_IRQ
+       GET_THREAD_INFO(%rcx)
+       movl TI_flags(%rcx),%edx
+       andl %edi,%edx
+       jnz   int_careful
+       andl    $~TS_COMPAT,TI_status(%rcx)
+       jmp   retint_restore_args
+
+       /* Either reschedule or signal or syscall exit tracking needed. */
+       /* First do a reschedule test. */
+       /* edx: work, edi: workmask */
+int_careful:
+       bt $TIF_NEED_RESCHED,%edx
+       jnc  int_very_careful
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
+       pushq_cfi %rdi
+       call schedule
+       popq_cfi %rdi
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
+       jmp int_with_check
+
+       /* handle signals and tracing -- both require a full stack frame */
+int_very_careful:
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
+int_check_syscall_exit_work:
+       SAVE_REST
+       /* Check for syscall exit trace */
+       testl $_TIF_WORK_SYSCALL_EXIT,%edx
+       jz int_signal
+       pushq_cfi %rdi
+       leaq 8(%rsp),%rdi       # &ptregs -> arg1
+       call syscall_trace_leave
+       popq_cfi %rdi
+       andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
+       jmp int_restore_rest
+
+int_signal:
+       testl $_TIF_DO_NOTIFY_MASK,%edx
+       jz 1f
+       movq %rsp,%rdi          # &ptregs -> arg1
+       xorl %esi,%esi          # oldset -> arg2
+       call do_notify_resume
+1:     movl $_TIF_WORK_MASK,%edi
+int_restore_rest:
+       RESTORE_REST
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
+       jmp int_with_check
+       CFI_ENDPROC
+END(system_call)
+
+/*
+ * Certain special system calls that need to save a complete full stack frame.
+ */
+       .macro PTREGSCALL label,func,arg
+ENTRY(\label)
+       PARTIAL_FRAME 1 8               /* offset 8: return address */
+       subq $REST_SKIP, %rsp
+       CFI_ADJUST_CFA_OFFSET REST_SKIP
+       call save_rest
+       DEFAULT_FRAME -2 8              /* offset 8: return address */
+       leaq 8(%rsp), \arg      /* pt_regs pointer */
+       call \func
+       jmp ptregscall_common
+       CFI_ENDPROC
+END(\label)
+       .endm
+
+       PTREGSCALL stub_clone, sys_clone, %r8
+       PTREGSCALL stub_fork, sys_fork, %rdi
+       PTREGSCALL stub_vfork, sys_vfork, %rdi
+       PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
+       PTREGSCALL stub_iopl, sys_iopl, %rsi
+
+ENTRY(ptregscall_common)
+       DEFAULT_FRAME 1 8       /* offset 8: return address */
+       RESTORE_TOP_OF_STACK %r11, 8
+       movq_cfi_restore R15+8, r15
+       movq_cfi_restore R14+8, r14
+       movq_cfi_restore R13+8, r13
+       movq_cfi_restore R12+8, r12
+       movq_cfi_restore RBP+8, rbp
+       movq_cfi_restore RBX+8, rbx
+       ret $REST_SKIP          /* pop extended registers */
+       CFI_ENDPROC
+END(ptregscall_common)
+
+ENTRY(stub_execve)
+       CFI_STARTPROC
+       addq $8, %rsp
+       PARTIAL_FRAME 0
+       SAVE_REST
+       FIXUP_TOP_OF_STACK %r11
+       movq %rsp, %rcx
+       call sys_execve
+       RESTORE_TOP_OF_STACK %r11
+       movq %rax,RAX(%rsp)
+       RESTORE_REST
+       jmp int_ret_from_sys_call
+       CFI_ENDPROC
+END(stub_execve)
+
+/*
+ * sigreturn is special because it needs to restore all registers on return.
+ * This cannot be done with SYSRET, so use the IRET return path instead.
+ */
+ENTRY(stub_rt_sigreturn)
+       CFI_STARTPROC
+       addq $8, %rsp
+       PARTIAL_FRAME 0
+       SAVE_REST
+       movq %rsp,%rdi
+       FIXUP_TOP_OF_STACK %r11
+       call sys_rt_sigreturn
+       movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+       RESTORE_REST
+       jmp int_ret_from_sys_call
+       CFI_ENDPROC
+END(stub_rt_sigreturn)
+
+#ifdef CONFIG_X86_X32_ABI
+       PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx
+
+ENTRY(stub_x32_rt_sigreturn)
+       CFI_STARTPROC
+       addq $8, %rsp
+       PARTIAL_FRAME 0
+       SAVE_REST
+       movq %rsp,%rdi
+       FIXUP_TOP_OF_STACK %r11
+       call sys32_x32_rt_sigreturn
+       movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+       RESTORE_REST
+       jmp int_ret_from_sys_call
+       CFI_ENDPROC
+END(stub_x32_rt_sigreturn)
+
+ENTRY(stub_x32_execve)
+       CFI_STARTPROC
+       addq $8, %rsp
+       PARTIAL_FRAME 0
+       SAVE_REST
+       FIXUP_TOP_OF_STACK %r11
+       movq %rsp, %rcx
+       call sys32_execve
+       RESTORE_TOP_OF_STACK %r11
+       movq %rax,RAX(%rsp)
+       RESTORE_REST
+       jmp int_ret_from_sys_call
+       CFI_ENDPROC
+END(stub_x32_execve)
+
+#endif
+
+/*
+ * Interrupt exit.
+ */ 
+
+retint_with_reschedule:
+       PARTIAL_FRAME
+       movl $_TIF_WORK_MASK,%edi
+retint_check:
+       LOCKDEP_SYS_EXIT_IRQ
+       movl TI_flags(%rcx),%edx
+       andl %edi,%edx
+       CFI_REMEMBER_STATE
+       jnz  retint_careful
+retint_restore_args:   /* return to kernel space */
+       movl EFLAGS-REST_SKIP(%rsp), %eax
+       shr $9, %eax                    # EAX[0] == IRET_EFLAGS.IF
+       GET_VCPU_INFO
+       andb evtchn_upcall_mask(%rsi),%al
+       andb $1,%al                     # EAX[0] == IRET_EFLAGS.IF & event_mask
+       jnz restore_all_enable_events   #        != 0 => enable event delivery
+               
+       RESTORE_ARGS 1,8,1
+       HYPERVISOR_IRET 0
+       
+       /* edi: workmask, edx: work */
+retint_careful:
+       CFI_RESTORE_STATE
+       bt    $TIF_NEED_RESCHED,%edx
+       jnc   retint_signal
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
+       pushq_cfi %rdi
+       call  schedule
+       popq_cfi %rdi
+       GET_THREAD_INFO(%rcx)
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
+       jmp retint_check
+
+retint_signal:
+       testl $_TIF_DO_NOTIFY_MASK,%edx
+       jz    retint_restore_args
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
+       SAVE_REST
+       movq $-1,ORIG_RAX(%rsp)
+       xorl %esi,%esi          # oldset
+       movq %rsp,%rdi          # &pt_regs
+       call do_notify_resume
+       RESTORE_REST
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
+       GET_THREAD_INFO(%rcx)
+       jmp retint_with_reschedule
+
+#ifdef CONFIG_PREEMPT
+       /* Returning to kernel space. Check if we need preemption */
+       /* rcx:  threadinfo. interrupts off. */
+ENTRY(retint_kernel)
+       cmpl $0,TI_preempt_count(%rcx)
+       jnz  retint_restore_args
+       bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
+       jnc  retint_restore_args
+       bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
+       jnc  retint_restore_args
+       call preempt_schedule_irq
+       jmp retint_kernel       /* check again */
+#endif
+
+       CFI_ENDPROC
+END(retint_check)
+
+#ifndef CONFIG_XEN
+/*
+ * APIC interrupts.
+ */
+.macro apicinterrupt num sym do_sym
+ENTRY(\sym)
+       INTR_FRAME
+       pushq_cfi $~(\num)
+       interrupt \do_sym
+       jmp error_entry
+       CFI_ENDPROC
+END(\sym)
+.endm
+
+#ifdef CONFIG_SMP
+apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
+       irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
+apicinterrupt REBOOT_VECTOR \
+       reboot_interrupt smp_reboot_interrupt
+#endif
+
+#ifdef CONFIG_X86_UV
+apicinterrupt UV_BAU_MESSAGE \
+       uv_bau_message_intr1 uv_bau_message_interrupt
+#endif
+apicinterrupt LOCAL_TIMER_VECTOR \
+       apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt X86_PLATFORM_IPI_VECTOR \
+       x86_platform_ipi smp_x86_platform_ipi
+
+#ifdef CONFIG_SMP
+       ALIGN
+       INTR_FRAME
+.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
+       16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+.if NUM_INVALIDATE_TLB_VECTORS > \idx
+ENTRY(invalidate_interrupt\idx)
+       pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
+       jmp .Lcommon_invalidate_interrupt0
+       CFI_ADJUST_CFA_OFFSET -8
+END(invalidate_interrupt\idx)
+.endif
+.endr
+       CFI_ENDPROC
+apicinterrupt INVALIDATE_TLB_VECTOR_START, \
+       invalidate_interrupt0, smp_invalidate_interrupt
+#endif
+
+apicinterrupt THRESHOLD_APIC_VECTOR \
+       threshold_interrupt smp_threshold_interrupt
+apicinterrupt THERMAL_APIC_VECTOR \
+       thermal_interrupt smp_thermal_interrupt
+
+#ifdef CONFIG_SMP
+apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
+       call_function_single_interrupt smp_call_function_single_interrupt
+apicinterrupt CALL_FUNCTION_VECTOR \
+       call_function_interrupt smp_call_function_interrupt
+apicinterrupt RESCHEDULE_VECTOR \
+       reschedule_interrupt smp_reschedule_interrupt
+#endif
+
+apicinterrupt ERROR_APIC_VECTOR \
+       error_interrupt smp_error_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR \
+       spurious_interrupt smp_spurious_interrupt
+
+#ifdef CONFIG_IRQ_WORK
+apicinterrupt IRQ_WORK_VECTOR \
+       irq_work_interrupt smp_irq_work_interrupt
+#endif
+#endif /* !CONFIG_XEN */
+
+/*
+ * Exception entry points.
+ */
+.macro zeroentry sym do_sym
+ENTRY(\sym)
+       INTR_FRAME
+        movq (%rsp),%rcx
+       CFI_RESTORE rcx
+        movq 8(%rsp),%r11
+       CFI_RESTORE r11
+       movq $-1,8(%rsp)        /* ORIG_RAX: no syscall to restart */
+       subq $ORIG_RAX-R15-1*8,%rsp
+       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15-1*8
+       call error_entry
+       DEFAULT_FRAME -1
+       movq %rsp,%rdi          /* pt_regs pointer */
+       xorl %esi,%esi          /* no error code */
+       call \do_sym
+       jmp error_exit          /* %ebx: no swapgs flag */
+       CFI_ENDPROC
+END(\sym)
+.endm
+
+.macro paranoidzeroentry sym do_sym
+       zeroentry \sym \do_sym
+.endm
+
+.macro paranoidzeroentry_ist sym do_sym ist
+       zeroentry \sym \do_sym
+.endm
+
+.macro errorentry sym do_sym
+ENTRY(\sym)
+       XCPT_FRAME
+        movq (%rsp),%rcx
+       CFI_RESTORE rcx
+        movq 8(%rsp),%r11
+       CFI_RESTORE r11
+       subq $ORIG_RAX-R15-2*8,%rsp
+       CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15-2*8
+       call error_entry
+       DEFAULT_FRAME -1
+       movq %rsp,%rdi                  /* pt_regs pointer */
+       movq ORIG_RAX(%rsp),%rsi        /* get error code */
+       movq $-1,ORIG_RAX(%rsp)         /* no syscall to restart */
+       call \do_sym
+       jmp error_exit                  /* %ebx: no swapgs flag */
+       CFI_ENDPROC
+END(\sym)
+.endm
+
+       /* error code is on the stack already */
+.macro paranoiderrorentry sym do_sym
+       errorentry \sym \do_sym
+.endm
+
+/*
+ * Copied from arch/xen/i386/kernel/entry.S
+ */               
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+ENTRY(do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
+       CFI_STARTPROC
+# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+# see the correct pointer to the pt_regs
+       movq %rdi, %rsp            # we don't return, adjust the stack frame
+       CFI_ENDPROC
+       DEFAULT_FRAME
+11:    incl PER_CPU_VAR(irq_count)
+       movq %rsp,%rbp
+       CFI_DEF_CFA_REGISTER rbp
+       cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
+       pushq %rbp                      # backlink for old unwinder
+       call evtchn_do_upcall
+       popq %rsp
+       CFI_DEF_CFA_REGISTER rsp
+       decl PER_CPU_VAR(irq_count)
+       jmp  error_exit
+       CFI_ENDPROC
+END(do_hypervisor_callback)
+
+        ALIGN
+restore_all_enable_events:  
+       PARTIAL_FRAME
+       TRACE_IRQS_ON
+       __ENABLE_INTERRUPTS
+
+scrit: /**** START OF CRITICAL REGION ****/
+       __TEST_PENDING
+       CFI_REMEMBER_STATE
+       jnz  14f                        # process more events if necessary...
+        RESTORE_ARGS 1,8,1
+        HYPERVISOR_IRET 0
+        
+       CFI_RESTORE_STATE
+14:    __DISABLE_INTERRUPTS
+       SAVE_REST
+        movq %rsp,%rdi                  # set the argument again
+       jmp  11b
+       CFI_ENDPROC
+ecrit:  /**** END OF CRITICAL REGION ****/
+# At this point, unlike on x86-32, we don't do the fixup to simplify the 
+# code and the stack frame is more complex on x86-64.
+# When the kernel is interrupted in the critical section, the kernel 
+# will do IRET in that case, and everything will be restored at that point, 
+# i.e. it just resumes from the next instruction interrupted with the same context. 
+
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we do not need to fix up as Xen has already reloaded all segment
+# registers that could be reloaded and zeroed the others.
+# Category 2 we fix up by killing the current process. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by comparing each saved segment register
+# with its current contents: any discrepancy means we in category 1.
+ENTRY(failsafe_callback)
+       INTR_FRAME offset=4*8
+       movw %ds,%cx
+       cmpw %cx,0x10(%rsp)
+       CFI_REMEMBER_STATE
+       jne 1f
+       movw %es,%cx
+       cmpw %cx,0x18(%rsp)
+       jne 1f
+       movw %fs,%cx
+       cmpw %cx,0x20(%rsp)
+       jne 1f
+       movw %gs,%cx
+       cmpw %cx,0x28(%rsp)
+       jne 1f
+       /* All segments match their saved values => Category 2 (Bad IRET). */
+       movq (%rsp),%rcx
+       CFI_RESTORE rcx
+       movq 8(%rsp),%r11
+       CFI_RESTORE r11
+       addq $0x30,%rsp
+       CFI_ADJUST_CFA_OFFSET -0x30
+       movq $11,%rdi   /* SIGSEGV */
+       jmp do_exit                     
+       CFI_RESTORE_STATE
+1:     /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+       movq (%rsp),%rcx
+       CFI_RESTORE rcx
+       movq 8(%rsp),%r11
+       CFI_RESTORE r11
+       addq $0x30,%rsp
+       CFI_ADJUST_CFA_OFFSET -0x30
+       pushq_cfi $0
+       SAVE_ALL
+       jmp error_exit
+       CFI_ENDPROC
+
+zeroentry divide_error do_divide_error
+zeroentry overflow do_overflow
+zeroentry bounds do_bounds
+zeroentry invalid_op do_invalid_op
+zeroentry device_not_available do_device_not_available
+zeroentry hypervisor_callback do_hypervisor_callback
+zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
+errorentry invalid_TSS do_invalid_TSS
+errorentry segment_not_present do_segment_not_present
+zeroentry coprocessor_error do_coprocessor_error
+errorentry alignment_check do_alignment_check
+zeroentry simd_coprocessor_error do_simd_coprocessor_error
+       
+ENTRY(kernel_thread_helper)
+       pushq $0                # fake return address
+       CFI_STARTPROC
+       /*
+        * Here we are in the child and the registers are set as they were
+        * at kernel_thread() invocation in the parent.
+        */
+       call *%rsi
+       # exit
+       mov %eax, %edi
+       call do_exit
+       ud2                     # padding for call trace
+       CFI_ENDPROC
+END(kernel_thread_helper)
+
+/*
+ * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
+ *
+ * C extern interface:
+ *      extern long execve(const char *name, char **argv, char **envp)
+ *
+ * asm input arguments:
+ *     rdi: name, rsi: argv, rdx: envp
+ *
+ * We want to fallback into:
+ *     extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
+ *
+ * do_sys_execve asm fallback arguments:
+ *     rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
+ */
+ENTRY(kernel_execve)
+       CFI_STARTPROC
+       FAKE_STACK_FRAME $0
+       SAVE_ALL
+       movq %rsp,%rcx
+       call sys_execve
+       movq %rax, RAX(%rsp)
+       RESTORE_REST
+       testq %rax,%rax
+       jne 1f
+        jmp int_ret_from_sys_call
+1:      RESTORE_ARGS
+       UNFAKE_STACK_FRAME
+       ret
+       CFI_ENDPROC
+END(kernel_execve)
+
+/* Call softirq on interrupt stack. Interrupts are off. */
+ENTRY(call_softirq)
+       CFI_STARTPROC
+       pushq_cfi %rbp
+       CFI_REL_OFFSET rbp,0
+       mov  %rsp,%rbp
+       CFI_DEF_CFA_REGISTER rbp
+       incl PER_CPU_VAR(irq_count)
+       cmove PER_CPU_VAR(irq_stack_ptr),%rsp
+       push  %rbp                      # backlink for old unwinder
+       call __do_softirq
+       leaveq
+       CFI_RESTORE             rbp
+       CFI_DEF_CFA_REGISTER    rsp
+       CFI_ADJUST_CFA_OFFSET   -8
+       decl PER_CPU_VAR(irq_count)
+       ret
+       CFI_ENDPROC
+END(call_softirq)
+
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+       CFI_STARTPROC
+       movq    %r15, R15(%rdi)
+       movq    %r14, R14(%rdi)
+       xchgq   %rsi, %rdx
+       movq    %r13, R13(%rdi)
+       movq    %r12, R12(%rdi)
+       xorl    %eax, %eax
+       movq    %rbp, RBP(%rdi)
+       movq    %rbx, RBX(%rdi)
+       movq    (%rsp), %r9
+       xchgq   %rdx, %rcx
+       movq    %rax, R11(%rdi)
+       movq    %rax, R10(%rdi)
+       movq    %rax, R9(%rdi)
+       movq    %rax, R8(%rdi)
+       movq    %rax, RAX(%rdi)
+       movq    %rax, RCX(%rdi)
+       movq    %rax, RDX(%rdi)
+       movq    %rax, RSI(%rdi)
+       movq    %rax, RDI(%rdi)
+       movq    %rax, ORIG_RAX(%rdi)
+       movq    %r9, RIP(%rdi)
+       leaq    8(%rsp), %r9
+       movq    $__KERNEL_CS, CS(%rdi)
+       movq    %rax, EFLAGS(%rdi)
+       movq    %r9, RSP(%rdi)
+       movq    $__KERNEL_DS, SS(%rdi)
+       jmpq    *%rcx
+       CFI_ENDPROC
+END(arch_unwind_init_running)
+#endif
+
+/*
+ * Some functions should be protected against kprobes
+ */
+       .pushsection .kprobes.text, "ax"
+
+paranoidzeroentry_ist debug do_debug DEBUG_STACK
+zeroentry nmi do_nmi_callback
+paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
+paranoiderrorentry stack_segment do_stack_segment
+errorentry general_protection do_general_protection
+errorentry page_fault do_page_fault
+#ifdef CONFIG_KVM_GUEST
+errorentry async_page_fault do_async_page_fault
+#endif
+#ifdef CONFIG_X86_MCE
+paranoidzeroentry machine_check *machine_check_vector(%rip)
+#endif
+
+#ifndef CONFIG_XEN
+       /*
+        * "Paranoid" exit path from exception stack.
+        * Paranoid because this is used by NMIs and cannot take
+        * any kernel state for granted.
+        * We don't do kernel preemption checks here, because only
+        * NMI should be common and it does not enable IRQs and
+        * cannot get reschedule ticks.
+        *
+        * "trace" is 0 for the NMI handler only, because irq-tracing
+        * is fundamentally NMI-unsafe. (we cannot change the soft and
+        * hard flags at once, atomically)
+        */
+
+       /* ebx: no swapgs flag */
+ENTRY(paranoid_exit)
+       DEFAULT_FRAME
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
+       testl %ebx,%ebx                         /* swapgs needed? */
+       jnz paranoid_restore
+       testl $3,CS(%rsp)
+       jnz   paranoid_userspace
+paranoid_swapgs:
+       TRACE_IRQS_IRETQ 0
+       SWAPGS_UNSAFE_STACK
+       RESTORE_ALL 8
+       jmp irq_return
+paranoid_restore:
+       TRACE_IRQS_IRETQ 0
+       RESTORE_ALL 8
+       jmp irq_return
+paranoid_userspace:
+       GET_THREAD_INFO(%rcx)
+       movl TI_flags(%rcx),%ebx
+       andl $_TIF_WORK_MASK,%ebx
+       jz paranoid_swapgs
+       movq %rsp,%rdi                  /* &pt_regs */
+       call sync_regs
+       movq %rax,%rsp                  /* switch stack for scheduling */
+       testl $_TIF_NEED_RESCHED,%ebx
+       jnz paranoid_schedule
+       movl %ebx,%edx                  /* arg3: thread flags */
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_NONE)
+       xorl %esi,%esi                  /* arg2: oldset */
+       movq %rsp,%rdi                  /* arg1: &pt_regs */
+       call do_notify_resume
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
+       jmp paranoid_userspace
+paranoid_schedule:
+       TRACE_IRQS_ON
+       ENABLE_INTERRUPTS(CLBR_ANY)
+       call schedule
+       DISABLE_INTERRUPTS(CLBR_ANY)
+       TRACE_IRQS_OFF
+       jmp paranoid_userspace
+       CFI_ENDPROC
+END(paranoid_exit)
+#endif
+
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack.
+ * returns in "no swapgs flag" in %ebx.
+ */
+ENTRY(error_entry)
+       XCPT_FRAME start=2 offset=ORIG_RAX-R15+8
+       /* oldrax contains error code */
+       cld
+       movq %rdi, RDI+8(%rsp)
+       movq %rsi, RSI+8(%rsp)
+       movq %rdx, RDX+8(%rsp)
+       movq %rcx, RCX+8(%rsp)
+       movq %rax, RAX+8(%rsp)
+       movq  %r8,  R8+8(%rsp)
+       movq  %r9,  R9+8(%rsp)
+       movq %r10, R10+8(%rsp)
+       movq %r11, R11+8(%rsp)
+       movq_cfi rbx, RBX+8
+       movq %rbp, RBP+8(%rsp)
+       movq %r12, R12+8(%rsp)
+       movq %r13, R13+8(%rsp)
+       movq %r14, R14+8(%rsp)
+       movq %r15, R15+8(%rsp)
+#ifndef CONFIG_XEN
+       xorl %ebx,%ebx
+       testl $3,CS+8(%rsp)
+       je error_kernelspace
+error_swapgs:
+       SWAPGS
+error_sti:
+#endif
+       TRACE_IRQS_OFF
+       ret
+
+#ifndef CONFIG_XEN
+/*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. The exception handlers after iret run with
+ * kernel gs again, so don't set the user space flag. B stepping K8s
+ * sometimes report an truncated RIP for IRET exceptions returning to
+ * compat mode. Check for these here too.
+ */
+error_kernelspace:
+       CFI_REL_OFFSET rcx, RCX+8
+       incl %ebx
+       leaq irq_return(%rip),%rcx
+       cmpq %rcx,RIP+8(%rsp)
+       je error_swapgs
+       movl %ecx,%eax  /* zero extend */
+       cmpq %rax,RIP+8(%rsp)
+       je bstep_iret
+       cmpq $gs_change,RIP+8(%rsp)
+       je error_swapgs
+       jmp error_sti
+
+bstep_iret:
+       /* Fix truncated RIP */
+       movq %rcx,RIP+8(%rsp)
+       jmp error_swapgs
+#endif
+       CFI_ENDPROC
+END(error_entry)
+
+
+ENTRY(error_exit)
+       DEFAULT_FRAME
+       RESTORE_REST
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
+       GET_THREAD_INFO(%rcx)
+       testb $3,CS-ARGOFFSET(%rsp)
+       jz retint_kernel
+       LOCKDEP_SYS_EXIT_IRQ
+       movl TI_flags(%rcx),%edx
+       movl $_TIF_WORK_MASK,%edi
+       andl %edi,%edx
+       jnz retint_careful
+       jmp retint_restore_args
+       CFI_ENDPROC
+END(error_exit)
+
+
+#define extern #
+#include <asm-generic/percpu.h>
+
+.pushsection PER_CPU_BASE_SECTION, "aw", @progbits
+in_NMI:        .byte   0
+.popsection
+
+do_nmi_callback:
+       CFI_STARTPROC
+       addq $8, %rsp
+       CFI_ENDPROC
+       DEFAULT_FRAME
+       orb  $1, PER_CPU_VAR(in_NMI)
+       js   1f
+0:
+       movb $0x80, PER_CPU_VAR(in_NMI)
+       call do_nmi
+       movl $0x80, %eax
+       cmpxchgb %ah, PER_CPU_VAR(in_NMI)
+       jne  0b
+       orl  $NMI_MASK,EFLAGS(%rsp)
+1:
+       RESTORE_REST
+       DISABLE_INTERRUPTS(CLBR_NONE)
+       TRACE_IRQS_OFF
+       GET_THREAD_INFO(%rcx)
+       jmp  retint_restore_args
+       CFI_ENDPROC
+END(do_nmi_callback)
+
+
+#ifndef CONFIG_IA32_EMULATION
+ENTRY(ignore_sysret)
+       INTR_FRAME
+       popq_cfi %rcx
+       CFI_RESTORE rcx
+       popq_cfi %r11
+       CFI_RESTORE r11
+       mov $-ENOSYS,%eax
+       # any non-zero value not having VGCF_in_syscall set will do:
+       HYPERVISOR_IRET VGCF_i387_valid
+       CFI_ENDPROC
+END(ignore_sysret)
+#endif
+
+/*
+ * End of kprobes section
+ */
+       .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S

index cdc79b5..9ad9b48 100644 (file)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -239,21 +239,21 @@ ENDPROC(native_usergs_sysret64)
  /*
   * initial frame state for interrupts (and exceptions without error code)
   */
-       .macro EMPTY_FRAME start=1 offset=0
-       .if \start
+       .macro EMPTY_FRAME offset=0
         CFI_STARTPROC simple
         CFI_SIGNAL_FRAME
-       CFI_DEF_CFA rsp,8+\offset
-       .else
-       CFI_DEF_CFA_OFFSET 8+\offset
-       .endif
+       CFI_DEF_CFA rsp,\offset
         .endm
  
  /*
   * initial frame state for interrupts (and exceptions without error code)
   */
         .macro INTR_FRAME start=1 offset=0
-       EMPTY_FRAME \start, SS+8+\offset-RIP
+       .if \start
+       EMPTY_FRAME SS+8+\offset-RIP
+       .else
+       CFI_DEF_CFA_OFFSET SS+8+\offset-RIP
+       .endif
         /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
         CFI_REL_OFFSET rsp, RSP+\offset-RIP
         /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
@@ -267,14 +267,15 @@ ENDPROC(native_usergs_sysret64)
   */
         .macro XCPT_FRAME start=1 offset=0
         INTR_FRAME \start, RIP+\offset-ORIG_RAX
-       /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
         .endm
  
  /*
   * frame that enables calling into C.
   */
         .macro PARTIAL_FRAME start=1 offset=0
+       .if \start >= 0
         XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
+       .endif
         CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
         CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
         CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
@@ -290,7 +291,9 @@ ENDPROC(native_usergs_sysret64)
   * frame that enables passing a complete pt_regs to a C function.
   */
         .macro DEFAULT_FRAME start=1 offset=0
+       .if \start >= -1
         PARTIAL_FRAME \start, R11+\offset-R15
+       .endif
         CFI_REL_OFFSET rbx, RBX+\offset
         CFI_REL_OFFSET rbp, RBP+\offset
         CFI_REL_OFFSET r12, R12+\offset
@@ -345,14 +348,14 @@ ENDPROC(native_usergs_sysret64)
         .endm
  
  ENTRY(save_rest)
-       PARTIAL_FRAME 1 REST_SKIP+8
+       CFI_STARTPROC
         movq 5*8+16(%rsp), %r11 /* save return address */
-       movq_cfi rbx, RBX+16
-       movq_cfi rbp, RBP+16
-       movq_cfi r12, R12+16
-       movq_cfi r13, R13+16
-       movq_cfi r14, R14+16
-       movq_cfi r15, R15+16
+       movq %rbx, RBX+16(%rsp)
+       movq %rbp, RBP+16(%rsp)
+       movq %r12, R12+16(%rsp)
+       movq %r13, R13+16(%rsp)
+       movq %r14, R14+16(%rsp)
+       movq %r15, R15+16(%rsp)
         movq %r11, 8(%rsp)      /* return address */
         FIXUP_TOP_OF_STACK %r11, 16
         ret
@@ -362,23 +365,23 @@ END(save_rest)
  /* save complete stack frame */
         .pushsection .kprobes.text, "ax"
  ENTRY(save_paranoid)
-       XCPT_FRAME 1 RDI+8
+       XCPT_FRAME offset=ORIG_RAX-R15+8
         cld
-       movq_cfi rdi, RDI+8
-       movq_cfi rsi, RSI+8
+       movq %rdi, RDI+8(%rsp)
+       movq %rsi, RSI+8(%rsp)
         movq_cfi rdx, RDX+8
         movq_cfi rcx, RCX+8
         movq_cfi rax, RAX+8
-       movq_cfi r8, R8+8
-       movq_cfi r9, R9+8
-       movq_cfi r10, R10+8
-       movq_cfi r11, R11+8
+       movq %r8, R8+8(%rsp)
+       movq %r9, R9+8(%rsp)
+       movq %r10, R10+8(%rsp)
+       movq %r11, R11+8(%rsp)
         movq_cfi rbx, RBX+8
-       movq_cfi rbp, RBP+8
-       movq_cfi r12, R12+8
-       movq_cfi r13, R13+8
-       movq_cfi r14, R14+8
-       movq_cfi r15, R15+8
+       movq %rbp, RBP+8(%rsp)
+       movq %r12, R12+8(%rsp)
+       movq %r13, R13+8(%rsp)
+       movq %r14, R14+8(%rsp)
+       movq %r15, R15+8(%rsp)
         movl $1,%ebx
         movl $MSR_GS_BASE,%ecx
         rdmsr
@@ -685,7 +688,7 @@ ENTRY(\label)
         subq $REST_SKIP, %rsp
         CFI_ADJUST_CFA_OFFSET REST_SKIP
         call save_rest
-       DEFAULT_FRAME 0 8               /* offset 8: return address */
+       DEFAULT_FRAME -2 8              /* offset 8: return address */
         leaq 8(%rsp), \arg      /* pt_regs pointer */
         call \func
         jmp ptregscall_common
@@ -1068,7 +1071,7 @@ ENTRY(\sym)
         subq $ORIG_RAX-R15, %rsp
         CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
         call error_entry
-       DEFAULT_FRAME 0
+       DEFAULT_FRAME -1
         movq %rsp,%rdi          /* pt_regs pointer */
         xorl %esi,%esi          /* no error code */
         call \do_sym
@@ -1085,6 +1088,7 @@ ENTRY(\sym)
         subq $ORIG_RAX-R15, %rsp
         CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
         call save_paranoid
+       DEFAULT_FRAME -1
         TRACE_IRQS_OFF
         movq %rsp,%rdi          /* pt_regs pointer */
         xorl %esi,%esi          /* no error code */
@@ -1103,6 +1107,7 @@ ENTRY(\sym)
         subq $ORIG_RAX-R15, %rsp
         CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
         call save_paranoid
+       DEFAULT_FRAME -1
         TRACE_IRQS_OFF
         movq %rsp,%rdi          /* pt_regs pointer */
         xorl %esi,%esi          /* no error code */
@@ -1121,7 +1126,7 @@ ENTRY(\sym)
         subq $ORIG_RAX-R15, %rsp
         CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
         call error_entry
-       DEFAULT_FRAME 0
+       DEFAULT_FRAME -1
         movq %rsp,%rdi                  /* pt_regs pointer */
         movq ORIG_RAX(%rsp),%rsi        /* get error code */
         movq $-1,ORIG_RAX(%rsp)         /* no syscall to restart */
@@ -1139,7 +1144,7 @@ ENTRY(\sym)
         subq $ORIG_RAX-R15, %rsp
         CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
         call save_paranoid
-       DEFAULT_FRAME 0
+       DEFAULT_FRAME -1
         TRACE_IRQS_OFF
         movq %rsp,%rdi                  /* pt_regs pointer */
         movq ORIG_RAX(%rsp),%rsi        /* get error code */
@@ -1260,7 +1265,41 @@ ENTRY(call_softirq)
         CFI_ENDPROC
  END(call_softirq)
  
-#ifdef CONFIG_XEN
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+       CFI_STARTPROC
+       movq    %r15, R15(%rdi)
+       movq    %r14, R14(%rdi)
+       xchgq   %rsi, %rdx
+       movq    %r13, R13(%rdi)
+       movq    %r12, R12(%rdi)
+       xorl    %eax, %eax
+       movq    %rbp, RBP(%rdi)
+       movq    %rbx, RBX(%rdi)
+       movq    (%rsp), %r9
+       xchgq   %rdx, %rcx
+       movq    %rax, R11(%rdi)
+       movq    %rax, R10(%rdi)
+       movq    %rax, R9(%rdi)
+       movq    %rax, R8(%rdi)
+       movq    %rax, RAX(%rdi)
+       movq    %rax, RCX(%rdi)
+       movq    %rax, RDX(%rdi)
+       movq    %rax, RSI(%rdi)
+       movq    %rax, RDI(%rdi)
+       movq    %rax, ORIG_RAX(%rdi)
+       movq    %r9, RIP(%rdi)
+       leaq    8(%rsp), %r9
+       movq    $__KERNEL_CS, CS(%rdi)
+       movq    %rax, EFLAGS(%rdi)
+       movq    %r9, RSP(%rdi)
+       movq    $__KERNEL_DS, SS(%rdi)
+       jmpq    *%rcx
+       CFI_ENDPROC
+END(arch_unwind_init_running)
+#endif
+
+#ifdef CONFIG_PARAVIRT_XEN
  zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
  
  /*
@@ -1360,7 +1399,7 @@ END(xen_failsafe_callback)
  apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
         xen_hvm_callback_vector xen_evtchn_do_upcall
  
-#endif /* CONFIG_XEN */
+#endif /* CONFIG_PARAVIRT_XEN */
  
  /*
   * Some functions should be protected against kprobes
@@ -1370,7 +1409,7 @@ apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
  paranoidzeroentry_ist debug do_debug DEBUG_STACK
  paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
  paranoiderrorentry stack_segment do_stack_segment
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
  zeroentry xen_debug do_debug
  zeroentry xen_int3 do_int3
  errorentry xen_stack_segment do_stack_segment
@@ -1449,25 +1488,24 @@ END(paranoid_exit)
   * returns in "no swapgs flag" in %ebx.
   */
  ENTRY(error_entry)
-       XCPT_FRAME
-       CFI_ADJUST_CFA_OFFSET 15*8
+       XCPT_FRAME offset=ORIG_RAX-R15+8
         /* oldrax contains error code */
         cld
-       movq_cfi rdi, RDI+8
-       movq_cfi rsi, RSI+8
-       movq_cfi rdx, RDX+8
-       movq_cfi rcx, RCX+8
-       movq_cfi rax, RAX+8
-       movq_cfi  r8,  R8+8
-       movq_cfi  r9,  R9+8
-       movq_cfi r10, R10+8
-       movq_cfi r11, R11+8
+       movq %rdi, RDI+8(%rsp)
+       movq %rsi, RSI+8(%rsp)
+       movq %rdx, RDX+8(%rsp)
+       movq %rcx, RCX+8(%rsp)
+       movq %rax, RAX+8(%rsp)
+       movq  %r8,  R8+8(%rsp)
+       movq  %r9,  R9+8(%rsp)
+       movq %r10, R10+8(%rsp)
+       movq %r11, R11+8(%rsp)
         movq_cfi rbx, RBX+8
-       movq_cfi rbp, RBP+8
-       movq_cfi r12, R12+8
-       movq_cfi r13, R13+8
-       movq_cfi r14, R14+8
-       movq_cfi r15, R15+8
+       movq %rbp, RBP+8(%rsp)
+       movq %r12, R12+8(%rsp)
+       movq %r13, R13+8(%rsp)
+       movq %r14, R14+8(%rsp)
+       movq %r15, R15+8(%rsp)
         xorl %ebx,%ebx
         testl $3,CS+8(%rsp)
         je error_kernelspace
@@ -1485,6 +1523,7 @@ error_sti:
   * compat mode. Check for these here too.
   */
  error_kernelspace:
+       CFI_REL_OFFSET rcx, RCX+8
         incl %ebx
         leaq irq_return(%rip),%rcx
         cmpq %rcx,RIP+8(%rsp)
@@ -1727,7 +1766,7 @@ end_repeat_nmi:
          * exceptions might do.
          */
         call save_paranoid
-       DEFAULT_FRAME 0
+       DEFAULT_FRAME -1
         /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
         movq %rsp,%rdi
         movq $-1,%rsi
diff --git a/arch/x86/kernel/fixup.c b/arch/x86/kernel/fixup.c

new file mode 100644 (file)

index 0000000..64cd323
--- /dev/null
+++ b/arch/x86/kernel/fixup.c
@@ -0,0 +1,89 @@
+/******************************************************************************
+ * fixup.c
+ * 
+ * Binary-rewriting of certain IA32 instructions, on notification by Xen.
+ * Used to avoid repeated slow emulation of common instructions used by the
+ * user-space TLS (Thread-Local Storage) libraries.
+ * 
+ * **** NOTE ****
+ *  Issues with the binary rewriting have caused it to be removed. Instead
+ *  we rely on Xen's emulator to boot the kernel, and then print a banner
+ *  message recommending that the user disables /lib/tls.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/version.h>
+#include <asm/traps.h>
+
+#define DP(_f, _args...) pr_alert("  " _f "\n" , ## _args )
+
+dotraplinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
+{
+       static unsigned long printed = 0;
+       char info[100];
+       int i;
+
+       /* Ignore statically-linked init. */
+       if (current->tgid == 1)
+               return;
+            
+       VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable,
+                                 VMASST_TYPE_4gb_segments_notify));
+
+       if (test_and_set_bit(0, &printed))
+               return;
+
+       sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
+
+       DP("");
+       DP("***************************************************************");
+       DP("***************************************************************");
+       DP("** WARNING: Currently emulating unsupported memory accesses  **");
+       DP("**          in /lib/tls glibc libraries. The emulation is    **");
+       DP("**          slow. To ensure full performance you should      **");
+       DP("**          install a 'xen-friendly' (nosegneg) version of   **");
+       DP("**          the library, or disable tls support by executing **");
+       DP("**          the following as root:                           **");
+       DP("**          mv /lib/tls /lib/tls.disabled                    **");
+       DP("** Offending process: %-38.38s **", info);
+       DP("***************************************************************");
+       DP("***************************************************************");
+       DP("");
+
+       for (i = 5; i > 0; i--) {
+               touch_softlockup_watchdog();
+               printk("Pausing... %d", i);
+               mdelay(1000);
+               printk("\b\b\b\b\b\b\b\b\b\b\b\b");
+       }
+
+       printk("Continuing...\n\n");
+}
+
+static int __init fixup_init(void)
+{
+       WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+                                    VMASST_TYPE_4gb_segments_notify));
+       return 0;
+}
+__initcall(fixup_init);
diff --git a/arch/x86/kernel/head-xen.c b/arch/x86/kernel/head-xen.c

new file mode 100644 (file)

index 0000000..b15fd60
--- /dev/null
+++ b/arch/x86/kernel/head-xen.c
@@ -0,0 +1,223 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/pci.h>
+
+#include <asm/setup.h>
+#ifndef CONFIG_XEN
+#include <asm/bios_ebda.h>
+
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+void __init reserve_ebda_region(void)
+{
+       unsigned int lowmem, ebda_addr;
+
+       /* To determine the position of the EBDA and the */
+       /* end of conventional memory, we need to look at */
+       /* the BIOS data area. In a paravirtual environment */
+       /* that area is absent. We'll just have to assume */
+       /* that the paravirt case can handle memory setup */
+       /* correctly, without our help. */
+       if (paravirt_enabled())
+               return;
+
+       /* end of low (conventional) memory */
+       lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+       lowmem <<= 10;
+
+       /* start of EBDA area */
+       ebda_addr = get_bios_ebda();
+
+       /* Fixup: bios puts an EBDA in the top 64K segment */
+       /* of conventional memory, but does not adjust lowmem. */
+       if ((lowmem - ebda_addr) <= 0x10000)
+               lowmem = ebda_addr;
+
+       /* Fixup: bios does not report an EBDA at all. */
+       /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+       if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+               lowmem = 0x9f000;
+
+       /* Paranoia: should never happen, but... */
+       if ((lowmem == 0) || (lowmem >= 0x100000))
+               lowmem = 0x9f000;
+
+       /* reserve all memory between lowmem and the 1MB mark */
+       memblock_reserve(lowmem, 0x100000 - lowmem);
+}
+#else /* CONFIG_XEN */
+#include <linux/export.h>
+#include <asm/fixmap.h>
+#include <asm/mc146818rtc.h>
+#include <asm/pgtable.h>
+#include <asm/sections.h>
+#include <xen/interface/callback.h>
+#include <xen/interface/memory.h>
+
+extern void hypervisor_callback(void);
+extern void failsafe_callback(void);
+extern void nmi(void);
+
+#ifdef CONFIG_X86_64
+#include <asm/proto.h>
+#define CALLBACK_ADDR(fn) ((unsigned long)(fn))
+#else
+#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) }
+#endif
+
+unsigned long __initdata xen_initrd_start;
+
+unsigned long *__read_mostly machine_to_phys_mapping =
+       (void *)MACH2PHYS_VIRT_START;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned long __read_mostly machine_to_phys_nr;
+EXPORT_SYMBOL(machine_to_phys_nr);
+
+void __init xen_start_kernel(void)
+{
+       unsigned int i;
+       struct xen_machphys_mapping mapping;
+
+       xen_setup_features();
+
+       if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
+               machine_to_phys_mapping = (unsigned long *)mapping.v_start;
+               machine_to_phys_nr = mapping.max_mfn + 1;
+       } else
+               machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
+#ifdef CONFIG_X86_32
+       WARN_ON(machine_to_phys_mapping + (machine_to_phys_nr - 1)
+               < machine_to_phys_mapping);
+#endif
+
+       if (!xen_feature(XENFEAT_auto_translated_physmap))
+               phys_to_machine_mapping =
+                       (unsigned long *)xen_start_info->mfn_list;
+
+       WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+                                    VMASST_TYPE_writable_pagetables));
+
+       memblock_reserve(ALIGN(__pa_symbol(&_end), PAGE_SIZE),
+                        __pa(xen_start_info->pt_base)
+                        + PFN_PHYS(xen_start_info->nr_pt_frames));
+
+#ifdef CONFIG_X86_32
+{
+       extern pte_t swapper_pg_fixmap[PTRS_PER_PTE];
+       unsigned long addr;
+
+       /* Do an early initialization of the fixmap area */
+       make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables);
+       addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
+       set_pmd(pmd_offset(pud_offset(swapper_pg_dir + pgd_index(addr),
+                                     addr),
+                          addr),
+               __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE));
+}
+#else
+       x86_configure_nx();
+       xen_init_pt();
+#endif
+
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \
+                       != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE)))
+       FIX_BUG_ON(SHARED_INFO);
+       FIX_BUG_ON(ISAMAP_BEGIN);
+       FIX_BUG_ON(ISAMAP_END);
+#undef pmd_index
+#undef __FIXADDR_TOP
+
+       /* Switch to the real shared_info page, and clear the dummy page. */
+       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
+       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+       clear_page(empty_zero_page);
+
+       setup_vcpu_info(0);
+
+       /* Set up mapping of lowest 1MB of physical memory. */
+       for (i = 0; i < NR_FIX_ISAMAPS; i++)
+               if (is_initial_xendomain())
+                       set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
+               else
+                       __set_fixmap(FIX_ISAMAP_BEGIN - i,
+                                    virt_to_machine(empty_zero_page),
+                                    PAGE_KERNEL_RO);
+
+       if (is_initial_xendomain()) {
+               x86_platform.get_wallclock = mach_get_cmos_time;
+               x86_platform.set_wallclock = mach_set_rtc_mmss;
+
+               pci_request_acs();
+       } else
+               x86_init.resources.probe_roms = x86_init_noop;
+}
+
+void __init xen_arch_setup(void)
+{
+       int ret;
+       static const struct callback_register __initconst event = {
+               .type = CALLBACKTYPE_event,
+               .address = CALLBACK_ADDR(hypervisor_callback)
+       };
+       static const struct callback_register __initconst failsafe = {
+               .type = CALLBACKTYPE_failsafe,
+               .address = CALLBACK_ADDR(failsafe_callback)
+       };
+#ifdef CONFIG_X86_64
+       static const struct callback_register __initconst syscall = {
+               .type = CALLBACKTYPE_syscall,
+               .address = CALLBACK_ADDR(system_call)
+       };
+#endif
+       static const struct callback_register __initconst nmi_cb = {
+               .type = CALLBACKTYPE_nmi,
+               .address = CALLBACK_ADDR(nmi)
+       };
+
+       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
+       if (ret == 0)
+               ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
+#ifdef CONFIG_X86_64
+       if (ret == 0)
+               ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall);
+#endif
+#if CONFIG_XEN_COMPAT <= 0x030002
+#ifdef CONFIG_X86_32
+       if (ret == -ENOSYS)
+               ret = HYPERVISOR_set_callbacks(
+                       event.address.cs, event.address.eip,
+                       failsafe.address.cs, failsafe.address.eip);
+#else
+               ret = HYPERVISOR_set_callbacks(
+                       event.address,
+                       failsafe.address,
+                       syscall.address);
+#endif
+#endif
+       BUG_ON(ret);
+
+       ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb);
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (ret == -ENOSYS) {
+               static struct xennmi_callback __initdata cb = {
+                       .handler_address = (unsigned long)nmi
+               };
+
+               HYPERVISOR_nmi_op(XENNMI_register_callback, &cb);
+       }
+#endif
+}
+#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/head32-xen.c b/arch/x86/kernel/head32-xen.c

new file mode 100644 (file)

index 0000000..fcc893b
--- /dev/null
+++ b/arch/x86/kernel/head32-xen.c
@@ -0,0 +1,103 @@
+/*
+ *  linux/arch/i386/kernel/head32.c -- prepare to run common code
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#include <linux/init.h>
+#include <linux/start_kernel.h>
+#include <linux/mm.h>
+#include <linux/memblock.h>
+
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/e820.h>
+#include <asm/trampoline.h>
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/tlbflush.h>
+
+static void __init i386_default_early_setup(void)
+{
+       /* Initialize 32bit specific setup functions */
+       x86_init.resources.reserve_resources = i386_reserve_resources;
+#ifndef CONFIG_XEN
+       x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
+
+       reserve_ebda_region();
+#endif
+}
+
+void __init i386_start_kernel(void)
+{
+#ifdef CONFIG_XEN
+       struct xen_platform_parameters pp;
+
+       WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable,
+                                    VMASST_TYPE_4gb_segments));
+
+       init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base;
+
+       if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) {
+               hypervisor_virt_start = pp.virt_start;
+               reserve_top_address(0UL - pp.virt_start);
+       }
+
+       BUG_ON(pte_index(hypervisor_virt_start));
+#endif
+
+       memblock_reserve(__pa_symbol(&_text),
+                        __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
+
+#ifndef CONFIG_XEN
+#ifdef CONFIG_BLK_DEV_INITRD
+       /* Reserve INITRD */
+       if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+               /* Assume only end is not page aligned */
+               u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+               u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+               u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+               memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
+       }
+#endif
+
+       /* Call the subarch specific early setup function */
+       switch (boot_params.hdr.hardware_subarch) {
+       case X86_SUBARCH_MRST:
+               x86_mrst_early_setup();
+               break;
+       case X86_SUBARCH_CE4100:
+               x86_ce4100_early_setup();
+               break;
+       default:
+               i386_default_early_setup();
+               break;
+       }
+#else
+#ifdef CONFIG_BLK_DEV_INITRD
+       BUG_ON(xen_start_info->flags & SIF_MOD_START_PFN);
+       if (xen_start_info->mod_start)
+               xen_initrd_start = __pa(xen_start_info->mod_start);
+#endif
+       {
+               int max_cmdline;
+
+               if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
+                       max_cmdline = COMMAND_LINE_SIZE;
+               memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
+               boot_command_line[max_cmdline-1] = '\0';
+       }
+
+       i386_default_early_setup();
+       xen_start_kernel();
+#endif
+
+       /*
+        * At this point everything still needed from the boot loader
+        * or BIOS or kernel text should be early reserved or marked not
+        * RAM in e820. All other memory is free game.
+        */
+
+       start_kernel();
+}
diff --git a/arch/x86/kernel/head64-xen.c b/arch/x86/kernel/head64-xen.c

new file mode 100644 (file)

index 0000000..b2010d8
--- /dev/null
+++ b/arch/x86/kernel/head64-xen.c
@@ -0,0 +1,146 @@
+/*
+ *  prepare to run common code
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *
+ *  Jun Nakajima <jun.nakajima@intel.com>
+ *     Modified for Xen.
+ */
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/percpu.h>
+#include <linux/start_kernel.h>
+#include <linux/io.h>
+#include <linux/memblock.h>
+
+#include <asm/processor.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/e820.h>
+#include <asm/trampoline.h>
+#include <asm/bios_ebda.h>
+
+#ifndef CONFIG_XEN
+static void __init zap_identity_mappings(void)
+{
+       pgd_t *pgd = pgd_offset_k(0UL);
+       pgd_clear(pgd);
+       __flush_tlb_all();
+}
+
+/* Don't add a printk in there. printk relies on the PDA which is not initialized 
+   yet. */
+static void __init clear_bss(void)
+{
+       memset(__bss_start, 0,
+              (unsigned long) __bss_stop - (unsigned long) __bss_start);
+}
+#endif
+
+static void __init copy_bootdata(char *real_mode_data)
+{
+#ifndef CONFIG_XEN
+       char * command_line;
+
+       memcpy(&boot_params, real_mode_data, sizeof boot_params);
+       if (boot_params.hdr.cmd_line_ptr) {
+               command_line = __va(boot_params.hdr.cmd_line_ptr);
+               memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
+       }
+#else
+       int max_cmdline;
+       
+       if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
+               max_cmdline = COMMAND_LINE_SIZE;
+       memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline);
+       boot_command_line[max_cmdline-1] = '\0';
+#endif
+}
+
+#include <xen/interface/memory.h>
+
+void __init x86_64_start_kernel(char * real_mode_data)
+{
+       /*
+        * Build-time sanity checks on the kernel image and module
+        * area mappings. (these are purely build-time and produce no code)
+        */
+       BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START);
+       BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE);
+       BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
+       BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0);
+       BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
+       BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+       BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+                               (__START_KERNEL & PGDIR_MASK)));
+       BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
+
+       xen_start_info = (struct start_info *)real_mode_data;
+       xen_start_kernel();
+
+#ifndef CONFIG_XEN
+       /* clear bss before set_intr_gate with early_idt_handler */
+       clear_bss();
+
+       /* Make NULL pointers segfault */
+       zap_identity_mappings();
+
+       for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
+#ifdef CONFIG_EARLY_PRINTK
+               set_intr_gate(i, &early_idt_handlers[i]);
+#else
+               set_intr_gate(i, early_idt_handler);
+#endif
+       }
+       load_idt((const struct desc_ptr *)&idt_descr);
+#endif
+
+       if (console_loglevel == 10)
+               early_printk("Kernel alive\n");
+
+       xen_switch_pt();
+
+       x86_64_start_reservations(real_mode_data);
+}
+
+void __init x86_64_start_reservations(char *real_mode_data)
+{
+       copy_bootdata(__va(real_mode_data));
+
+       memblock_reserve(__pa_symbol(&_text),
+                        __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
+
+#ifdef CONFIG_BLK_DEV_INITRD
+       /* Reserve INITRD if needed. */
+       if (xen_start_info->flags & SIF_MOD_START_PFN) {
+               reserve_pfn_range(xen_start_info->mod_start,
+                                 PFN_UP(xen_start_info->mod_len));
+               xen_initrd_start = xen_start_info->mod_start << PAGE_SHIFT;
+       } else if (xen_start_info->mod_start)
+               xen_initrd_start = __pa(xen_start_info->mod_start);
+#endif
+
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               xen_start_info->mfn_list = ~0UL;
+       else if (xen_start_info->mfn_list < __START_KERNEL_map)
+               reserve_pfn_range(xen_start_info->first_p2m_pfn,
+                                 xen_start_info->nr_p2m_frames);
+
+       /*
+        * At this point everything still needed from the boot loader
+        * or BIOS or kernel text should be early reserved or marked not
+        * RAM in e820. All other memory is free game.
+        */
+
+       start_kernel();
+}
diff --git a/arch/x86/kernel/head_32-xen.S b/arch/x86/kernel/head_32-xen.S

new file mode 100644 (file)

index 0000000..c434cef
--- /dev/null
+++ b/arch/x86/kernel/head_32-xen.S
@@ -0,0 +1,220 @@
+
+
+.text
+#include <linux/elfnote.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page_types.h>
+#include <asm/pgtable_types.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/asm-offsets.h>
+#include <asm/boot.h>
+#include <asm/dwarf2.h>
+#include <asm/percpu.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/elfnote.h>
+#include <xen/interface/features.h>
+
+/*
+ * References to members of the new_cpu_data structure.
+ */
+
+#define X86            new_cpu_data+CPUINFO_x86
+#define X86_VENDOR     new_cpu_data+CPUINFO_x86_vendor
+#define X86_MODEL      new_cpu_data+CPUINFO_x86_model
+#define X86_MASK       new_cpu_data+CPUINFO_x86_mask
+#define X86_HARD_MATH  new_cpu_data+CPUINFO_hard_math
+#define X86_CPUID      new_cpu_data+CPUINFO_cpuid_level
+#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
+#define X86_VENDOR_ID  new_cpu_data+CPUINFO_x86_vendor_id
+
+__HEAD
+#define VIRT_ENTRY_OFFSET 0x0
+.org VIRT_ENTRY_OFFSET
+ENTRY(startup_32)
+       movl %esi,xen_start_info
+       cld
+
+       /* Set up the stack pointer */
+       movl $(init_thread_union+THREAD_SIZE),%esp
+
+       /* get vendor info */
+       xorl %eax,%eax                  # call CPUID with 0 -> return vendor ID
+       XEN_CPUID
+       movl %eax,X86_CPUID             # save CPUID level
+       movl %ebx,X86_VENDOR_ID         # lo 4 chars
+       movl %edx,X86_VENDOR_ID+4       # next 4 chars
+       movl %ecx,X86_VENDOR_ID+8       # last 4 chars
+
+       movl $1,%eax            # Use the CPUID instruction to get CPU type
+       XEN_CPUID
+       movb %al,%cl            # save reg for future use
+       andb $0x0f,%ah          # mask processor family
+       movb %ah,X86
+       andb $0xf0,%al          # mask model
+       shrb $4,%al
+       movb %al,X86_MODEL
+       andb $0x0f,%cl          # mask mask revision
+       movb %cl,X86_MASK
+       movl %edx,X86_CAPABILITY
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+       /*
+        * The linker can't handle this by relocation.  Manually set
+        * base address in stack canary segment descriptor.
+        */
+       movl $gdt_page,%eax
+       movl $stack_canary,%ecx
+       movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
+       shrl $16, %ecx
+       movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
+       movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
+#endif
+
+       # %esi still points to start_info, and no registers
+       # need to be preserved.
+
+       movl XEN_START_mfn_list(%esi), %ebx
+       movl $(gdt_page - __PAGE_OFFSET), %eax
+       shrl $PAGE_SHIFT, %eax
+       movl (%ebx,%eax,4), %ecx
+       pushl %ecx                      # frame number for set_gdt below
+
+       xorl %esi, %esi
+       xorl %edx, %edx
+       shldl $PAGE_SHIFT, %ecx, %edx
+       shll $PAGE_SHIFT, %ecx
+       orl $_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY, %ecx
+       movl $gdt_page, %ebx
+       movl $__HYPERVISOR_update_va_mapping, %eax
+       int $0x82
+
+       movl $(PAGE_SIZE / 8), %ecx
+       movl %esp, %ebx
+       movl $__HYPERVISOR_set_gdt, %eax
+       int $0x82
+
+       popl %ecx
+
+       movl $(__KERNEL_PERCPU), %eax
+       movl %eax,%fs                   # set this cpu's percpu
+
+       movl $(__KERNEL_STACK_CANARY),%eax
+       movl %eax,%gs
+
+       cld                     # gcc2 wants the direction flag cleared at all times
+
+       pushl $0                # fake return address for unwinder
+       jmp i386_start_kernel
+
+#define HYPERCALL_PAGE_OFFSET 0x1000
+.org HYPERCALL_PAGE_OFFSET
+ENTRY(hypercall_page)
+       CFI_STARTPROC
+.skip 0x1000
+       CFI_ENDPROC
+
+/*
+ * BSS section
+ */
+__PAGE_ALIGNED_BSS
+       .align PAGE_SIZE
+ENTRY(swapper_pg_fixmap)
+       .fill 1024,4,0
+ENTRY(empty_zero_page)
+       .fill 4096,1,0
+
+/*
+ * This starts the data section.
+ */
+.data
+
+#ifdef CONFIG_XEN_UNPRIVILEGED_GUEST
+# define XEN_DOM0_CAP          0
+# define XEN_DOM0_CAP_STR      ""
+#else
+# define XEN_DOM0_CAP          (1 << XENFEAT_dom0)
+# if CONFIG_XEN_COMPAT < 0x040200
+#  define XEN_DOM0_CAP_STR     ""
+# else
+#  define XEN_DOM0_CAP_STR     "|dom0"
+# endif
+#endif
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+/*
+ * __xen_guest information
+ */
+.macro utoa value
+ .if (\value) < 0 || (\value) >= 0x10
+       utoa (((\value)>>4)&0x0fffffff)
+ .endif
+ .if ((\value) & 0xf) < 10
+  .byte '0' + ((\value) & 0xf)
+ .else
+  .byte 'A' + ((\value) & 0xf) - 10
+ .endif
+.endm
+
+.section __xen_guest
+       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
+       .ascii  ",XEN_VER=xen-3.0"
+       .ascii  ",VIRT_BASE=0x"
+               utoa __PAGE_OFFSET
+       .ascii  ",ELF_PADDR_OFFSET=0x"
+               utoa __PAGE_OFFSET
+       .ascii  ",VIRT_ENTRY=0x"
+               utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET)
+       .ascii  ",HYPERCALL_PAGE=0x"
+               utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
+       .ascii  ",FEATURES=writable_page_tables"
+       .ascii           "|writable_descriptor_tables"
+       .ascii           "|auto_translated_physmap"
+       .ascii           "|pae_pgdir_above_4gb"
+       .ascii           "|supervisor_mode_kernel"
+#ifdef CONFIG_X86_PAE
+       .ascii  ",PAE=yes[extended-cr3]"
+#else
+       .ascii  ",PAE=no"
+#endif
+       .ascii  ",LOADER=generic"
+       .byte   0
+#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
+
+
+       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
+       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
+       ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
+       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long __PAGE_OFFSET)
+#if CONFIG_XEN_COMPAT <= 0x030002
+       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .long __PAGE_OFFSET)
+#else
+       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .long 0)
+#endif
+       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long startup_32)
+       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
+       ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   .long HYPERVISOR_VIRT_START)
+       ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .ascii "writable_page_tables";
+                                                .ascii "|writable_descriptor_tables";
+                                                .ascii "|auto_translated_physmap";
+                                                .ascii "|pae_pgdir_above_4gb";
+                                                .ascii "|supervisor_mode_kernel";
+                                                .asciz XEN_DOM0_CAP_STR)
+       ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long XEN_DOM0_CAP |
+                                          (1 << XENFEAT_writable_page_tables) |
+                                          (1 << XENFEAT_writable_descriptor_tables) |
+                                          (1 << XENFEAT_auto_translated_physmap) |
+                                          (1 << XENFEAT_pae_pgdir_above_4gb) |
+                                          (1 << XENFEAT_supervisor_mode_kernel))
+#ifdef CONFIG_X86_PAE
+       ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
+       ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .quad _PAGE_PRESENT, _PAGE_PRESENT)
+#else
+       ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
+       ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .long _PAGE_PRESENT, _PAGE_PRESENT)
+#endif
+       ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
+       ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
diff --git a/arch/x86/kernel/head_64-xen.S b/arch/x86/kernel/head_64-xen.S

new file mode 100644 (file)

index 0000000..c8ce8bd
--- /dev/null
+++ b/arch/x86/kernel/head_64-xen.S
@@ -0,0 +1,176 @@
+/*
+ *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ *
+ *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
+ *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
+ *  Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
+ *  Jun Nakajima <jun.nakajima@intel.com>
+ *    Modified for Xen                                
+ */
+
+
+#include <linux/linkage.h>
+#include <linux/threads.h>
+#include <linux/init.h>
+#include <linux/elfnote.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/msr.h>
+#include <asm/cache.h>
+#include <asm/dwarf2.h>
+#include <asm/percpu.h>
+#include <xen/interface/elfnote.h>
+#include <xen/interface/features.h>
+
+       __HEAD
+       .code64
+       .globl startup_64
+startup_64:
+       movq $(init_thread_union+THREAD_SIZE-8),%rsp
+
+       /* rsi is pointer to startup info structure.
+          pass it to C */
+       movq %rsi,%rdi
+
+       /* Set up %gs.
+        *
+        * The base of %gs always points to the bottom of the irqstack
+        * union.  If the stack protector canary is enabled, it is
+        * located at %gs:40.  Note that, on SMP, the boot cpu uses
+        * init data section till per cpu areas are set up.
+        */
+       movl    $MSR_GS_BASE,%ecx
+       movq    $INIT_PER_CPU_VAR(irq_stack_union),%rax
+       movq    %rax,%rdx
+       shrq    $32,%rdx
+       wrmsr
+
+       pushq $0                # fake return address
+       jmp x86_64_start_kernel
+
+#define NEXT_PAGE(name) \
+       .balign PAGE_SIZE; \
+ENTRY(name)
+
+       __PAGE_ALIGNED_BSS
+NEXT_PAGE(init_level4_pgt)
+       .fill   512,8,0
+
+NEXT_PAGE(level3_kernel_pgt)
+       .fill   512,8,0
+
+        /*
+         * This is used for vsyscall area mapping as we have a different
+         * level4 page table for user.
+         */
+NEXT_PAGE(level3_user_pgt)
+        .fill  512,8,0
+
+NEXT_PAGE(level2_fixmap_pgt)
+       .fill   512,8,0
+
+NEXT_PAGE(level1_fixmap_pgt)
+       .fill   512,8,0
+
+       .previous
+NEXT_PAGE(hypercall_page)
+       phys_hypercall_page = . - .head.text
+       CFI_STARTPROC
+       .rept 0x1000 / 0x20
+       .skip 1 /* push %rcx */
+       CFI_ADJUST_CFA_OFFSET   8
+       CFI_REL_OFFSET  rcx,0
+       .skip 2 /* push %r11 */
+       CFI_ADJUST_CFA_OFFSET   8
+       CFI_REL_OFFSET  rcx,0
+       .skip 5 /* mov $#,%eax */
+       .skip 2 /* syscall */
+       .skip 2 /* pop %r11 */
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_RESTORE r11
+       .skip 1 /* pop %rcx */
+       CFI_ADJUST_CFA_OFFSET -8
+       CFI_RESTORE rcx
+       .align 0x20,0 /* ret */
+       .endr
+       CFI_ENDPROC
+
+#undef NEXT_PAGE
+
+       __PAGE_ALIGNED_BSS
+       .align PAGE_SIZE
+ENTRY(empty_zero_page)
+       .skip PAGE_SIZE
+
+#ifdef CONFIG_XEN_UNPRIVILEGED_GUEST
+# define XEN_DOM0_CAP          0
+# define XEN_DOM0_CAP_STR      ""
+#else
+# define XEN_DOM0_CAP          (1 << XENFEAT_dom0)
+# if CONFIG_XEN_COMPAT < 0x040200
+#  define XEN_DOM0_CAP_STR     ""
+# else
+#  define XEN_DOM0_CAP_STR     "|dom0"
+# endif
+#endif
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+/*
+ * __xen_guest information
+ */
+.macro utoh value
+ i = 64
+ .rept 16
+  i = i - 4
+  .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf)
+ .endr
+.endm
+
+.section __xen_guest
+       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
+       .ascii  ",XEN_VER=xen-3.0"
+       .ascii  ",VIRT_BASE=0x"
+               utoh __START_KERNEL_map
+       .ascii  ",ELF_PADDR_OFFSET=0x"
+               utoh __START_KERNEL_map
+       .ascii  ",VIRT_ENTRY=0x"
+               utoh (__START_KERNEL_map + __PHYSICAL_START)
+       .ascii  ",HYPERCALL_PAGE=0x"
+               utoh (phys_hypercall_page >> PAGE_SHIFT)
+       .ascii  ",FEATURES=writable_page_tables"
+       .ascii           "|writable_descriptor_tables"
+       .ascii           "|auto_translated_physmap"
+       .ascii           "|supervisor_mode_kernel"
+       .ascii  ",LOADER=generic"
+       .byte   0
+#endif /* CONFIG_XEN_COMPAT <= 0x030002 */
+       
+       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
+       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz "2.6")
+       ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz "xen-3.0")
+       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .quad __START_KERNEL_map)
+#if CONFIG_XEN_COMPAT <= 0x030002
+       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .quad __START_KERNEL_map)
+#else
+       ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   .quad 0)
+#endif
+       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .quad startup_64)
+       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page)
+       ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,   .quad _PAGE_PRESENT, _PAGE_PRESENT)
+       ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN,  .long 1)
+       ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M,       .quad VMEMMAP_START)
+       ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .ascii "writable_page_tables";
+                                                .ascii "|writable_descriptor_tables";
+                                                .ascii "|auto_translated_physmap";
+                                                .ascii "|supervisor_mode_kernel";
+                                                .asciz XEN_DOM0_CAP_STR)
+       ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long XEN_DOM0_CAP |
+                                          (1 << XENFEAT_writable_page_tables) |
+                                          (1 << XENFEAT_writable_descriptor_tables) |
+                                          (1 << XENFEAT_auto_translated_physmap) |
+                                          (1 << XENFEAT_supervisor_mode_kernel))
+       ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
+       ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S

index 40f4eb3..66c3fcb 100644 (file)
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -283,6 +283,8 @@ early_idt_handlers:
  
  ENTRY(early_idt_handler)
  #ifdef CONFIG_EARLY_PRINTK
+#include <asm/calling.h>
+#include <asm/dwarf2.h>
         cmpl $2,early_recursion_flag(%rip)
         jz  1f
         incl early_recursion_flag(%rip)
@@ -298,6 +300,16 @@ ENTRY(early_idt_handler)
         testl $0x27d00,%eax
         je 0f
         popq %r8                # get error code
+
+       CFI_STARTPROC   simple
+       CFI_SIGNAL_FRAME
+       CFI_DEF_CFA     rsp, SS+8-RIP
+#      CFI_REL_OFFSET  ss, SS-RIP
+       CFI_REL_OFFSET  rsp, RSP-RIP
+#      CFI_REL_OFFSET  rflags, EFLAGS-RIP
+#      CFI_REL_OFFSET  cs, CS-RIP
+       CFI_REL_OFFSET  rip, RIP-RIP
+
  0:     movq 0(%rsp),%rcx       # get ip
         movq 8(%rsp),%rdx       # get cs
         xorl %eax,%eax
@@ -311,6 +323,7 @@ ENTRY(early_idt_handler)
         movq 0(%rsp),%rsi       # get rip again
         call __print_symbol
  #endif
+       CFI_ENDPROC
  #endif /* EARLY_PRINTK */
  1:     hlt
         jmp 1b
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c

index ad0de0c..0634c36 100644 (file)
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -42,6 +42,7 @@ u8                                    hpet_msi_disable;
  static unsigned long                   hpet_num_timers;
  #endif
  static void __iomem                    *hpet_virt_address;
+static int hpet_legacy_use_64_bits;
  
  struct hpet_dev {
         struct clock_event_device       evt;
@@ -69,6 +70,33 @@ static inline void hpet_writel(unsigned int d, unsigned int a)
  
  #ifdef CONFIG_X86_64
  #include <asm/pgtable.h>
+static inline unsigned long hpet_read_value(unsigned long a)
+{
+       if (hpet_legacy_use_64_bits)
+               return readq(hpet_virt_address + a);
+       else
+               return readl(hpet_virt_address + a);
+}
+
+static void hpet_write_value(unsigned long d, unsigned long a)
+{
+       if (hpet_legacy_use_64_bits)
+               writeq(d, hpet_virt_address + a);
+       else
+               writel(d, hpet_virt_address + a);
+}
+
+#else
+
+static inline unsigned long hpet_read_value(unsigned long a)
+{
+       return readl(hpet_virt_address + a);
+}
+
+static void hpet_write_value(unsigned long d, unsigned long a)
+{
+       writel(d, hpet_virt_address + a);
+}
  #endif
  
  static inline void hpet_set_mapping(void)
@@ -113,6 +141,17 @@ static int __init disable_hpet(char *str)
  }
  __setup("nohpet", disable_hpet);
  
+#ifdef CONFIG_X86_64
+static int hpet64 = 0;
+static int __init hpet64_setup(char *str)
+{
+       hpet64 = 1;
+       return 1;
+}
+__setup("hpet64", hpet64_setup);
+#endif
+
+
  static inline int is_hpet_capable(void)
  {
         return !boot_hpet_disable && hpet_address;
@@ -222,6 +261,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
   * Common hpet info
   */
  static unsigned long hpet_freq;
+static int hpet_legacy_use_64_bits; /* configure T0 in 64-bit mode? */
  
  static void hpet_legacy_set_mode(enum clock_event_mode mode,
                           struct clock_event_device *evt);
@@ -287,10 +327,38 @@ static void hpet_enable_legacy_int(void)
         hpet_legacy_int_enabled = 1;
  }
  
+static int timer0_use_64_bits(void)
+{
+#ifndef CONFIG_X86_64
+       /* using the HPET in 64-bit mode without atomic 64-bit
+        * accesses is too inefficient
+        */
+       return 0;
+#else
+
+       if (unlikely(hpet64)) {
+               u32 id, t0_cfg;
+               id = hpet_readl(HPET_ID);
+               t0_cfg = hpet_readl(HPET_Tn_CFG(0));
+
+               if ((id & HPET_ID_64BIT) && (t0_cfg & HPET_TN_64BIT_CAP)) {
+                       printk(KERN_DEBUG "hpet timer0 configured in 64-bit mode\n");
+                       return 1;
+               }
+               else {
+                       printk(KERN_DEBUG "hpet timer0 does not support 64-bit mode\n");
+                       return 0;
+               }
+       }
+       else return 0;
+#endif
+}
+
  static void hpet_legacy_clockevent_register(void)
  {
         /* Start HPET legacy interrupts */
         hpet_enable_legacy_int();
+       hpet_legacy_use_64_bits = timer0_use_64_bits();
  
         /*
          * Start hpet with the boot cpu mask and make it
@@ -322,9 +390,10 @@ static void hpet_set_mode(enum clock_event_mode mode,
                 /* Make sure we use edge triggered interrupts */
                 cfg &= ~HPET_TN_LEVEL;
                 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
-                      HPET_TN_SETVAL | HPET_TN_32BIT;
+                      HPET_TN_SETVAL |
+                      (hpet_legacy_use_64_bits ? 0 : HPET_TN_32BIT);
                 hpet_writel(cfg, HPET_Tn_CFG(timer));
-               hpet_writel(cmp, HPET_Tn_CMP(timer));
+               hpet_write_value(cmp, HPET_Tn_CMP(timer));
                 udelay(1);
                 /*
                  * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
@@ -333,7 +402,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
                  * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
                  * Publication # 24674)
                  */
-               hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
+               hpet_write_value((unsigned long) delta, HPET_Tn_CMP(timer));
                 hpet_start_counter();
                 hpet_print_config();
                 break;
@@ -341,7 +410,8 @@ static void hpet_set_mode(enum clock_event_mode mode,
         case CLOCK_EVT_MODE_ONESHOT:
                 cfg = hpet_readl(HPET_Tn_CFG(timer));
                 cfg &= ~HPET_TN_PERIODIC;
-               cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+               cfg |= HPET_TN_ENABLE |
+                      (hpet_legacy_use_64_bits ? 0 : HPET_TN_32BIT);
                 hpet_writel(cfg, HPET_Tn_CFG(timer));
                 break;
  
@@ -370,12 +440,12 @@ static void hpet_set_mode(enum clock_event_mode mode,
  static int hpet_next_event(unsigned long delta,
                            struct clock_event_device *evt, int timer)
  {
-       u32 cnt;
+       unsigned long cnt;
         s32 res;
  
-       cnt = hpet_readl(HPET_COUNTER);
+       cnt = hpet_read_value(HPET_COUNTER);
         cnt += (u32) delta;
-       hpet_writel(cnt, HPET_Tn_CMP(timer));
+       hpet_write_value(cnt, HPET_Tn_CMP(timer));
  
         /*
          * HPETs are a complete disaster. The compare register is
@@ -399,7 +469,7 @@ static int hpet_next_event(unsigned long delta,
          * the event. The minimum programming delta for the generic
          * clockevents code is set to 1.5 * HPET_MIN_CYCLES.
          */
-       res = (s32)(cnt - hpet_readl(HPET_COUNTER));
+       res = (s32)((u32)cnt - (u32)hpet_readl(HPET_COUNTER));
  
         return res < HPET_MIN_CYCLES ? -ETIME : 0;
  }
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c

index 43e9ccf..c50f863 100644 (file)
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -31,6 +31,7 @@ union thread_union init_thread_union __init_task_data =
  struct task_struct init_task = INIT_TASK(init_task);
  EXPORT_SYMBOL(init_task);
  
+#ifndef CONFIG_X86_NO_TSS
  /*
   * per-CPU TSS segments. Threads are completely 'soft' on Linux,
   * no more per-task TSS's. The TSS size is kept cacheline-aligned
@@ -39,4 +40,4 @@ EXPORT_SYMBOL(init_task);
   * on exact cacheline boundaries, to eliminate cacheline ping-pong.
   */
  DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
-
+#endif
diff --git a/arch/x86/kernel/ioport-xen.c b/arch/x86/kernel/ioport-xen.c

new file mode 100644 (file)

index 0000000..3cb400e
--- /dev/null
+++ b/arch/x86/kernel/ioport-xen.c
@@ -0,0 +1,84 @@
+/*
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus. 32/64 bits code unification by Miguel Botón.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/syscalls.h>
+#include <linux/bitmap.h>
+#include <asm/syscalls.h>
+#include <xen/interface/physdev.h>
+
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+       struct thread_struct *t = &current->thread;
+       struct physdev_set_iobitmap set_iobitmap;
+
+       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+               return -EINVAL;
+       if (turn_on && !capable(CAP_SYS_RAWIO))
+               return -EPERM;
+
+       /*
+        * If it's the first ioperm() call in this thread's lifetime, set the
+        * IO bitmap up. ioperm() is much less timing critical than clone(),
+        * this is why we delay this operation until now:
+        */
+       if (!t->io_bitmap_ptr) {
+               unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+
+               if (!bitmap)
+                       return -ENOMEM;
+
+               memset(bitmap, 0xff, IO_BITMAP_BYTES);
+               t->io_bitmap_ptr = bitmap;
+               set_thread_flag(TIF_IO_BITMAP);
+
+               set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
+               set_iobitmap.nr_ports = IO_BITMAP_BITS;
+               WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+                                             &set_iobitmap));
+       }
+
+       if (turn_on)
+               bitmap_clear(t->io_bitmap_ptr, from, num);
+       else
+               bitmap_set(t->io_bitmap_ptr, from, num);
+
+       return 0;
+}
+
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ */
+long sys_iopl(unsigned int level, struct pt_regs *regs)
+{
+       struct thread_struct *t = &current->thread;
+       unsigned int old = t->iopl >> 12;
+
+       if (level > 3)
+               return -EINVAL;
+       /* Trying to gain more privileges? */
+       if (level > old) {
+               if (!capable(CAP_SYS_RAWIO))
+                       return -EPERM;
+       }
+       t->iopl = level << 12;
+       set_iopl_mask(t->iopl);
+
+       return 0;
+}
diff --git a/arch/x86/kernel/irq-xen.c b/arch/x86/kernel/irq-xen.c

new file mode 100644 (file)

index 0000000..b902509
--- /dev/null
+++ b/arch/x86/kernel/irq-xen.c
@@ -0,0 +1,355 @@
+/*
+ * Common interrupt code for 32 and 64 bit
+ */
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/of.h>
+#include <linux/seq_file.h>
+#include <linux/smp.h>
+#include <linux/ftrace.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/irq.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+#include <asm/hw_irq.h>
+
+#ifndef CONFIG_XEN
+atomic_t irq_err_count;
+
+/* Function pointer for generic interrupt vector handling */
+void (*x86_platform_ipi_callback)(void) = NULL;
+#endif
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+       if (printk_ratelimit())
+               pr_err("unexpected IRQ trap at vector %02x\n", irq);
+
+#ifndef CONFIG_XEN
+       /*
+        * Currently unexpected vectors happen only on SMP and APIC.
+        * We _must_ ack these because every local APIC has only N
+        * irq slots per priority level, and a 'hanging, unacked' IRQ
+        * holds up an irq slot - in excessive cases (when multiple
+        * unexpected vectors occur) that might lock up the APIC
+        * completely.
+        * But only ack when the APIC is enabled -AK
+        */
+       ack_APIC_irq();
+#endif
+}
+
+#define irq_stats(x)           (&per_cpu(irq_stat, x))
+/*
+ * /proc/interrupts printing for arch specific interrupts
+ */
+int arch_show_interrupts(struct seq_file *p, int prec)
+{
+       int j;
+
+       seq_printf(p, "%*s: ", prec, "NMI");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
+       seq_printf(p, "  Non-maskable interrupts\n");
+#ifdef CONFIG_X86_LOCAL_APIC
+#ifndef CONFIG_XEN
+       seq_printf(p, "%*s: ", prec, "LOC");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
+       seq_printf(p, "  Local timer interrupts\n");
+
+       seq_printf(p, "%*s: ", prec, "SPU");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+       seq_printf(p, "  Spurious interrupts\n");
+       seq_printf(p, "%*s: ", prec, "PMI");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+       seq_printf(p, "  Performance monitoring interrupts\n");
+#endif
+       seq_printf(p, "%*s: ", prec, "IWI");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
+       seq_printf(p, "  IRQ work interrupts\n");
+#ifndef CONFIG_XEN
+       seq_printf(p, "%*s: ", prec, "RTR");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
+       seq_printf(p, "  APIC ICR read retries\n");
+#endif
+#endif
+#ifndef CONFIG_XEN
+       if (x86_platform_ipi_callback) {
+               seq_printf(p, "%*s: ", prec, "PLT");
+               for_each_online_cpu(j)
+                       seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
+               seq_printf(p, "  Platform interrupts\n");
+       }
+#endif
+#ifdef CONFIG_SMP
+       seq_printf(p, "%*s: ", prec, "RES");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
+       seq_printf(p, "  Rescheduling interrupts\n");
+       seq_printf(p, "%*s: ", prec, "CAL");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+       seq_printf(p, "  Function call interrupts\n");
+#ifndef CONFIG_XEN
+       seq_printf(p, "%*s: ", prec, "TLB");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
+       seq_printf(p, "  TLB shootdowns\n");
+#else
+       seq_printf(p, "%*s: ", prec, "LCK");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->irq_lock_count);
+       seq_printf(p, "  Spinlock wakeups\n");
+#endif
+#endif
+#ifdef CONFIG_X86_THERMAL_VECTOR
+       seq_printf(p, "%*s: ", prec, "TRM");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
+       seq_printf(p, "  Thermal event interrupts\n");
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
+       seq_printf(p, "%*s: ", prec, "THR");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
+       seq_printf(p, "  Threshold APIC interrupts\n");
+#endif
+#ifdef CONFIG_X86_MCE
+       seq_printf(p, "%*s: ", prec, "MCE");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
+       seq_printf(p, "  Machine check exceptions\n");
+       seq_printf(p, "%*s: ", prec, "MCP");
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
+       seq_printf(p, "  Machine check polls\n");
+#endif
+#ifndef CONFIG_XEN
+       seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+       seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
+#endif
+#endif
+       return 0;
+}
+
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+       u64 sum = irq_stats(cpu)->__nmi_count;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+       sum += irq_stats(cpu)->apic_timer_irqs;
+       sum += irq_stats(cpu)->irq_spurious_count;
+       sum += irq_stats(cpu)->apic_perf_irqs;
+       sum += irq_stats(cpu)->apic_irq_work_irqs;
+       sum += irq_stats(cpu)->icr_read_retry_count;
+#endif
+#ifndef CONFIG_XEN
+       if (x86_platform_ipi_callback)
+               sum += irq_stats(cpu)->x86_platform_ipis;
+#endif
+#ifdef CONFIG_SMP
+       sum += irq_stats(cpu)->irq_resched_count;
+       sum += irq_stats(cpu)->irq_call_count;
+#ifndef CONFIG_XEN
+       sum += irq_stats(cpu)->irq_tlb_count;
+#else
+       sum += irq_stats(cpu)->irq_lock_count;
+#endif
+#endif
+#ifdef CONFIG_X86_THERMAL_VECTOR
+       sum += irq_stats(cpu)->irq_thermal_count;
+#endif
+#ifdef CONFIG_X86_MCE_THRESHOLD
+       sum += irq_stats(cpu)->irq_threshold_count;
+#endif
+#ifdef CONFIG_X86_MCE
+       sum += per_cpu(mce_exception_count, cpu);
+       sum += per_cpu(mce_poll_count, cpu);
+#endif
+       return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+#ifndef CONFIG_XEN
+       u64 sum = atomic_read(&irq_err_count);
+
+#ifdef CONFIG_X86_IO_APIC
+       sum += atomic_read(&irq_mis_count);
+#endif
+       return sum;
+#else
+       return 0;
+#endif
+}
+
+
+#ifndef CONFIG_XEN
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
+{
+       struct pt_regs *old_regs = set_irq_regs(regs);
+
+       /* high bit used in ret_from_ code  */
+       unsigned vector = ~regs->orig_ax;
+       unsigned irq;
+
+       irq_enter();
+       exit_idle();
+
+       irq = __this_cpu_read(vector_irq[vector]);
+
+       if (!handle_irq(irq, regs)) {
+               ack_APIC_irq();
+
+               if (printk_ratelimit())
+                       pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
+                               __func__, smp_processor_id(), vector, irq);
+       }
+
+       irq_exit();
+
+       set_irq_regs(old_regs);
+       return 1;
+}
+
+/*
+ * Handler for X86_PLATFORM_IPI_VECTOR.
+ */
+void smp_x86_platform_ipi(struct pt_regs *regs)
+{
+       struct pt_regs *old_regs = set_irq_regs(regs);
+
+       ack_APIC_irq();
+
+       irq_enter();
+
+       exit_idle();
+
+       inc_irq_stat(x86_platform_ipis);
+
+       if (x86_platform_ipi_callback)
+               x86_platform_ipi_callback();
+
+       irq_exit();
+
+       set_irq_regs(old_regs);
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+#include <xen/evtchn.h>
+/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
+void fixup_irqs(void)
+{
+       unsigned int irq;
+       static int warned;
+       struct irq_desc *desc;
+       struct irq_data *data;
+       struct irq_chip *chip;
+       static DECLARE_BITMAP(irqs_used, NR_IRQS);
+
+       for_each_irq_desc(irq, desc) {
+               int break_affinity = 0;
+               int set_affinity = 1;
+               const struct cpumask *affinity;
+
+               if (!desc)
+                       continue;
+               if (irq == 2)
+                       continue;
+
+               /* interrupt's are disabled at this point */
+               raw_spin_lock(&desc->lock);
+
+               data = irq_desc_get_irq_data(desc);
+               affinity = data->affinity;
+               if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
+                   cpumask_subset(affinity, cpu_online_mask)) {
+                       raw_spin_unlock(&desc->lock);
+                       continue;
+               }
+
+               if (cpumask_test_cpu(smp_processor_id(), affinity))
+                       __set_bit(irq, irqs_used);
+
+               if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+                       break_affinity = 1;
+                       affinity = cpu_all_mask;
+               }
+
+               chip = irq_data_get_irq_chip(data);
+               if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
+                       chip->irq_mask(data);
+
+               if (chip->irq_set_affinity)
+                       chip->irq_set_affinity(data, affinity, true);
+               else if (data->chip != &no_irq_chip && !(warned++))
+                       set_affinity = 0;
+
+               /*
+                * We unmask if the irq was not marked masked by the
+                * core code. That respects the lazy irq disable
+                * behaviour.
+                */
+               if (!irqd_can_move_in_process_context(data) &&
+                   !irqd_irq_masked(data) && chip->irq_unmask)
+                       chip->irq_unmask(data);
+
+               raw_spin_unlock(&desc->lock);
+
+               if (break_affinity && set_affinity)
+                       /*printk("Broke affinity for irq %i\n", irq)*/;
+               else if (!set_affinity)
+                       printk("Cannot set affinity for irq %i\n", irq);
+       }
+
+       /*
+        * We can remove mdelay() and then send spuriuous interrupts to
+        * new cpu targets for all the irqs that were handled previously by
+        * this cpu. While it works, I have seen spurious interrupt messages
+        * (nothing wrong but still...).
+        *
+        * So for now, retain mdelay(1) and check the IRR and then send those
+        * interrupts to new targets as this cpu is already offlined...
+        */
+       mdelay(1);
+
+       for_each_irq_desc(irq, desc) {
+               if (!__test_and_clear_bit(irq, irqs_used))
+                       continue;
+
+               if (xen_test_irq_pending(irq)) {
+                       desc = irq_to_desc(irq);
+                       data = irq_desc_get_irq_data(desc);
+                       chip = irq_data_get_irq_chip(data);
+                       raw_spin_lock(&desc->lock);
+                       if (chip->irq_retrigger)
+                               chip->irq_retrigger(data);
+                       raw_spin_unlock(&desc->lock);
+               }
+       }
+}
+#endif
diff --git a/arch/x86/kernel/irq_work-xen.c b/arch/x86/kernel/irq_work-xen.c

new file mode 100644 (file)

index 0000000..851414e
--- /dev/null
+++ b/arch/x86/kernel/irq_work-xen.c
@@ -0,0 +1,21 @@
+/*
+ * x86/Xen specific code for irq_work
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq_work.h>
+#include <linux/hardirq.h>
+#include <asm/ipi.h>
+
+#ifdef CONFIG_SMP
+void smp_irq_work_interrupt(struct pt_regs *regs)
+{
+       inc_irq_stat(apic_irq_work_irqs);
+       irq_work_run();
+}
+
+void arch_irq_work_raise(void)
+{
+       xen_send_IPI_self(IRQ_WORK_VECTOR);
+}
+#endif
diff --git a/arch/x86/kernel/ldt-xen.c b/arch/x86/kernel/ldt-xen.c

new file mode 100644 (file)

index 0000000..bf4a4f7
--- /dev/null
+++ b/arch/x86/kernel/ldt-xen.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Andi Kleen
+ *
+ * This handles calls from both 32bit and 64bit mode.
+ */
+
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+#include <asm/syscalls.h>
+
+#ifdef CONFIG_SMP
+static void flush_ldt(void *current_mm)
+{
+       if (current->active_mm == current_mm)
+               load_LDT(&current->active_mm->context);
+}
+#endif
+
+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+{
+       void *oldldt, *newldt;
+       int oldsize;
+
+       if (mincount <= pc->size)
+               return 0;
+       oldsize = pc->size;
+       mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
+                       (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
+       if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
+               newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+       else
+               newldt = (void *)__get_free_page(GFP_KERNEL);
+
+       if (!newldt)
+               return -ENOMEM;
+
+       if (oldsize)
+               memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
+       oldldt = pc->ldt;
+       memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
+              (mincount - oldsize) * LDT_ENTRY_SIZE);
+
+#ifdef CONFIG_X86_64
+       /* CHECKME: Do we really need this ? */
+       wmb();
+#endif
+       pc->ldt = newldt;
+       wmb();
+       pc->size = mincount;
+       wmb();
+
+       if (reload) {
+#ifdef CONFIG_SMP
+               preempt_disable();
+#endif
+               make_pages_readonly(newldt,
+                                   (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE,
+                                   XENFEAT_writable_descriptor_tables);
+               load_LDT(pc);
+#ifdef CONFIG_SMP
+               if (!cpumask_equal(mm_cpumask(current->mm),
+                                  cpumask_of(smp_processor_id())))
+                       smp_call_function(flush_ldt, current->mm, 1);
+               preempt_enable();
+#endif
+       }
+       if (oldsize) {
+               make_pages_writable(oldldt,
+                                   (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE,
+                                   XENFEAT_writable_descriptor_tables);
+               if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
+                       vfree(oldldt);
+               else
+                       put_page(virt_to_page(oldldt));
+       }
+       return 0;
+}
+
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+       int err = alloc_ldt(new, old->size, 0);
+
+       if (err < 0)
+               return err;
+       memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
+       make_pages_readonly(new->ldt,
+                           (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+                           XENFEAT_writable_descriptor_tables);
+       return 0;
+}
+
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+       struct mm_struct *old_mm;
+       int retval = 0;
+
+       memset(&mm->context, 0, sizeof(mm->context));
+       mutex_init(&mm->context.lock);
+       old_mm = current->mm;
+       if (old_mm)
+               mm->context.vdso = old_mm->context.vdso;
+       if (old_mm && old_mm->context.size > 0) {
+               mutex_lock(&old_mm->context.lock);
+               retval = copy_ldt(&mm->context, &old_mm->context);
+               mutex_unlock(&old_mm->context.lock);
+       }
+       return retval;
+}
+
+/*
+ * No need to lock the MM as we are the last user
+ *
+ * 64bit: Don't touch the LDT register - we're already in the next thread.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+       if (mm->context.size) {
+               /* CHECKME: Can this ever happen ? */
+               if (mm == current->active_mm)
+                       clear_LDT();
+               make_pages_writable(mm->context.ldt,
+                                   (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE,
+                                   XENFEAT_writable_descriptor_tables);
+               if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
+                       vfree(mm->context.ldt);
+               else
+                       put_page(virt_to_page(mm->context.ldt));
+               mm->context.size = 0;
+       }
+}
+
+static int read_ldt(void __user *ptr, unsigned long bytecount)
+{
+       int err;
+       unsigned long size;
+       struct mm_struct *mm = current->mm;
+
+       if (!mm->context.size)
+               return 0;
+       if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
+               bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
+
+       mutex_lock(&mm->context.lock);
+       size = mm->context.size * LDT_ENTRY_SIZE;
+       if (size > bytecount)
+               size = bytecount;
+
+       err = 0;
+       if (copy_to_user(ptr, mm->context.ldt, size))
+               err = -EFAULT;
+       mutex_unlock(&mm->context.lock);
+       if (err < 0)
+               goto error_return;
+       if (size != bytecount) {
+               /* zero-fill the rest */
+               if (clear_user(ptr + size, bytecount - size) != 0) {
+                       err = -EFAULT;
+                       goto error_return;
+               }
+       }
+       return bytecount;
+error_return:
+       return err;
+}
+
+static int read_default_ldt(void __user *ptr, unsigned long bytecount)
+{
+       /* CHECKME: Can we use _one_ random number ? */
+#ifdef CONFIG_X86_32
+       unsigned long size = 5 * sizeof(struct desc_struct);
+#else
+       unsigned long size = 128;
+#endif
+       if (bytecount > size)
+               bytecount = size;
+       if (clear_user(ptr, bytecount))
+               return -EFAULT;
+       return bytecount;
+}
+
+static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
+{
+       struct mm_struct *mm = current->mm;
+       struct desc_struct ldt;
+       int error;
+       struct user_desc ldt_info;
+
+       error = -EINVAL;
+       if (bytecount != sizeof(ldt_info))
+               goto out;
+       error = -EFAULT;
+       if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
+               goto out;
+
+       error = -EINVAL;
+       if (ldt_info.entry_number >= LDT_ENTRIES)
+               goto out;
+       if (ldt_info.contents == 3) {
+               if (oldmode)
+                       goto out;
+               if (ldt_info.seg_not_present == 0)
+                       goto out;
+       }
+
+       mutex_lock(&mm->context.lock);
+       if (ldt_info.entry_number >= mm->context.size) {
+               error = alloc_ldt(&current->mm->context,
+                                 ldt_info.entry_number + 1, 1);
+               if (error < 0)
+                       goto out_unlock;
+       }
+
+       /* Allow LDTs to be cleared by the user. */
+       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+               if (oldmode || LDT_empty(&ldt_info)) {
+                       memset(&ldt, 0, sizeof(ldt));
+                       goto install;
+               }
+       }
+
+       fill_ldt(&ldt, &ldt_info);
+       if (oldmode)
+               ldt.avl = 0;
+
+       /* Install the new entry ...  */
+install:
+       error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
+
+out_unlock:
+       mutex_unlock(&mm->context.lock);
+out:
+       return error;
+}
+
+asmlinkage int sys_modify_ldt(int func, void __user *ptr,
+                             unsigned long bytecount)
+{
+       int ret = -ENOSYS;
+
+       switch (func) {
+       case 0:
+               ret = read_ldt(ptr, bytecount);
+               break;
+       case 1:
+               ret = write_ldt(ptr, bytecount, 1);
+               break;
+       case 2:
+               ret = read_default_ldt(ptr, bytecount);
+               break;
+       case 0x11:
+               ret = write_ldt(ptr, bytecount, 0);
+               break;
+       }
+       return ret;
+}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c

index 5b19e4d..9f80077 100644 (file)
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -26,47 +26,9 @@
  #include <asm/cacheflush.h>
  #include <asm/debugreg.h>
  
-static void set_idt(void *newidt, __u16 limit)
-{
-       struct desc_ptr curidt;
-
-       /* ia32 supports unaliged loads & stores */
-       curidt.size    = limit;
-       curidt.address = (unsigned long)newidt;
-
-       load_idt(&curidt);
-}
-
-
-static void set_gdt(void *newgdt, __u16 limit)
-{
-       struct desc_ptr curgdt;
-
-       /* ia32 supports unaligned loads & stores */
-       curgdt.size    = limit;
-       curgdt.address = (unsigned long)newgdt;
-
-       load_gdt(&curgdt);
-}
-
-static void load_segments(void)
-{
-#define __STR(X) #X
-#define STR(X) __STR(X)
-
-       __asm__ __volatile__ (
-               "\tljmp $"STR(__KERNEL_CS)",$1f\n"
-               "\t1:\n"
-               "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
-               "\tmovl %%eax,%%ds\n"
-               "\tmovl %%eax,%%es\n"
-               "\tmovl %%eax,%%fs\n"
-               "\tmovl %%eax,%%gs\n"
-               "\tmovl %%eax,%%ss\n"
-               : : : "eax", "memory");
-#undef STR
-#undef __STR
-}
+#ifdef CONFIG_XEN
+#include <xen/interface/kexec.h>
+#endif
  
  static void machine_kexec_free_page_tables(struct kimage *image)
  {
@@ -83,6 +45,17 @@ static int machine_kexec_alloc_page_tables(struct kimage *image)
  {
         image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
  #ifdef CONFIG_X86_PAE
+#ifdef CONFIG_XEN /* machine address must fit into xki->page_list[PA_PGD] */
+       if (image->arch.pgd) {
+               struct page *pg = virt_to_page(image->arch.pgd);
+
+               if (xen_limit_pages_to_max_mfn(pg, 0, BITS_PER_LONG) < 0) {
+                       image->arch.pgd = NULL;
+                       __free_page(pg);
+                       return -ENOMEM;
+               }
+       }
+#endif
         image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
         image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
  #endif
@@ -138,6 +111,38 @@ static void machine_kexec_prepare_page_tables(struct kimage *image)
                 __pa(control_page), __pa(control_page));
  }
  
+#ifdef CONFIG_XEN
+
+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
+
+#if PAGES_NR > KEXEC_XEN_NO_PAGES
+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
+#endif
+
+#if PA_CONTROL_PAGE != 0
+#error PA_CONTROL_PAGE is non zero - Xen support will break
+#endif
+
+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+       void *control_page;
+
+       memset(xki->page_list, 0, sizeof(xki->page_list));
+
+       control_page = page_address(image->control_code_page);
+       memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+       xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
+       xki->page_list[PA_PGD] = __ma(image->arch.pgd);
+
+       if (image->type == KEXEC_TYPE_DEFAULT)
+               xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
+}
+
+#include "machine_kexec_xen.c"
+
+#endif /* CONFIG_XEN */
+
  /*
   * A architecture hook called to validate the
   * proposed image and prepare the control pages
@@ -175,6 +180,7 @@ void machine_kexec_cleanup(struct kimage *image)
         machine_kexec_free_page_tables(image);
  }
  
+#ifndef CONFIG_XEN
  /*
   * Do not allocate memory (or fail in any way) in machine_kexec().
   * We are past the point of no return, committed to rebooting now.
@@ -227,24 +233,6 @@ void machine_kexec(struct kimage *image)
                 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
                                                 << PAGE_SHIFT);
  
-       /*
-        * The segment registers are funny things, they have both a
-        * visible and an invisible part.  Whenever the visible part is
-        * set to a specific selector, the invisible part is loaded
-        * with from a table in memory.  At no other time is the
-        * descriptor table in memory accessed.
-        *
-        * I take advantage of this here by force loading the
-        * segments, before I zap the gdt with an invalid value.
-        */
-       load_segments();
-       /*
-        * The gdt & idt are now invalid.
-        * If you want to load them you must set up your own idt & gdt.
-        */
-       set_gdt(phys_to_virt(0), 0);
-       set_idt(phys_to_virt(0), 0);
-
         /* now call it */
         image->start = relocate_kernel_ptr((unsigned long)image->head,
                                            (unsigned long)page_list,
@@ -258,6 +246,7 @@ void machine_kexec(struct kimage *image)
  
         __ftrace_enabled_restore(save_ftrace_enabled);
  }
+#endif
  
  void arch_crash_save_vmcoreinfo(void)
  {
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c

index b3ea9db..d8d77db 100644 (file)
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -21,6 +21,101 @@
  #include <asm/mmu_context.h>
  #include <asm/debugreg.h>
  
+#ifdef CONFIG_XEN
+
+/* In the case of Xen, override hypervisor functions to be able to create
+ * a regular identity mapping page table...
+ */
+
+#include <xen/interface/kexec.h>
+#include <xen/interface/memory.h>
+
+#define x__pmd(x) ((pmd_t) { (x) } )
+#define x__pud(x) ((pud_t) { (x) } )
+#define x__pgd(x) ((pgd_t) { (x) } )
+
+#define x_pmd_val(x)   ((x).pmd)
+#define x_pud_val(x)   ((x).pud)
+#define x_pgd_val(x)   ((x).pgd)
+
+static inline void x_set_pmd(pmd_t *dst, pmd_t val)
+{
+       x_pmd_val(*dst) = x_pmd_val(val);
+}
+
+static inline void x_set_pud(pud_t *dst, pud_t val)
+{
+       x_pud_val(*dst) = phys_to_machine(x_pud_val(val));
+}
+
+static inline void x_pud_clear (pud_t *pud)
+{
+       x_pud_val(*pud) = 0;
+}
+
+static inline void x_set_pgd(pgd_t *dst, pgd_t val)
+{
+       x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val));
+}
+
+static inline void x_pgd_clear (pgd_t * pgd)
+{
+       x_pgd_val(*pgd) = 0;
+}
+
+#define X__PAGE_KERNEL_LARGE_EXEC \
+         _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE
+#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY
+
+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT)
+
+#if PAGES_NR > KEXEC_XEN_NO_PAGES
+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break
+#endif
+
+#if PA_CONTROL_PAGE != 0
+#error PA_CONTROL_PAGE is non zero - Xen support will break
+#endif
+
+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+       void *control_page;
+       void *table_page;
+
+       memset(xki->page_list, 0, sizeof(xki->page_list));
+
+       control_page = page_address(image->control_code_page) + PAGE_SIZE;
+       memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+       table_page = page_address(image->control_code_page);
+
+       xki->page_list[PA_CONTROL_PAGE] = __ma(control_page);
+       xki->page_list[PA_TABLE_PAGE] = __ma(table_page);
+
+       if (image->type == KEXEC_TYPE_DEFAULT)
+               xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page);
+}
+
+#include "machine_kexec_xen.c"
+
+#else /* CONFIG_XEN */
+
+#define x__pmd(x) __pmd(x)
+#define x__pud(x) __pud(x)
+#define x__pgd(x) __pgd(x)
+
+#define x_set_pmd(x, y) set_pmd(x, y)
+#define x_set_pud(x, y) set_pud(x, y)
+#define x_set_pgd(x, y) set_pgd(x, y)
+
+#define x_pud_clear(x) pud_clear(x)
+#define x_pgd_clear(x) pgd_clear(x)
+
+#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
+#define X_KERNPG_TABLE _KERNPG_TABLE
+
+#endif /* CONFIG_XEN */
+
  static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
                                 unsigned long addr)
  {
@@ -50,7 +145,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
         }
         pmd = pmd_offset(pud, addr);
         if (!pmd_present(*pmd))
-               set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+               x_set_pmd(pmd, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
         result = 0;
  out:
         return result;
@@ -63,7 +158,7 @@ static void init_level2_page(pmd_t *level2p, unsigned long addr)
         addr &= PAGE_MASK;
         end_addr = addr + PUD_SIZE;
         while (addr < end_addr) {
-               set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+               x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC));
                 addr += PMD_SIZE;
         }
  }
@@ -88,12 +183,12 @@ static int init_level3_page(struct kimage *image, pud_t *level3p,
                 }
                 level2p = (pmd_t *)page_address(page);
                 init_level2_page(level2p, addr);
-               set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
+               x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE));
                 addr += PUD_SIZE;
         }
         /* clear the unused entries */
         while (addr < end_addr) {
-               pud_clear(level3p++);
+               x_pud_clear(level3p++);
                 addr += PUD_SIZE;
         }
  out:
@@ -123,12 +218,12 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
                 result = init_level3_page(image, level3p, addr, last_addr);
                 if (result)
                         goto out;
-               set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
+               x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE));
                 addr += PGDIR_SIZE;
         }
         /* clear the unused entries */
         while (addr < end_addr) {
-               pgd_clear(level4p++);
+               x_pgd_clear(level4p++);
                 addr += PGDIR_SIZE;
         }
  out:
@@ -189,8 +284,14 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
  {
         pgd_t *level4p;
         int result;
+       unsigned long x_max_pfn = max_pfn;
+
+#ifdef CONFIG_XEN
+       x_max_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
+#endif
+
         level4p = (pgd_t *)__va(start_pgtable);
-       result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
+       result = init_level4_page(image, level4p, 0, x_max_pfn << PAGE_SHIFT);
         if (result)
                 return result;
         /*
@@ -203,47 +304,6 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
         return init_transition_pgtable(image, level4p);
  }
  
-static void set_idt(void *newidt, u16 limit)
-{
-       struct desc_ptr curidt;
-
-       /* x86-64 supports unaliged loads & stores */
-       curidt.size    = limit;
-       curidt.address = (unsigned long)newidt;
-
-       __asm__ __volatile__ (
-               "lidtq %0\n"
-               : : "m" (curidt)
-               );
-};
-
-
-static void set_gdt(void *newgdt, u16 limit)
-{
-       struct desc_ptr curgdt;
-
-       /* x86-64 supports unaligned loads & stores */
-       curgdt.size    = limit;
-       curgdt.address = (unsigned long)newgdt;
-
-       __asm__ __volatile__ (
-               "lgdtq %0\n"
-               : : "m" (curgdt)
-               );
-};
-
-static void load_segments(void)
-{
-       __asm__ __volatile__ (
-               "\tmovl %0,%%ds\n"
-               "\tmovl %0,%%es\n"
-               "\tmovl %0,%%ss\n"
-               "\tmovl %0,%%fs\n"
-               "\tmovl %0,%%gs\n"
-               : : "a" (__KERNEL_DS) : "memory"
-               );
-}
-
  int machine_kexec_prepare(struct kimage *image)
  {
         unsigned long start_pgtable;
@@ -265,6 +325,7 @@ void machine_kexec_cleanup(struct kimage *image)
         free_transition_pgtable(image);
  }
  
+#ifndef CONFIG_XEN
  /*
   * Do not allocate memory (or fail in any way) in machine_kexec().
   * We are past the point of no return, committed to rebooting now.
@@ -311,24 +372,6 @@ void machine_kexec(struct kimage *image)
                 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
                                                 << PAGE_SHIFT);
  
-       /*
-        * The segment registers are funny things, they have both a
-        * visible and an invisible part.  Whenever the visible part is
-        * set to a specific selector, the invisible part is loaded
-        * with from a table in memory.  At no other time is the
-        * descriptor table in memory accessed.
-        *
-        * I take advantage of this here by force loading the
-        * segments, before I zap the gdt with an invalid value.
-        */
-       load_segments();
-       /*
-        * The gdt & idt are now invalid.
-        * If you want to load them you must set up your own idt & gdt.
-        */
-       set_gdt(phys_to_virt(0), 0);
-       set_idt(phys_to_virt(0), 0);
-
         /* now call it */
         image->start = relocate_kernel((unsigned long)image->head,
                                        (unsigned long)page_list,
@@ -342,10 +385,13 @@ void machine_kexec(struct kimage *image)
  
         __ftrace_enabled_restore(save_ftrace_enabled);
  }
+#endif
  
  void arch_crash_save_vmcoreinfo(void)
  {
+#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */
         VMCOREINFO_SYMBOL(phys_base);
+#endif
         VMCOREINFO_SYMBOL(init_level4_pgt);
  
  #ifdef CONFIG_NUMA
diff --git a/arch/x86/kernel/machine_kexec_xen.c b/arch/x86/kernel/machine_kexec_xen.c

new file mode 100644 (file)

index 0000000..b171ce6
--- /dev/null
+++ b/arch/x86/kernel/machine_kexec_xen.c
@@ -0,0 +1,29 @@
+int machine_kexec_setup_resource(struct resource *hypervisor,
+                                struct resource *phys_cpu)
+{
+       /* The per-cpu crash note resources belong to the hypervisor resource */
+       insert_resource(hypervisor, phys_cpu);
+       if (!phys_cpu->parent) /* outside of hypervisor range */
+               insert_resource(&iomem_resource, phys_cpu);
+
+       return 0;
+}
+
+int __init machine_kexec_setup_resources(struct resource *hypervisor,
+                                        struct resource *phys_cpus,
+                                        int nr_phys_cpus)
+{
+       unsigned int k;
+
+       insert_resource(&iomem_resource, hypervisor);
+       if (crashk_res.end > crashk_res.start)
+               insert_resource(&iomem_resource, &crashk_res);
+
+       for (k = 0; k < nr_phys_cpus; k++)
+               machine_kexec_setup_resource(hypervisor, phys_cpus + k);
+
+       return xen_create_contiguous_region((unsigned long)&vmcoreinfo_note,
+                                           get_order(sizeof(vmcoreinfo_note)),
+                                           BITS_PER_LONG);
+
+}
diff --git a/arch/x86/kernel/microcode_core-xen.c b/arch/x86/kernel/microcode_core-xen.c

new file mode 100644 (file)

index 0000000..7effbaa
--- /dev/null
+++ b/arch/x86/kernel/microcode_core-xen.c
@@ -0,0 +1,303 @@
+/*
+ *     CPU Microcode Update Driver for Linux on Xen
+ *
+ *     Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *                   2006      Shaohua Li <shaohua.li@intel.com>
+ *
+ *     This driver allows to upgrade microcode on Intel processors
+ *     belonging to IA-32 family - PentiumPro, Pentium II,
+ *     Pentium III, Xeon, Pentium 4, etc.
+ *
+ *     Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ *     Software Developer's Manual
+ *     Order Number 253668 or free download from:
+ *
+ *     http://developer.intel.com/Assets/PDF/manual/253668.pdf
+ *
+ *     For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License
+ *     as published by the Free Software Foundation; either version
+ *     2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/platform_device.h>
+#include <linux/miscdevice.h>
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/firmware.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/cpu_device_id.h>
+
+#include <xen/pcpu.h>
+
+MODULE_DESCRIPTION("Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+
+static int verbose;
+module_param(verbose, int, 0644);
+
+#define MICROCODE_VERSION      "2.00-xen"
+
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - microcode_mutex to synchronize with each other;
+ * - get/put_online_cpus() to synchronize with
+ *   the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
+static DEFINE_MUTEX(microcode_mutex);
+
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static int do_microcode_update(const void __user *ubuf, size_t len)
+{
+       int err;
+       void *kbuf;
+
+       kbuf = vmalloc(len);
+       if (!kbuf)
+               return -ENOMEM;
+
+       if (copy_from_user(kbuf, ubuf, len) == 0) {
+               struct xen_platform_op op;
+
+               op.cmd = XENPF_microcode_update;
+               set_xen_guest_handle(op.u.microcode.data, kbuf);
+               op.u.microcode.length = len;
+               err = HYPERVISOR_platform_op(&op);
+       } else
+               err = -EFAULT;
+
+       vfree(kbuf);
+
+       return err;
+}
+
+static int microcode_open(struct inode *inode, struct file *file)
+{
+       return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
+}
+
+static ssize_t microcode_write(struct file *file, const char __user *buf,
+                              size_t len, loff_t *ppos)
+{
+       ssize_t ret = -EINVAL;
+
+       if ((len >> PAGE_SHIFT) > totalram_pages) {
+               pr_err("too much data (max %ld pages)\n", totalram_pages);
+               return ret;
+       }
+
+       mutex_lock(&microcode_mutex);
+
+       if (do_microcode_update(buf, len) == 0)
+               ret = (ssize_t)len;
+
+       mutex_unlock(&microcode_mutex);
+
+       return ret;
+}
+
+static const struct file_operations microcode_fops = {
+       .owner                  = THIS_MODULE,
+       .write                  = microcode_write,
+       .open                   = microcode_open,
+       .llseek         = no_llseek,
+};
+
+static struct miscdevice microcode_dev = {
+       .minor                  = MICROCODE_MINOR,
+       .name                   = "microcode",
+       .nodename               = "cpu/microcode",
+       .fops                   = &microcode_fops,
+};
+
+static int __init microcode_dev_init(void)
+{
+       int error;
+
+       if (!is_initial_xendomain())
+               return -ENODEV;
+
+       error = misc_register(&microcode_dev);
+       if (error) {
+               pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
+               return error;
+       }
+
+       return 0;
+}
+
+static void __exit microcode_dev_exit(void)
+{
+       misc_deregister(&microcode_dev);
+}
+
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+MODULE_ALIAS("devname:cpu/microcode");
+#else
+#define microcode_dev_init()   0
+#define microcode_dev_exit()   do { } while (0)
+#endif
+
+/* fake device for request_firmware */
+static struct platform_device  *microcode_pdev;
+
+static int request_microcode(const char *name)
+{
+       const struct firmware *firmware;
+       int error;
+       struct xen_platform_op op;
+
+       error = request_firmware(&firmware, name, &microcode_pdev->dev);
+       if (error) {
+               pr_debug("microcode: data file %s load failed\n", name);
+               return error;
+       }
+
+       op.cmd = XENPF_microcode_update;
+       set_xen_guest_handle(op.u.microcode.data, firmware->data);
+       op.u.microcode.length = firmware->size;
+       error = HYPERVISOR_platform_op(&op);
+
+       release_firmware(firmware);
+
+       if (error)
+               pr_debug("ucode load failed\n");
+
+       return error;
+}
+
+static const char amd_uc_name[] = "amd-ucode/microcode_amd.bin";
+static const char amd_uc_fmt[] = "amd-ucode/microcode_amd_fam%x.bin";
+static const char intel_uc_fmt[] = "intel-ucode/%02x-%02x-%02x";
+
+static int ucode_cpu_callback(struct notifier_block *nfb,
+                             unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (unsigned long)hcpu;
+       struct xen_platform_op op;
+       char buf[36];
+       const char *uc_name = buf;
+
+       switch (action) {
+       case CPU_ONLINE:
+               op.cmd = XENPF_get_cpu_version;
+               op.u.pcpu_version.xen_cpuid = cpu;
+               if (HYPERVISOR_platform_op(&op))
+                       break;
+               if (op.u.pcpu_version.family == boot_cpu_data.x86
+                   && op.u.pcpu_version.model == boot_cpu_data.x86_model
+                   && op.u.pcpu_version.stepping == boot_cpu_data.x86_mask)
+                       break;
+               if (strncmp(op.u.pcpu_version.vendor_id,
+                           "GenuineIntel", 12) == 0)
+                       snprintf(buf, sizeof(buf), intel_uc_fmt,
+                                op.u.pcpu_version.family,
+                                op.u.pcpu_version.model,
+                                op.u.pcpu_version.stepping);
+               else if (strncmp(op.u.pcpu_version.vendor_id,
+                                "AuthenicAMD", 12) == 0) {
+                       if (op.u.pcpu_version.family >= 0x15)
+                               snprintf(buf, sizeof(buf), amd_uc_fmt,
+                                        op.u.pcpu_version.family);
+                       else
+                               uc_name = amd_uc_name;
+               } else
+                       break;
+               request_microcode(uc_name);
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block ucode_cpu_notifier = {
+       .notifier_call = ucode_cpu_callback
+};
+
+#ifdef MODULE
+/* Autoload on Intel and AMD systems */
+static const struct x86_cpu_id microcode_id[] = {
+#ifdef CONFIG_MICROCODE_INTEL
+       { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
+#endif
+#ifdef CONFIG_MICROCODE_AMD
+       { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
+#endif
+       {}
+};
+MODULE_DEVICE_TABLE(x86cpu, microcode_id);
+#endif
+
+static int __init microcode_init(void)
+{
+       const struct cpuinfo_x86 *c = &boot_cpu_data;
+       char buf[36];
+       const char *fw_name = buf;
+       int error;
+
+       if (c->x86_vendor == X86_VENDOR_INTEL)
+               snprintf(buf, sizeof(buf), intel_uc_fmt,
+                        c->x86, c->x86_model, c->x86_mask);
+       else if (c->x86_vendor == X86_VENDOR_AMD) {
+               if (c->x86 >= 0x15)
+                       snprintf(buf, sizeof(buf), amd_uc_fmt, c->x86);
+               else
+                       fw_name = amd_uc_name;
+       } else {
+               pr_err("no support for this CPU vendor\n");
+               return -ENODEV;
+       }
+
+       microcode_pdev = platform_device_register_simple("microcode", -1,
+                                                        NULL, 0);
+       if (IS_ERR(microcode_pdev))
+               return PTR_ERR(microcode_pdev);
+
+       request_microcode(fw_name);
+
+       error = microcode_dev_init();
+       if (error) {
+               platform_device_unregister(microcode_pdev);
+               return error;
+       }
+
+       pr_info("Microcode Update Driver: v" MICROCODE_VERSION
+               " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
+
+       error = register_pcpu_notifier(&ucode_cpu_notifier);
+       if (error)
+               pr_warn("pCPU notifier registration failed (%d)\n", error);
+
+       return 0;
+}
+module_init(microcode_init);
+
+static void __exit microcode_exit(void)
+{
+       unregister_pcpu_notifier(&ucode_cpu_notifier);
+       microcode_dev_exit();
+       platform_device_unregister(microcode_pdev);
+
+       pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
+}
+module_exit(microcode_exit);
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c

index ac861b8..2c88c6a 100644 (file)
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -205,12 +205,20 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
                 return;
         }
  
+#ifndef CONFIG_XEN
         printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
         val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
              (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT));
         val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
                FAM10H_MMIO_CONF_ENABLE;
         wrmsrl(address, val);
+#else
+       if ((val & ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
+            (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT)))
+           != (fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+               FAM10H_MMIO_CONF_ENABLE))
+               pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
+#endif
  }
  
  static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
diff --git a/arch/x86/kernel/mpparse-xen.c b/arch/x86/kernel/mpparse-xen.c

new file mode 100644 (file)

index 0000000..5a54f95
--- /dev/null
+++ b/arch/x86/kernel/mpparse-xen.c
@@ -0,0 +1,962 @@
+/*
+ *     Intel Multiprocessor Specification 1.1 and 1.4
+ *     compliant MP-table parsing routines.
+ *
+ *     (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
+ *     (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
+ *      (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/pci.h>
+
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+#include <asm/bios_ebda.h>
+#include <asm/e820.h>
+#include <asm/trampoline.h>
+#include <asm/setup.h>
+#include <asm/smp.h>
+
+#include <asm/apic.h>
+
+static void *_bus_to_virt(unsigned long ma)
+{
+       return is_ISA_range(ma, ma) ? isa_bus_to_virt(ma) : bus_to_virt(ma);
+}
+
+/*
+ * Checksum an MP configuration block.
+ */
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+       int sum = 0;
+
+       while (len--)
+               sum += *mp++;
+
+       return sum & 0xFF;
+}
+
+#ifndef CONFIG_XEN
+int __init default_mpc_apic_id(struct mpc_cpu *m)
+{
+       return m->apicid;
+}
+#endif
+
+static void __init MP_processor_info(struct mpc_cpu *m)
+{
+#ifndef CONFIG_XEN
+       int apicid;
+       char *bootup_cpu = "";
+
+       if (!(m->cpuflag & CPU_ENABLED)) {
+               disabled_cpus++;
+               return;
+       }
+
+       apicid = x86_init.mpparse.mpc_apic_id(m);
+
+       if (m->cpuflag & CPU_BOOTPROCESSOR) {
+               bootup_cpu = " (Bootup-CPU)";
+               boot_cpu_physical_apicid = m->apicid;
+       }
+
+       printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu);
+       generic_processor_info(apicid, m->apicver);
+#else /* CONFIG_XEN */
+       num_processors++;
+#endif
+}
+
+#ifdef CONFIG_X86_IO_APIC
+void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str)
+{
+       memcpy(str, m->bustype, 6);
+       str[6] = 0;
+       apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
+}
+
+static void __init MP_bus_info(struct mpc_bus *m)
+{
+       char str[7];
+
+       x86_init.mpparse.mpc_oem_bus_info(m, str);
+
+#if MAX_MP_BUSSES < 256
+       if (m->busid >= MAX_MP_BUSSES) {
+               printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
+                      " is too large, max. supported is %d\n",
+                      m->busid, str, MAX_MP_BUSSES - 1);
+               return;
+       }
+#endif
+
+       set_bit(m->busid, mp_bus_not_pci);
+       if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+               mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
+#endif
+       } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
+               if (x86_init.mpparse.mpc_oem_pci_bus)
+                       x86_init.mpparse.mpc_oem_pci_bus(m);
+
+               clear_bit(m->busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+               mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
+       } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
+               mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
+       } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
+               mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
+#endif
+       } else
+               printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+}
+
+static void __init MP_ioapic_info(struct mpc_ioapic *m)
+{
+       if (m->flags & MPC_APIC_USABLE)
+               mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
+}
+
+static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
+{
+       apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
+               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+               mp_irq->irqtype, mp_irq->irqflag & 3,
+               (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
+               mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
+}
+
+#else /* CONFIG_X86_IO_APIC */
+static inline void __init MP_bus_info(struct mpc_bus *m) {}
+static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
+{
+       apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
+               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+               m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
+               m->srcbusirq, m->destapic, m->destapiclint);
+}
+
+/*
+ * Read/parse the MPC
+ */
+static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
+{
+
+       if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
+               printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
+                      mpc->signature[0], mpc->signature[1],
+                      mpc->signature[2], mpc->signature[3]);
+               return 0;
+       }
+       if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
+               printk(KERN_ERR "MPTABLE: checksum error!\n");
+               return 0;
+       }
+       if (mpc->spec != 0x01 && mpc->spec != 0x04) {
+               printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
+                      mpc->spec);
+               return 0;
+       }
+       if (!mpc->lapic) {
+               printk(KERN_ERR "MPTABLE: null local APIC address!\n");
+               return 0;
+       }
+       memcpy(oem, mpc->oem, 8);
+       oem[8] = 0;
+       printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
+
+       memcpy(str, mpc->productid, 12);
+       str[12] = 0;
+
+       printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
+
+#ifndef CONFIG_XEN
+       printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic);
+#endif
+
+       return 1;
+}
+
+static void skip_entry(unsigned char **ptr, int *count, int size)
+{
+       *ptr += size;
+       *count += size;
+}
+
+static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
+{
+       printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
+               "type %x\n", *mpt);
+       print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
+                       1, mpc, mpc->length, 1);
+}
+
+void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
+
+static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
+{
+       char str[16];
+       char oem[10];
+
+       int count = sizeof(*mpc);
+       unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+       if (!smp_check_mpc(mpc, oem, str))
+               return 0;
+
+#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_32
+       generic_mps_oem_check(mpc, oem, str);
+#endif
+       /* Initialize the lapic mapping */
+       if (!acpi_lapic)
+               register_lapic_address(mpc->lapic);
+#endif
+
+       if (early)
+               return 1;
+
+       if (mpc->oemptr)
+               x86_init.mpparse.smp_read_mpc_oem(mpc);
+
+       /*
+        *      Now process the configuration blocks.
+        */
+       x86_init.mpparse.mpc_record(0);
+
+       while (count < mpc->length) {
+               switch (*mpt) {
+               case MP_PROCESSOR:
+                       /* ACPI may have already provided this data */
+                       if (!acpi_lapic)
+                               MP_processor_info((struct mpc_cpu *)mpt);
+                       skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+                       break;
+               case MP_BUS:
+                       MP_bus_info((struct mpc_bus *)mpt);
+                       skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+                       break;
+               case MP_IOAPIC:
+                       MP_ioapic_info((struct mpc_ioapic *)mpt);
+                       skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+                       break;
+               case MP_INTSRC:
+                       mp_save_irq((struct mpc_intsrc *)mpt);
+                       skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+                       break;
+               case MP_LINTSRC:
+                       MP_lintsrc_info((struct mpc_lintsrc *)mpt);
+                       skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+                       break;
+               default:
+                       /* wrong mptable */
+                       smp_dump_mptable(mpc, mpt);
+                       count = mpc->length;
+                       break;
+               }
+               x86_init.mpparse.mpc_record(1);
+       }
+
+       if (!num_processors)
+               printk(KERN_ERR "MPTABLE: no processors registered!\n");
+       return num_processors;
+}
+
+#ifdef CONFIG_X86_IO_APIC
+
+static int __init ELCR_trigger(unsigned int irq)
+{
+       unsigned int port;
+
+       port = 0x4d0 + (irq >> 3);
+       return (inb(port) >> (irq & 7)) & 1;
+}
+
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+       struct mpc_intsrc intsrc;
+       int i;
+       int ELCR_fallback = 0;
+
+       intsrc.type = MP_INTSRC;
+       intsrc.irqflag = 0;     /* conforming */
+       intsrc.srcbus = 0;
+       intsrc.dstapic = mpc_ioapic_id(0);
+
+       intsrc.irqtype = mp_INT;
+
+       /*
+        *  If true, we have an ISA/PCI system with no IRQ entries
+        *  in the MP table. To prevent the PCI interrupts from being set up
+        *  incorrectly, we try to use the ELCR. The sanity check to see if
+        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+        *  never be level sensitive, so we simply see if the ELCR agrees.
+        *  If it does, we assume it's valid.
+        */
+       if (mpc_default_type == 5) {
+               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
+                      "falling back to ELCR\n");
+
+               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
+                   ELCR_trigger(13))
+                       printk(KERN_ERR "ELCR contains invalid data... "
+                              "not using ELCR\n");
+               else {
+                       printk(KERN_INFO
+                              "Using ELCR to identify PCI interrupts\n");
+                       ELCR_fallback = 1;
+               }
+       }
+
+       for (i = 0; i < 16; i++) {
+               switch (mpc_default_type) {
+               case 2:
+                       if (i == 0 || i == 13)
+                               continue;       /* IRQ0 & IRQ13 not connected */
+                       /* fall through */
+               default:
+                       if (i == 2)
+                               continue;       /* IRQ2 is never connected */
+               }
+
+               if (ELCR_fallback) {
+                       /*
+                        *  If the ELCR indicates a level-sensitive interrupt, we
+                        *  copy that information over to the MP table in the
+                        *  irqflag field (level sensitive, active high polarity).
+                        */
+                       if (ELCR_trigger(i))
+                               intsrc.irqflag = 13;
+                       else
+                               intsrc.irqflag = 0;
+               }
+
+               intsrc.srcbusirq = i;
+               intsrc.dstirq = i ? i : 2;      /* IRQ0 to INTIN2 */
+               mp_save_irq(&intsrc);
+       }
+
+       intsrc.irqtype = mp_ExtINT;
+       intsrc.srcbusirq = 0;
+       intsrc.dstirq = 0;      /* 8259A to INTIN0 */
+       mp_save_irq(&intsrc);
+}
+
+
+static void __init construct_ioapic_table(int mpc_default_type)
+{
+       struct mpc_ioapic ioapic;
+       struct mpc_bus bus;
+
+       bus.type = MP_BUS;
+       bus.busid = 0;
+       switch (mpc_default_type) {
+       default:
+               printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+                      mpc_default_type);
+               /* fall through */
+       case 1:
+       case 5:
+               memcpy(bus.bustype, "ISA   ", 6);
+               break;
+       case 2:
+       case 6:
+       case 3:
+               memcpy(bus.bustype, "EISA  ", 6);
+               break;
+       case 4:
+       case 7:
+               memcpy(bus.bustype, "MCA   ", 6);
+       }
+       MP_bus_info(&bus);
+       if (mpc_default_type > 4) {
+               bus.busid = 1;
+               memcpy(bus.bustype, "PCI   ", 6);
+               MP_bus_info(&bus);
+       }
+
+       ioapic.type     = MP_IOAPIC;
+       ioapic.apicid   = 2;
+       ioapic.apicver  = mpc_default_type > 4 ? 0x10 : 0x01;
+       ioapic.flags    = MPC_APIC_USABLE;
+       ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE;
+       MP_ioapic_info(&ioapic);
+
+       /*
+        * We set up most of the low 16 IO-APIC pins according to MPS rules.
+        */
+       construct_default_ioirq_mptable(mpc_default_type);
+}
+#else
+static inline void __init construct_ioapic_table(int mpc_default_type) { }
+#endif
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+       struct mpc_cpu processor;
+       struct mpc_lintsrc lintsrc;
+       int linttypes[2] = { mp_ExtINT, mp_NMI };
+       int i;
+
+#ifndef CONFIG_XEN
+       /*
+        * local APIC has default address
+        */
+       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+#endif
+
+       /*
+        * 2 CPUs, numbered 0 & 1.
+        */
+       processor.type = MP_PROCESSOR;
+       /* Either an integrated APIC or a discrete 82489DX. */
+       processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+       processor.cpuflag = CPU_ENABLED;
+       processor.cpufeature = (boot_cpu_data.x86 << 8) |
+           (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+       processor.featureflag = boot_cpu_data.x86_capability[0];
+       processor.reserved[0] = 0;
+       processor.reserved[1] = 0;
+       for (i = 0; i < 2; i++) {
+               processor.apicid = i;
+               MP_processor_info(&processor);
+       }
+
+       construct_ioapic_table(mpc_default_type);
+
+       lintsrc.type = MP_LINTSRC;
+       lintsrc.irqflag = 0;            /* conforming */
+       lintsrc.srcbusid = 0;
+       lintsrc.srcbusirq = 0;
+       lintsrc.destapic = MP_APIC_ALL;
+       for (i = 0; i < 2; i++) {
+               lintsrc.irqtype = linttypes[i];
+               lintsrc.destapiclint = i;
+               MP_lintsrc_info(&lintsrc);
+       }
+}
+
+static struct mpf_intel *mpf_found;
+
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+       struct mpc_table *mpc;
+       unsigned long size;
+
+       mpc = early_ioremap(physptr, PAGE_SIZE);
+       size = mpc->length;
+       early_iounmap(mpc, PAGE_SIZE);
+       apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size);
+
+       return size;
+}
+
+static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
+{
+       struct mpc_table *mpc;
+       unsigned long size;
+
+       size = get_mpc_size(mpf->physptr);
+       mpc = early_ioremap(mpf->physptr, size);
+       /*
+        * Read the physical hardware table.  Anything here will
+        * override the defaults.
+        */
+       if (!smp_read_mpc(mpc, early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
+               smp_found_config = 0;
+#endif
+               printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
+                       "... disabling SMP support. (tell your hw vendor)\n");
+               early_iounmap(mpc, size);
+               return -1;
+       }
+       early_iounmap(mpc, size);
+
+       if (early)
+               return -1;
+
+#ifdef CONFIG_X86_IO_APIC
+       /*
+        * If there are no explicit MP IRQ entries, then we are
+        * broken.  We set up most of the low 16 IO-APIC pins to
+        * ISA defaults and hope it will work.
+        */
+       if (!mp_irq_entries) {
+               struct mpc_bus bus;
+
+               printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
+                      "using default mptable. (tell your hw vendor)\n");
+
+               bus.type = MP_BUS;
+               bus.busid = 0;
+               memcpy(bus.bustype, "ISA   ", 6);
+               MP_bus_info(&bus);
+
+               construct_default_ioirq_mptable(0);
+       }
+#endif
+
+       return 0;
+}
+
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+void __init default_get_smp_config(unsigned int early)
+{
+       struct mpf_intel *mpf = mpf_found;
+
+       if (!mpf)
+               return;
+
+#ifdef CONFIG_XEN
+       BUG_ON(early);
+#define early 0
+#endif
+
+       if (acpi_lapic && early)
+               return;
+
+       /*
+        * MPS doesn't support hyperthreading, aka only have
+        * thread 0 apic id in MPS table
+        */
+       if (acpi_lapic && acpi_ioapic)
+               return;
+
+       printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
+              mpf->specification);
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+       if (mpf->feature2 & (1 << 7)) {
+               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
+               pic_mode = 1;
+       } else {
+               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
+               pic_mode = 0;
+       }
+#endif
+       /*
+        * Now see if we need to read further.
+        */
+       if (mpf->feature1 != 0) {
+#ifndef CONFIG_XEN
+               if (early) {
+                       /*
+                        * local APIC has default address
+                        */
+                       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+                       return;
+               }
+#endif
+
+               printk(KERN_INFO "Default MP configuration #%d\n",
+                      mpf->feature1);
+               construct_default_ISA_mptable(mpf->feature1);
+
+       } else if (mpf->physptr) {
+               if (check_physptr(mpf, early))
+                       return;
+       } else
+               BUG();
+
+       if (!early)
+               printk(KERN_INFO "Processors: %d\n", num_processors);
+       /*
+        * Only use the first configuration found.
+        */
+#undef early
+}
+
+#ifndef CONFIG_XEN
+static void __init smp_reserve_memory(struct mpf_intel *mpf)
+{
+       memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));
+}
+#endif
+
+static int __init smp_scan_config(unsigned long base, unsigned long length)
+{
+       unsigned int *bp = _bus_to_virt(base);
+       struct mpf_intel *mpf;
+#ifndef CONFIG_XEN
+       unsigned long mem;
+#endif
+
+       apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
+                       bp, length);
+       BUILD_BUG_ON(sizeof(*mpf) != 16);
+
+       while (length > 0) {
+               mpf = (struct mpf_intel *)bp;
+               if ((*bp == SMP_MAGIC_IDENT) &&
+                   (mpf->length == 1) &&
+                   !mpf_checksum((unsigned char *)bp, 16) &&
+                   ((mpf->specification == 1)
+                    || (mpf->specification == 4))) {
+#ifdef CONFIG_X86_LOCAL_APIC
+                       smp_found_config = 1;
+#endif
+                       mpf_found = mpf;
+
+#ifndef CONFIG_XEN
+                       printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
+                              mpf, (u64)virt_to_phys(mpf));
+
+                       mem = virt_to_phys(mpf);
+                       memblock_reserve(mem, sizeof(*mpf));
+                       if (mpf->physptr)
+                               smp_reserve_memory(mpf);
+#else
+                       printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
+                              mpf, ((void *)bp - _bus_to_virt(base)) + base);
+#endif
+                       return 1;
+               }
+               bp += 4;
+               length -= 16;
+       }
+       return 0;
+}
+
+void __init default_find_smp_config(void)
+{
+#ifndef CONFIG_XEN
+       unsigned int address;
+#endif
+
+       /*
+        * FIXME: Linux assumes you have 640K of base ram..
+        * this continues the error...
+        *
+        * 1) Scan the bottom 1K for a signature
+        * 2) Scan the top 1K of base RAM
+        * 3) Scan the 64K of bios
+        */
+       if (smp_scan_config(0x0, 0x400) ||
+           smp_scan_config(639 * 0x400, 0x400) ||
+           smp_scan_config(0xF0000, 0x10000))
+               return;
+       /*
+        * If it is an SMP machine we should know now, unless the
+        * configuration is in an EISA/MCA bus machine with an
+        * extended bios data area.
+        *
+        * there is a real-mode segmented pointer pointing to the
+        * 4K EBDA area at 0x40E, calculate and scan it here.
+        *
+        * NOTE! There are Linux loaders that will corrupt the EBDA
+        * area, and as such this kind of SMP config may be less
+        * trustworthy, simply because the SMP table may have been
+        * stomped on during early boot. These loaders are buggy and
+        * should be fixed.
+        *
+        * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
+        */
+
+#ifndef CONFIG_XEN
+       address = get_bios_ebda();
+       if (address)
+               smp_scan_config(address, 0x400);
+#endif
+}
+
+#ifdef CONFIG_X86_IO_APIC
+static u8 __initdata irq_used[MAX_IRQ_SOURCES];
+
+static int  __init get_MP_intsrc_index(struct mpc_intsrc *m)
+{
+       int i;
+
+       if (m->irqtype != mp_INT)
+               return 0;
+
+       if (m->irqflag != 0x0f)
+               return 0;
+
+       /* not legacy */
+
+       for (i = 0; i < mp_irq_entries; i++) {
+               if (mp_irqs[i].irqtype != mp_INT)
+                       continue;
+
+               if (mp_irqs[i].irqflag != 0x0f)
+                       continue;
+
+               if (mp_irqs[i].srcbus != m->srcbus)
+                       continue;
+               if (mp_irqs[i].srcbusirq != m->srcbusirq)
+                       continue;
+               if (irq_used[i]) {
+                       /* already claimed */
+                       return -2;
+               }
+               irq_used[i] = 1;
+               return i;
+       }
+
+       /* not found */
+       return -1;
+}
+
+#define SPARE_SLOT_NUM 20
+
+static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
+
+static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
+{
+       int i;
+
+       apic_printk(APIC_VERBOSE, "OLD ");
+       print_mp_irq_info(m);
+
+       i = get_MP_intsrc_index(m);
+       if (i > 0) {
+               memcpy(m, &mp_irqs[i], sizeof(*m));
+               apic_printk(APIC_VERBOSE, "NEW ");
+               print_mp_irq_info(&mp_irqs[i]);
+               return;
+       }
+       if (!i) {
+               /* legacy, do nothing */
+               return;
+       }
+       if (*nr_m_spare < SPARE_SLOT_NUM) {
+               /*
+                * not found (-1), or duplicated (-2) are invalid entries,
+                * we need to use the slot later
+                */
+               m_spare[*nr_m_spare] = m;
+               *nr_m_spare += 1;
+       }
+}
+
+static int __init
+check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
+{
+       if (!mpc_new_phys || count <= mpc_new_length) {
+               WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
+               return -1;
+       }
+
+       return 0;
+}
+#else /* CONFIG_X86_IO_APIC */
+static
+inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
+#endif /* CONFIG_X86_IO_APIC */
+
+static int  __init replace_intsrc_all(struct mpc_table *mpc,
+                                       unsigned long mpc_new_phys,
+                                       unsigned long mpc_new_length)
+{
+#ifdef CONFIG_X86_IO_APIC
+       int i;
+#endif
+       int count = sizeof(*mpc);
+       int nr_m_spare = 0;
+       unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+       printk(KERN_INFO "mpc_length %x\n", mpc->length);
+       while (count < mpc->length) {
+               switch (*mpt) {
+               case MP_PROCESSOR:
+                       skip_entry(&mpt, &count, sizeof(struct mpc_cpu));
+                       break;
+               case MP_BUS:
+                       skip_entry(&mpt, &count, sizeof(struct mpc_bus));
+                       break;
+               case MP_IOAPIC:
+                       skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
+                       break;
+               case MP_INTSRC:
+                       check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare);
+                       skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
+                       break;
+               case MP_LINTSRC:
+                       skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc));
+                       break;
+               default:
+                       /* wrong mptable */
+                       smp_dump_mptable(mpc, mpt);
+                       goto out;
+               }
+       }
+
+#ifdef CONFIG_X86_IO_APIC
+       for (i = 0; i < mp_irq_entries; i++) {
+               if (irq_used[i])
+                       continue;
+
+               if (mp_irqs[i].irqtype != mp_INT)
+                       continue;
+
+               if (mp_irqs[i].irqflag != 0x0f)
+                       continue;
+
+               if (nr_m_spare > 0) {
+                       apic_printk(APIC_VERBOSE, "*NEW* found\n");
+                       nr_m_spare--;
+                       memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
+                       m_spare[nr_m_spare] = NULL;
+               } else {
+                       struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
+                       count += sizeof(struct mpc_intsrc);
+                       if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
+                               goto out;
+                       memcpy(m, &mp_irqs[i], sizeof(*m));
+                       mpc->length = count;
+                       mpt += sizeof(struct mpc_intsrc);
+               }
+               print_mp_irq_info(&mp_irqs[i]);
+       }
+#endif
+out:
+       /* update checksum */
+       mpc->checksum = 0;
+       mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length);
+
+       return 0;
+}
+
+int enable_update_mptable;
+
+static int __init update_mptable_setup(char *str)
+{
+       enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+       pci_routeirq = 1;
+#endif
+       return 0;
+}
+early_param("update_mptable", update_mptable_setup);
+
+static unsigned long __initdata mpc_new_phys;
+static unsigned long mpc_new_length __initdata = 4096;
+
+/* alloc_mptable or alloc_mptable=4k */
+static int __initdata alloc_mptable;
+static int __init parse_alloc_mptable_opt(char *p)
+{
+       enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+       pci_routeirq = 1;
+#endif
+       alloc_mptable = 1;
+       if (!p)
+               return 0;
+       mpc_new_length = memparse(p, &p);
+       return 0;
+}
+early_param("alloc_mptable", parse_alloc_mptable_opt);
+
+void __init early_reserve_e820_mpc_new(void)
+{
+       if (enable_update_mptable && alloc_mptable)
+               mpc_new_phys = early_reserve_e820(mpc_new_length, 4);
+}
+
+static int __init update_mp_table(void)
+{
+       char str[16];
+       char oem[10];
+       struct mpf_intel *mpf;
+       struct mpc_table *mpc, *mpc_new;
+
+       if (!enable_update_mptable)
+               return 0;
+
+       mpf = mpf_found;
+       if (!mpf)
+               return 0;
+
+       /*
+        * Now see if we need to go further.
+        */
+       if (mpf->feature1 != 0)
+               return 0;
+
+       if (!mpf->physptr)
+               return 0;
+
+       mpc = _bus_to_virt(mpf->physptr);
+
+       if (!smp_check_mpc(mpc, oem, str))
+               return 0;
+
+       printk(KERN_INFO "mpf: %llx\n", (u64)arbitrary_virt_to_machine(mpf));
+       printk(KERN_INFO "physptr: %x\n", mpf->physptr);
+
+       if (mpc_new_phys && mpc->length > mpc_new_length) {
+               mpc_new_phys = 0;
+               printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
+                        mpc_new_length);
+       }
+
+       if (!mpc_new_phys) {
+               unsigned char old, new;
+               /* check if we can change the position */
+               mpc->checksum = 0;
+               old = mpf_checksum((unsigned char *)mpc, mpc->length);
+               mpc->checksum = 0xff;
+               new = mpf_checksum((unsigned char *)mpc, mpc->length);
+               if (old == new) {
+                       printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
+                       return 0;
+               }
+               printk(KERN_INFO "use in-position replacing\n");
+       } else {
+               maddr_t mpc_new_bus;
+
+               mpc_new_bus = phys_to_machine(mpc_new_phys);
+               mpf->physptr = mpc_new_bus;
+               mpc_new = phys_to_virt(mpc_new_phys);
+               memcpy(mpc_new, mpc, mpc->length);
+               mpc = mpc_new;
+               /* check if we can modify that */
+               if (mpc_new_bus - mpf->physptr) {
+                       struct mpf_intel *mpf_new;
+                       /* steal 16 bytes from [0, 1k) */
+                       printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
+                       mpf_new = isa_bus_to_virt(0x400 - 16);
+                       memcpy(mpf_new, mpf, 16);
+                       mpf = mpf_new;
+                       mpf->physptr = mpc_new_bus;
+               }
+               mpf->checksum = 0;
+               mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
+               printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
+       }
+
+       /*
+        * only replace the one with mp_INT and
+        *       MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
+        * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
+        * may need pci=routeirq for all coverage
+        */
+       replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+
+       return 0;
+}
+
+late_initcall(update_mp_table);
diff --git a/arch/x86/kernel/msr-xen.c b/arch/x86/kernel/msr-xen.c

new file mode 100644 (file)

index 0000000..900a894
--- /dev/null
+++ b/arch/x86/kernel/msr-xen.c
@@ -0,0 +1,337 @@
+#ifndef CONFIG_XEN_PRIVILEGED_GUEST
+#include "msr.c"
+#else
+/* ----------------------------------------------------------------------- *
+ *
+ *   Copyright 2010 Novell, Inc.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139,
+ *   USA; either version 2 of the License, or (at your option) any later
+ *   version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * x86 MSR access device
+ *
+ * This device is accessed by lseek() to the appropriate register number
+ * and then read/write in chunks of 8 bytes.  A larger size means multiple
+ * reads or writes of the same register.
+ *
+ * This driver uses /dev/xen/cpu/%d/msr where %d correlates to the minor
+ * number, and on an SMP box will direct the access to pCPU %d.
+ */
+
+static int msr_init(void);
+static void msr_exit(void);
+
+#define msr_init(args...) _msr_init(args)
+#define msr_exit(args...) _msr_exit(args)
+#include "msr.c"
+#undef msr_exit
+#undef msr_init
+
+#include <linux/slab.h>
+#include <xen/pcpu.h>
+
+static struct class *pmsr_class;
+static unsigned int minor_bias = 10;
+static unsigned int nr_xen_cpu_ids;
+static unsigned long *xen_cpu_online_map;
+
+#define PMSR_DEV(cpu) MKDEV(MSR_MAJOR, (cpu) + minor_bias)
+
+static unsigned int pmsr_minor(struct inode *inode)
+{
+       return iminor(inode) - minor_bias;
+}
+
+static ssize_t pmsr_read(struct file *file, char __user *buf,
+                        size_t count, loff_t *ppos)
+{
+       u32 __user *tmp = (u32 __user *) buf;
+       u32 data[2];
+       u32 reg = *ppos;
+       unsigned int cpu = pmsr_minor(file->f_path.dentry->d_inode);
+       int err = 0;
+       ssize_t bytes = 0;
+
+       if (count % 8)
+               return -EINVAL; /* Invalid chunk size */
+
+       for (; count; count -= 8) {
+               err = rdmsr_safe_on_pcpu(cpu, reg, &data[0], &data[1]);
+               if (err)
+                       break;
+               if (copy_to_user(tmp, &data, 8)) {
+                       err = -EFAULT;
+                       break;
+               }
+               tmp += 2;
+               bytes += 8;
+       }
+
+       return bytes ? bytes : err;
+}
+
+static ssize_t pmsr_write(struct file *file, const char __user *buf,
+                         size_t count, loff_t *ppos)
+{
+       const u32 __user *tmp = (const u32 __user *)buf;
+       u32 data[2];
+       u32 reg = *ppos;
+       unsigned int cpu = pmsr_minor(file->f_path.dentry->d_inode);
+       int err = 0;
+       ssize_t bytes = 0;
+
+       if (count % 8)
+               return -EINVAL; /* Invalid chunk size */
+
+       for (; count; count -= 8) {
+               if (copy_from_user(&data, tmp, 8)) {
+                       err = -EFAULT;
+                       break;
+               }
+               err = wrmsr_safe_on_pcpu(cpu, reg, data[0], data[1]);
+               if (err)
+                       break;
+               tmp += 2;
+               bytes += 8;
+       }
+
+       return bytes ? bytes : err;
+}
+
+static long pmsr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
+{
+       u32 __user *uregs = (u32 __user *)arg;
+       u32 regs[8];
+       unsigned int cpu = pmsr_minor(file->f_path.dentry->d_inode);
+       int err;
+
+       switch (ioc) {
+       case X86_IOC_RDMSR_REGS:
+               if (!(file->f_mode & FMODE_READ)) {
+                       err = -EBADF;
+                       break;
+               }
+               if (copy_from_user(&regs, uregs, sizeof regs)) {
+                       err = -EFAULT;
+                       break;
+               }
+               err = rdmsr_safe_regs_on_pcpu(cpu, regs);
+               if (err)
+                       break;
+               if (copy_to_user(uregs, &regs, sizeof regs))
+                       err = -EFAULT;
+               break;
+
+       case X86_IOC_WRMSR_REGS:
+               if (!(file->f_mode & FMODE_WRITE)) {
+                       err = -EBADF;
+                       break;
+               }
+               if (copy_from_user(&regs, uregs, sizeof regs)) {
+                       err = -EFAULT;
+                       break;
+               }
+               err = wrmsr_safe_regs_on_pcpu(cpu, regs);
+               if (err)
+                       break;
+               if (copy_to_user(uregs, &regs, sizeof regs))
+                       err = -EFAULT;
+               break;
+
+       default:
+               err = -ENOTTY;
+               break;
+       }
+
+       return err;
+}
+
+static int pmsr_open(struct inode *inode, struct file *file)
+{
+       unsigned int cpu;
+
+       cpu = pmsr_minor(file->f_path.dentry->d_inode);
+       if (cpu >= nr_xen_cpu_ids || !test_bit(cpu, xen_cpu_online_map))
+               return -ENXIO;  /* No such CPU */
+
+       return 0;
+}
+
+/*
+ * File operations we support
+ */
+static const struct file_operations pmsr_fops = {
+       .owner = THIS_MODULE,
+       .llseek = msr_seek,
+       .read = pmsr_read,
+       .write = pmsr_write,
+       .open = pmsr_open,
+       .unlocked_ioctl = pmsr_ioctl,
+       .compat_ioctl = pmsr_ioctl,
+};
+
+static int pmsr_device_create(unsigned int cpu)
+{
+       struct device *dev;
+
+       if (cpu >= nr_xen_cpu_ids) {
+               static bool warned;
+               unsigned long *map;
+
+               if ((minor_bias + cpu) >> MINORBITS) {
+                       if (!warned) {
+                               warned = true;
+                               pr_warn("Physical MSRs of CPUs beyond %u"
+                                       " will not be accessible\n",
+                                       MINORMASK - minor_bias);
+                       }
+                       return -EDOM;
+               }
+
+               map = kcalloc(BITS_TO_LONGS(cpu + 1), sizeof(*map),
+                             GFP_KERNEL);
+               if (!map) {
+                       if (!warned) {
+                               warned = true;
+                               pr_warn("Physical MSRs of CPUs beyond %u"
+                                       " may not be accessible\n",
+                                       nr_xen_cpu_ids - 1);
+                       }
+                       return -ENOMEM;
+               }
+
+               memcpy(map, xen_cpu_online_map,
+                      BITS_TO_LONGS(nr_xen_cpu_ids)
+                      * sizeof(*xen_cpu_online_map));
+               nr_xen_cpu_ids = min_t(unsigned int,
+                                    BITS_TO_LONGS(cpu + 1) * BITS_PER_LONG,
+                                    MINORMASK + 1 - minor_bias);
+               kfree(xchg(&xen_cpu_online_map, map));
+       }
+       set_bit(cpu, xen_cpu_online_map);
+       dev = device_create(pmsr_class, NULL, PMSR_DEV(cpu), NULL,
+                           "pmsr%d", cpu);
+       return IS_ERR(dev) ? PTR_ERR(dev) : 0;
+}
+
+static void pmsr_device_destroy(unsigned int cpu)
+{
+       clear_bit(cpu, xen_cpu_online_map);
+       device_destroy(pmsr_class, PMSR_DEV(cpu));
+}
+
+static int pmsr_cpu_callback(struct notifier_block *nfb,
+                            unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (unsigned long)hcpu;
+
+       switch (action) {
+       case CPU_ONLINE:
+               pmsr_device_create(cpu);
+               break;
+       case CPU_DEAD:
+               pmsr_device_destroy(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block pmsr_cpu_notifier = {
+       .notifier_call = pmsr_cpu_callback,
+};
+
+static char *pmsr_devnode(struct device *dev, umode_t *mode)
+{
+       return kasprintf(GFP_KERNEL, "xen/cpu/%u/msr",
+                        MINOR(dev->devt) - minor_bias);
+}
+
+static int __init msr_init(void)
+{
+       int err;
+       xen_platform_op_t op;
+
+       err = _msr_init();
+       if (err || !is_initial_xendomain())
+               return err;
+
+       op.cmd = XENPF_get_cpuinfo;
+       op.u.pcpu_info.xen_cpuid = 0;
+       do {
+               err = HYPERVISOR_platform_op(&op);
+       } while (err == -EBUSY);
+       if (err)
+               goto out;
+       nr_xen_cpu_ids = BITS_TO_LONGS(op.u.pcpu_info.max_present + 1)
+                        * BITS_PER_LONG;
+
+       while (minor_bias < NR_CPUS)
+               minor_bias *= 10;
+       if ((minor_bias + nr_xen_cpu_ids - 1) >> MINORBITS)
+               minor_bias = NR_CPUS;
+       if ((minor_bias + nr_xen_cpu_ids - 1) >> MINORBITS)
+               nr_xen_cpu_ids = MINORMASK + 1 - NR_CPUS;
+
+       xen_cpu_online_map = kcalloc(BITS_TO_LONGS(nr_xen_cpu_ids),
+                                    sizeof(*xen_cpu_online_map),
+                                    GFP_KERNEL);
+       if (!xen_cpu_online_map) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       if (__register_chrdev(MSR_MAJOR, minor_bias,
+                             MINORMASK + 1 - minor_bias,
+                             "pcpu/msr", &pmsr_fops)) {
+               pr_err("msr: unable to get minors for pmsr\n");
+               goto out;
+       }
+       pmsr_class = class_create(THIS_MODULE, "pmsr");
+       if (IS_ERR(pmsr_class)) {
+               err = PTR_ERR(pmsr_class);
+               goto out_chrdev;
+       }
+       pmsr_class->devnode = pmsr_devnode;
+       err = register_pcpu_notifier(&pmsr_cpu_notifier);
+
+       if (!err && !nr_xen_cpu_ids)
+               err = -ENODEV;
+       if (!err)
+               return 0;
+
+       class_destroy(pmsr_class);
+
+out_chrdev:
+       __unregister_chrdev(MSR_MAJOR, minor_bias,
+                           MINORMASK + 1 - minor_bias, "pcpu/msr");
+out:
+       if (err)
+               pr_warn("msr: can't initialize physical MSR access (%d)\n",
+                       err);
+       nr_xen_cpu_ids = 0;
+       kfree(xen_cpu_online_map);
+       return 0;
+}
+
+static void __exit msr_exit(void)
+{
+       if (nr_xen_cpu_ids) {
+               unsigned int cpu = 0;
+
+               unregister_pcpu_notifier(&pmsr_cpu_notifier);
+               for_each_set_bit(cpu, xen_cpu_online_map, nr_xen_cpu_ids)
+                       pmsr_device_destroy(cpu);
+               class_destroy(pmsr_class);
+               __unregister_chrdev(MSR_MAJOR, minor_bias,
+                                   MINORMASK + 1 - minor_bias, "pcpu/msr");
+               kfree(xen_cpu_online_map);
+       }
+       _msr_exit();
+}
+#endif /* CONFIG_XEN_PRIVILEGED_GUEST */
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c

index 47acaf3..0509542 100644 (file)
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -232,15 +232,12 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
         pr_emerg("Dazed and confused, but trying to continue\n");
  
         /* Clear and disable the PCI SERR error line. */
-       reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
-       outb(reason, NMI_REASON_PORT);
+       clear_serr_error(reason);
  }
  
  static notrace __kprobes void
  io_check_error(unsigned char reason, struct pt_regs *regs)
  {
-       unsigned long i;
-
         pr_emerg(
         "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
                  reason, smp_processor_id());
@@ -250,17 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
                 panic("NMI IOCK error: Not continuing");
  
         /* Re-enable the IOCK line, wait for a few seconds */
-       reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
-       outb(reason, NMI_REASON_PORT);
-
-       i = 20000;
-       while (--i) {
-               touch_nmi_watchdog();
-               udelay(100);
-       }
-
-       reason &= ~NMI_REASON_CLEAR_IOCHK;
-       outb(reason, NMI_REASON_PORT);
+       clear_io_check_error(reason);
  }
  
  static notrace __kprobes void
diff --git a/arch/x86/kernel/pci-dma-xen.c b/arch/x86/kernel/pci-dma-xen.c

new file mode 100644 (file)

index 0000000..04a26d3
--- /dev/null
+++ b/arch/x86/kernel/pci-dma-xen.c
@@ -0,0 +1,365 @@
+#include <linux/dma-mapping.h>
+#include <linux/dma-debug.h>
+#include <linux/export.h>
+#include <linux/bootmem.h>
+#include <linux/gfp.h>
+#include <linux/pci.h>
+#include <linux/kmemleak.h>
+
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
+
+static int forbid_dac __read_mostly;
+
+struct dma_map_ops *dma_ops = &nommu_dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
+static int iommu_sac_force __read_mostly;
+
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow __read_mostly = 1;
+int force_iommu __initdata = 1;
+#else
+int panic_on_overflow __read_mostly = 0;
+int force_iommu __initdata = 0;
+#endif
+
+int iommu_merge __initdata;
+
+int no_iommu __initdata;
+#ifndef CONFIG_XEN
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly = 0;
+
+/*
+ * This variable becomes 1 if iommu=pt is passed on the kernel command line.
+ * If this variable is 1, IOMMU implementations do no DMA translation for
+ * devices and allow every device to access to whole physical memory. This is
+ * useful if a user wants to use an IOMMU only for KVM device assignment to
+ * guests and not for driver dma translation.
+ */
+int iommu_pass_through __read_mostly;
+
+/*
+ * Group multi-function PCI devices into a single device-group for the
+ * iommu_device_group interface.  This tells the iommu driver to pretend
+ * it cannot distinguish between functions of a device, exposing only one
+ * group for the device.  Useful for disallowing use of individual PCI
+ * functions from userspace drivers.
+ */
+int iommu_group_mf __read_mostly;
+#endif
+
+extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
+
+/* Dummy device used for NULL arguments (normally ISA). */
+struct device x86_dma_fallback_dev = {
+       .init_name = "fallback device",
+       .coherent_dma_mask = ISA_DMA_BIT_MASK,
+       .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
+};
+EXPORT_SYMBOL(x86_dma_fallback_dev);
+
+/* Number of entries preallocated for DMA-API debugging */
+#define PREALLOC_DMA_DEBUG_ENTRIES       32768
+
+int dma_set_mask(struct device *dev, u64 mask)
+{
+       if (!dev->dma_mask || !dma_supported(dev, mask))
+               return -EIO;
+
+       *dev->dma_mask = mask;
+
+       return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+static struct dma_map_ops swiotlb_dma_ops = {
+       .alloc = dma_generic_alloc_coherent,
+       .free = dma_generic_free_coherent,
+       .mapping_error = swiotlb_dma_mapping_error,
+       .map_page = swiotlb_map_page,
+       .unmap_page = swiotlb_unmap_page,
+       .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+       .sync_single_for_device = swiotlb_sync_single_for_device,
+       .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+       .sync_sg_for_device = swiotlb_sync_sg_for_device,
+       .map_sg = swiotlb_map_sg_attrs,
+       .unmap_sg = swiotlb_unmap_sg_attrs,
+       .dma_supported = swiotlb_dma_supported
+};
+
+static int __init pci_xen_swiotlb_detect(void)
+{
+       return 1;
+}
+
+static void __init pci_xen_swiotlb_init(void)
+{
+       swiotlb_init(1);
+       if (swiotlb) {
+               printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+               dma_ops = &swiotlb_dma_ops;
+       }
+}
+
+IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, NULL, pci_xen_swiotlb_init, NULL);
+
+void __init pci_iommu_alloc(void)
+{
+       struct iommu_table_entry *p;
+
+       sort_iommu_table(__iommu_table, __iommu_table_end);
+       check_iommu_entries(__iommu_table, __iommu_table_end);
+
+       for (p = __iommu_table; p < __iommu_table_end; p++) {
+               if (p && p->detect && p->detect() > 0) {
+                       p->flags |= IOMMU_DETECTED;
+                       if (p->early_init)
+                               p->early_init();
+                       if (p->flags & IOMMU_FINISH_IF_DETECTED)
+                               break;
+               }
+       }
+}
+void *dma_generic_alloc_coherent(struct device *dev, size_t size,
+                                dma_addr_t *dma_addr, gfp_t flag,
+                                struct dma_attrs *attrs)
+{
+       unsigned long dma_mask;
+       struct page *page;
+#ifndef CONFIG_XEN
+       dma_addr_t addr;
+#else
+       void *memory;
+#endif
+       unsigned int order = get_order(size);
+
+       dma_mask = dma_alloc_coherent_mask(dev, flag);
+
+#ifndef CONFIG_XEN
+       flag |= __GFP_ZERO;
+again:
+#else
+       flag &= ~(__GFP_DMA | __GFP_DMA32);
+#endif
+       page = alloc_pages_node(dev_to_node(dev), flag, order);
+       if (!page)
+               return NULL;
+
+#ifndef CONFIG_XEN
+       addr = page_to_phys(page);
+       if (addr + size > dma_mask) {
+               __free_pages(page, order);
+
+               if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) {
+                       flag = (flag & ~GFP_DMA32) | GFP_DMA;
+                       goto again;
+               }
+
+               return NULL;
+       }
+
+       *dma_addr = addr;
+       return page_address(page);
+#else
+       memory = page_address(page);
+       if (xen_create_contiguous_region((unsigned long)memory, order,
+                                        fls64(dma_mask))) {
+               __free_pages(page, order);
+               return NULL;
+       }
+
+       *dma_addr = virt_to_bus(memory);
+       return memset(memory, 0, size);
+#endif
+}
+
+#ifdef CONFIG_XEN
+void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
+                              dma_addr_t dma_addr, struct dma_attrs *attrs)
+{
+       unsigned int order = get_order(size);
+       unsigned long va = (unsigned long)vaddr;
+
+       xen_destroy_contiguous_region(va, order);
+       free_pages(va, order);
+}
+#endif
+
+/*
+ * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
+ * parameter documentation.
+ */
+static __init int iommu_setup(char *p)
+{
+       iommu_merge = 1;
+
+       if (!p)
+               return -EINVAL;
+
+       while (*p) {
+               if (!strncmp(p, "off", 3))
+                       no_iommu = 1;
+               /* gart_parse_options has more force support */
+               if (!strncmp(p, "force", 5))
+                       force_iommu = 1;
+               if (!strncmp(p, "noforce", 7)) {
+                       iommu_merge = 0;
+                       force_iommu = 0;
+               }
+
+               if (!strncmp(p, "biomerge", 8)) {
+                       iommu_merge = 1;
+                       force_iommu = 1;
+               }
+               if (!strncmp(p, "panic", 5))
+                       panic_on_overflow = 1;
+               if (!strncmp(p, "nopanic", 7))
+                       panic_on_overflow = 0;
+               if (!strncmp(p, "merge", 5)) {
+                       iommu_merge = 1;
+                       force_iommu = 1;
+               }
+               if (!strncmp(p, "nomerge", 7))
+                       iommu_merge = 0;
+               if (!strncmp(p, "forcesac", 8))
+                       iommu_sac_force = 1;
+               if (!strncmp(p, "allowdac", 8))
+                       forbid_dac = 0;
+               if (!strncmp(p, "nodac", 5))
+                       forbid_dac = 1;
+               if (!strncmp(p, "usedac", 6)) {
+                       forbid_dac = -1;
+                       return 1;
+               }
+#ifdef CONFIG_SWIOTLB
+               if (!strncmp(p, "soft", 4))
+                       swiotlb = 1;
+#endif
+#ifndef CONFIG_XEN
+               if (!strncmp(p, "pt", 2))
+                       iommu_pass_through = 1;
+               if (!strncmp(p, "group_mf", 8))
+                       iommu_group_mf = 1;
+
+               gart_parse_options(p);
+#endif
+
+#ifdef CONFIG_CALGARY_IOMMU
+               if (!strncmp(p, "calgary", 7))
+                       use_calgary = 1;
+#endif /* CONFIG_CALGARY_IOMMU */
+
+               p += strcspn(p, ",");
+               if (*p == ',')
+                       ++p;
+       }
+       return 0;
+}
+early_param("iommu", iommu_setup);
+
+static int check_pages_physically_contiguous(unsigned long pfn,
+                                            unsigned int offset,
+                                            size_t length)
+{
+       unsigned long next_mfn;
+       int i;
+       int nr_pages;
+
+       next_mfn = pfn_to_mfn(pfn);
+       nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+       for (i = 1; i < nr_pages; i++) {
+               if (pfn_to_mfn(++pfn) != ++next_mfn)
+                       return 0;
+       }
+       return 1;
+}
+
+int range_straddles_page_boundary(paddr_t p, size_t size)
+{
+       unsigned long pfn = p >> PAGE_SHIFT;
+       unsigned int offset = p & ~PAGE_MASK;
+
+       return ((offset + size > PAGE_SIZE) &&
+               !check_pages_physically_contiguous(pfn, offset, size));
+}
+
+int dma_supported(struct device *dev, u64 mask)
+{
+       struct dma_map_ops *ops = get_dma_ops(dev);
+
+#ifdef CONFIG_PCI
+       if (mask > 0xffffffff && forbid_dac > 0) {
+               dev_info(dev, "PCI: Disallowing DAC for device\n");
+               return 0;
+       }
+#endif
+
+       if (ops->dma_supported)
+               return ops->dma_supported(dev, mask);
+
+       /* Copied from i386. Doesn't make much sense, because it will
+          only work for pci_alloc_coherent.
+          The caller just has to use GFP_DMA in this case. */
+       if (mask < DMA_BIT_MASK(24))
+               return 0;
+
+       /* Tell the device to use SAC when IOMMU force is on.  This
+          allows the driver to use cheaper accesses in some cases.
+
+          Problem with this is that if we overflow the IOMMU area and
+          return DAC as fallback address the device may not handle it
+          correctly.
+
+          As a special case some controllers have a 39bit address
+          mode that is as efficient as 32bit (aic79xx). Don't force
+          SAC for these.  Assume all masks <= 40 bits are of this
+          type. Normally this doesn't make any difference, but gives
+          more gentle handling of IOMMU overflow. */
+       if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) {
+               dev_info(dev, "Force SAC with mask %Lx\n", mask);
+               return 0;
+       }
+
+       return 1;
+}
+EXPORT_SYMBOL(dma_supported);
+
+static int __init pci_iommu_init(void)
+{
+       struct iommu_table_entry *p;
+       dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
+
+#ifdef CONFIG_PCI
+       dma_debug_add_bus(&pci_bus_type);
+#endif
+       x86_init.iommu.iommu_init();
+
+       for (p = __iommu_table; p < __iommu_table_end; p++) {
+               if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
+                       p->late_init();
+       }
+
+       return 0;
+}
+/* Must execute after PCI subsystem */
+rootfs_initcall(pci_iommu_init);
+
+#ifdef CONFIG_PCI
+/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
+
+static __devinit void via_no_dac(struct pci_dev *dev)
+{
+       if (forbid_dac == 0) {
+               dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
+               forbid_dac = 1;
+       }
+}
+DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID,
+                               PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);
+#endif
diff --git a/arch/x86/kernel/pci-nommu-xen.c b/arch/x86/kernel/pci-nommu-xen.c

new file mode 100644 (file)

index 0000000..6983576
--- /dev/null
+++ b/arch/x86/kernel/pci-nommu-xen.c
@@ -0,0 +1,112 @@
+#include <linux/dma-mapping.h>
+#include <linux/pci.h>
+
+#include <xen/gnttab.h>
+
+#include <asm/iommu.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/swiotlb.h>
+#include <asm/tlbflush.h>
+#include <asm/gnttab_dma.h>
+#include <asm/bug.h>
+
+#define IOMMU_BUG_ON(test)                             \
+do {                                                   \
+       if (unlikely(test)) {                           \
+               printk(KERN_ALERT "Fatal DMA error! "   \
+                      "Please use 'swiotlb=force'\n"); \
+               BUG();                                  \
+       }                                               \
+} while (0)
+
+static int
+gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
+             enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+       unsigned int i;
+       struct scatterlist *sg;
+
+       WARN_ON(nents == 0 || sgl->length == 0);
+
+       for_each_sg(sgl, sg, nents, i) {
+               BUG_ON(!sg_page(sg));
+               sg->dma_address =
+                       gnttab_dma_map_page(sg_page(sg)) + sg->offset;
+               sg->dma_length  = sg->length;
+               IOMMU_BUG_ON(!dma_capable(
+                       hwdev, sg->dma_address, sg->length));
+               IOMMU_BUG_ON(range_straddles_page_boundary(
+                       page_to_pseudophys(sg_page(sg)) + sg->offset,
+                       sg->length));
+       }
+
+       return nents;
+}
+
+static void
+gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents,
+               enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+       unsigned int i;
+       struct scatterlist *sg;
+
+       for_each_sg(sgl, sg, nents, i)
+               gnttab_dma_unmap_page(sg->dma_address);
+}
+
+static dma_addr_t
+gnttab_map_page(struct device *dev, struct page *page, unsigned long offset,
+               size_t size, enum dma_data_direction dir,
+               struct dma_attrs *attrs)
+{
+       dma_addr_t dma;
+
+       WARN_ON(size == 0);
+
+       dma = gnttab_dma_map_page(page) + offset;
+       IOMMU_BUG_ON(range_straddles_page_boundary(page_to_pseudophys(page) +
+                                                  offset, size));
+       IOMMU_BUG_ON(!dma_capable(dev, dma, size));
+
+       return dma;
+}
+
+static void
+gnttab_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
+                 enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+       gnttab_dma_unmap_page(dma_addr);
+}
+
+static void nommu_sync_single_for_device(struct device *dev,
+                       dma_addr_t addr, size_t size,
+                       enum dma_data_direction dir)
+{
+       flush_write_buffers();
+}
+
+
+static void nommu_sync_sg_for_device(struct device *dev,
+                       struct scatterlist *sg, int nelems,
+                       enum dma_data_direction dir)
+{
+       flush_write_buffers();
+}
+
+static int nommu_dma_supported(struct device *hwdev, u64 mask)
+{
+       return 1;
+}
+
+struct dma_map_ops nommu_dma_ops = {
+       .alloc                  = dma_generic_alloc_coherent,
+       .free                   = dma_generic_free_coherent,
+       .map_page               = gnttab_map_page,
+       .unmap_page             = gnttab_unmap_page,
+       .map_sg                 = gnttab_map_sg,
+       .unmap_sg               = gnttab_unmap_sg,
+       .sync_single_for_device = nommu_sync_single_for_device,
+       .sync_sg_for_device     = nommu_sync_sg_for_device,
+       .dma_supported          = nommu_dma_supported,
+};
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c

index a311ffc..965c549 100644 (file)
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -6,6 +6,11 @@ static __init int add_pcspkr(void)
  {
         struct platform_device *pd;
  
+#ifdef CONFIG_XEN
+       if (!is_initial_xendomain())
+               return 0;
+#endif
+
         pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
  
         return IS_ERR(pd) ? PTR_ERR(pd) : 0;
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c

index 0bc72e2..787a60b 100644 (file)
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -115,6 +115,11 @@ static struct resource *find_oprom(struct pci_dev *pdev)
         struct resource *oprom = NULL;
         int i;
  
+#ifdef CONFIG_XEN
+       if (!is_initial_xendomain())
+               return NULL;
+#endif
+
         for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
                 struct resource *res = &adapter_rom_resources[i];
                 unsigned short offset, vendor, device, list, rev;
@@ -233,7 +238,7 @@ void __init probe_roms(void)
         upper = system_rom_resource.start;
  
         /* check for extension rom (ignore length byte!) */
-       rom = isa_bus_to_virt(extension_rom_resource.start);
+       rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
         if (romsignature(rom)) {
                 length = resource_size(&extension_rom_resource);
                 if (romchecksum(rom, length)) {
diff --git a/arch/x86/kernel/process-xen.c b/arch/x86/kernel/process-xen.c

new file mode 100644 (file)

index 0000000..5a1dc3d
--- /dev/null
+++ b/arch/x86/kernel/process-xen.c
@@ -0,0 +1,745 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/prctl.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/pm.h>
+#include <linux/clockchips.h>
+#include <linux/random.h>
+#include <linux/user-return-notifier.h>
+#include <linux/dmi.h>
+#include <linux/utsname.h>
+#include <linux/stackprotector.h>
+#include <linux/tick.h>
+#include <linux/cpuidle.h>
+#include <trace/events/power.h>
+#include <linux/hw_breakpoint.h>
+#include <asm/cpu.h>
+#include <asm/apic.h>
+#include <asm/syscalls.h>
+#include <asm/idle.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include <asm/fpu-internal.h>
+#include <asm/debugreg.h>
+#include <asm/nmi.h>
+#include <xen/evtchn.h>
+
+#ifdef CONFIG_X86_64
+static DEFINE_PER_CPU(unsigned char, is_idle);
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+       atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+       atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
+#endif
+
+struct kmem_cache *task_xstate_cachep;
+EXPORT_SYMBOL_GPL(task_xstate_cachep);
+
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
+{
+       int ret;
+
+       *dst = *src;
+       if (fpu_allocated(&src->thread.fpu)) {
+               memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
+               ret = fpu_alloc(&dst->thread.fpu);
+               if (ret)
+                       return ret;
+               fpu_copy(&dst->thread.fpu, &src->thread.fpu);
+       }
+       return 0;
+}
+
+void free_thread_xstate(struct task_struct *tsk)
+{
+       fpu_free(&tsk->thread.fpu);
+}
+
+void free_thread_info(struct thread_info *ti)
+{
+       free_thread_xstate(ti->task);
+       free_pages((unsigned long)ti, THREAD_ORDER);
+}
+
+void arch_task_cache_init(void)
+{
+        task_xstate_cachep =
+               kmem_cache_create("task_xstate", xstate_size,
+                                 __alignof__(union thread_xstate),
+                                 SLAB_PANIC | SLAB_NOTRACK, NULL);
+}
+
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+       struct task_struct *me = current;
+       struct thread_struct *t = &me->thread;
+       unsigned long *bp = t->io_bitmap_ptr;
+
+       if (bp) {
+               struct physdev_set_iobitmap set_iobitmap;
+
+               t->io_bitmap_ptr = NULL;
+               clear_thread_flag(TIF_IO_BITMAP);
+               /*
+                * Careful, clear this in the TSS too:
+                */
+               memset(&set_iobitmap, 0, sizeof(set_iobitmap));
+               WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap,
+                                             &set_iobitmap));
+               t->io_bitmap_max = 0;
+               kfree(bp);
+       }
+}
+
+void show_regs(struct pt_regs *regs)
+{
+       show_registers(regs);
+       show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
+}
+
+void show_regs_common(void)
+{
+       const char *vendor, *product, *board;
+
+       vendor = dmi_get_system_info(DMI_SYS_VENDOR);
+       if (!vendor)
+               vendor = "";
+       product = dmi_get_system_info(DMI_PRODUCT_NAME);
+       if (!product)
+               product = "";
+
+       /* Board Name is optional */
+       board = dmi_get_system_info(DMI_BOARD_NAME);
+
+       printk(KERN_CONT "\n");
+       printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s",
+               current->pid, current->comm, print_tainted(),
+               init_utsname()->release,
+               (int)strcspn(init_utsname()->version, " "),
+               init_utsname()->version);
+       printk(KERN_CONT " %s %s", vendor, product);
+       if (board)
+               printk(KERN_CONT "/%s", board);
+       printk(KERN_CONT "\n");
+}
+
+void flush_thread(void)
+{
+       struct task_struct *tsk = current;
+
+       flush_ptrace_hw_breakpoint(tsk);
+       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+       /*
+        * Forget coprocessor state..
+        */
+       tsk->fpu_counter = 0;
+       clear_fpu(tsk);
+       clear_used_math();
+}
+
+static void hard_disable_TSC(void)
+{
+       write_cr4(read_cr4() | X86_CR4_TSD);
+}
+
+void disable_TSC(void)
+{
+       preempt_disable();
+       if (!test_and_set_thread_flag(TIF_NOTSC))
+               /*
+                * Must flip the CPU state synchronously with
+                * TIF_NOTSC in the current running context.
+                */
+               hard_disable_TSC();
+       preempt_enable();
+}
+
+static void hard_enable_TSC(void)
+{
+       write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+
+static void enable_TSC(void)
+{
+       preempt_disable();
+       if (test_and_clear_thread_flag(TIF_NOTSC))
+               /*
+                * Must flip the CPU state synchronously with
+                * TIF_NOTSC in the current running context.
+                */
+               hard_enable_TSC();
+       preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+       unsigned int val;
+
+       if (test_thread_flag(TIF_NOTSC))
+               val = PR_TSC_SIGSEGV;
+       else
+               val = PR_TSC_ENABLE;
+
+       return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+       if (val == PR_TSC_SIGSEGV)
+               disable_TSC();
+       else if (val == PR_TSC_ENABLE)
+               enable_TSC();
+       else
+               return -EINVAL;
+
+       return 0;
+}
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
+{
+       struct thread_struct *prev, *next;
+
+       prev = &prev_p->thread;
+       next = &next_p->thread;
+
+       if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
+           test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
+               unsigned long debugctl = get_debugctlmsr();
+
+               debugctl &= ~DEBUGCTLMSR_BTF;
+               if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
+                       debugctl |= DEBUGCTLMSR_BTF;
+
+               update_debugctlmsr(debugctl);
+       }
+
+       if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+           test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+               /* prev and next are different */
+               if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+                       hard_disable_TSC();
+               else
+                       hard_enable_TSC();
+       }
+       propagate_user_return_notify(prev_p, next_p);
+}
+
+int sys_fork(struct pt_regs *regs)
+{
+       return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+int sys_vfork(struct pt_regs *regs)
+{
+       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
+                      NULL, NULL);
+}
+
+long
+sys_clone(unsigned long clone_flags, unsigned long newsp,
+         void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
+{
+       if (!newsp)
+               newsp = regs->sp;
+       return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+}
+
+/*
+ * This gets run with %si containing the
+ * function to call, and %di containing
+ * the "args".
+ */
+extern void kernel_thread_helper(void);
+
+/*
+ * Create a kernel thread
+ */
+int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+{
+       struct pt_regs regs;
+
+       memset(&regs, 0, sizeof(regs));
+
+       regs.si = (unsigned long) fn;
+       regs.di = (unsigned long) arg;
+
+#ifdef CONFIG_X86_32
+       regs.ds = __USER_DS;
+       regs.es = __USER_DS;
+       regs.fs = __KERNEL_PERCPU;
+       regs.gs = __KERNEL_STACK_CANARY;
+#else
+       regs.ss = __KERNEL_DS;
+#endif
+
+       regs.orig_ax = -1;
+       regs.ip = (unsigned long) kernel_thread_helper;
+       regs.cs = __KERNEL_CS | get_kernel_rpl();
+       regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
+
+       /* Ok, create the new process.. */
+       return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
+}
+EXPORT_SYMBOL(kernel_thread);
+
+/*
+ * sys_execve() executes a new program.
+ */
+long sys_execve(const char __user *name,
+               const char __user *const __user *argv,
+               const char __user *const __user *envp, struct pt_regs *regs)
+{
+       long error;
+       char *filename;
+
+       filename = getname(name);
+       error = PTR_ERR(filename);
+       if (IS_ERR(filename))
+               return error;
+       error = do_execve(filename, argv, envp, regs);
+
+#ifdef CONFIG_X86_32
+       if (error == 0) {
+               /* Make sure we don't return using sysenter.. */
+                set_thread_flag(TIF_IRET);
+        }
+#endif
+
+       putname(filename);
+       return error;
+}
+
+/*
+ * Idle related variables and functions
+ */
+unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(pm_idle);
+#endif
+
+#ifndef CONFIG_SMP
+static inline void play_dead(void)
+{
+       BUG();
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void enter_idle(void)
+{
+       percpu_write(is_idle, 1);
+       atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+}
+
+static void __exit_idle(void)
+{
+       if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
+               return;
+       atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+}
+
+/* Called from interrupts to signify idle end */
+void exit_idle(void)
+{
+       /* idle loop has pid 0 */
+       if (current->pid)
+               return;
+       __exit_idle();
+}
+#endif
+
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle(void)
+{
+       /*
+        * If we're the non-boot CPU, nothing set the stack canary up
+        * for us.  CPU0 already has it initialized but no harm in
+        * doing it again.  This is a good place for updating it, as
+        * we wont ever return from this function (so the invalid
+        * canaries already on the stack wont ever trigger).
+        */
+       boot_init_stack_canary();
+       current_thread_info()->status |= TS_POLLING;
+
+       /* endless idle loop with no priority at all */
+       while (1) {
+               tick_nohz_idle_enter();
+
+               while (!need_resched()) {
+                       rmb();
+
+                       if (cpu_is_offline(smp_processor_id()))
+                               play_dead();
+
+                       /*
+                        * Idle routines should keep interrupts disabled
+                        * from here on, until they go to idle.
+                        * Otherwise, idle callbacks can misfire.
+                        */
+                       local_touch_nmi();
+                       local_irq_disable();
+
+                       enter_idle();
+
+                       /* Don't trace irqs off for idle */
+                       stop_critical_timings();
+
+                       /* enter_idle() needs rcu for notifiers */
+                       rcu_idle_enter();
+
+                       if (cpuidle_idle_call())
+                               xen_idle();
+
+                       rcu_idle_exit();
+                       start_critical_timings();
+
+                       /* In many cases the interrupt that ended idle
+                          has already called exit_idle. But some idle
+                          loops can be woken up without interrupt. */
+                       __exit_idle();
+               }
+
+               tick_nohz_idle_exit();
+               preempt_enable_no_resched();
+               schedule();
+               preempt_disable();
+       }
+}
+
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+void xen_idle(void)
+{
+       trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
+       trace_cpu_idle_rcuidle(1, smp_processor_id());
+       current_thread_info()->status &= ~TS_POLLING;
+       /*
+        * TS_POLLING-cleared state must be visible before we
+        * test NEED_RESCHED:
+        */
+       smp_mb();
+
+       if (!need_resched())
+               safe_halt();    /* enables interrupts racelessly */
+       else
+               local_irq_enable();
+       current_thread_info()->status |= TS_POLLING;
+       trace_power_end_rcuidle(smp_processor_id());
+       trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+
+bool __init set_pm_idle_to_default(void)
+{
+       bool ret = !!pm_idle;
+
+       pm_idle = xen_idle;
+
+       return ret;
+}
+void stop_this_cpu(void *dummy)
+{
+       local_irq_disable();
+       /*
+        * Remove this CPU:
+        */
+       set_cpu_online(smp_processor_id(), false);
+       disable_all_local_evtchn();
+
+       for (;;) {
+               if (hlt_works(smp_processor_id()))
+                       halt();
+       }
+}
+
+static void do_nothing(void *unused)
+{
+}
+
+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
+ * handler on SMP systems.
+ *
+ * Caller must have changed pm_idle to the new value before the call. Old
+ * pm_idle value will not be used by any CPU after the return of this function.
+ */
+void cpu_idle_wait(void)
+{
+       smp_mb();
+       /* kick all the CPUs so that they exit out of pm_idle */
+       smp_call_function(do_nothing, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+#ifndef CONFIG_XEN
+/* Default MONITOR/MWAIT with no hints, used for default C1 state */
+static void mwait_idle(void)
+{
+       if (!need_resched()) {
+               trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
+               trace_cpu_idle_rcuidle(1, smp_processor_id());
+               if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
+                       clflush((void *)&current_thread_info()->flags);
+
+               __monitor((void *)&current_thread_info()->flags, 0, 0);
+               smp_mb();
+               if (!need_resched())
+                       __sti_mwait(0, 0);
+               else
+                       local_irq_enable();
+               trace_power_end_rcuidle(smp_processor_id());
+               trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+       } else
+               local_irq_enable();
+}
+#endif
+
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->work.need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle(void)
+{
+       trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id());
+       trace_cpu_idle_rcuidle(0, smp_processor_id());
+       local_irq_enable();
+       while (!need_resched())
+               cpu_relax();
+       trace_power_end_rcuidle(smp_processor_id());
+       trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+}
+
+#ifndef CONFIG_XEN
+/*
+ * mwait selection logic:
+ *
+ * It depends on the CPU. For AMD CPUs that support MWAIT this is
+ * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
+ * then depend on a clock divisor and current Pstate of the core. If
+ * all cores of a processor are in halt state (C1) the processor can
+ * enter the C1E (C1 enhanced) state. If mwait is used this will never
+ * happen.
+ *
+ * idle=mwait overrides this decision and forces the usage of mwait.
+ */
+
+#define MWAIT_INFO                     0x05
+#define MWAIT_ECX_EXTENDED_INFO                0x01
+#define MWAIT_EDX_C1                   0xf0
+
+int mwait_usable(const struct cpuinfo_x86 *c)
+{
+       u32 eax, ebx, ecx, edx;
+
+       if (boot_option_idle_override == IDLE_FORCE_MWAIT)
+               return 1;
+
+       if (c->cpuid_level < MWAIT_INFO)
+               return 0;
+
+       cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
+       /* Check, whether EDX has extended info about MWAIT */
+       if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
+               return 1;
+
+       /*
+        * edx enumeratios MONITOR/MWAIT extensions. Check, whether
+        * C1  supports MWAIT
+        */
+       return (edx & MWAIT_EDX_C1);
+}
+
+bool amd_e400_c1e_detected;
+EXPORT_SYMBOL(amd_e400_c1e_detected);
+
+static cpumask_var_t amd_e400_c1e_mask;
+
+void amd_e400_remove_cpu(int cpu)
+{
+       if (amd_e400_c1e_mask != NULL)
+               cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
+}
+
+/*
+ * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
+ * pending message MSR. If we detect C1E, then we handle it the same
+ * way as C3 power states (local apic timer and TSC stop)
+ */
+static void amd_e400_idle(void)
+{
+       if (need_resched())
+               return;
+
+       if (!amd_e400_c1e_detected) {
+               u32 lo, hi;
+
+               rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+
+               if (lo & K8_INTP_C1E_ACTIVE_MASK) {
+                       amd_e400_c1e_detected = true;
+                       if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+                               mark_tsc_unstable("TSC halt in AMD C1E");
+                       printk(KERN_INFO "System has AMD C1E enabled\n");
+               }
+       }
+
+       if (amd_e400_c1e_detected) {
+               int cpu = smp_processor_id();
+
+               if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
+                       cpumask_set_cpu(cpu, amd_e400_c1e_mask);
+                       /*
+                        * Force broadcast so ACPI can not interfere.
+                        */
+                       clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
+                                          &cpu);
+                       printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
+                              cpu);
+               }
+               clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+
+               default_idle();
+
+               /*
+                * The switch back from broadcast mode needs to be
+                * called with interrupts disabled.
+                */
+                local_irq_disable();
+                clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+                local_irq_enable();
+       } else
+               default_idle();
+}
+#endif
+
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
+#ifndef CONFIG_XEN
+#ifdef CONFIG_SMP
+       if (pm_idle == poll_idle && smp_num_siblings > 1) {
+               printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
+                       " performance may degrade.\n");
+       }
+#endif
+       if (pm_idle)
+               return;
+
+       if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
+               /*
+                * One CPU supports mwait => All CPUs supports mwait
+                */
+               printk(KERN_INFO "using mwait in idle threads.\n");
+               pm_idle = mwait_idle;
+       } else if (cpu_has_amd_erratum(amd_erratum_400)) {
+               /* E400: APIC timer interrupt does not wake up CPU from C1e */
+               printk(KERN_INFO "using AMD E400 aware idle routine\n");
+               pm_idle = amd_e400_idle;
+       } else
+               pm_idle = default_idle;
+#endif
+}
+
+void __init init_amd_e400_c1e_mask(void)
+{
+#ifndef CONFIG_XEN
+       /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
+       if (pm_idle == amd_e400_idle)
+               zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
+#endif
+}
+
+static int __init idle_setup(char *str)
+{
+       if (!str)
+               return -EINVAL;
+
+       if (!strcmp(str, "poll")) {
+               printk("using polling idle threads.\n");
+               pm_idle = poll_idle;
+               boot_option_idle_override = IDLE_POLL;
+#ifndef CONFIG_XEN
+       } else if (!strcmp(str, "mwait")) {
+               boot_option_idle_override = IDLE_FORCE_MWAIT;
+               WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n");
+       } else if (!strcmp(str, "halt")) {
+               /*
+                * When the boot option of idle=halt is added, halt is
+                * forced to be used for CPU idle. In such case CPU C2/C3
+                * won't be used again.
+                * To continue to load the CPU idle driver, don't touch
+                * the boot_option_idle_override.
+                */
+               pm_idle = default_idle;
+               boot_option_idle_override = IDLE_HALT;
+       } else if (!strcmp(str, "nomwait")) {
+               /*
+                * If the boot option of "idle=nomwait" is added,
+                * it means that mwait will be disabled for CPU C2/C3
+                * states. In such case it won't touch the variable
+                * of boot_option_idle_override.
+                */
+               boot_option_idle_override = IDLE_NOMWAIT;
+#endif
+       } else
+               return -1;
+
+       return 0;
+}
+early_param("idle", idle_setup);
+
+unsigned long arch_align_stack(unsigned long sp)
+{
+       if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+               sp -= get_random_int() % 8192;
+       return sp & ~0xf;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+       unsigned long range_end = mm->brk + 0x02000000;
+       return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
+
diff --git a/arch/x86/kernel/process_32-xen.c b/arch/x86/kernel/process_32-xen.c

new file mode 100644 (file)

index 0000000..cce5aac
--- /dev/null
+++ b/arch/x86/kernel/process_32-xen.c
@@ -0,0 +1,387 @@
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/user.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/ptrace.h>
+#include <linux/personality.h>
+#include <linux/percpu.h>
+#include <linux/prctl.h>
+#include <linux/ftrace.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/kdebug.h>
+
+#include <asm/pgtable.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/fpu-internal.h>
+#include <asm/desc.h>
+#ifdef CONFIG_MATH_EMULATION
+#include <asm/math_emu.h>
+#endif
+
+#include <xen/interface/physdev.h>
+
+#include <linux/err.h>
+
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+#include <asm/idle.h>
+#include <asm/syscalls.h>
+#include <asm/debugreg.h>
+#include <asm/switch_to.h>
+
+asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork");
+
+/*
+ * Return saved PC of a blocked thread.
+ */
+unsigned long thread_saved_pc(struct task_struct *tsk)
+{
+       return ((unsigned long *)tsk->thread.sp)[3];
+}
+
+void __show_regs(struct pt_regs *regs, int all)
+{
+       unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
+       unsigned long d0, d1, d2, d3, d6, d7;
+       unsigned long sp;
+       unsigned short ss, gs;
+
+       if (user_mode_vm(regs)) {
+               sp = regs->sp;
+               ss = regs->ss & 0xffff;
+               gs = get_user_gs(regs);
+       } else {
+               sp = kernel_stack_pointer(regs);
+               savesegment(ss, ss);
+               savesegment(gs, gs);
+       }
+
+       show_regs_common();
+
+       printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
+                       (u16)regs->cs, regs->ip, regs->flags,
+                       smp_processor_id());
+       print_symbol("EIP is at %s\n", regs->ip);
+
+       printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+               regs->ax, regs->bx, regs->cx, regs->dx);
+       printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
+               regs->si, regs->di, regs->bp, sp);
+       printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
+              (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
+
+       if (!all)
+               return;
+
+       cr0 = read_cr0();
+       cr2 = read_cr2();
+       cr3 = read_cr3();
+       cr4 = read_cr4_safe();
+       printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
+                       cr0, cr2, cr3, cr4);
+
+       get_debugreg(d0, 0);
+       get_debugreg(d1, 1);
+       get_debugreg(d2, 2);
+       get_debugreg(d3, 3);
+       printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+                       d0, d1, d2, d3);
+
+       get_debugreg(d6, 6);
+       get_debugreg(d7, 7);
+       printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
+                       d6, d7);
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+       BUG_ON(dead_task->mm);
+       release_vm86_irqs(dead_task);
+}
+
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+       unlazy_fpu(tsk);
+}
+
+int copy_thread(unsigned long clone_flags, unsigned long sp,
+       unsigned long unused,
+       struct task_struct *p, struct pt_regs *regs)
+{
+       struct pt_regs *childregs;
+       struct task_struct *tsk;
+       int err;
+
+       childregs = task_pt_regs(p);
+       *childregs = *regs;
+       childregs->ax = 0;
+       childregs->sp = sp;
+
+       p->thread.sp = (unsigned long) childregs;
+       p->thread.sp0 = (unsigned long) (childregs+1);
+
+       p->thread.ip = (unsigned long) ret_from_fork;
+
+       task_user_gs(p) = get_user_gs(regs);
+
+       p->fpu_counter = 0;
+       p->thread.io_bitmap_ptr = NULL;
+       tsk = current;
+       err = -ENOMEM;
+
+       memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
+#ifdef TIF_CSTAR
+       if (test_tsk_thread_flag(tsk, TIF_CSTAR))
+               p->thread.ip = (unsigned long) cstar_ret_from_fork;
+#endif
+       if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
+               p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
+                                               IO_BITMAP_BYTES, GFP_KERNEL);
+               if (!p->thread.io_bitmap_ptr) {
+                       p->thread.io_bitmap_max = 0;
+                       return -ENOMEM;
+               }
+               set_tsk_thread_flag(p, TIF_IO_BITMAP);
+       }
+
+       err = 0;
+
+       /*
+        * Set a new TLS for the child thread?
+        */
+       if (clone_flags & CLONE_SETTLS)
+               err = do_set_thread_area(p, -1,
+                       (struct user_desc __user *)childregs->si, 0);
+
+       p->thread.iopl = current->thread.iopl;
+
+       if (err && p->thread.io_bitmap_ptr) {
+               kfree(p->thread.io_bitmap_ptr);
+               p->thread.io_bitmap_max = 0;
+       }
+       return err;
+}
+
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+       set_user_gs(regs, 0);
+       regs->fs                = 0;
+       regs->ds                = __USER_DS;
+       regs->es                = __USER_DS;
+       regs->ss                = __USER_DS;
+       regs->cs                = __USER_CS;
+       regs->ip                = new_ip;
+       regs->sp                = new_sp;
+       /*
+        * Free the old FP and other extended state
+        */
+       free_thread_xstate(current);
+}
+EXPORT_SYMBOL_GPL(start_thread);
+
+/*
+ *     switch_to(x,y) should switch tasks from x to y.
+ *
+ * We fsave/fwait so that an exception goes off at the right time
+ * (as a call from the fsave or fwait in effect) rather than to
+ * the wrong process. Lazy FP saving no longer makes any sense
+ * with modern CPU's, and this simplifies a lot of things (SMP
+ * and UP become the same).
+ *
+ * NOTE! We used to use the x86 hardware context switching. The
+ * reason for not using it any more becomes apparent when you
+ * try to recover gracefully from saved state that is no longer
+ * valid (stale segment register values in particular). With the
+ * hardware task-switch, there is no way to fix up bad state in
+ * a reasonable manner.
+ *
+ * The fact that Intel documents the hardware task-switching to
+ * be slow is a fairly red herring - this code is not noticeably
+ * faster. However, there _is_ some room for improvement here,
+ * so the performance issues may eventually be a valid point.
+ * More important, however, is the fact that this allows us much
+ * more flexibility.
+ *
+ * The return value (in %ax) will be the "prev" task after
+ * the task-switch, and shows up in ret_from_fork in entry.S,
+ * for example.
+ */
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+       struct thread_struct *prev = &prev_p->thread,
+                                *next = &next_p->thread;
+       int cpu = smp_processor_id();
+#ifndef CONFIG_X86_NO_TSS
+       struct tss_struct *tss = &per_cpu(init_tss, cpu);
+#endif
+       fpu_switch_t fpu;
+#if CONFIG_XEN_COMPAT > 0x030002
+       struct physdev_set_iopl iopl_op;
+       struct physdev_set_iobitmap iobmp_op;
+#else
+       struct physdev_op _pdo[2], *pdo = _pdo;
+#define iopl_op pdo->u.set_iopl
+#define iobmp_op pdo->u.set_iobitmap
+#endif
+       multicall_entry_t _mcl[8], *mcl = _mcl;
+
+       /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
+
+       fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl);
+
+       /*
+        * Reload sp0.
+        * This is load_sp0(tss, next) with a multicall.
+        */
+       mcl->op      = __HYPERVISOR_stack_switch;
+       mcl->args[0] = __KERNEL_DS;
+       mcl->args[1] = next->sp0;
+       mcl++;
+
+       /*
+        * Load the per-thread Thread-Local Storage descriptor.
+        * This is load_TLS(next, cpu) with multicalls.
+        */
+#define C(i) do {                                                      \
+       if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
+                    next->tls_array[i].b != prev->tls_array[i].b)) {   \
+               mcl->op = __HYPERVISOR_update_descriptor;               \
+               *(u64 *)&mcl->args[0] = arbitrary_virt_to_machine(      \
+                       &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
+               *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i];    \
+               mcl++;                                                  \
+       }                                                               \
+} while (0)
+       C(0); C(1); C(2);
+#undef C
+
+       if (unlikely(prev->iopl != next->iopl)) {
+               iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
+#if CONFIG_XEN_COMPAT > 0x030002
+               mcl->op      = __HYPERVISOR_physdev_op;
+               mcl->args[0] = PHYSDEVOP_set_iopl;
+               mcl->args[1] = (unsigned long)&iopl_op;
+#else
+               mcl->op      = __HYPERVISOR_physdev_op_compat;
+               pdo->cmd     = PHYSDEVOP_set_iopl;
+               mcl->args[0] = (unsigned long)pdo++;
+#endif
+               mcl++;
+       }
+
+       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
+               set_xen_guest_handle(iobmp_op.bitmap,
+                                    (char *)next->io_bitmap_ptr);
+               iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
+#if CONFIG_XEN_COMPAT > 0x030002
+               mcl->op      = __HYPERVISOR_physdev_op;
+               mcl->args[0] = PHYSDEVOP_set_iobitmap;
+               mcl->args[1] = (unsigned long)&iobmp_op;
+#else
+               mcl->op      = __HYPERVISOR_physdev_op_compat;
+               pdo->cmd     = PHYSDEVOP_set_iobitmap;
+               mcl->args[0] = (unsigned long)pdo++;
+#endif
+               mcl++;
+       }
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+       BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
+#endif
+       BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
+       if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
+               BUG();
+
+       /*
+        * Now maybe handle debug registers
+        */
+       if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
+                    task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
+               __switch_to_xtra(prev_p, next_p);
+
+       /*
+        * Leave lazy mode, flushing any hypercalls made here.
+        * This must be done before restoring TLS segments so
+        * the GDT and LDT are properly updated, and must be
+        * done before math_state_restore, so the TS bit is up
+        * to date.
+        */
+       arch_end_context_switch(next_p);
+
+       /*
+        * Restore %gs if needed (which is common)
+        */
+       if (prev->gs | next->gs)
+               lazy_load_gs(next->gs);
+
+       switch_fpu_finish(next_p, fpu);
+
+       percpu_write(current_task, next_p);
+
+       return prev_p;
+}
+
+#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
+#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
+
+unsigned long get_wchan(struct task_struct *p)
+{
+       unsigned long bp, sp, ip;
+       unsigned long stack_page;
+       int count = 0;
+       if (!p || p == current || p->state == TASK_RUNNING)
+               return 0;
+       stack_page = (unsigned long)task_stack_page(p);
+       sp = p->thread.sp;
+       if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
+               return 0;
+       /* include/asm-i386/system.h:switch_to() pushes bp last. */
+       bp = *(unsigned long *) sp;
+       do {
+               if (bp < stack_page || bp > top_ebp+stack_page)
+                       return 0;
+               ip = *(unsigned long *) (bp+4);
+               if (!in_sched_functions(ip))
+                       return ip;
+               bp = *(unsigned long *) bp;
+       } while (count++ < 16);
+       return 0;
+}
+
diff --git a/arch/x86/kernel/process_64-xen.c b/arch/x86/kernel/process_64-xen.c

new file mode 100644 (file)

index 0000000..5961605
--- /dev/null
+++ b/arch/x86/kernel/process_64-xen.c
@@ -0,0 +1,602 @@
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ *  X86-64 port
+ *     Andi Kleen.
+ *
+ *     CPU hotplug support - ashok.raj@intel.com
+ * 
+ *  Jun Nakajima <jun.nakajima@intel.com> 
+ *     Modified for Xen
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/user.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/prctl.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/ftrace.h>
+
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/fpu-internal.h>
+#include <asm/mmu_context.h>
+#include <asm/prctl.h>
+#include <xen/interface/physdev.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/hardirq.h>
+#include <asm/ia32.h>
+#include <asm/idle.h>
+#include <asm/syscalls.h>
+#include <asm/debugreg.h>
+#include <asm/switch_to.h>
+
+asmlinkage extern void ret_from_fork(void);
+
+/* Prints also some state that isn't saved in the pt_regs */
+void __show_regs(struct pt_regs *regs, int all)
+{
+       unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
+       unsigned long d0, d1, d2, d3, d6, d7;
+       unsigned int fsindex, gsindex;
+       unsigned int ds, cs, es;
+
+       show_regs_common();
+       printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+       printk_address(regs->ip, 1);
+       printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
+                       regs->sp, regs->flags);
+       printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
+              regs->ax, regs->bx, regs->cx);
+       printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
+              regs->dx, regs->si, regs->di);
+       printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
+              regs->bp, regs->r8, regs->r9);
+       printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
+              regs->r10, regs->r11, regs->r12);
+       printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
+              regs->r13, regs->r14, regs->r15);
+
+       asm("movl %%ds,%0" : "=r" (ds));
+       asm("movl %%cs,%0" : "=r" (cs));
+       asm("movl %%es,%0" : "=r" (es));
+       asm("mov %%fs,%0" : "=r" (fsindex));
+       asm("mov %%gs,%0" : "=r" (gsindex));
+
+       rdmsrl(MSR_FS_BASE, fs);
+       rdmsrl(MSR_GS_BASE, gs);
+       rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+
+       if (!all)
+               return;
+
+       cr0 = read_cr0();
+       cr2 = read_cr2();
+       cr3 = read_cr3();
+       cr4 = read_cr4();
+
+       printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+              fs, fsindex, gs, gsindex, shadowgs);
+       printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
+                       es, cr0);
+       printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
+                       cr4);
+
+       get_debugreg(d0, 0);
+       get_debugreg(d1, 1);
+       get_debugreg(d2, 2);
+       printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+       get_debugreg(d3, 3);
+       get_debugreg(d6, 6);
+       get_debugreg(d7, 7);
+       printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+}
+
+void xen_load_gs_index(unsigned gs)
+{
+       WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs));
+}
+EXPORT_SYMBOL(xen_load_gs_index);
+
+void release_thread(struct task_struct *dead_task)
+{
+       if (dead_task->mm) {
+               if (dead_task->mm->context.size) {
+                       printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
+                                       dead_task->comm,
+                                       dead_task->mm->context.ldt,
+                                       dead_task->mm->context.size);
+                       BUG();
+               }
+       }
+}
+
+static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
+{
+       struct user_desc ud = {
+               .base_addr = addr,
+               .limit = 0xfffff,
+               .seg_32bit = 1,
+               .limit_in_pages = 1,
+               .useable = 1,
+       };
+       struct desc_struct *desc = t->thread.tls_array;
+       desc += tls;
+       fill_ldt(desc, &ud);
+}
+
+static inline u32 read_32bit_tls(struct task_struct *t, int tls)
+{
+       return get_desc_base(&t->thread.tls_array[tls]);
+}
+
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+       unlazy_fpu(tsk);
+}
+
+int copy_thread(unsigned long clone_flags, unsigned long sp,
+               unsigned long unused,
+       struct task_struct *p, struct pt_regs *regs)
+{
+       int err;
+       struct pt_regs *childregs;
+       struct task_struct *me = current;
+
+       childregs = ((struct pt_regs *)
+                       (THREAD_SIZE + task_stack_page(p))) - 1;
+       *childregs = *regs;
+
+       childregs->ax = 0;
+       if (user_mode(regs))
+               childregs->sp = sp;
+       else
+               childregs->sp = (unsigned long)childregs;
+
+       p->thread.sp = (unsigned long) childregs;
+       p->thread.sp0 = (unsigned long) (childregs+1);
+
+       set_tsk_thread_flag(p, TIF_FORK);
+
+       p->fpu_counter = 0;
+       p->thread.io_bitmap_ptr = NULL;
+
+       savesegment(gs, p->thread.gsindex);
+       p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
+       savesegment(fs, p->thread.fsindex);
+       p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
+       savesegment(es, p->thread.es);
+       savesegment(ds, p->thread.ds);
+
+       err = -ENOMEM;
+       memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
+       if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
+               p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
+                                                 IO_BITMAP_BYTES, GFP_KERNEL);
+               if (!p->thread.io_bitmap_ptr) {
+                       p->thread.io_bitmap_max = 0;
+                       return -ENOMEM;
+               }
+               set_tsk_thread_flag(p, TIF_IO_BITMAP);
+       }
+
+       /*
+        * Set a new TLS for the child thread?
+        */
+       if (clone_flags & CLONE_SETTLS) {
+#ifdef CONFIG_IA32_EMULATION
+               if (test_thread_flag(TIF_IA32))
+                       err = do_set_thread_area(p, -1,
+                               (struct user_desc __user *)childregs->si, 0);
+               else
+#endif
+                       err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+               if (err)
+                       goto out;
+       }
+        p->thread.iopl = current->thread.iopl;
+
+       err = 0;
+out:
+       if (err && p->thread.io_bitmap_ptr) {
+               kfree(p->thread.io_bitmap_ptr);
+               p->thread.io_bitmap_max = 0;
+       }
+
+       return err;
+}
+
+static void
+start_thread_common(struct pt_regs *regs, unsigned long new_ip,
+                   unsigned long new_sp,
+                   unsigned int _cs, unsigned int _ss, unsigned int _ds)
+{
+       loadsegment(fs, 0);
+       loadsegment(es, _ds);
+       loadsegment(ds, _ds);
+       load_gs_index(0);
+       regs->ip                = new_ip;
+       regs->sp                = new_sp;
+       regs->cs                = _cs;
+       regs->ss                = _ss;
+       regs->flags             = X86_EFLAGS_IF;
+       /*
+        * Free the old FP and other extended state
+        */
+       free_thread_xstate(current);
+}
+
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+       start_thread_common(regs, new_ip, new_sp,
+                           __USER_CS, __USER_DS, 0);
+}
+
+#ifdef CONFIG_IA32_EMULATION
+void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+{
+       start_thread_common(regs, new_ip, new_sp,
+                           test_thread_flag(TIF_X32)
+                           ? __USER_CS : __USER32_CS,
+                           __USER_DS, __USER_DS);
+}
+#endif
+
+/*
+ *     switch_to(x,y) should switch tasks from x to y.
+ *
+ * This could still be optimized:
+ * - fold all the options into a flag word and test it with a single test.
+ * - could test fs/gs bitsliced
+ *
+ * Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
+ */
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+{
+       struct thread_struct *prev = &prev_p->thread;
+       struct thread_struct *next = &next_p->thread;
+       int cpu = smp_processor_id();
+#ifndef CONFIG_X86_NO_TSS
+       struct tss_struct *tss = &per_cpu(init_tss, cpu);
+#endif
+       fpu_switch_t fpu;
+#if CONFIG_XEN_COMPAT > 0x030002
+       struct physdev_set_iopl iopl_op;
+       struct physdev_set_iobitmap iobmp_op;
+#else
+       struct physdev_op _pdo[2], *pdo = _pdo;
+#define iopl_op pdo->u.set_iopl
+#define iobmp_op pdo->u.set_iobitmap
+#endif
+       multicall_entry_t _mcl[8], *mcl = _mcl;
+
+       fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl);
+
+       /*
+        * Reload sp0.
+        * This is load_sp0(tss, next) with a multicall.
+        */
+       mcl->op      = __HYPERVISOR_stack_switch;
+       mcl->args[0] = __KERNEL_DS;
+       mcl->args[1] = next->sp0;
+       mcl++;
+
+       /*
+        * Load the per-thread Thread-Local Storage descriptor.
+        * This is load_TLS(next, cpu) with multicalls.
+        */
+#define C(i) do {                                                      \
+       if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
+                    next->tls_array[i].b != prev->tls_array[i].b)) {   \
+               mcl->op      = __HYPERVISOR_update_descriptor;          \
+               mcl->args[0] = arbitrary_virt_to_machine(               \
+                       &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\
+               mcl->args[1] = *(u64 *)&next->tls_array[i];             \
+               mcl++;                                                  \
+       }                                                               \
+} while (0)
+       C(0); C(1); C(2);
+#undef C
+
+       if (unlikely(prev->iopl != next->iopl)) {
+               iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3;
+#if CONFIG_XEN_COMPAT > 0x030002
+               mcl->op      = __HYPERVISOR_physdev_op;
+               mcl->args[0] = PHYSDEVOP_set_iopl;
+               mcl->args[1] = (unsigned long)&iopl_op;
+#else
+               mcl->op      = __HYPERVISOR_physdev_op_compat;
+               pdo->cmd     = PHYSDEVOP_set_iopl;
+               mcl->args[0] = (unsigned long)pdo++;
+#endif
+               mcl++;
+       }
+
+       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
+               set_xen_guest_handle(iobmp_op.bitmap,
+                                    (char *)next->io_bitmap_ptr);
+               iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
+#if CONFIG_XEN_COMPAT > 0x030002
+               mcl->op      = __HYPERVISOR_physdev_op;
+               mcl->args[0] = PHYSDEVOP_set_iobitmap;
+               mcl->args[1] = (unsigned long)&iobmp_op;
+#else
+               mcl->op      = __HYPERVISOR_physdev_op_compat;
+               pdo->cmd     = PHYSDEVOP_set_iobitmap;
+               mcl->args[0] = (unsigned long)pdo++;
+#endif
+               mcl++;
+       }
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+       BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo));
+#endif
+       BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl));
+       if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
+               BUG();
+
+       /*
+        * Switch DS and ES.
+        * This won't pick up thread selector changes, but I guess that is ok.
+        */
+       if (unlikely(next->es))
+               loadsegment(es, next->es);
+
+       if (unlikely(next->ds))
+               loadsegment(ds, next->ds);
+
+       /*
+        * Leave lazy mode, flushing any hypercalls made here.
+        * This must be done before restoring TLS segments so
+        * the GDT and LDT are properly updated, and must be
+        * done before math_state_restore, so the TS bit is up
+        * to date.
+        */
+       arch_end_context_switch(next_p);
+
+       /*
+        * Switch FS and GS.
+        *
+        * Segment register != 0 always requires a reload.  Also
+        * reload when it has changed.  When prev process used 64bit
+        * base always reload to avoid an information leak.
+        */
+       if (unlikely(next->fsindex))
+               loadsegment(fs, next->fsindex);
+
+       if (next->fs)
+               WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs));
+       
+       if (unlikely(next->gsindex))
+               load_gs_index(next->gsindex);
+
+       if (next->gs)
+               WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs));
+
+       switch_fpu_finish(next_p, fpu);
+
+       /*
+        * Switch the PDA context.
+        */
+       percpu_write(current_task, next_p);
+
+       percpu_write(kernel_stack,
+                 (unsigned long)task_stack_page(next_p) +
+                 THREAD_SIZE - KERNEL_STACK_OFFSET);
+
+       /*
+        * Now maybe reload the debug registers
+        */
+       if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
+                    task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
+               __switch_to_xtra(prev_p, next_p);
+
+       return prev_p;
+}
+
+void set_personality_64bit(void)
+{
+       /* inherit personality from parent */
+
+       /* Make sure to be in 64bit mode */
+       clear_thread_flag(TIF_IA32);
+       clear_thread_flag(TIF_ADDR32);
+       clear_thread_flag(TIF_X32);
+
+       /* Ensure the corresponding mm is not marked. */
+       if (current->mm)
+               current->mm->context.ia32_compat = 0;
+
+       /* TBD: overwrites user setup. Should have two bits.
+          But 64bit processes have always behaved this way,
+          so it's not too bad. The main problem is just that
+          32bit childs are affected again. */
+       current->personality &= ~READ_IMPLIES_EXEC;
+}
+
+void set_personality_ia32(bool x32)
+{
+       /* inherit personality from parent */
+
+       /* Make sure to be in 32bit mode */
+       set_thread_flag(TIF_ADDR32);
+
+       /* Mark the associated mm as containing 32-bit tasks. */
+       if (current->mm)
+               current->mm->context.ia32_compat = 1;
+
+       if (x32) {
+               clear_thread_flag(TIF_IA32);
+               set_thread_flag(TIF_X32);
+               current->personality &= ~READ_IMPLIES_EXEC;
+               /* is_compat_task() uses the presence of the x32
+                  syscall bit flag to determine compat status */
+               current_thread_info()->status &= ~TS_COMPAT;
+       } else {
+               set_thread_flag(TIF_IA32);
+               clear_thread_flag(TIF_X32);
+               current->personality |= force_personality32;
+               /* Prepare the first "return" to user space */
+               current_thread_info()->status |= TS_COMPAT;
+       }
+}
+EXPORT_SYMBOL_GPL(set_personality_ia32);
+
+unsigned long get_wchan(struct task_struct *p)
+{
+       unsigned long stack;
+       u64 fp, ip;
+       int count = 0;
+
+       if (!p || p == current || p->state == TASK_RUNNING)
+               return 0;
+       stack = (unsigned long)task_stack_page(p);
+       if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
+               return 0;
+       fp = *(u64 *)(p->thread.sp);
+       do {
+               if (fp < (unsigned long)stack ||
+                   fp >= (unsigned long)stack+THREAD_SIZE)
+                       return 0;
+               ip = *(u64 *)(fp+8);
+               if (!in_sched_functions(ip))
+                       return ip;
+               fp = *(u64 *)fp;
+       } while (count++ < 16);
+       return 0;
+}
+
+long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
+{
+       int ret = 0;
+       int doit = task == current;
+       int cpu;
+
+       switch (code) {
+       case ARCH_SET_GS:
+               if (addr >= TASK_SIZE_OF(task))
+                       return -EPERM;
+               cpu = get_cpu();
+               /* handle small bases via the GDT because that's faster to
+                  switch. */
+               if (addr <= 0xffffffff) {
+                       set_32bit_tls(task, GS_TLS, addr);
+                       if (doit) {
+                               load_TLS(&task->thread, cpu);
+                               load_gs_index(GS_TLS_SEL);
+                       }
+                       task->thread.gsindex = GS_TLS_SEL;
+                       task->thread.gs = 0;
+               } else {
+                       task->thread.gsindex = 0;
+                       task->thread.gs = addr;
+                       if (doit) {
+                               load_gs_index(0);
+                               ret = HYPERVISOR_set_segment_base(
+                                       SEGBASE_GS_USER, addr);
+                       }
+               }
+               put_cpu();
+               break;
+       case ARCH_SET_FS:
+               /* Not strictly needed for fs, but do it for symmetry
+                  with gs */
+               if (addr >= TASK_SIZE_OF(task))
+                       return -EPERM;
+               cpu = get_cpu();
+               /* handle small bases via the GDT because that's faster to
+                  switch. */
+               if (addr <= 0xffffffff) {
+                       set_32bit_tls(task, FS_TLS, addr);
+                       if (doit) {
+                               load_TLS(&task->thread, cpu);
+                               loadsegment(fs, FS_TLS_SEL);
+                       }
+                       task->thread.fsindex = FS_TLS_SEL;
+                       task->thread.fs = 0;
+               } else {
+                       task->thread.fsindex = 0;
+                       task->thread.fs = addr;
+                       if (doit) {
+                               /* set the selector to 0 to not confuse
+                                  __switch_to */
+                               loadsegment(fs, 0);
+                                ret = HYPERVISOR_set_segment_base(SEGBASE_FS,
+                                                                 addr);
+                       }
+               }
+               put_cpu();
+               break;
+       case ARCH_GET_FS: {
+               unsigned long base;
+               if (task->thread.fsindex == FS_TLS_SEL)
+                       base = read_32bit_tls(task, FS_TLS);
+               else if (doit)
+                       rdmsrl(MSR_FS_BASE, base);
+               else
+                       base = task->thread.fs;
+               ret = put_user(base, (unsigned long __user *)addr);
+               break;
+       }
+       case ARCH_GET_GS: {
+               unsigned long base;
+               unsigned gsindex;
+               if (task->thread.gsindex == GS_TLS_SEL)
+                       base = read_32bit_tls(task, GS_TLS);
+               else if (doit) {
+                       savesegment(gs, gsindex);
+                       if (gsindex)
+                               rdmsrl(MSR_KERNEL_GS_BASE, base);
+                       else
+                               base = task->thread.gs;
+               } else
+                       base = task->thread.gs;
+               ret = put_user(base, (unsigned long __user *)addr);
+               break;
+       }
+
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
+long sys_arch_prctl(int code, unsigned long addr)
+{
+       return do_arch_prctl(current, code, addr);
+}
+
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c

index 03920a1..e22394c 100644 (file)
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -4,9 +4,7 @@
  #include <linux/pci.h>
  #include <linux/irq.h>
  
-#include <asm/hpet.h>
-
-#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
+#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
  
  static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
  {
@@ -34,10 +32,21 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
         if (!(word & (1 << 13))) {
                 dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
                         "disabling irq balancing and affinity\n");
+#ifndef CONFIG_XEN
                 noirqdebug_setup("");
  #ifdef CONFIG_PROC_FS
                 no_irq_affinity = 1;
  #endif
+#else
+               {
+                       struct xen_platform_op op = {
+                               .cmd = XENPF_platform_quirk,
+                               .u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING
+                       };
+
+                       WARN_ON(HYPERVISOR_platform_op(&op));
+               }
+#endif
         }
  
         /* put back the original value for config space*/
@@ -53,6 +62,8 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH,
  #endif
  
  #if defined(CONFIG_HPET_TIMER)
+#include <asm/hpet.h>
+
  unsigned long force_hpet_address;
  
  static enum {
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c

index d840e69..8d647a9 100644 (file)
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -439,28 +439,20 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
                         DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
                 },
         },
-       {       /* Handle problems with rebooting on the Latitude E6320. */
+       {
                 .callback = set_pci_reboot,
-               .ident = "Dell Latitude E6320",
+               .ident = "Dell Latitude E5xxx",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5"),
                 },
         },
-       {       /* Handle problems with rebooting on the Latitude E5420. */
+       {
                 .callback = set_pci_reboot,
-               .ident = "Dell Latitude E5420",
+               .ident = "Dell Latitude E6xxx",
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
-               },
-       },
-       {       /* Handle problems with rebooting on the Latitude E6420. */
-               .callback = set_pci_reboot,
-               .ident = "Dell Latitude E6420",
-               .matches = {
-                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6"),
                 },
         },
         {       /* Handle problems with rebooting on the OptiPlex 990. */
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S

index 36818f8..06ee365 100644 (file)
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -87,14 +87,32 @@ relocate_kernel:
         movl    PTR(PA_PGD)(%ebp), %eax
         movl    %eax, %cr3
  
+       /* setup idt */
+       lidtl   idt_48 - relocate_kernel(%edi)
+
+       /* setup gdt */
+       leal    gdt - relocate_kernel(%edi), %eax
+       movl    %eax, (gdt_48 - relocate_kernel) + 2(%edi)
+       lgdtl   gdt_48 - relocate_kernel(%edi)
+
+       /* setup data segment registers */
+       mov     $(gdt_ds - gdt), %eax
+       mov     %eax, %ds
+       mov     %eax, %es
+       mov     %eax, %fs
+       mov     %eax, %gs
+       mov     %eax, %ss
+
         /* setup a new stack at the end of the physical control page */
         lea     PAGE_SIZE(%edi), %esp
  
-       /* jump to identity mapped page */
+       /* load new code segment and jump to identity mapped page */
+       pushl   $0
+       pushl   $(gdt_cs - gdt)
         movl    %edi, %eax
         addl    $(identity_mapped - relocate_kernel), %eax
         pushl   %eax
-       ret
+       iretl
  
  identity_mapped:
         /* set return address to 0 if not preserving context */
@@ -273,5 +291,22 @@ swap_pages:
         popl    %ebp
         ret
  
+       .align  16
+gdt:
+       .quad   0x0000000000000000      /* NULL descriptor */
+gdt_cs:
+       .quad   0x00cf9a000000ffff      /* kernel 4GB code at 0x00000000 */
+gdt_ds:
+       .quad   0x00cf92000000ffff      /* kernel 4GB data at 0x00000000 */
+gdt_end:
+
+gdt_48:
+       .word   gdt_end - gdt - 1       /* limit */
+       .long   0                       /* base - filled in by code above */
+
+idt_48:
+       .word   0                       /* limit */
+       .long   0                       /* base */
+
         .globl kexec_control_code_size
  .set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S

index 7a6f3b3..b591fca 100644 (file)
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -91,13 +91,30 @@ relocate_kernel:
         /* Switch to the identity mapped page tables */
         movq    %r9, %cr3
  
+       /* setup idt */
+       lidtq   idt_80 - relocate_kernel(%r8)
+
+       /* setup gdt */
+       leaq    gdt - relocate_kernel(%r8), %rax
+       movq    %rax, (gdt_80 - relocate_kernel) + 2(%r8)
+       lgdtq   gdt_80 - relocate_kernel(%r8)
+
+       /* setup data segment registers */
+       xorl    %eax, %eax
+       movl    %eax, %ds
+       movl    %eax, %es
+       movl    %eax, %fs
+       movl    %eax, %gs
+       movl    %eax, %ss
+
         /* setup a new stack at the end of the physical control page */
         lea     PAGE_SIZE(%r8), %rsp
  
-       /* jump to identity mapped page */
+       /* load new code segment and jump to identity mapped page */
         addq    $(identity_mapped - relocate_kernel), %r8
+       pushq   $(gdt_cs - gdt)
         pushq   %r8
-       ret
+       lretq
  
  identity_mapped:
         /* set return address to 0 if not preserving context */
@@ -264,5 +281,20 @@ swap_pages:
  3:
         ret
  
+       .align  16
+gdt:
+       .quad   0x0000000000000000      /* NULL descriptor */
+gdt_cs:
+       .quad   0x00af9a000000ffff
+gdt_end:
+
+gdt_80:
+       .word   gdt_end - gdt - 1       /* limit */
+       .quad   0                       /* base - filled in by code above */
+
+idt_80:
+       .word   0                       /* limit */
+       .quad   0                       /* base */
+
         .globl kexec_control_code_size
  .set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c

index 2a26819..7cc4a75 100644 (file)
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -1,3 +1,7 @@
+#ifdef CONFIG_XEN
+# define e820 machine_e820
+# include <asm/hypervisor.h>
+#endif
  #include <linux/ioport.h>
  #include <asm/e820.h>
  
@@ -37,6 +41,10 @@ static void remove_e820_regions(struct resource *avail)
  
  void arch_remove_reservations(struct resource *avail)
  {
+#ifdef CONFIG_XEN
+       if (!is_initial_xendomain())
+               return;
+#endif
         /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
         if (avail->flags & IORESOURCE_MEM) {
                 if (avail->start < BIOS_END)
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c

index af6db6e..4598f0b 100644 (file)
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -30,6 +30,7 @@ EXPORT_SYMBOL(cmos_lock);
  DEFINE_SPINLOCK(rtc_lock);
  EXPORT_SYMBOL(rtc_lock);
  
+#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST
  /*
   * In order to set the CMOS clock precisely, set_rtc_mmss has to be
   * called 500 ms after the second nowtime has started, because when
@@ -155,6 +156,7 @@ unsigned long mach_get_cmos_time(void)
  
         return mktime(year, mon, day, hour, min, sec);
  }
+#endif /* CONFIG_XEN_UNPRIVILEGED_GUEST */
  
  /* Routines for accessing the CMOS RAM/RTC. */
  unsigned char rtc_cmos_read(unsigned char addr)
@@ -202,6 +204,7 @@ unsigned long long native_read_tsc(void)
  EXPORT_SYMBOL(native_read_tsc);
  
  
+#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST
  static struct resource rtc_resources[] = {
         [0] = {
                 .start  = RTC_PORT(0),
@@ -247,6 +250,11 @@ static __init int add_rtc_cmos(void)
         if (mrst_identify_cpu())
                 return -ENODEV;
  
+#ifdef CONFIG_XEN
+       if (!is_initial_xendomain())
+               return -ENODEV;
+#endif
+
         platform_device_register(&rtc_device);
         dev_info(&rtc_device.dev,
                  "registered platform RTC device (no PNP device found)\n");
@@ -254,3 +262,4 @@ static __init int add_rtc_cmos(void)
         return 0;
  }
  device_initcall(add_rtc_cmos);
+#endif /* CONFIG_XEN_UNPRIVILEGED_GUEST */
diff --git a/arch/x86/kernel/setup-xen.c b/arch/x86/kernel/setup-xen.c

new file mode 100644 (file)

index 0000000..56aca1c
--- /dev/null
+++ b/arch/x86/kernel/setup-xen.c
@@ -0,0 +1,1456 @@
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ *  Memory region support
+ *     David Parsons <orc@pell.chi.il.us>, July-August 1999
+ *
+ *  Added E820 sanitization routine (removes overlapping memory regions);
+ *  Brian Moyle <bmoyle@mvista.com>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ *    Patrick Mochel <mochel@osdl.org>, March 2002
+ *
+ *  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *  Alex Achenbach <xela@slit.de>, December 2002.
+ *
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/screen_info.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/sfi.h>
+#include <linux/apm_bios.h>
+#include <linux/initrd.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/mca.h>
+#include <linux/root_dev.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
+#include <linux/nodemask.h>
+#include <linux/kexec.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/delay.h>
+
+#include <linux/kallsyms.h>
+#include <linux/cpufreq.h>
+#include <linux/dma-mapping.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+
+#include <linux/percpu.h>
+#include <linux/crash_dump.h>
+#include <linux/tboot.h>
+
+#include <video/edid.h>
+
+#include <asm/mtrr.h>
+#include <asm/apic.h>
+#include <asm/trampoline.h>
+#include <asm/e820.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/efi.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
+#include <asm/sections.h>
+#include <asm/dmi.h>
+#include <asm/io_apic.h>
+#include <asm/ist.h>
+#include <asm/setup_arch.h>
+#include <asm/bios_ebda.h>
+#include <asm/cacheflush.h>
+#include <asm/processor.h>
+#include <asm/bugs.h>
+
+#include <asm/vsyscall.h>
+#include <asm/cpu.h>
+#include <asm/desc.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+
+#include <asm/paravirt.h>
+#include <asm/hypervisor.h>
+#include <asm/olpc_ofw.h>
+
+#include <asm/percpu.h>
+#include <asm/topology.h>
+#include <asm/apicdef.h>
+#include <asm/amd_nb.h>
+#ifdef CONFIG_X86_64
+#include <asm/numa_64.h>
+#endif
+#include <asm/mce.h>
+#include <asm/alternative.h>
+#include <asm/prom.h>
+
+#ifdef CONFIG_XEN
+#include <asm/hypervisor.h>
+#include <xen/interface/kexec.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/nmi.h>
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+#include <xen/firmware.h>
+#include <xen/xencons.h>
+
+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xen_panic_block = {
+       xen_panic_event, NULL, 0 /* try to go last */
+};
+
+unsigned long *phys_to_machine_mapping;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+static unsigned long *pfn_to_mfn_frame_list_list, **pfn_to_mfn_frame_list;
+
+/* Raw start-of-day parameters from the hypervisor. */
+start_info_t *xen_start_info;
+EXPORT_SYMBOL(xen_start_info);
+#endif
+
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
+#ifdef CONFIG_DMI
+RESERVE_BRK(dmi_alloc, 65536);
+#endif
+
+
+static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
+
+#ifndef CONFIG_XEN
+#ifdef CONFIG_X86_64
+int default_cpu_present_to_apicid(int mps_cpu)
+{
+       return __default_cpu_present_to_apicid(mps_cpu);
+}
+
+int default_check_phys_apicid_present(int phys_apicid)
+{
+       return __default_check_phys_apicid_present(phys_apicid);
+}
+#endif
+
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+struct boot_params __initdata boot_params;
+#else
+struct boot_params boot_params;
+#endif
+#else /* CONFIG_XEN */
+/*
+ * Initialise the list of the frames that specify the list of
+ * frames that make up the p2m table. Used by save/restore and
+ * kexec/crash.
+ */
+#ifdef CONFIG_PM_SLEEP
+void
+#else
+static void __init
+#endif
+setup_pfn_to_mfn_frame_list(typeof(__alloc_bootmem) *__alloc_bootmem)
+{
+       unsigned long i, j, size;
+       unsigned int k, fpp = PAGE_SIZE / sizeof(unsigned long);
+
+       size = (max_pfn + fpp - 1) / fpp;
+       size = (size + fpp - 1) / fpp;
+       ++size; /* include a zero terminator for crash tools */
+       size *= sizeof(unsigned long);
+       if (__alloc_bootmem)
+               pfn_to_mfn_frame_list_list = alloc_bootmem_pages(size);
+       if (size > PAGE_SIZE
+           && xen_create_contiguous_region((unsigned long)
+                                           pfn_to_mfn_frame_list_list,
+                                           get_order(size), 0))
+               BUG();
+       size -= sizeof(unsigned long);
+       if (__alloc_bootmem)
+               pfn_to_mfn_frame_list = alloc_bootmem(size);
+
+       for (i = j = 0, k = -1; i < max_pfn; i += fpp, j++) {
+               if (j == fpp)
+                       j = 0;
+               if (j == 0) {
+                       k++;
+                       BUG_ON(k * sizeof(unsigned long) >= size);
+                       if (__alloc_bootmem)
+                               pfn_to_mfn_frame_list[k] =
+                                       alloc_bootmem_pages(PAGE_SIZE);
+                       pfn_to_mfn_frame_list_list[k] =
+                               virt_to_mfn(pfn_to_mfn_frame_list[k]);
+               }
+               pfn_to_mfn_frame_list[k][j] =
+                       virt_to_mfn(&phys_to_machine_mapping[i]);
+       }
+       HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
+       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+               virt_to_mfn(pfn_to_mfn_frame_list_list);
+}
+#endif
+
+/*
+ * Machine setup..
+ */
+static struct resource data_resource = {
+       .name   = "Kernel data",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource code_resource = {
+       .name   = "Kernel code",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource bss_resource = {
+       .name   = "Kernel bss",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+
+#ifdef CONFIG_X86_32
+/* cpu data as detected by the assembly code in head.S */
+struct cpuinfo_x86 new_cpu_data __cpuinitdata = { .wp_works_ok = 1, .hard_math = 1 };
+/* common cpu data for all cpus */
+struct cpuinfo_x86 boot_cpu_data __read_mostly = { .wp_works_ok = 1, .hard_math = 1 };
+EXPORT_SYMBOL(boot_cpu_data);
+#ifndef CONFIG_XEN
+static void set_mca_bus(int x)
+{
+#ifdef CONFIG_MCA
+       MCA_bus = x;
+#endif
+}
+
+unsigned int def_to_bigsmp;
+
+/* for MCA, but anyone else can use it if they want */
+unsigned int machine_id;
+unsigned int machine_submodel_id;
+unsigned int BIOS_revision;
+
+struct apm_info apm_info;
+EXPORT_SYMBOL(apm_info);
+#endif
+
+#if defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+struct ist_info ist_info;
+EXPORT_SYMBOL(ist_info);
+#elif defined(CONFIG_X86_SPEEDSTEP_SMI)
+struct ist_info ist_info;
+#endif
+
+#else
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+       .x86_phys_bits = MAX_PHYSMEM_BITS,
+};
+EXPORT_SYMBOL(boot_cpu_data);
+#endif
+
+
+#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+unsigned long mmu_cr4_features;
+#else
+unsigned long mmu_cr4_features = X86_CR4_PAE;
+#endif
+
+/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
+int bootloader_type, bootloader_version;
+
+/*
+ * Setup options
+ */
+struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
+struct edid_info edid_info;
+EXPORT_SYMBOL_GPL(edid_info);
+
+extern int root_mountflags;
+
+unsigned long saved_video_mode;
+
+#define RAMDISK_IMAGE_START_MASK       0x07FF
+#define RAMDISK_PROMPT_FLAG            0x8000
+#define RAMDISK_LOAD_FLAG              0x4000
+
+static char __initdata command_line[COMMAND_LINE_SIZE];
+#ifdef CONFIG_CMDLINE_BOOL
+static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+#endif
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+#ifndef CONFIG_XEN
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ *              from boot_params into a safe place.
+ *
+ */
+static inline void __init copy_edd(void)
+{
+     memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
+           sizeof(edd.mbr_signature));
+     memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info));
+     edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries;
+     edd.edd_info_nr = boot_params.eddbuf_entries;
+}
+#endif
+#else
+static inline void __init copy_edd(void)
+{
+}
+#endif
+
+void * __init extend_brk(size_t size, size_t align)
+{
+       size_t mask = align - 1;
+       void *ret;
+
+       BUG_ON(_brk_start == 0);
+       BUG_ON(align & mask);
+
+       _brk_end = (_brk_end + mask) & ~mask;
+       BUG_ON((char *)(_brk_end + size) > __brk_limit);
+
+       ret = (void *)_brk_end;
+       _brk_end += size;
+
+       memset(ret, 0, size);
+
+       return ret;
+}
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+static void __init init_gbpages(void)
+{
+       if (direct_gbpages && cpu_has_gbpages)
+               printk(KERN_INFO "Using GB pages for direct mapping\n");
+       else
+               direct_gbpages = 0;
+}
+#else
+static inline void init_gbpages(void)
+{
+}
+static void __init cleanup_highmap(void)
+{
+}
+#endif
+
+static void __init reserve_brk(void)
+{
+       if (_brk_end > _brk_start)
+               memblock_reserve(__pa(_brk_start),
+                                __pa(_brk_end) - __pa(_brk_start));
+
+       /* Mark brk area as locked down and no longer taking any
+          new allocations */
+       _brk_start = 0;
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+
+#define MAX_MAP_CHUNK  (NR_FIX_BTMAPS << PAGE_SHIFT)
+static void __init relocate_initrd(void)
+{
+#ifndef CONFIG_XEN
+       /* Assume only end is not page aligned */
+       u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+       u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+       u64 area_size     = PAGE_ALIGN(ramdisk_size);
+       u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+       u64 ramdisk_here;
+       unsigned long slop, clen, mapaddr;
+       char *p, *q;
+
+       /* We need to move the initrd down into lowmem */
+       ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
+                                        PAGE_SIZE);
+
+       if (!ramdisk_here)
+               panic("Cannot find place for new RAMDISK of size %lld\n",
+                        ramdisk_size);
+
+       /* Note: this includes all the lowmem currently occupied by
+          the initrd, we rely on that fact to keep the data intact. */
+       memblock_reserve(ramdisk_here, area_size);
+       initrd_start = ramdisk_here + PAGE_OFFSET;
+       initrd_end   = initrd_start + ramdisk_size;
+       printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
+                        ramdisk_here, ramdisk_here + ramdisk_size);
+
+       q = (char *)initrd_start;
+
+       /* Copy any lowmem portion of the initrd */
+       if (ramdisk_image < end_of_lowmem) {
+               clen = end_of_lowmem - ramdisk_image;
+               p = (char *)__va(ramdisk_image);
+               memcpy(q, p, clen);
+               q += clen;
+               ramdisk_image += clen;
+               ramdisk_size  -= clen;
+       }
+
+       /* Copy the highmem portion of the initrd */
+       while (ramdisk_size) {
+               slop = ramdisk_image & ~PAGE_MASK;
+               clen = ramdisk_size;
+               if (clen > MAX_MAP_CHUNK-slop)
+                       clen = MAX_MAP_CHUNK-slop;
+               mapaddr = ramdisk_image & PAGE_MASK;
+               p = early_memremap(mapaddr, clen+slop);
+               memcpy(q, p+slop, clen);
+               early_iounmap(p, clen+slop);
+               q += clen;
+               ramdisk_image += clen;
+               ramdisk_size  -= clen;
+       }
+       /* high pages is not converted by early_res_to_bootmem */
+       ramdisk_image = boot_params.hdr.ramdisk_image;
+       ramdisk_size  = boot_params.hdr.ramdisk_size;
+       printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
+               " %08llx - %08llx\n",
+               ramdisk_image, ramdisk_image + ramdisk_size - 1,
+               ramdisk_here, ramdisk_here + ramdisk_size - 1);
+#else
+       printk(KERN_ERR "initrd extends beyond end of memory "
+              "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+              xen_initrd_start + xen_start_info->mod_len,
+              max_low_pfn_mapped << PAGE_SHIFT);
+       initrd_start = 0;
+#endif
+}
+
+static void __init reserve_initrd(void)
+{
+       /* Assume only end is not page aligned */
+#ifndef CONFIG_XEN
+       u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+       u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+       u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+       u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+
+       if (!boot_params.hdr.type_of_loader ||
+           !ramdisk_image || !ramdisk_size)
+               return;         /* No initrd provided by bootloader */
+#else
+       unsigned long ramdisk_image = xen_initrd_start;
+       unsigned long ramdisk_size  = xen_start_info->mod_len;
+       unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+       unsigned long end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+
+       if (!xen_start_info->mod_start || !ramdisk_size)
+               return;         /* No initrd provided by bootloader */
+#endif
+
+       initrd_start = 0;
+
+       if (ramdisk_size >= (end_of_lowmem>>1)) {
+               memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
+               printk(KERN_ERR "initrd too large to handle, "
+                      "disabling initrd\n");
+               return;
+       }
+
+       printk(KERN_INFO "RAMDISK: %08lx - %08lx\n", ramdisk_image,
+                       ramdisk_end);
+
+
+       if (ramdisk_end <= end_of_lowmem) {
+               /* All in lowmem, easy case */
+               /*
+                * don't need to reserve again, already reserved early
+                * in i386_start_kernel
+                */
+               initrd_start = ramdisk_image + PAGE_OFFSET;
+               initrd_end = initrd_start + ramdisk_size;
+#ifdef CONFIG_X86_64_XEN
+               initrd_below_start_ok = 1;
+#endif
+       } else {
+               relocate_initrd();
+               memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
+       }
+#ifdef CONFIG_ACPI_INITRD_TABLE_OVERRIDE
+       acpi_initrd_offset = acpi_initrd_table_override((void *)initrd_start,
+                                                       (void *)initrd_end);
+       if (!acpi_initrd_offset)
+               return;
+       printk(KERN_INFO "Found acpi tables of size: %lu at 0x%lx\n",
+              acpi_initrd_offset, initrd_start);
+       initrd_start += acpi_initrd_offset;
+       return;
+#endif
+}
+#else
+static void __init reserve_initrd(void)
+{
+}
+#endif /* CONFIG_BLK_DEV_INITRD */
+
+static void __init parse_setup_data(void)
+{
+#ifndef CONFIG_XEN
+       struct setup_data *data;
+       u64 pa_data;
+
+       if (boot_params.hdr.version < 0x0209)
+               return;
+       pa_data = boot_params.hdr.setup_data;
+       while (pa_data) {
+               u32 data_len, map_len;
+
+               map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
+                             (u64)sizeof(struct setup_data));
+               data = early_memremap(pa_data, map_len);
+               data_len = data->len + sizeof(struct setup_data);
+               if (data_len > map_len) {
+                       early_iounmap(data, map_len);
+                       data = early_memremap(pa_data, data_len);
+                       map_len = data_len;
+               }
+
+               switch (data->type) {
+               case SETUP_E820_EXT:
+                       parse_e820_ext(data);
+                       break;
+               case SETUP_DTB:
+                       add_dtb(pa_data);
+                       break;
+               default:
+                       break;
+               }
+               pa_data = data->next;
+               early_iounmap(data, map_len);
+       }
+#endif
+}
+
+static void __init e820_reserve_setup_data(void)
+{
+#ifndef CONFIG_XEN
+       struct setup_data *data;
+       u64 pa_data;
+       int found = 0;
+
+       if (boot_params.hdr.version < 0x0209)
+               return;
+       pa_data = boot_params.hdr.setup_data;
+       while (pa_data) {
+               data = early_memremap(pa_data, sizeof(*data));
+               e820_update_range(pa_data, sizeof(*data)+data->len,
+                        E820_RAM, E820_RESERVED_KERN);
+               found = 1;
+               pa_data = data->next;
+               early_iounmap(data, sizeof(*data));
+       }
+       if (!found)
+               return;
+
+       sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+       memcpy(&e820_saved, &e820, sizeof(struct e820map));
+       printk(KERN_INFO "extended physical RAM map:\n");
+       e820_print_map("reserve setup_data");
+#endif
+}
+
+static void __init memblock_x86_reserve_range_setup_data(void)
+{
+#ifndef CONFIG_XEN
+       struct setup_data *data;
+       u64 pa_data;
+
+       if (boot_params.hdr.version < 0x0209)
+               return;
+       pa_data = boot_params.hdr.setup_data;
+       while (pa_data) {
+               data = early_memremap(pa_data, sizeof(*data));
+               memblock_reserve(pa_data, sizeof(*data) + data->len);
+               pa_data = data->next;
+               early_iounmap(data, sizeof(*data));
+       }
+#endif
+}
+
+#ifndef CONFIG_XEN
+/*
+ * --------- Crashkernel reservation ------------------------------
+ */
+
+#ifdef CONFIG_KEXEC
+
+/*
+ * Keep the crash kernel below this limit.  On 32 bits earlier kernels
+ * would limit the kernel to the low 512 MiB due to mapping restrictions.
+ * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
+ * limit once kexec-tools are fixed.
+ */
+#ifdef CONFIG_X86_32
+# define CRASH_KERNEL_ADDR_MAX (512 << 20)
+#else
+# define CRASH_KERNEL_ADDR_MAX (896 << 20)
+#endif
+
+static void __init reserve_crashkernel(void)
+{
+       unsigned long long total_mem;
+       unsigned long long crash_size, crash_base;
+       int ret;
+
+       total_mem = memblock_phys_mem_size();
+
+       ret = parse_crashkernel(boot_command_line, total_mem,
+                       &crash_size, &crash_base);
+       if (ret != 0 || crash_size <= 0)
+               return;
+
+       /* 0 means: find the address automatically */
+       if (crash_base <= 0) {
+               const unsigned long long alignment = 16<<20;    /* 16M */
+
+               /*
+                *  kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
+                */
+               crash_base = memblock_find_in_range(alignment,
+                              CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
+
+               if (!crash_base) {
+                       pr_info("crashkernel reservation failed - No suitable area found.\n");
+                       return;
+               }
+       } else {
+               unsigned long long start;
+
+               start = memblock_find_in_range(crash_base,
+                                crash_base + crash_size, crash_size, 1<<20);
+               if (start != crash_base) {
+                       pr_info("crashkernel reservation failed - memory is in use.\n");
+                       return;
+               }
+       }
+       memblock_reserve(crash_base, crash_size);
+
+       printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+                       "for crashkernel (System RAM: %ldMB)\n",
+                       (unsigned long)(crash_size >> 20),
+                       (unsigned long)(crash_base >> 20),
+                       (unsigned long)(total_mem >> 20));
+
+       crashk_res.start = crash_base;
+       crashk_res.end   = crash_base + crash_size - 1;
+       insert_resource(&iomem_resource, &crashk_res);
+}
+#else
+static void __init reserve_crashkernel(void)
+{
+}
+#endif
+#endif /* CONFIG_XEN */
+
+static struct resource standard_io_resources[] = {
+       { .name = "dma1", .start = 0x00, .end = 0x1f,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "pic1", .start = 0x20, .end = 0x21,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "timer0", .start = 0x40, .end = 0x43,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "timer1", .start = 0x50, .end = 0x53,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "keyboard", .start = 0x60, .end = 0x60,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "keyboard", .start = 0x64, .end = 0x64,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "dma page reg", .start = 0x80, .end = 0x8f,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "pic2", .start = 0xa0, .end = 0xa1,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "dma2", .start = 0xc0, .end = 0xdf,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO },
+       { .name = "fpu", .start = 0xf0, .end = 0xff,
+               .flags = IORESOURCE_BUSY | IORESOURCE_IO }
+};
+
+void __init reserve_standard_io_resources(void)
+{
+       int i;
+
+       /* Nothing to do if not running in dom0. */
+       if (!is_initial_xendomain())
+               return;
+
+       /* request I/O space for devices used on all i[345]86 PCs */
+       for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+               request_resource(&ioport_resource, &standard_io_resources[i]);
+
+}
+
+static __init void reserve_ibft_region(void)
+{
+       unsigned long addr, size = 0;
+
+       addr = find_ibft_region(&size);
+
+#ifndef CONFIG_XEN
+       if (size)
+               memblock_reserve(addr, size);
+#endif
+}
+
+#ifndef CONFIG_XEN
+static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
+
+static void __init trim_bios_range(void)
+{
+       /*
+        * A special case is the first 4Kb of memory;
+        * This is a BIOS owned area, not kernel ram, but generally
+        * not listed as such in the E820 table.
+        *
+        * This typically reserves additional memory (64KiB by default)
+        * since some BIOSes are known to corrupt low memory.  See the
+        * Kconfig help text for X86_RESERVE_LOW.
+        */
+       e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
+                         E820_RAM, E820_RESERVED);
+
+       /*
+        * special case: Some BIOSen report the PC BIOS
+        * area (640->1Mb) as ram even though it is not.
+        * take them out.
+        */
+       e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
+       sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+}
+
+static int __init parse_reservelow(char *p)
+{
+       unsigned long long size;
+
+       if (!p)
+               return -EINVAL;
+
+       size = memparse(p, &p);
+
+       if (size < 4096)
+               size = 4096;
+
+       if (size > 640*1024)
+               size = 640*1024;
+
+       reserve_low = size;
+
+       return 0;
+}
+
+early_param("reservelow", parse_reservelow);
+#endif
+
+/*
+ * Determine if we were loaded by an EFI loader.  If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization.  Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
+ */
+/*
+ * setup_arch - architecture-specific boot-time initializations
+ *
+ * Note: On x86_64, fixmaps are ready for use even before this is called.
+ */
+
+void __init setup_arch(char **cmdline_p)
+{
+#ifdef CONFIG_XEN
+       unsigned long p2m_pages;
+       struct physdev_set_iopl set_iopl;
+
+       if (!is_initial_xendomain()) {
+#ifdef CONFIG_X86_32
+               /* Force a quick death if the kernel panics (not domain 0). */
+               extern int panic_timeout;
+               if (!panic_timeout)
+                       panic_timeout = 1;
+#endif
+
+               /* Register a call for panic conditions. */
+               atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+       }
+
+       set_iopl.iopl = 1;
+       WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
+#endif /* CONFIG_XEN */
+
+#ifdef CONFIG_X86_32
+       memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+       visws_early_detect();
+
+#ifndef CONFIG_XEN
+       /*
+        * copy kernel address range established so far and switch
+        * to the proper swapper page table
+        */
+       clone_pgd_range(swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
+                       initial_page_table + KERNEL_PGD_BOUNDARY,
+                       KERNEL_PGD_PTRS);
+
+       load_cr3(swapper_pg_dir);
+       __flush_tlb_all();
+#endif
+#else
+       printk(KERN_INFO "Command line: %s\n", boot_command_line);
+#endif
+
+       /*
+        * If we have OLPC OFW, we might end up relocating the fixmap due to
+        * reserve_top(), so do this before touching the ioremap area.
+        */
+       olpc_ofw_detect();
+
+       early_trap_init();
+       early_cpu_init();
+       early_ioremap_init();
+
+       setup_olpc_ofw_pgd();
+
+#ifndef CONFIG_XEN
+       ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
+       screen_info = boot_params.screen_info;
+       edid_info = boot_params.edid_info;
+#ifdef CONFIG_X86_32
+       apm_info.bios = boot_params.apm_bios_info;
+       ist_info = boot_params.ist_info;
+       if (boot_params.sys_desc_table.length != 0) {
+               set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
+               machine_id = boot_params.sys_desc_table.table[0];
+               machine_submodel_id = boot_params.sys_desc_table.table[1];
+               BIOS_revision = boot_params.sys_desc_table.table[2];
+       }
+#endif
+       saved_video_mode = boot_params.hdr.vid_mode;
+       bootloader_type = boot_params.hdr.type_of_loader;
+       if ((bootloader_type >> 4) == 0xe) {
+               bootloader_type &= 0xf;
+               bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+       }
+       bootloader_version  = bootloader_type & 0xf;
+       bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
+
+#ifdef CONFIG_BLK_DEV_RAM
+       rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
+       rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
+       rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
+#endif
+#ifdef CONFIG_EFI
+       if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+                    "EL32", 4)) {
+               efi_enabled = 1;
+               efi_64bit = false;
+       } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
+                    "EL64", 4)) {
+               efi_enabled = 1;
+               efi_64bit = true;
+       }
+       if (efi_enabled && efi_memblock_x86_reserve_range())
+               efi_enabled = 0;
+#endif
+#else /* CONFIG_XEN */
+#ifdef CONFIG_X86_32
+       /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
+          properly.  Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
+       */
+       ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
+#else
+       ROOT_DEV = MKDEV(RAMDISK_MAJOR,0);
+#endif
+       if (is_initial_xendomain()) {
+               const struct dom0_vga_console_info *info =
+                       (void *)((char *)xen_start_info +
+                                xen_start_info->console.dom0.info_off);
+
+               dom0_init_screen_info(info,
+                                     xen_start_info->console.dom0.info_size);
+               xen_start_info->console.domU.mfn = 0;
+               xen_start_info->console.domU.evtchn = 0;
+
+               efi_probe();
+       } else
+               screen_info.orig_video_isVGA = 0;
+       copy_edid();
+#endif /* CONFIG_XEN */
+
+       x86_init.oem.arch_setup();
+
+       iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
+       setup_memory_map();
+       parse_setup_data();
+       /* update the e820_saved too */
+       e820_reserve_setup_data();
+
+       copy_edd();
+
+#ifndef CONFIG_XEN
+       if (!boot_params.hdr.root_flags)
+               root_mountflags &= ~MS_RDONLY;
+#endif
+       init_mm.start_code = (unsigned long) _text;
+       init_mm.end_code = (unsigned long) _etext;
+       init_mm.end_data = (unsigned long) _edata;
+       init_mm.brk = _brk_end;
+
+       code_resource.start = virt_to_phys(_text);
+       code_resource.end = virt_to_phys(_etext)-1;
+       data_resource.start = virt_to_phys(_etext);
+       data_resource.end = virt_to_phys(_edata)-1;
+       bss_resource.start = virt_to_phys(&__bss_start);
+       bss_resource.end = virt_to_phys(&__bss_stop)-1;
+
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+       strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+       if (builtin_cmdline[0]) {
+               /* append boot loader cmdline to builtin */
+               strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+               strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+               strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+       }
+#endif
+#endif
+
+       strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+       *cmdline_p = command_line;
+
+       /*
+        * x86_configure_nx() is called before parse_early_param() to detect
+        * whether hardware doesn't support NX (so that the early EHCI debug
+        * console setup can safely call set_fixmap()). It may then be called
+        * again from within noexec_setup() during parsing early parameters
+        * to honor the respective command line option.
+        */
+       x86_configure_nx();
+
+       parse_early_param();
+
+       x86_report_nx();
+
+       /* after early param, so could get panic from serial */
+       memblock_x86_reserve_range_setup_data();
+
+       if (acpi_mps_check()) {
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)
+               disable_apic = 1;
+#endif
+               setup_clear_cpu_cap(X86_FEATURE_APIC);
+       }
+
+#ifdef CONFIG_PCI
+       if (pci_early_dump_regs)
+               early_dump_pci_devices();
+#endif
+
+       finish_e820_parsing();
+
+       if (efi_enabled)
+               efi_init();
+
+       if (is_initial_xendomain())
+               dmi_scan_machine();
+
+       /*
+        * VMware detection requires dmi to be available, so this
+        * needs to be done after dmi_scan_machine, for the BP.
+        */
+       init_hypervisor_platform();
+
+       x86_init.resources.probe_roms();
+
+#ifndef CONFIG_XEN
+       /* after parse_early_param, so could debug it */
+       insert_resource(&iomem_resource, &code_resource);
+       insert_resource(&iomem_resource, &data_resource);
+       insert_resource(&iomem_resource, &bss_resource);
+
+       trim_bios_range();
+#ifdef CONFIG_X86_32
+       if (ppro_with_ram_bug()) {
+               e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
+                                 E820_RESERVED);
+               sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+               printk(KERN_INFO "fixed physical RAM map:\n");
+               e820_print_map("bad_ppro");
+       }
+#else
+       early_gart_iommu_check();
+#endif
+#endif /* CONFIG_XEN */
+
+       /*
+        * partially used pages are not usable - thus
+        * we are rounding upwards:
+        */
+       max_pfn = e820_end_of_ram_pfn();
+
+       /* update e820 for memory not covered by WB MTRRs */
+       mtrr_bp_init();
+#ifndef CONFIG_XEN
+       if (mtrr_trim_uncached_memory(max_pfn))
+               max_pfn = e820_end_of_ram_pfn();
+#endif
+
+#ifdef CONFIG_X86_32
+       /* max_low_pfn get updated here */
+       find_low_pfn_range();
+#else
+       num_physpages = max_pfn;
+       max_mapnr = max_pfn;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+       check_x2apic();
+#endif
+
+       /* How many end-of-memory variables you have, grandma! */
+       /* need this before calling reserve_initrd */
+       if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
+               max_low_pfn = e820_end_of_low_ram_pfn();
+       else
+               max_low_pfn = max_pfn;
+
+       high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+#endif
+
+       /*
+        * Find and reserve possible boot-time SMP configuration:
+        */
+       find_smp_config();
+
+       reserve_ibft_region();
+
+       /*
+        * Need to conclude brk, before memblock_x86_fill()
+        *  it could use memblock_find_in_range, could overlap with
+        *  brk area.
+        */
+       reserve_brk();
+
+       cleanup_highmap();
+
+       memblock.current_limit = get_max_mapped();
+       memblock_x86_fill();
+
+       /*
+        * The EFI specification says that boot service code won't be called
+        * after ExitBootServices(). This is, in fact, a lie.
+        */
+       if (efi_enabled)
+               efi_reserve_boot_services();
+
+       /* preallocate 4k for mptable mpc */
+       early_reserve_e820_mpc_new();
+
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+       setup_bios_corruption_check();
+#endif
+
+       printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
+                       max_pfn_mapped<<PAGE_SHIFT);
+
+#ifndef CONFIG_XEN
+       setup_trampolines();
+#endif
+
+       init_gbpages();
+
+       /* max_pfn_mapped is updated here */
+#ifdef CONFIG_X86_64_XEN
+       if (xen_start_info->mfn_list < __START_KERNEL_map) {
+               /* Map P2M space only after all usable memory. */
+               unsigned long p2m_start = xen_start_info->first_p2m_pfn;
+               unsigned long p2m_end = p2m_start
+                                       + xen_start_info->nr_p2m_frames;
+               unsigned long temp;
+
+               max_low_pfn_mapped = init_memory_mapping(
+                       0, min(max_low_pfn, p2m_start) << PAGE_SHIFT);
+               max_pfn_mapped = max_low_pfn_mapped;
+
+               if (p2m_end < max_low_pfn)
+                       max_low_pfn_mapped = init_memory_mapping(
+                               p2m_end << PAGE_SHIFT,
+                               max_low_pfn << PAGE_SHIFT);
+               max_pfn_mapped = max_low_pfn_mapped;
+
+               if (max_low_pfn < p2m_start)
+                       max_pfn_mapped = init_memory_mapping(
+                               max_low_pfn << PAGE_SHIFT,
+                               p2m_start << PAGE_SHIFT);
+
+               if (max(max_low_pfn, p2m_end) < max_pfn)
+                       max_pfn_mapped = init_memory_mapping(
+                               max(max_low_pfn, p2m_end) << PAGE_SHIFT,
+                               max_pfn << PAGE_SHIFT);
+
+               temp = max_pfn_mapped;
+               if (p2m_start < max_low_pfn) {
+                       temp = init_memory_mapping(
+                               p2m_start << PAGE_SHIFT,
+                               min(max_low_pfn, p2m_end) << PAGE_SHIFT);
+                       if (temp > max_low_pfn_mapped)
+                               max_low_pfn_mapped = temp;
+               }
+
+               if (max_low_pfn < p2m_end)
+                       temp = init_memory_mapping(
+                               max(max_low_pfn, p2m_start) << PAGE_SHIFT,
+                               p2m_end << PAGE_SHIFT);
+               if (temp > max_pfn_mapped)
+                       max_pfn_mapped = temp;
+
+               goto init_memory_mapping_done;
+       }
+#endif
+
+       max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
+       max_pfn_mapped = max_low_pfn_mapped;
+
+#ifdef CONFIG_X86_64
+       if (max_pfn > max_low_pfn) {
+               max_pfn_mapped = init_memory_mapping(1UL<<32,
+                                                    max_pfn<<PAGE_SHIFT);
+ init_memory_mapping_done:
+               /* can we preseve max_low_pfn ?*/
+               max_low_pfn = max_pfn;
+       }
+#endif
+       memblock.current_limit = get_max_mapped();
+
+       /*
+        * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
+        */
+
+#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
+       if (init_ohci1394_dma_early)
+               init_ohci1394_dma_on_all_controllers();
+#endif
+       /* Allocate bigger log buffer */
+       setup_log_buf(1);
+
+       reserve_initrd();
+
+#ifndef CONFIG_XEN
+       reserve_crashkernel();
+
+       vsmp_init();
+#endif
+
+       io_delay_init();
+
+#ifdef CONFIG_ACPI
+       if (!is_initial_xendomain()) {
+               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+               disable_acpi();
+       }
+#endif
+
+       /*
+        * Parse the ACPI tables for possible boot-time SMP configuration.
+        */
+       acpi_boot_table_init();
+
+       early_acpi_boot_init();
+
+       initmem_init();
+       memblock_find_dma_reserve();
+
+#ifdef CONFIG_KVM_CLOCK
+       kvmclock_init();
+#endif
+
+       x86_init.paging.pagetable_setup_start(swapper_pg_dir);
+       paging_init();
+       x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+
+       if (boot_cpu_data.cpuid_level >= 0) {
+               /* A CPU has %cr4 if and only if it has CPUID */
+               mmu_cr4_features = read_cr4();
+       }
+
+#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN)
+       /* sync back kernel address range */
+       clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
+                       swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
+                       KERNEL_PGD_PTRS);
+#endif
+
+       tboot_probe();
+
+#ifdef CONFIG_X86_64
+       map_vsyscall();
+#endif
+
+#ifdef CONFIG_XEN
+#ifdef CONFIG_KEXEC
+       xen_machine_kexec_setup_resources();
+# define kexec_enabled() (crashk_res.start < crashk_res.end)
+#else
+# define kexec_enabled() 0
+#endif
+       p2m_pages = max_pfn;
+       if (xen_start_info->nr_pages > max_pfn) {
+               /*
+                * the max_pfn was shrunk (probably by mem= or highmem=
+                * kernel parameter); shrink reservation with the HV
+                */
+               struct xen_memory_reservation reservation = {
+                       .address_bits = 0,
+                       .extent_order = 0,
+                       .domid = DOMID_SELF
+               };
+               unsigned int difference;
+               int ret;
+
+               difference = xen_start_info->nr_pages - max_pfn;
+
+               set_xen_guest_handle(reservation.extent_start,
+                                    phys_to_machine_mapping + max_pfn);
+               reservation.nr_extents = difference;
+               ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+                                          &reservation);
+               BUG_ON(ret != difference);
+       } else if (max_pfn > xen_start_info->nr_pages)
+               p2m_pages = xen_start_info->nr_pages;
+
+       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+               /* Make sure we have a large enough P->M table. */
+               phys_to_machine_mapping = alloc_bootmem_pages(
+                       max_pfn * sizeof(unsigned long));
+               memcpy(phys_to_machine_mapping,
+                      __va(__pa(xen_start_info->mfn_list)),
+                      p2m_pages * sizeof(unsigned long));
+               memset(phys_to_machine_mapping + p2m_pages, ~0,
+                      (max_pfn - p2m_pages) * sizeof(unsigned long));
+#ifdef CONFIG_X86_64
+               if (xen_start_info->mfn_list == VMEMMAP_START) {
+                       /*
+                        * Since it is well isolated we can (and since it is
+                        * perhaps large we should) also free the page tables
+                        * mapping the initial P->M table.
+                        */
+                       unsigned long va = VMEMMAP_START, pa;
+                       pgd_t *pgd = pgd_offset_k(va);
+                       pud_t *pud_page = pud_offset(pgd, 0);
+
+                       BUILD_BUG_ON(VMEMMAP_START & ~PGDIR_MASK);
+                       xen_l4_entry_update(pgd, __pgd(0));
+                       do {
+                               pud_t *pud = pud_page + pud_index(va);
+
+                               if (pud_none(*pud))
+                                       va += PUD_SIZE;
+                               else if (pud_large(*pud)) {
+                                       pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+                                       make_pages_writable(__va(pa),
+                                               PUD_SIZE >> PAGE_SHIFT,
+                                               XENFEAT_writable_page_tables);
+                                       free_bootmem(pa, PUD_SIZE);
+                                       va += PUD_SIZE;
+                               } else {
+                                       pmd_t *pmd = pmd_offset(pud, va);
+
+                                       if (pmd_large(*pmd)) {
+                                               pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+                                               make_pages_writable(__va(pa),
+                                                       PMD_SIZE >> PAGE_SHIFT,
+                                                       XENFEAT_writable_page_tables);
+                                               free_bootmem(pa, PMD_SIZE);
+                                       } else if (!pmd_none(*pmd)) {
+                                               unsigned int i;
+                                               pte_t *pte = pte_offset_kernel(pmd, va);
+
+                                               for (i = 0; i < PTRS_PER_PTE; ++i) {
+                                                       if (pte_none(pte[i]))
+                                                               break;
+                                                       pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+                                                       make_page_writable(__va(pa),
+                                                               XENFEAT_writable_page_tables);
+                                                       free_bootmem(pa, PAGE_SIZE);
+                                               }
+                                               ClearPagePinned(virt_to_page(pte));
+                                               make_page_writable(pte,
+                                                       XENFEAT_writable_page_tables);
+                                               free_bootmem(__pa(pte), PAGE_SIZE);
+                                       }
+                                       va += PMD_SIZE;
+                                       if (pmd_index(va))
+                                               continue;
+                                       ClearPagePinned(virt_to_page(pmd));
+                                       make_page_writable(pmd,
+                                               XENFEAT_writable_page_tables);
+                                       free_bootmem(__pa((unsigned long)pmd
+                                                         & PAGE_MASK),
+                                                    PAGE_SIZE);
+                               }
+                       } while (pud_index(va));
+                       ClearPagePinned(virt_to_page(pud_page));
+                       make_page_writable(pud_page,
+                                          XENFEAT_writable_page_tables);
+                       free_bootmem(__pa((unsigned long)pud_page & PAGE_MASK),
+                                    PAGE_SIZE);
+               } else if (!WARN_ON(xen_start_info->mfn_list
+                                   < __START_KERNEL_map))
+#endif
+                       free_bootmem(__pa(xen_start_info->mfn_list),
+                                    PFN_PHYS(PFN_UP(xen_start_info->nr_pages *
+                                                    sizeof(unsigned long))));
+
+               if (!is_initial_xendomain() || kexec_enabled())
+                       setup_pfn_to_mfn_frame_list(__alloc_bootmem);
+       }
+
+#ifdef CONFIG_ISA_DMA_API
+# define ch p2m_pages
+       /* Mark all ISA DMA channels in-use - using them wouldn't work. */
+       for (ch = 0; ch < MAX_DMA_CHANNELS; ++ch)
+               if (ch != 4 && request_dma(ch, "xen") != 0)
+                       BUG();
+# undef ch
+#endif
+#else /* CONFIG_XEN */
+       generic_apic_probe();
+
+       early_quirks();
+#endif
+
+       /*
+        * Read APIC and some other early information from ACPI tables.
+        */
+       acpi_boot_init();
+       sfi_init();
+       x86_dtb_init();
+
+       /*
+        * get boot-time SMP configuration:
+        */
+       if (smp_found_config)
+               get_smp_config();
+
+       prefill_possible_map();
+
+       init_cpu_to_node();
+
+#ifndef CONFIG_XEN
+       init_apic_mappings();
+       ioapic_and_gsi_init();
+
+       kvm_guest_init();
+
+       e820_reserve_resources();
+       e820_mark_nosave_regions(max_low_pfn);
+#else
+       if (is_initial_xendomain())
+               e820_reserve_resources();
+#endif
+
+       x86_init.resources.reserve_resources();
+
+#ifdef CONFIG_XEN
+       if (is_initial_xendomain())
+#endif
+               e820_setup_gap();
+
+#ifdef CONFIG_VT
+#ifdef CONFIG_DUMMY_CONSOLE
+       conswitchp = &dummy_con;
+#endif
+#ifdef CONFIG_VGA_CONSOLE
+#ifdef CONFIG_XEN
+       if (!is_initial_xendomain())
+               ;
+       else
+#endif
+       if (!efi_enabled || efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)
+               conswitchp = &vga_con;
+#endif
+#endif
+       x86_init.oem.banner();
+
+       x86_init.timers.wallclock_init();
+
+       x86_platform.wallclock_init();
+
+       mcheck_init();
+
+       arch_init_ideal_nops();
+}
+
+#ifdef CONFIG_X86_32
+
+static struct resource video_ram_resource = {
+       .name   = "Video RAM area",
+       .start  = 0xa0000,
+       .end    = 0xbffff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+void __init i386_reserve_resources(void)
+{
+       if (is_initial_xendomain())
+               request_resource(&iomem_resource, &video_ram_resource);
+       reserve_standard_io_resources();
+}
+
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_XEN
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+       HYPERVISOR_shutdown(SHUTDOWN_crash);
+       /* we're never actually going to get here... */
+       return NOTIFY_DONE;
+}
+#endif /* !CONFIG_XEN */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c

index 5a98aa2..2e440b9 100644 (file)
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -231,6 +231,7 @@ void __init setup_per_cpu_areas(void)
                  * are zeroed indicating that the static arrays are
                  * gone.
                  */
+#ifndef CONFIG_XEN
  #ifdef CONFIG_X86_LOCAL_APIC
                 per_cpu(x86_cpu_to_apicid, cpu) =
                         early_per_cpu_map(x86_cpu_to_apicid, cpu);
@@ -241,6 +242,7 @@ void __init setup_per_cpu_areas(void)
                 per_cpu(x86_cpu_to_logical_apicid, cpu) =
                         early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
  #endif
+#endif
  #ifdef CONFIG_X86_64
                 per_cpu(irq_stack_ptr, cpu) =
                         per_cpu(irq_stack_union.irq_stack, cpu) +
@@ -268,6 +270,7 @@ void __init setup_per_cpu_areas(void)
         }
  
         /* indicate the early static arrays will soon be gone */
+#ifndef CONFIG_XEN
  #ifdef CONFIG_X86_LOCAL_APIC
         early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
         early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
@@ -275,6 +278,7 @@ void __init setup_per_cpu_areas(void)
  #ifdef CONFIG_X86_32
         early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
  #endif
+#endif
  #ifdef CONFIG_NUMA
         early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
  #endif
diff --git a/arch/x86/kernel/smp-xen.c b/arch/x86/kernel/smp-xen.c

new file mode 100644 (file)

index 0000000..1abab7b
--- /dev/null
+++ b/arch/x86/kernel/smp-xen.c
@@ -0,0 +1,237 @@
+/*
+ *     Intel SMP support routines.
+ *
+ *     (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
+ *     (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com>
+ *      (c) 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ *     i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
+ *
+ *     This code is released under the GNU General Public License version 2 or
+ *     later.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/gfp.h>
+
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/ipi.h>
+#include <asm/nmi.h>
+#include <xen/evtchn.h>
+/*
+ *     Some notes on x86 processor bugs affecting SMP operation:
+ *
+ *     Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ *     The Linux implications for SMP are handled as follows:
+ *
+ *     Pentium III / [Xeon]
+ *             None of the E1AP-E3AP errata are visible to the user.
+ *
+ *     E1AP.   see PII A1AP
+ *     E2AP.   see PII A2AP
+ *     E3AP.   see PII A3AP
+ *
+ *     Pentium II / [Xeon]
+ *             None of the A1AP-A3AP errata are visible to the user.
+ *
+ *     A1AP.   see PPro 1AP
+ *     A2AP.   see PPro 2AP
+ *     A3AP.   see PPro 7AP
+ *
+ *     Pentium Pro
+ *             None of 1AP-9AP errata are visible to the normal user,
+ *     except occasional delivery of 'spurious interrupt' as trap #15.
+ *     This is very rare and a non-problem.
+ *
+ *     1AP.    Linux maps APIC as non-cacheable
+ *     2AP.    worked around in hardware
+ *     3AP.    fixed in C0 and above steppings microcode update.
+ *             Linux does not use excessive STARTUP_IPIs.
+ *     4AP.    worked around in hardware
+ *     5AP.    symmetric IO mode (normal Linux operation) not affected.
+ *             'noapic' mode has vector 0xf filled out properly.
+ *     6AP.    'noapic' mode might be affected - fixed in later steppings
+ *     7AP.    We do not assume writes to the LVT deassering IRQs
+ *     8AP.    We do not enable low power mode (deep sleep) during MP bootup
+ *     9AP.    We do not use mixed mode
+ *
+ *     Pentium
+ *             There is a marginal case where REP MOVS on 100MHz SMP
+ *     machines with B stepping processors can fail. XXX should provide
+ *     an L1cache=Writethrough or L1cache=off option.
+ *
+ *             B stepping CPUs may hang. There are hardware work arounds
+ *     for this. We warn about it in case your board doesn't have the work
+ *     arounds. Basically that's so I can tell anyone with a B stepping
+ *     CPU and SMP problems "tough".
+ *
+ *     Specific items [From Pentium Processor Specification Update]
+ *
+ *     1AP.    Linux doesn't use remote read
+ *     2AP.    Linux doesn't trust APIC errors
+ *     3AP.    We work around this
+ *     4AP.    Linux never generated 3 interrupts of the same priority
+ *             to cause a lost local interrupt.
+ *     5AP.    Remote read is never used
+ *     6AP.    not affected - worked around in hardware
+ *     7AP.    not affected - worked around in hardware
+ *     8AP.    worked around in hardware - we get explicit CS errors if not
+ *     9AP.    only 'noapic' mode affected. Might generate spurious
+ *             interrupts, we log only the first one and count the
+ *             rest silently.
+ *     10AP.   not affected - worked around in hardware
+ *     11AP.   Linux reads the APIC between writes to avoid this, as per
+ *             the documentation. Make sure you preserve this as it affects
+ *             the C stepping chips too.
+ *     12AP.   not affected - worked around in hardware
+ *     13AP.   not affected - worked around in hardware
+ *     14AP.   we always deassert INIT during bootup
+ *     15AP.   not affected - worked around in hardware
+ *     16AP.   not affected - worked around in hardware
+ *     17AP.   not affected - worked around in hardware
+ *     18AP.   not affected - worked around in hardware
+ *     19AP.   not affected - worked around in BIOS
+ *
+ *     If this sounds worrying believe me these bugs are either ___RARE___,
+ *     or are signal timing bugs worked around in hardware and there's
+ *     about nothing of note with C stepping upwards.
+ */
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+void xen_smp_send_reschedule(int cpu)
+{
+       if (unlikely(cpu_is_offline(cpu))) {
+               WARN_ON(1);
+               return;
+       }
+       xen_send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
+}
+
+void xen_send_call_func_single_ipi(int cpu)
+{
+       xen_send_IPI_mask(cpumask_of(cpu), CALL_FUNC_SINGLE_VECTOR);
+}
+
+void xen_send_call_func_ipi(const struct cpumask *mask)
+{
+       xen_send_IPI_mask_allbutself(mask, CALL_FUNCTION_VECTOR);
+}
+
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+static bool __read_mostly xen_smp_disable_nmi_ipi;
+
+static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
+{
+       /* We are registered on stopping cpu too, avoid spurious NMI */
+       if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
+               return NMI_HANDLED;
+
+       stop_this_cpu(NULL);
+
+       return NMI_HANDLED;
+}
+
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+
+void smp_reboot_interrupt(struct pt_regs *regs)
+{
+       stop_this_cpu(NULL);
+}
+
+void xen_stop_other_cpus(int wait)
+{
+       unsigned long flags;
+       unsigned long timeout;
+
+       /*
+        * Use an own vector here because smp_call_function
+        * does lots of things not suitable in a panic situation.
+        * On most systems we could also use an NMI here,
+        * but there are a few systems around where NMI
+        * is problematic so stay with an non NMI for now
+        * (this implies we cannot stop CPUs spinning with irq off
+        * currently)
+        */
+       if (num_online_cpus() > 1) {
+               unsigned int vector = REBOOT_VECTOR;
+
+               if (!xen_smp_disable_nmi_ipi) {
+                       /* did someone beat us here? */
+                       if (atomic_cmpxchg(&stopping_cpu, -1,
+                                          safe_smp_processor_id()) != -1)
+                               return;
+
+                       if (register_nmi_handler(NMI_LOCAL,
+                                                smp_stop_nmi_callback,
+                                                NMI_FLAG_FIRST, "smp_stop"))
+                               /* Note: we ignore failures here */
+                               return;
+
+                       /* sync above data before sending NMI */
+                       wmb();
+
+                       vector = NMI_VECTOR;
+               }
+
+               xen_send_IPI_allbutself(vector);
+
+               /*
+                * Don't wait longer than a second if the caller
+                * didn't ask us to wait.
+                */
+               timeout = USEC_PER_SEC;
+               while (num_online_cpus() > 1 && (wait || timeout--))
+                       udelay(1);
+       }
+
+       local_irq_save(flags);
+       disable_all_local_evtchn();
+       local_irq_restore(flags);
+}
+
+/*
+ * Reschedule call back.
+ */
+void smp_reschedule_interrupt(struct pt_regs *regs)
+{
+       inc_irq_stat(irq_resched_count);
+       scheduler_ipi();
+}
+
+void smp_call_function_interrupt(struct pt_regs *regs)
+{
+       generic_smp_call_function_interrupt();
+       inc_irq_stat(irq_call_count);
+}
+
+void smp_call_function_single_interrupt(struct pt_regs *regs)
+{
+       generic_smp_call_function_single_interrupt();
+       inc_irq_stat(irq_call_count);
+}
+
+static int __init nonmi_ipi_setup(char *str)
+{
+        xen_smp_disable_nmi_ipi = true;
+        return 1;
+}
+
+__setup("nonmi_ipi", nonmi_ipi_setup);
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c

index fdd0c64..15f0e0b 100644 (file)
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -9,6 +9,15 @@
  #include <linux/uaccess.h>
  #include <asm/stacktrace.h>
  
+static void save_stack_warning(void *data, char *msg)
+{
+}
+
+static void
+save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+}
+
  static int save_stack_stack(void *data, char *name)
  {
         return 0;
@@ -44,12 +53,16 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
  }
  
  static const struct stacktrace_ops save_stack_ops = {
+       .warning        = save_stack_warning,
+       .warning_symbol = save_stack_warning_symbol,
         .stack          = save_stack_stack,
         .address        = save_stack_address,
         .walk_stack     = print_context_stack,
  };
  
  static const struct stacktrace_ops save_stack_ops_nosched = {
+       .warning        = save_stack_warning,
+       .warning_symbol = save_stack_warning_symbol,
         .stack          = save_stack_stack,
         .address        = save_stack_address_nosched,
         .walk_stack     = print_context_stack,
diff --git a/arch/x86/kernel/syscall_32-xen.c b/arch/x86/kernel/syscall_32-xen.c

new file mode 100644 (file)

index 0000000..6e89a5a
--- /dev/null
+++ b/arch/x86/kernel/syscall_32-xen.c
@@ -0,0 +1,20 @@
+#include "syscall_32.c"
+
+#include <linux/thread_info.h>
+
+#ifdef TIF_CSTAR
+extern asmlinkage void cstar_set_tif(void);
+
+#define        ptregs_fork cstar_set_tif
+#define        ptregs_clone cstar_set_tif
+#define        ptregs_vfork cstar_set_tif
+
+const sys_call_ptr_t cstar_call_table[__NR_syscall_max+1] = {
+       /*
+        * Smells like a compiler bug -- it doesn't work
+        * when the & below is removed.
+        */
+       [0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_32.h>
+};
+#endif /* TIF_CSTAR */
diff --git a/arch/x86/kernel/time-xen.c b/arch/x86/kernel/time-xen.c

new file mode 100644 (file)

index 0000000..e6bedf7
--- /dev/null
+++ b/arch/x86/kernel/time-xen.c
@@ -0,0 +1,626 @@
+/*
+ *  Copyright (c) 1991,1992,1995  Linus Torvalds
+ *  Copyright (c) 1994  Alan Modra
+ *  Copyright (c) 1995  Markus Kuhn
+ *  Copyright (c) 1996  Ingo Molnar
+ *  Copyright (c) 1998  Andrea Arcangeli
+ *  Copyright (c) 2002,2006  Vojtech Pavlik
+ *  Copyright (c) 2003  Andi Kleen
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/export.h>
+#include <linux/sysctl.h>
+#include <linux/percpu.h>
+#include <linux/kernel_stat.h>
+#include <linux/posix-timers.h>
+#include <linux/cpufreq.h>
+#include <linux/clocksource.h>
+
+#include <asm/vsyscall.h>
+#include <asm/delay.h>
+#include <asm/time.h>
+#include <asm/timer.h>
+
+#include <xen/clock.h>
+#include <xen/sysctl.h>
+#include <xen/interface/vcpu.h>
+
+#ifdef CONFIG_X86_64
+DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
+#endif
+
+#define XEN_SHIFT 22
+
+unsigned int cpu_khz;  /* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
+
+/* These are peridically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+       u32 tsc_to_nsec_mul;
+       u32 tsc_to_usec_mul;
+       int tsc_shift;
+       u32 version;
+};
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+static struct timespec shadow_tv;
+static u32 shadow_tv_version;
+
+static u64 jiffies_bias, system_time_bias;
+
+/* Current runstate of each CPU (updated automatically by the hypervisor). */
+DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
+/* Must be signed, as it's compared with s64 quantities which can be -ve. */
+#define NS_PER_TICK (1000000000LL/HZ)
+
+/* Does this guest OS track Xen time, or set its wall clock independently? */
+static int independent_wallclock = 0;
+static int __init __independent_wallclock(char *str)
+{
+       independent_wallclock = 1;
+       return 1;
+}
+__setup("independent_wallclock", __independent_wallclock);
+
+/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
+static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
+static int __init __permitted_clock_jitter(char *str)
+{
+       permitted_clock_jitter = simple_strtoul(str, NULL, 0);
+       return 1;
+}
+__setup("permitted_clock_jitter=", __permitted_clock_jitter);
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+       u64 product;
+#ifdef __i386__
+       u32 tmp1, tmp2;
+#endif
+
+       if (shift < 0)
+               delta >>= -shift;
+       else
+               delta <<= shift;
+
+#ifdef __i386__
+       __asm__ (
+               "mul  %5       ; "
+               "mov  %4,%%eax ; "
+               "mov  %%edx,%4 ; "
+               "mul  %5       ; "
+               "xor  %5,%5    ; "
+               "add  %4,%%eax ; "
+               "adc  %5,%%edx ; "
+               : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+               : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#else
+       __asm__ (
+               "mul %%rdx ; shrd $32,%%rdx,%%rax"
+               : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#endif
+
+       return product;
+}
+
+static void init_cpu_khz(void)
+{
+       u64 __cpu_khz = 1000000ULL << 32;
+       struct vcpu_time_info *info = &vcpu_info(0)->time;
+       do_div(__cpu_khz, info->tsc_to_system_mul);
+       if (info->tsc_shift < 0)
+               cpu_khz = __cpu_khz << -info->tsc_shift;
+       else
+               cpu_khz = __cpu_khz >> info->tsc_shift;
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+       u64 now, delta;
+       rdtscll(now);
+       delta = now - shadow->tsc_timestamp;
+       return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+static inline u64 processed_system_time(u64 jiffies_64)
+{
+       return (jiffies_64 - jiffies_bias) * NS_PER_TICK + system_time_bias;
+}
+
+static void update_wallclock(bool local)
+{
+       static DEFINE_MUTEX(uwc_mutex);
+       shared_info_t *s = HYPERVISOR_shared_info;
+
+       mutex_lock(&uwc_mutex);
+
+       do {
+               shadow_tv_version = s->wc_version;
+               rmb();
+               shadow_tv.tv_sec  = s->wc_sec;
+               shadow_tv.tv_nsec = s->wc_nsec;
+               rmb();
+       } while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version));
+
+       if (local) {
+               u64 tmp = processed_system_time(get_jiffies_64());
+               long nsec = do_div(tmp, NSEC_PER_SEC);
+               struct timespec tv;
+
+               set_normalized_timespec(&tv, shadow_tv.tv_sec + tmp,
+                                       shadow_tv.tv_nsec + nsec);
+               do_settimeofday(&tv);
+       }
+
+       mutex_unlock(&uwc_mutex);
+}
+
+static void _update_wallclock(struct work_struct *unused)
+{
+       update_wallclock(true);
+}
+static DECLARE_WORK(update_wallclock_work, _update_wallclock);
+
+void xen_check_wallclock_update(void)
+{
+       if (shadow_tv_version != HYPERVISOR_shared_info->wc_version
+           && !is_initial_xendomain() && !independent_wallclock
+           && keventd_up())
+               schedule_work(&update_wallclock_work);
+}
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static void get_time_values_from_xen(unsigned int cpu)
+{
+       struct vcpu_time_info   *src;
+       struct shadow_time_info *dst;
+       unsigned long flags;
+       u32 pre_version, post_version;
+
+       src = &vcpu_info(cpu)->time;
+       dst = &per_cpu(shadow_time, cpu);
+
+       local_irq_save(flags);
+
+       do {
+               pre_version = dst->version = src->version;
+               rmb();
+               dst->tsc_timestamp     = src->tsc_timestamp;
+               dst->system_timestamp  = src->system_time;
+               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+               dst->tsc_shift         = src->tsc_shift;
+               rmb();
+               post_version = src->version;
+       } while ((pre_version & 1) | (pre_version ^ post_version));
+
+       dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
+
+       local_irq_restore(flags);
+}
+
+static inline int time_values_up_to_date(void)
+{
+       rmb();
+       return this_cpu_read(shadow_time.version) == vcpu_info_read(time.version);
+}
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+int xen_update_wallclock(const struct timespec *tv)
+{
+       struct timespec now;
+       s64 nsec;
+       struct shadow_time_info *shadow;
+       struct xen_platform_op op;
+
+       if (!is_initial_xendomain() || independent_wallclock)
+               return -EPERM;
+
+       shadow = &__get_cpu_var(shadow_time);
+
+       /*
+        * Ensure we don't get blocked for a long time so that our time delta
+        * overflows. If that were to happen then our shadow time values would
+        * be stale, so we can retry with fresh ones.
+        */
+       for (;;) {
+               nsec = tv->tv_nsec - get_nsec_offset(shadow);
+               if (time_values_up_to_date())
+                       break;
+               get_time_values_from_xen(smp_processor_id());
+       }
+       set_normalized_timespec(&now, tv->tv_sec, nsec);
+
+       op.cmd = XENPF_settime;
+       op.u.settime.secs        = now.tv_sec;
+       op.u.settime.nsecs       = now.tv_nsec;
+       op.u.settime.system_time = shadow->system_timestamp;
+       WARN_ON(HYPERVISOR_platform_op(&op));
+       update_wallclock(false);
+
+       return 0;
+}
+
+static void sync_xen_wallclock(unsigned long dummy);
+static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
+static void sync_xen_wallclock(unsigned long dummy)
+{
+       struct timespec now, ignore;
+       struct xen_platform_op op;
+
+       BUG_ON(!is_initial_xendomain());
+       if (!ntp_synced() || independent_wallclock)
+               return;
+
+       get_xtime_and_monotonic_and_sleep_offset(&now, &ignore, &ignore);
+       set_normalized_timespec(&now, now.tv_sec, now.tv_nsec);
+
+       op.cmd = XENPF_settime;
+       op.u.settime.secs        = now.tv_sec;
+       op.u.settime.nsecs       = now.tv_nsec;
+       op.u.settime.system_time = processed_system_time(get_jiffies_64());
+       WARN_ON(HYPERVISOR_platform_op(&op));
+
+       update_wallclock(false);
+
+       /* Once per minute. */
+       mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
+}
+#endif /* CONFIG_XEN_PRIVILEGED_GUEST */
+
+unsigned long long xen_local_clock(void)
+{
+       unsigned int cpu = get_cpu();
+       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+       u64 time;
+       u32 local_time_version;
+
+       do {
+               local_time_version = shadow->version;
+               rdtsc_barrier();
+               time = shadow->system_timestamp + get_nsec_offset(shadow);
+               if (!time_values_up_to_date())
+                       get_time_values_from_xen(cpu);
+               barrier();
+       } while (local_time_version != shadow->version);
+
+       put_cpu();
+
+       return time;
+}
+
+unsigned long xen_read_wallclock(void)
+{
+       const shared_info_t *s = HYPERVISOR_shared_info;
+       u32 version, sec, nsec;
+       u64 delta;
+
+       do {
+               version = s->wc_version;
+               rmb();
+               sec     = s->wc_sec;
+               nsec    = s->wc_nsec;
+               rmb();
+       } while ((s->wc_version & 1) | (version ^ s->wc_version));
+
+       delta = xen_local_clock() + (u64)sec * NSEC_PER_SEC + nsec;
+       do_div(delta, NSEC_PER_SEC);
+
+       return delta;
+}
+
+int xen_write_wallclock(unsigned long now)
+{
+       if (!is_initial_xendomain() || independent_wallclock)
+               return 0;
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
+#endif
+
+       return mach_set_rtc_mmss(now);
+}
+
+/*
+ * Runstate accounting
+ */
+static void get_runstate_snapshot(struct vcpu_runstate_info *res)
+{
+       u64 state_time;
+       struct vcpu_runstate_info *state;
+
+       BUG_ON(preemptible());
+
+       state = &__get_cpu_var(runstate);
+
+       do {
+               state_time = get_64bit_local(&state->state_entry_time);
+               *res = *state;
+       } while (get_64bit_local(&state->state_entry_time) != state_time);
+
+       WARN_ON_ONCE(res->state != RUNSTATE_running);
+}
+
+/*
+ * Xen sched_clock implementation.  Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+unsigned long long sched_clock(void)
+{
+       struct vcpu_runstate_info runstate;
+       cycle_t now;
+       u64 ret;
+       s64 offset;
+
+       /*
+        * Ideally sched_clock should be called on a per-cpu basis
+        * anyway, so preempt should already be disabled, but that's
+        * not current practice at the moment.
+        */
+       preempt_disable();
+
+       now = xen_local_clock();
+
+       get_runstate_snapshot(&runstate);
+
+       offset = now - runstate.state_entry_time;
+       if (offset < 0)
+               offset = 0;
+
+       ret = offset + runstate.time[RUNSTATE_running]
+             + runstate.time[RUNSTATE_blocked];
+
+       preempt_enable();
+
+       return ret;
+}
+
+unsigned long profile_pc(struct pt_regs *regs)
+{
+       unsigned long pc = instruction_pointer(regs);
+
+       if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
+               return *(unsigned long *)(regs->bp + sizeof(long));
+#else
+               unsigned long *sp =
+                       (unsigned long *)kernel_stack_pointer(regs);
+
+               /*
+                * Return address is either directly at stack pointer
+                * or above a saved flags. Eflags has bits 22-31 zero,
+                * kernel addresses don't.
+                */
+               if (sp[0] >> 22)
+                       return sp[0];
+               if (sp[1] >> 22)
+                       return sp[1];
+#endif
+       }
+
+       return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+
+void mark_tsc_unstable(char *reason)
+{
+#ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */
+       tsc_unstable = 1;
+#endif
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+static cycle_t cs_last;
+
+static cycle_t xen_clocksource_read(struct clocksource *cs)
+{
+#ifdef CONFIG_SMP
+       cycle_t last = get_64bit(&cs_last);
+       cycle_t ret = xen_local_clock();
+
+       if (unlikely((s64)(ret - last) < 0)) {
+               if (last - ret > permitted_clock_jitter
+                   && printk_ratelimit()) {
+                       unsigned int cpu = get_cpu();
+                       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+
+                       printk(KERN_WARNING "clocksource/%u: "
+                              "Time went backwards: "
+                              "ret=%Lx delta=%Ld shadow=%Lx offset=%Lx\n",
+                              cpu, ret, ret - last, shadow->system_timestamp,
+                              get_nsec_offset(shadow));
+                       put_cpu();
+               }
+               return last;
+       }
+
+       for (;;) {
+               cycle_t cur = cmpxchg64(&cs_last, last, ret);
+
+               if (cur == last || (s64)(ret - cur) < 0)
+                       return ret;
+               last = cur;
+       }
+#else
+       return xen_local_clock();
+#endif
+}
+
+/* No locking required. Interrupts are disabled on all CPUs. */
+static void xen_clocksource_resume(struct clocksource *cs)
+{
+       unsigned int cpu;
+
+       init_cpu_khz();
+
+       for_each_online_cpu(cpu)
+               get_time_values_from_xen(cpu);
+
+       jiffies_bias = get_jiffies_64();
+       system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
+
+       cs_last = xen_local_clock();
+}
+
+static struct clocksource clocksource_xen = {
+       .name                   = "xen",
+       .rating                 = 400,
+       .read                   = xen_clocksource_read,
+       .mask                   = CLOCKSOURCE_MASK(64),
+       .mult                   = 1 << XEN_SHIFT,               /* time directly in nanoseconds */
+       .shift                  = XEN_SHIFT,
+       .flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
+       .resume                 = xen_clocksource_resume,
+};
+
+void setup_runstate_area(unsigned int cpu)
+{
+       struct vcpu_register_runstate_memory_area area;
+       struct vcpu_runstate_info *rs = &per_cpu(runstate, cpu);
+       int rc;
+
+       set_xen_guest_handle(area.addr.h, rs);
+       rc = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
+       if (rc) {
+               BUILD_BUG_ON(RUNSTATE_running);
+               memset(rs, 0, sizeof(*rs));
+               WARN_ON(rc != -ENOSYS);
+       }
+}
+
+static void __init _late_time_init(void)
+{
+       update_wallclock(false);
+       xen_clockevents_init();
+}
+
+void __init time_init(void)
+{
+       init_cpu_khz();
+       printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
+              cpu_khz / 1000, cpu_khz % 1000);
+
+       setup_runstate_area(0);
+       get_time_values_from_xen(0);
+
+       jiffies_bias     = jiffies_64;
+       system_time_bias = per_cpu(shadow_time, 0).system_timestamp;
+
+       clocksource_register_hz(&clocksource_xen, NSEC_PER_SEC);
+
+       use_tsc_delay();
+
+       /*
+        * Cannot request_irq() until kmem is initialised, and cannot
+        * do_settimeofday() (i.e. clock_was_set()) until interrupts are on.
+        */
+       late_time_init = _late_time_init;
+}
+
+/* Convert jiffies to system time. */
+u64 jiffies_to_st(unsigned long j)
+{
+       u64 j64 = get_jiffies_64();
+       long delta = j - (unsigned long)j64;
+
+       if (delta < 1)
+               /* Triggers in some wrap-around cases, but that's okay:
+                * we just end up with a shorter timeout. */
+               return processed_system_time(j64) + NS_PER_TICK;
+
+       if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0)
+               /* Very long timeout means there is no pending timer.
+                * We indicate this to Xen by passing zero timeout. */
+               return 0;
+
+       return processed_system_time(j64) + delta * (u64)NS_PER_TICK;
+}
+EXPORT_SYMBOL(jiffies_to_st);
+
+#ifdef CONFIG_CPU_FREQ
+static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 
+                               void *data)
+{
+       struct cpufreq_freqs *freq = data;
+       struct xen_platform_op op;
+
+       if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC))
+               return 0;
+
+       if (val == CPUFREQ_PRECHANGE)
+               return 0;
+
+       op.cmd = XENPF_change_freq;
+       op.u.change_freq.flags = 0;
+       op.u.change_freq.cpu = freq->cpu;
+       op.u.change_freq.freq = (u64)freq->new * 1000;
+       WARN_ON(HYPERVISOR_platform_op(&op));
+
+       return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+       .notifier_call = time_cpufreq_notifier
+};
+
+static int __init cpufreq_time_setup(void)
+{
+       if (!cpufreq_register_notifier(&time_cpufreq_notifier_block,
+                       CPUFREQ_TRANSITION_NOTIFIER)) {
+               printk(KERN_ERR "failed to set up cpufreq notifier\n");
+               return -ENODEV;
+       }
+       return 0;
+}
+
+core_initcall(cpufreq_time_setup);
+#endif
+
+/*
+ * /proc/sys/xen: This really belongs in another file. It can stay here for
+ * now however.
+ */
+static ctl_table xen_subtable[] = {
+       {
+               .procname       = "independent_wallclock",
+               .data           = &independent_wallclock,
+               .maxlen         = sizeof(independent_wallclock),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec
+       },
+       {
+               .procname       = "permitted_clock_jitter",
+               .data           = &permitted_clock_jitter,
+               .maxlen         = sizeof(permitted_clock_jitter),
+               .mode           = 0644,
+               .proc_handler   = proc_doulongvec_minmax
+       },
+       { }
+};
+static ctl_table xen_table[] = {
+       {
+               .procname       = "xen",
+               .mode           = 0555,
+               .child          = xen_subtable
+       },
+       { }
+};
+static int __init xen_sysctl_init(void)
+{
+       (void)register_sysctl_table(xen_table);
+       return 0;
+}
+__initcall(xen_sysctl_init);
diff --git a/arch/x86/kernel/traps-xen.c b/arch/x86/kernel/traps-xen.c

new file mode 100644 (file)

index 0000000..20d5ca1
--- /dev/null
+++ b/arch/x86/kernel/traps-xen.c
@@ -0,0 +1,748 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ *
+ *  Pentium III FXSR, SSE support
+ *     Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+
+/*
+ * Handle hardware traps and faults.
+ */
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/spinlock.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <linux/kgdb.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/io.h>
+
+#ifdef CONFIG_EISA
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#endif
+
+#ifdef CONFIG_MCA
+#include <linux/mca.h>
+#endif
+
+#if defined(CONFIG_EDAC)
+#include <linux/edac.h>
+#endif
+
+#include <asm/kmemcheck.h>
+#include <asm/stacktrace.h>
+#include <asm/processor.h>
+#include <asm/debugreg.h>
+#include <linux/atomic.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/fpu-internal.h>
+#include <asm/mce.h>
+
+#include <asm/mach_traps.h>
+
+#ifdef CONFIG_X86_64
+#include <asm/x86_init.h>
+#include <asm/pgalloc.h>
+#include <asm/proto.h>
+#else
+#include <asm/processor-flags.h>
+#include <asm/setup.h>
+
+asmlinkage int system_call(void);
+
+/* Do we ignore FPU interrupts ? */
+char ignore_fpu_irq;
+
+#ifndef CONFIG_X86_NO_IDT
+/*
+ * The IDT has to be page-aligned to simplify the Pentium
+ * F0 0F bug workaround.
+ */
+gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
+#endif
+#endif
+
+#ifndef CONFIG_XEN
+DECLARE_BITMAP(used_vectors, NR_VECTORS);
+EXPORT_SYMBOL_GPL(used_vectors);
+#endif
+
+static inline void conditional_sti(struct pt_regs *regs)
+{
+       if (regs->flags & X86_EFLAGS_IF)
+               local_irq_enable();
+}
+
+static inline void preempt_conditional_sti(struct pt_regs *regs)
+{
+       inc_preempt_count();
+       if (regs->flags & X86_EFLAGS_IF)
+               local_irq_enable();
+}
+
+static inline void conditional_cli(struct pt_regs *regs)
+{
+       if (regs->flags & X86_EFLAGS_IF)
+               local_irq_disable();
+}
+
+static inline void preempt_conditional_cli(struct pt_regs *regs)
+{
+       if (regs->flags & X86_EFLAGS_IF)
+               local_irq_disable();
+       dec_preempt_count();
+}
+
+static void __kprobes
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+       long error_code, siginfo_t *info)
+{
+       struct task_struct *tsk = current;
+
+#ifdef CONFIG_X86_32
+       if (regs->flags & X86_VM_MASK) {
+               /*
+                * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
+                * On nmi (interrupt 2), do_trap should not be called.
+                */
+               if (trapnr < X86_TRAP_UD)
+                       goto vm86_trap;
+               goto trap_signal;
+       }
+#endif
+
+       if (!user_mode(regs))
+               goto kernel_trap;
+
+#ifdef CONFIG_X86_32
+trap_signal:
+#endif
+       /*
+        * We want error_code and trap_nr set for userspace faults and
+        * kernelspace faults which result in die(), but not
+        * kernelspace faults which are fixed up.  die() gives the
+        * process no chance to handle the signal and notice the
+        * kernel fault information, so that won't result in polluting
+        * the information about previously queued, but not yet
+        * delivered, faults.  See also do_general_protection below.
+        */
+       tsk->thread.error_code = error_code;
+       tsk->thread.trap_nr = trapnr;
+
+#ifdef CONFIG_X86_64
+       if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+           printk_ratelimit()) {
+               printk(KERN_INFO
+                      "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+                      tsk->comm, tsk->pid, str,
+                      regs->ip, regs->sp, error_code);
+               print_vma_addr(" in ", regs->ip);
+               printk("\n");
+       }
+#endif
+
+       if (info)
+               force_sig_info(signr, info, tsk);
+       else
+               force_sig(signr, tsk);
+       return;
+
+kernel_trap:
+       if (!fixup_exception(regs)) {
+               tsk->thread.error_code = error_code;
+               tsk->thread.trap_nr = trapnr;
+               die(str, regs, error_code);
+       }
+       return;
+
+#ifdef CONFIG_X86_32
+vm86_trap:
+       if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
+                                               error_code, trapnr))
+               goto trap_signal;
+       return;
+#endif
+}
+
+#define DO_ERROR(trapnr, signr, str, name)                             \
+dotraplinkage void do_##name(struct pt_regs *regs, long error_code)    \
+{                                                                      \
+       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)  \
+                                                       == NOTIFY_STOP) \
+               return;                                                 \
+       conditional_sti(regs);                                          \
+       do_trap(trapnr, signr, str, regs, error_code, NULL);            \
+}
+
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)                \
+dotraplinkage void do_##name(struct pt_regs *regs, long error_code)    \
+{                                                                      \
+       siginfo_t info;                                                 \
+       info.si_signo = signr;                                          \
+       info.si_errno = 0;                                              \
+       info.si_code = sicode;                                          \
+       info.si_addr = (void __user *)siaddr;                           \
+       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)  \
+                                                       == NOTIFY_STOP) \
+               return;                                                 \
+       conditional_sti(regs);                                          \
+       do_trap(trapnr, signr, str, regs, error_code, &info);           \
+}
+
+DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
+               regs->ip)
+DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
+DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN,
+               regs->ip)
+DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",
+               coprocessor_segment_overrun)
+DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
+#ifdef CONFIG_X86_32
+DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
+#endif
+DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
+               BUS_ADRALN, 0)
+
+#ifdef CONFIG_X86_64
+/* Runs on IST stack */
+dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
+{
+       if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
+                       X86_TRAP_SS, SIGBUS) == NOTIFY_STOP)
+               return;
+       preempt_conditional_sti(regs);
+       do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
+       preempt_conditional_cli(regs);
+}
+
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
+{
+       static const char str[] = "double fault";
+       struct task_struct *tsk = current;
+
+       /* Return not checked because double check cannot be ignored */
+       notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
+
+       tsk->thread.error_code = error_code;
+       tsk->thread.trap_nr = X86_TRAP_DF;
+
+       /*
+        * This is always a kernel trap and never fixable (and thus must
+        * never return).
+        */
+       for (;;)
+               die(str, regs, error_code);
+}
+#endif
+
+dotraplinkage void __kprobes
+do_general_protection(struct pt_regs *regs, long error_code)
+{
+       struct task_struct *tsk;
+
+       conditional_sti(regs);
+
+#ifdef CONFIG_X86_32
+       if (regs->flags & X86_VM_MASK)
+               goto gp_in_vm86;
+#endif
+
+       tsk = current;
+       if (!user_mode(regs))
+               goto gp_in_kernel;
+
+       tsk->thread.error_code = error_code;
+       tsk->thread.trap_nr = X86_TRAP_GP;
+
+       if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+                       printk_ratelimit()) {
+               printk(KERN_INFO
+                       "%s[%d] general protection ip:%lx sp:%lx error:%lx",
+                       tsk->comm, task_pid_nr(tsk),
+                       regs->ip, regs->sp, error_code);
+               print_vma_addr(" in ", regs->ip);
+               printk("\n");
+       }
+
+       force_sig(SIGSEGV, tsk);
+       return;
+
+#ifdef CONFIG_X86_32
+gp_in_vm86:
+       local_irq_enable();
+       handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+       return;
+#endif
+
+gp_in_kernel:
+       if (fixup_exception(regs))
+               return;
+
+       tsk->thread.error_code = error_code;
+       tsk->thread.trap_nr = X86_TRAP_GP;
+       if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
+                       X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
+               return;
+       die("general protection fault", regs, error_code);
+}
+
+/* May run on IST stack. */
+dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
+{
+#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
+       if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+                               SIGTRAP) == NOTIFY_STOP)
+               return;
+#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
+
+       if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+                       SIGTRAP) == NOTIFY_STOP)
+               return;
+
+       /*
+        * Let others (NMI) know that the debug stack is in use
+        * as we may switch to the interrupt stack.
+        */
+       debug_stack_usage_inc();
+       preempt_conditional_sti(regs);
+       do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
+       preempt_conditional_cli(regs);
+       debug_stack_usage_dec();
+}
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+/*
+ * Help handler running on IST stack to switch back to user stack
+ * for scheduling or signal handling. The actual stack switch is done in
+ * entry.S
+ */
+asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
+{
+       struct pt_regs *regs = eregs;
+       /* Did already sync */
+       if (eregs == (struct pt_regs *)eregs->sp)
+               ;
+       /* Exception from user space */
+       else if (user_mode(eregs))
+               regs = task_pt_regs(current);
+       /*
+        * Exception from kernel and interrupts are enabled. Move to
+        * kernel process stack.
+        */
+       else if (eregs->flags & X86_EFLAGS_IF)
+               regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
+       if (eregs != regs)
+               *regs = *eregs;
+       return regs;
+}
+#endif
+
+/*
+ * Our handling of the processor debug registers is non-trivial.
+ * We do not clear them on entry and exit from the kernel. Therefore
+ * it is possible to get a watchpoint trap here from inside the kernel.
+ * However, the code in ./ptrace.c has ensured that the user can
+ * only set watchpoints on userspace addresses. Therefore the in-kernel
+ * watchpoint trap can only occur in code which is reading/writing
+ * from user space. Such code must not hold kernel locks (since it
+ * can equally take a page fault), therefore it is safe to call
+ * force_sig_info even though that claims and releases locks.
+ *
+ * Code in ./signal.c ensures that the debug control register
+ * is restored before we deliver any signal, and therefore that
+ * user code runs with the correct debug control register even though
+ * we clear it here.
+ *
+ * Being careful here means that we don't have to be as careful in a
+ * lot of more complicated places (task switching can be a bit lazy
+ * about restoring all the debug state, and ptrace doesn't have to
+ * find every occurrence of the TF bit that could be saved away even
+ * by user code)
+ *
+ * May run on IST stack.
+ */
+dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
+{
+       struct task_struct *tsk = current;
+       int user_icebp = 0;
+       unsigned long dr6;
+       int si_code;
+
+       get_debugreg(dr6, 6);
+
+       /* Filter out all the reserved bits which are preset to 1 */
+       dr6 &= ~DR6_RESERVED;
+
+       /*
+        * If dr6 has no reason to give us about the origin of this trap,
+        * then it's very likely the result of an icebp/int01 trap.
+        * User wants a sigtrap for that.
+        */
+       if (!dr6 && user_mode(regs))
+               user_icebp = 1;
+
+       /* Catch kmemcheck conditions first of all! */
+       if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
+               return;
+
+       /* DR6 may or may not be cleared by the CPU */
+       set_debugreg(0, 6);
+
+       /*
+        * The processor cleared BTF, so don't mark that we need it set.
+        */
+       clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+
+       /* Store the virtualized DR6 value */
+       tsk->thread.debugreg6 = dr6;
+
+       if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
+                                                       SIGTRAP) == NOTIFY_STOP)
+               return;
+
+       /*
+        * Let others (NMI) know that the debug stack is in use
+        * as we may switch to the interrupt stack.
+        */
+       debug_stack_usage_inc();
+
+       /* It's safe to allow irq's after DR6 has been saved */
+       preempt_conditional_sti(regs);
+
+       if (regs->flags & X86_VM_MASK) {
+               handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
+                                       X86_TRAP_DB);
+               preempt_conditional_cli(regs);
+               debug_stack_usage_dec();
+               return;
+       }
+
+       /*
+        * Single-stepping through system calls: ignore any exceptions in
+        * kernel space, but re-enable TF when returning to user mode.
+        *
+        * We already checked v86 mode above, so we can check for kernel mode
+        * by just checking the CPL of CS.
+        */
+       if ((dr6 & DR_STEP) && !user_mode(regs)) {
+               tsk->thread.debugreg6 &= ~DR_STEP;
+               set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+               regs->flags &= ~X86_EFLAGS_TF;
+       }
+       si_code = get_si_code(tsk->thread.debugreg6);
+       if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
+               send_sigtrap(tsk, regs, error_code, si_code);
+       preempt_conditional_cli(regs);
+       debug_stack_usage_dec();
+
+       return;
+}
+
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+void math_error(struct pt_regs *regs, int error_code, int trapnr)
+{
+       struct task_struct *task = current;
+       siginfo_t info;
+       unsigned short err;
+       char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
+                                               "simd exception";
+
+       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
+               return;
+       conditional_sti(regs);
+
+       if (!user_mode_vm(regs))
+       {
+               if (!fixup_exception(regs)) {
+                       task->thread.error_code = error_code;
+                       task->thread.trap_nr = trapnr;
+                       die(str, regs, error_code);
+               }
+               return;
+       }
+
+       /*
+        * Save the info for the exception handler and clear the error.
+        */
+       save_init_fpu(task);
+       task->thread.trap_nr = trapnr;
+       task->thread.error_code = error_code;
+       info.si_signo = SIGFPE;
+       info.si_errno = 0;
+       info.si_addr = (void __user *)regs->ip;
+       if (trapnr == X86_TRAP_MF) {
+               unsigned short cwd, swd;
+               /*
+                * (~cwd & swd) will mask out exceptions that are not set to unmasked
+                * status.  0x3f is the exception bits in these regs, 0x200 is the
+                * C1 reg you need in case of a stack fault, 0x040 is the stack
+                * fault bit.  We should only be taking one exception at a time,
+                * so if this combination doesn't produce any single exception,
+                * then we have a bad program that isn't synchronizing its FPU usage
+                * and it will suffer the consequences since we won't be able to
+                * fully reproduce the context of the exception
+                */
+               cwd = get_fpu_cwd(task);
+               swd = get_fpu_swd(task);
+
+               err = swd & ~cwd;
+       } else {
+               /*
+                * The SIMD FPU exceptions are handled a little differently, as there
+                * is only a single status/control register.  Thus, to determine which
+                * unmasked exception was caught we must mask the exception mask bits
+                * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+                */
+               unsigned short mxcsr = get_fpu_mxcsr(task);
+               err = ~(mxcsr >> 7) & mxcsr;
+       }
+
+       if (err & 0x001) {      /* Invalid op */
+               /*
+                * swd & 0x240 == 0x040: Stack Underflow
+                * swd & 0x240 == 0x240: Stack Overflow
+                * User must clear the SF bit (0x40) if set
+                */
+               info.si_code = FPE_FLTINV;
+       } else if (err & 0x004) { /* Divide by Zero */
+               info.si_code = FPE_FLTDIV;
+       } else if (err & 0x008) { /* Overflow */
+               info.si_code = FPE_FLTOVF;
+       } else if (err & 0x012) { /* Denormal, Underflow */
+               info.si_code = FPE_FLTUND;
+       } else if (err & 0x020) { /* Precision */
+               info.si_code = FPE_FLTRES;
+       } else {
+               /*
+                * If we're using IRQ 13, or supposedly even some trap
+                * X86_TRAP_MF implementations, it's possible
+                * we get a spurious trap, which is not an error.
+                */
+               return;
+       }
+       force_sig_info(SIGFPE, &info, task);
+}
+
+dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+#ifdef CONFIG_X86_32
+       ignore_fpu_irq = 1;
+#endif
+
+       math_error(regs, error_code, X86_TRAP_MF);
+}
+
+dotraplinkage void
+do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+{
+       math_error(regs, error_code, X86_TRAP_XF);
+}
+
+#ifndef CONFIG_XEN
+dotraplinkage void
+do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+{
+       conditional_sti(regs);
+#if 0
+       /* No need to warn about this any longer. */
+       printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
+#endif
+}
+
+asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+{
+}
+
+asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
+{
+}
+#endif /* CONFIG_XEN */
+
+/*
+ * 'math_state_restore()' saves the current math information in the
+ * old math state array, and gets the new ones from the current task
+ *
+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
+ * Don't touch unless you *really* know how it works.
+ *
+ * Must be called with kernel preemption disabled (eg with local
+ * local interrupts as in the case of do_device_not_available).
+ */
+void math_state_restore(void)
+{
+       struct task_struct *tsk = current;
+
+       if (!tsk_used_math(tsk)) {
+               local_irq_enable();
+               /*
+                * does a slab alloc which can sleep
+                */
+               if (init_fpu(tsk)) {
+                       /*
+                        * ran out of memory!
+                        */
+                       do_group_exit(SIGKILL);
+                       return;
+               }
+               local_irq_disable();
+       }
+
+       xen_thread_fpu_begin(tsk, NULL);
+       /*
+        * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+        */
+       if (unlikely(restore_fpu_checking(tsk))) {
+               __thread_fpu_end(tsk);
+               force_sig(SIGSEGV, tsk);
+               return;
+       }
+
+       tsk->fpu_counter++;
+}
+EXPORT_SYMBOL_GPL(math_state_restore);
+
+dotraplinkage void __kprobes
+do_device_not_available(struct pt_regs *regs, long error_code)
+{
+#ifdef CONFIG_MATH_EMULATION
+       if (read_cr0() & X86_CR0_EM) {
+               struct math_emu_info info = { };
+
+               conditional_sti(regs);
+
+               info.regs = regs;
+               math_emulate(&info);
+               return;
+       }
+#endif
+       math_state_restore(); /* interrupts still off */
+#ifdef CONFIG_X86_32
+       conditional_sti(regs);
+#endif
+}
+
+#ifdef CONFIG_X86_32
+dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
+{
+       siginfo_t info;
+       local_irq_enable();
+
+       info.si_signo = SIGILL;
+       info.si_errno = 0;
+       info.si_code = ILL_BADSTK;
+       info.si_addr = NULL;
+       if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
+                       X86_TRAP_IRET, SIGILL) == NOTIFY_STOP)
+               return;
+       do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
+               &info);
+}
+#endif
+
+/*
+ * NB. All these are "trap gates" (i.e. events_mask isn't set) except
+ * for those that specify <dpl>|4 in the second field.
+ */
+#ifdef CONFIG_X86_32
+#define X 0
+#else
+#define X 4
+#endif
+static const trap_info_t __initconst early_trap_table[] = {
+       { X86_TRAP_DB, 0|4, __KERNEL_CS, (unsigned long)debug                   },
+       { X86_TRAP_BP, 3|4, __KERNEL_CS, (unsigned long)int3                    },
+       { X86_TRAP_PF, 0|4, __KERNEL_CS, (unsigned long)page_fault              },
+       { }
+};
+static const trap_info_t __cpuinitconst trap_table[] = {
+       { X86_TRAP_DE, 0|X, __KERNEL_CS, (unsigned long)divide_error            },
+       { X86_TRAP_DB, 0|4, __KERNEL_CS, (unsigned long)debug                   },
+       { X86_TRAP_BP, 3|4, __KERNEL_CS, (unsigned long)int3                    },
+       { X86_TRAP_OF, 3|X, __KERNEL_CS, (unsigned long)overflow                },
+       { X86_TRAP_BR, 0|X, __KERNEL_CS, (unsigned long)bounds                  },
+       { X86_TRAP_UD, 0|X, __KERNEL_CS, (unsigned long)invalid_op              },
+       { X86_TRAP_NM, 0|4, __KERNEL_CS, (unsigned long)device_not_available    },
+       { X86_TRAP_OLD_MF, 0|X, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
+       { X86_TRAP_TS, 0|X, __KERNEL_CS, (unsigned long)invalid_TSS             },
+       { X86_TRAP_NP, 0|X, __KERNEL_CS, (unsigned long)segment_not_present     },
+       { X86_TRAP_SS, 0|X, __KERNEL_CS, (unsigned long)stack_segment           },
+       { X86_TRAP_GP, 0|X, __KERNEL_CS, (unsigned long)general_protection      },
+       { X86_TRAP_PF, 0|4, __KERNEL_CS, (unsigned long)page_fault              },
+       { X86_TRAP_MF, 0|X, __KERNEL_CS, (unsigned long)coprocessor_error       },
+       { X86_TRAP_AC, 0|X, __KERNEL_CS, (unsigned long)alignment_check         },
+#ifdef CONFIG_X86_MCE
+       { X86_TRAP_MC, 0|X, __KERNEL_CS, (unsigned long)machine_check           },
+#endif
+       { X86_TRAP_XF, 0|X, __KERNEL_CS, (unsigned long)simd_coprocessor_error  },
+#ifdef CONFIG_X86_32
+       { X86_TRAP_SPURIOUS, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment   },
+       { SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)system_call   },
+#elif defined(CONFIG_IA32_EMULATION)
+       { IA32_SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)ia32_syscall },
+#endif
+       { }
+};
+
+/* Set of traps needed for early debugging. */
+void __init early_trap_init(void)
+{
+       int ret;
+
+       ret = HYPERVISOR_set_trap_table(early_trap_table);
+       if (ret)
+               printk("early set_trap_table failed (%d)\n", ret);
+}
+
+void __init trap_init(void)
+{
+       int ret;
+
+       ret = HYPERVISOR_set_trap_table(trap_table);
+       if (ret)
+               printk("HYPERVISOR_set_trap_table failed (%d)\n", ret);
+
+       /*
+        * Should be a barrier for any external CPU state:
+        */
+       cpu_init();
+
+       x86_init.irqs.trap_init();
+}
+
+void __cpuinit smp_trap_init(trap_info_t *trap_ctxt)
+{
+       const trap_info_t *t = trap_table;
+
+       for (t = trap_table; t->address; t++) {
+               trap_ctxt[t->vector].flags = t->flags;
+               trap_ctxt[t->vector].cs = t->cs;
+               trap_ctxt[t->vector].address = t->address;
+       }
+       TI_SET_IF(trap_ctxt + NMI_VECTOR, 1);
+       trap_ctxt[NMI_VECTOR].cs = __KERNEL_CS;
+       trap_ctxt[NMI_VECTOR].address = (unsigned long)nmi;
+}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c

index 255f58a..a89b4a3 100644 (file)
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -125,7 +125,9 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
  
  struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
  {
+#ifndef CONFIG_X86_NO_TSS
         struct tss_struct *tss;
+#endif
         struct pt_regs *ret;
         unsigned long tmp;
  
@@ -148,12 +150,16 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
                 do_exit(SIGSEGV);
         }
  
+#ifndef CONFIG_X86_NO_TSS
         tss = &per_cpu(init_tss, get_cpu());
+#endif
         current->thread.sp0 = current->thread.saved_sp0;
         current->thread.sysenter_cs = __KERNEL_CS;
         load_sp0(tss, &current->thread);
         current->thread.saved_sp0 = 0;
+#ifndef CONFIG_X86_NO_TSS
         put_cpu();
+#endif
  
         ret = KVM86->regs32;
  
@@ -282,7 +288,9 @@ out:
  
  static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
  {
+#ifndef CONFIG_X86_NO_TSS
         struct tss_struct *tss;
+#endif
  /*
   * make sure the vm86() system call doesn't try to do anything silly
   */
@@ -326,12 +334,16 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
         tsk->thread.saved_fs = info->regs32->fs;
         tsk->thread.saved_gs = get_user_gs(info->regs32);
  
+#ifndef CONFIG_X86_NO_TSS
         tss = &per_cpu(init_tss, get_cpu());
+#endif
         tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
         if (cpu_has_sep)
                 tsk->thread.sysenter_cs = 0;
         load_sp0(tss, &tsk->thread);
+#ifndef CONFIG_X86_NO_TSS
         put_cpu();
+#endif
  
         tsk->thread.screen_bitmap = info->screen_bitmap;
         if (info->flags & VM86_SCREEN_BITMAP)
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S

index 0f703f1..69c4a2c 100644 (file)
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -16,8 +16,10 @@
  
  #ifdef CONFIG_X86_32
  #define LOAD_OFFSET __PAGE_OFFSET
-#else
+#elif !defined(CONFIG_XEN) || CONFIG_XEN_COMPAT > 0x030002
  #define LOAD_OFFSET __START_KERNEL_map
+#else
+#define LOAD_OFFSET 0
  #endif
  
  #include <asm-generic/vmlinux.lds.h>
@@ -41,7 +43,7 @@ ENTRY(phys_startup_64)
  jiffies_64 = jiffies;
  #endif
  
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && !defined(CONFIG_XEN)
  /*
   * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
   * we retain large page mappings for boundaries spanning kernel text, rodata
@@ -83,6 +85,10 @@ SECTIONS
  {
  #ifdef CONFIG_X86_32
          . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+#if defined(CONFIG_XEN) && CONFIG_XEN_COMPAT <= 0x030002
+#undef LOAD_OFFSET
+#define LOAD_OFFSET 0
+#endif
          phys_startup_32 = startup_32 - LOAD_OFFSET;
  #else
          . = __START_KERNEL;
@@ -335,7 +341,9 @@ SECTIONS
  
         /* Sections to be discarded */
         DISCARDS
+#ifndef CONFIG_UNWIND_INFO
         /DISCARD/ : { *(.eh_frame) }
+#endif
  }
  
  
diff --git a/arch/x86/kernel/vsyscall_64-xen.c b/arch/x86/kernel/vsyscall_64-xen.c

new file mode 100644 (file)

index 0000000..ca85c69
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_64-xen.c
@@ -0,0 +1,362 @@
+/*
+ *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ *  Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ *  [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
+ *
+ *  Thanks to hpa@transmeta.com for some useful hint.
+ *  Special thanks to Ingo Molnar for his early experience with
+ *  a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ *  vsyscall 1 is located at -10Mbyte, vsyscall 2 is located
+ *  at virtual address -10Mbyte+1024bytes etc... There are at max 4
+ *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
+ *  jumping out of line if necessary. We cannot add more with this
+ *  mechanism because older kernels won't return -ENOSYS.
+ *
+ *  Note: the concept clashes with user mode linux.  UML users should
+ *  use the vDSO.
+ */
+
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/seqlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysctl.h>
+#include <linux/topology.h>
+#include <linux/clocksource.h>
+#include <linux/getcpu.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/ratelimit.h>
+
+#include <asm/vsyscall.h>
+#include <asm/pgtable.h>
+#include <asm/compat.h>
+#include <asm/page.h>
+#include <asm/unistd.h>
+#include <asm/fixmap.h>
+#include <asm/errno.h>
+#include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
+#include <asm/vgtod.h>
+#include <asm/traps.h>
+
+#define CREATE_TRACE_POINTS
+#include "vsyscall_trace.h"
+
+DEFINE_VVAR(int, vgetcpu_mode);
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
+
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
+
+static int __init vsyscall_setup(char *str)
+{
+       if (str) {
+               if (!strcmp("emulate", str))
+                       vsyscall_mode = EMULATE;
+               else if (!strcmp("native", str))
+                       vsyscall_mode = NATIVE;
+               else if (!strcmp("none", str))
+                       vsyscall_mode = NONE;
+               else
+                       return -EINVAL;
+
+               return 0;
+       }
+
+       return -EINVAL;
+}
+early_param("vsyscall", vsyscall_setup);
+
+void update_vsyscall_tz(void)
+{
+       vsyscall_gtod_data.sys_tz = sys_tz;
+}
+
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+                       struct clocksource *clock, u32 mult)
+{
+       struct timespec monotonic;
+
+       write_seqcount_begin(&vsyscall_gtod_data.seq);
+
+       /* copy vsyscall data */
+#ifndef CONFIG_XEN
+       vsyscall_gtod_data.clock.vclock_mode    = clock->archdata.vclock_mode;
+#endif
+       vsyscall_gtod_data.clock.cycle_last     = clock->cycle_last;
+       vsyscall_gtod_data.clock.mask           = clock->mask;
+       vsyscall_gtod_data.clock.mult           = mult;
+       vsyscall_gtod_data.clock.shift          = clock->shift;
+
+       vsyscall_gtod_data.wall_time_sec        = wall_time->tv_sec;
+       vsyscall_gtod_data.wall_time_nsec       = wall_time->tv_nsec;
+
+       monotonic = timespec_add(*wall_time, *wtm);
+       vsyscall_gtod_data.monotonic_time_sec   = monotonic.tv_sec;
+       vsyscall_gtod_data.monotonic_time_nsec  = monotonic.tv_nsec;
+
+       vsyscall_gtod_data.wall_time_coarse     = __current_kernel_time();
+       vsyscall_gtod_data.monotonic_time_coarse =
+               timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm);
+
+       write_seqcount_end(&vsyscall_gtod_data.seq);
+}
+
+static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
+                             const char *message)
+{
+       static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+       struct task_struct *tsk;
+
+       if (!show_unhandled_signals || !__ratelimit(&rs))
+               return;
+
+       tsk = current;
+
+       printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+              level, tsk->comm, task_pid_nr(tsk),
+              message, regs->ip, regs->cs,
+              regs->sp, regs->ax, regs->si, regs->di);
+}
+
+static int addr_to_vsyscall_nr(unsigned long addr)
+{
+       int nr;
+
+       if ((addr & ~0xC00UL) != VSYSCALL_START)
+               return -EINVAL;
+
+       nr = (addr & 0xC00UL) >> 10;
+       if (nr >= 3)
+               return -EINVAL;
+
+       return nr;
+}
+
+static bool write_ok_or_segv(unsigned long ptr, size_t size)
+{
+       /*
+        * XXX: if access_ok, get_user, and put_user handled
+        * sig_on_uaccess_error, this could go away.
+        */
+
+       if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
+               siginfo_t info;
+               struct thread_struct *thread = &current->thread;
+
+               thread->error_code      = 6;  /* user fault, no page, write */
+               thread->cr2             = ptr;
+               thread->trap_nr         = X86_TRAP_PF;
+
+               memset(&info, 0, sizeof(info));
+               info.si_signo           = SIGSEGV;
+               info.si_errno           = 0;
+               info.si_code            = SEGV_MAPERR;
+               info.si_addr            = (void __user *)ptr;
+
+               force_sig_info(SIGSEGV, &info, current);
+               return false;
+       } else {
+               return true;
+       }
+}
+
+bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
+{
+       struct task_struct *tsk;
+       unsigned long caller;
+       int vsyscall_nr;
+       int prev_sig_on_uaccess_error;
+       long ret;
+
+       /*
+        * No point in checking CS -- the only way to get here is a user mode
+        * trap to a high address, which means that we're in 64-bit user code.
+        */
+
+       WARN_ON_ONCE(address != regs->ip);
+
+       if (vsyscall_mode == NONE) {
+               warn_bad_vsyscall(KERN_INFO, regs,
+                                 "vsyscall attempted with vsyscall=none");
+               return false;
+       }
+
+       vsyscall_nr = addr_to_vsyscall_nr(address);
+
+       trace_emulate_vsyscall(vsyscall_nr);
+
+       if (vsyscall_nr < 0) {
+               warn_bad_vsyscall(KERN_WARNING, regs,
+                                 "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround");
+               goto sigsegv;
+       }
+
+       if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
+               warn_bad_vsyscall(KERN_WARNING, regs,
+                                 "vsyscall with bad stack (exploit attempt?)");
+               goto sigsegv;
+       }
+
+       tsk = current;
+       if (seccomp_mode(&tsk->seccomp))
+               do_exit(SIGKILL);
+
+       /*
+        * With a real vsyscall, page faults cause SIGSEGV.  We want to
+        * preserve that behavior to make writing exploits harder.
+        */
+       prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+       current_thread_info()->sig_on_uaccess_error = 1;
+
+       /*
+        * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
+        * 64-bit, so we don't need to special-case it here.  For all the
+        * vsyscalls, NULL means "don't write anything" not "write it at
+        * address 0".
+        */
+       ret = -EFAULT;
+       switch (vsyscall_nr) {
+       case 0:
+               if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+                   !write_ok_or_segv(regs->si, sizeof(struct timezone)))
+                       break;
+
+               ret = sys_gettimeofday(
+                       (struct timeval __user *)regs->di,
+                       (struct timezone __user *)regs->si);
+               break;
+
+       case 1:
+               if (!write_ok_or_segv(regs->di, sizeof(time_t)))
+                       break;
+
+               ret = sys_time((time_t __user *)regs->di);
+               break;
+
+       case 2:
+               if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+                   !write_ok_or_segv(regs->si, sizeof(unsigned)))
+                       break;
+
+               ret = sys_getcpu((unsigned __user *)regs->di,
+                                (unsigned __user *)regs->si,
+                                NULL);
+               break;
+       default:
+               ret = -ENOSYS;
+               break;
+       }
+
+       current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
+
+       if (ret == -EFAULT) {
+               /* Bad news -- userspace fed a bad pointer to a vsyscall. */
+               warn_bad_vsyscall(KERN_INFO, regs,
+                                 "vsyscall fault (exploit attempt?)");
+
+               /*
+                * If we failed to generate a signal for any reason,
+                * generate one here.  (This should be impossible.)
+                */
+               if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
+                                !sigismember(&tsk->pending.signal, SIGSEGV)))
+                       goto sigsegv;
+
+               return true;  /* Don't emulate the ret. */
+       }
+
+       regs->ax = ret;
+
+       /* Emulate a ret instruction. */
+       regs->ip = caller;
+       regs->sp += 8;
+
+       return true;
+
+sigsegv:
+       force_sig(SIGSEGV, current);
+       return true;
+}
+
+/*
+ * Assume __initcall executes before all user space. Hopefully kmod
+ * doesn't violate that. We'll find out if it does.
+ */
+static void __cpuinit vsyscall_set_cpu(int cpu)
+{
+       unsigned long d;
+       unsigned long node = 0;
+#ifdef CONFIG_NUMA
+       node = cpu_to_node(cpu);
+#endif
+       if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
+               write_rdtscp_aux((node << 12) | cpu);
+
+       /*
+        * Store cpu number in limit so that it can be loaded quickly
+        * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
+        */
+       d = 0x0f40000000000ULL;
+       d |= cpu;
+       d |= (node & 0xf) << 12;
+       d |= (node >> 4) << 48;
+
+       write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
+}
+
+static void __cpuinit cpu_vsyscall_init(void *arg)
+{
+       /* preemption should be already off */
+       vsyscall_set_cpu(raw_smp_processor_id());
+}
+
+static int __cpuinit
+cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
+{
+       long cpu = (long)arg;
+
+       if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
+               smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
+
+       return NOTIFY_DONE;
+}
+
+void __init map_vsyscall(void)
+{
+       extern char __vsyscall_page;
+       unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
+       extern char __vvar_page;
+       unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
+
+       __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
+                    vsyscall_mode == NATIVE
+                    ? PAGE_KERNEL_VSYSCALL
+                    : PAGE_KERNEL_VVAR);
+       BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) !=
+                    (unsigned long)VSYSCALL_START);
+
+       __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+       BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
+                    (unsigned long)VVAR_ADDRESS);
+}
+
+static int __init vsyscall_init(void)
+{
+       BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
+
+       on_each_cpu(cpu_vsyscall_init, NULL, 1);
+       /* notifier priority > KVM */
+       hotcpu_notifier(cpu_vsyscall_notifier, 30);
+
+       return 0;
+}
+__initcall(vsyscall_init);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c

index 9796c2f..4d10890 100644 (file)
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -55,6 +55,6 @@ EXPORT_SYMBOL(__memcpy);
  EXPORT_SYMBOL(memmove);
  
  EXPORT_SYMBOL(empty_zero_page);
-#ifndef CONFIG_PARAVIRT
+#if !defined(CONFIG_PARAVIRT) && !defined(CONFIG_XEN)
  EXPORT_SYMBOL(native_load_gs_index);
  #endif
diff --git a/arch/x86/kernel/x86_init-xen.c b/arch/x86/kernel/x86_init-xen.c

new file mode 100644 (file)

index 0000000..94cce78
--- /dev/null
+++ b/arch/x86/kernel/x86_init-xen.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#include <linux/bitmap.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/spinlock_types.h>
+#include <linux/threads.h>
+
+#include <asm/pci_x86.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/e820.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/pat.h>
+#include <asm/iommu.h>
+#include <asm/mach_traps.h>
+
+void __cpuinit x86_init_noop(void) { }
+void __init x86_init_uint_noop(unsigned int unused) { }
+void __init x86_init_pgd_noop(pgd_t *unused) { }
+int __init iommu_init_noop(void) { return 0; }
+void wallclock_init_noop(void) { }
+
+/*
+ * The platform setup functions are preset with the default functions
+ * for standard PC hardware.
+ */
+struct x86_init_ops x86_init __initdata = {
+
+       .resources = {
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+               .probe_roms             = probe_roms,
+#else
+               .probe_roms             = x86_init_noop,
+#endif
+               .reserve_resources      = reserve_standard_io_resources,
+               .memory_setup           = default_machine_specific_memory_setup,
+       },
+
+       .mpparse = {
+               .mpc_record             = x86_init_uint_noop,
+               .setup_ioapic_ids       = x86_init_noop,
+               .mpc_apic_id            = NULL,
+               .smp_read_mpc_oem       = default_smp_read_mpc_oem,
+               .mpc_oem_bus_info       = default_mpc_oem_bus_info,
+               .find_smp_config        = default_find_smp_config,
+               .get_smp_config         = default_get_smp_config,
+       },
+
+       .irqs = {
+               .pre_vector_init        = NULL,
+               .intr_init              = NULL,
+               .trap_init              = x86_init_noop,
+       },
+
+       .oem = {
+               .arch_setup             = xen_arch_setup,
+               .banner                 = x86_init_noop,
+       },
+
+       .mapping = {
+               .pagetable_reserve              = xen_pagetable_reserve,
+       },
+
+       .paging = {
+               .pagetable_setup_start  = x86_init_pgd_noop,
+               .pagetable_setup_done   = x86_init_pgd_noop,
+       },
+
+       .timers = {
+               .setup_percpu_clockev   = NULL,
+               .tsc_pre_init           = x86_init_noop,
+               .timer_init             = x86_init_noop,
+               .wallclock_init         = x86_init_noop,
+       },
+
+       .iommu = {
+               .iommu_init             = iommu_init_noop,
+       },
+
+       .pci = {
+               .init                   = x86_default_pci_init,
+               .init_irq               = x86_default_pci_init_irq,
+               .fixup_irqs             = x86_default_pci_fixup_irqs,
+       },
+};
+
+static int default_i8042_detect(void) { return 1; };
+
+struct x86_platform_ops x86_platform = {
+       .calibrate_tsc                  = NULL,
+       .wallclock_init                 = wallclock_init_noop,
+       .get_wallclock                  = xen_read_wallclock,
+       .set_wallclock                  = xen_write_wallclock,
+       .is_untracked_pat_range         = is_ISA_range,
+       .get_nmi_reason                 = xen_get_nmi_reason,
+       .i8042_detect                   = default_i8042_detect
+};
+
+EXPORT_SYMBOL_GPL(x86_platform);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig

index 1a7fe86..18419a2 100644 (file)
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -7,6 +7,7 @@ source "virt/kvm/Kconfig"
  menuconfig VIRTUALIZATION
         bool "Virtualization"
         depends on HAVE_KVM || X86
+       depends on !XEN
         default y
         ---help---
           Say Y here to get to see options for using your Linux host to run other
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c

index 9fed5be..7e94adb 100644 (file)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -224,7 +224,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
         /* cpuid 1.ecx */
         const u32 kvm_supported_word4_x86_features =
-               F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+               F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64 */ | F(MWAIT) |
                 0 /* DS-CPL, VMX, SMX, EST */ |
                 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
                 F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c

index e334389..5d80d19 100644 (file)
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2760,6 +2760,22 @@ static int xsetbv_interception(struct vcpu_svm *svm)
         return 1;
  }
  
+static int monitor_interception(struct vcpu_svm *svm)
+{
+       svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+       skip_emulated_instruction(&svm->vcpu);
+
+       return 1;
+}
+
+static int mwait_interception(struct vcpu_svm *svm)
+{
+       svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+       skip_emulated_instruction(&svm->vcpu);
+
+       return kvm_emulate_halt(&svm->vcpu);
+}
+
  static int invalid_op_interception(struct vcpu_svm *svm)
  {
         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -3318,8 +3334,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
         [SVM_EXIT_CLGI]                         = clgi_interception,
         [SVM_EXIT_SKINIT]                       = skinit_interception,
         [SVM_EXIT_WBINVD]                       = emulate_on_interception,
-       [SVM_EXIT_MONITOR]                      = invalid_op_interception,
-       [SVM_EXIT_MWAIT]                        = invalid_op_interception,
+       [SVM_EXIT_MONITOR]                      = monitor_interception,
+       [SVM_EXIT_MWAIT]                        = mwait_interception,
         [SVM_EXIT_XSETBV]                       = xsetbv_interception,
         [SVM_EXIT_NPF]                          = pf_interception,
  };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 185a2b8..7e0f8e1 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1582,6 +1582,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
         case MSR_VM_HSAVE_PA:
         case MSR_AMD64_PATCH_LOADER:
                 break;
+       case MSR_NHM_SNB_PKG_CST_CFG_CTL: /* 0xe2 */
         case 0x200 ... 0x2ff:
                 return set_msr_mtrr(vcpu, msr, data);
         case MSR_IA32_APICBASE:
@@ -1904,6 +1905,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
         case MSR_K8_INT_PENDING_MSG:
         case MSR_AMD64_NB_CFG:
         case MSR_FAM10H_MMIO_CONF_BASE:
+       case MSR_NHM_SNB_PKG_CST_CFG_CTL: /* 0xe2 */
                 data = 0;
                 break;
         case MSR_P6_PERFCTR0:
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile

index b00f678..88b35c4 100644 (file)
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -15,6 +15,7 @@ $(obj)/inat.o: $(obj)/inat-tables.c
  clean-files := inat-tables.c
  
  obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
+obj-$(CONFIG_XEN) += cache-smp.o
  
  lib-y := delay.o
  lib-y += thunk_$(BITS).o
@@ -45,3 +46,5 @@ else
          lib-y += copy_user_64.o copy_user_nocache_64.o
         lib-y += cmpxchg16b_emu.o
  endif
+
+lib-$(CONFIG_XEN_SCRUB_PAGES) += scrub.o
diff --git a/arch/x86/lib/cache-smp-xen.c b/arch/x86/lib/cache-smp-xen.c

new file mode 100644 (file)

index 0000000..48bfd37
--- /dev/null
+++ b/arch/x86/lib/cache-smp-xen.c
@@ -0,0 +1,27 @@
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <asm/hypervisor.h>
+
+static void __wbinvd(void *dummy)
+{
+       wbinvd();
+}
+
+#ifndef CONFIG_XEN
+void wbinvd_on_cpu(int cpu)
+{
+       smp_call_function_single(cpu, __wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_cpu);
+#endif
+
+int wbinvd_on_all_cpus(void)
+{
+       struct mmuext_op op = { .cmd = MMUEXT_FLUSH_CACHE_GLOBAL };
+
+       if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) == 0)
+               return 0;
+       /* Best effort as fallback. */
+       return on_each_cpu(__wbinvd, NULL, 1);
+}
+EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --git a/arch/x86/lib/scrub.c b/arch/x86/lib/scrub.c

new file mode 100644 (file)

index 0000000..a8c6b44
--- /dev/null
+++ b/arch/x86/lib/scrub.c
@@ -0,0 +1,21 @@
+#include <asm/cpufeature.h>
+#include <asm/page.h>
+#include <asm/processor.h>
+
+void xen_scrub_pages(void *v, unsigned int count)
+{
+       if (likely(cpu_has_xmm2)) {
+               unsigned long n = count * (PAGE_SIZE / sizeof(long) / 4);
+
+               for (; n--; v += sizeof(long) * 4)
+                       asm("movnti %1,(%0)\n\t"
+                           "movnti %1,%c2(%0)\n\t"
+                           "movnti %1,2*%c2(%0)\n\t"
+                           "movnti %1,3*%c2(%0)\n\t"
+                           : : "r" (v), "r" (0L), "i" (sizeof(long))
+                           : "memory");
+               asm volatile("sfence" : : : "memory");
+       } else
+               for (; count--; v += PAGE_SIZE)
+                       clear_page(v);
+}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile

index 23d8e5f..f9bba2e 100644 (file)
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -27,4 +27,7 @@ obj-$(CONFIG_AMD_NUMA)                += amdtopology.o
  obj-$(CONFIG_ACPI_NUMA)                += srat.o
  obj-$(CONFIG_NUMA_EMU)         += numa_emulation.o
  
+obj-$(CONFIG_XEN)              += hypervisor.o
+disabled-obj-$(CONFIG_XEN)     := gup.o tlb.o
+
  obj-$(CONFIG_MEMTEST)          += memtest.o
diff --git a/arch/x86/mm/dump_pagetables-xen.c b/arch/x86/mm/dump_pagetables-xen.c

new file mode 100644 (file)

index 0000000..d352692
--- /dev/null
+++ b/arch/x86/mm/dump_pagetables-xen.c
@@ -0,0 +1,392 @@
+/*
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * (C) Copyright 2008 Intel Corporation
+ *
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <xen/interface/xen.h>
+
+#include <asm/pgtable.h>
+
+/*
+ * The dumper groups pagetable entries of the same type into one, and for
+ * that it needs to keep some state when walking, and flush this state
+ * when a "break" in the continuity is found.
+ */
+struct pg_state {
+       int level;
+       pgprot_t current_prot;
+       unsigned long start_address;
+       unsigned long current_address;
+       const struct addr_marker *marker;
+};
+
+struct addr_marker {
+       unsigned long start_address;
+       const char *name;
+};
+
+/* indices for address_markers; keep sync'd w/ address_markers below */
+enum address_markers_idx {
+       USER_SPACE_NR = 0,
+#ifdef CONFIG_X86_64
+       XEN_SPACE_NR,
+       LOW_KERNEL_NR,
+       VMALLOC_START_NR,
+       VMEMMAP_START_NR,
+       HIGH_KERNEL_NR,
+       MODULES_VADDR_NR,
+       MODULES_END_NR,
+#else
+       KERNEL_SPACE_NR,
+       VMALLOC_START_NR,
+       VMALLOC_END_NR,
+# ifdef CONFIG_HIGHMEM
+       PKMAP_BASE_NR,
+# endif
+       FIXADDR_START_NR,
+       XEN_SPACE_NR,
+#endif
+};
+
+/* Address space markers hints */
+static struct addr_marker address_markers[] = {
+       { 0, "User Space" },
+#ifdef CONFIG_X86_64
+       { HYPERVISOR_VIRT_START,      "Hypervisor Space" },
+       { PAGE_OFFSET,                "Low Kernel Mapping" },
+       { VMALLOC_START,              "vmalloc() Area" },
+       { VMEMMAP_START,              "Vmemmap" },
+       { __START_KERNEL_map,         "High Kernel Mapping" },
+       { MODULES_VADDR,              "Modules" },
+       { MODULES_END,                "End Modules" },
+#else
+       { PAGE_OFFSET,                "Kernel Mapping" },
+       { 0/* VMALLOC_START */,       "vmalloc() Area" },
+       { 0/*VMALLOC_END*/,           "vmalloc() End" },
+# ifdef CONFIG_HIGHMEM
+       { 0/*PKMAP_BASE*/,            "Persisent kmap() Area" },
+# endif
+       { 0/*FIXADDR_START*/,         "Fixmap Area" },
+       { 0/*HYPERVISOR_VIRT_START*/, "Hypervisor Space" },
+#endif
+       { -1, NULL }                  /* End of list */
+};
+
+static inline bool hypervisor_space(unsigned long addr) {
+#ifdef CONFIG_X86_64
+       return addr >= HYPERVISOR_VIRT_START && addr < HYPERVISOR_VIRT_END;
+#else
+       return addr >= hypervisor_virt_start;
+#endif
+}
+
+/* Multipliers for offsets within the PTEs */
+#define PTE_LEVEL_MULT (PAGE_SIZE)
+#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
+#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+
+/*
+ * Print a readable form of a pgprot_t to the seq_file
+ */
+static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
+{
+       pgprotval_t pr = pgprot_val(prot);
+       static const char * const level_name[] =
+               { "cr3", "pgd", "pud", "pmd", "pte" };
+
+       if (!pgprot_val(prot)) {
+               /* Not present */
+               seq_printf(m, "                          ");
+       } else {
+               if (pr & _PAGE_USER)
+                       seq_printf(m, "USR ");
+               else
+                       seq_printf(m, "    ");
+               if (pr & _PAGE_RW)
+                       seq_printf(m, "RW ");
+               else
+                       seq_printf(m, "ro ");
+               if (pr & _PAGE_PWT)
+                       seq_printf(m, "PWT ");
+               else
+                       seq_printf(m, "    ");
+               if (pr & _PAGE_PCD)
+                       seq_printf(m, "PCD ");
+               else
+                       seq_printf(m, "    ");
+
+               /* Bit 9 has a different meaning on level 3 vs 4 */
+               if (level <= 3) {
+                       if (pr & _PAGE_PSE)
+                               seq_printf(m, "PSE ");
+                       else
+                               seq_printf(m, "    ");
+               } else {
+                       if (pr & _PAGE_PAT)
+                               seq_printf(m, "pat ");
+                       else
+                               seq_printf(m, "    ");
+               }
+               if (pr & _PAGE_GLOBAL)
+                       seq_printf(m, "GLB ");
+               else
+                       seq_printf(m, "    ");
+               if (pr & _PAGE_NX)
+                       seq_printf(m, "NX ");
+               else
+                       seq_printf(m, "x  ");
+       }
+       seq_printf(m, "%s\n", level_name[level]);
+}
+
+/*
+ * On 64 bits, sign-extend the 48 bit address to 64 bit
+ */
+static unsigned long normalize_addr(unsigned long u)
+{
+#ifdef CONFIG_X86_64
+       return (signed long)(u << 16) >> 16;
+#else
+       return u;
+#endif
+}
+
+/*
+ * This function gets called on a break in a continuous series
+ * of PTE entries; the next one is different so we need to
+ * print what we collected so far.
+ */
+static void note_page(struct seq_file *m, struct pg_state *st,
+                     pgprot_t new_prot, int level)
+{
+       pgprotval_t prot, cur;
+       static const char units[] = "KMGTPE";
+
+       /*
+        * If we have a "break" in the series, we need to flush the state that
+        * we have now. "break" is either changing perms, levels or
+        * address space marker.
+        */
+       prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
+       cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
+
+       if (!st->level) {
+               /* First entry */
+               st->current_prot = new_prot;
+               st->level = level;
+               st->marker = address_markers;
+               seq_printf(m, "---[ %s ]---\n", st->marker->name);
+       } else if (prot != cur || level != st->level ||
+                  st->current_address >= st->marker[1].start_address) {
+               const char *unit = units;
+               unsigned long delta;
+               int width = sizeof(unsigned long) * 2;
+
+               /*
+                * Now print the actual finished series
+                */
+               seq_printf(m, "0x%0*lx-0x%0*lx   ",
+                          width, st->start_address,
+                          width, st->current_address);
+
+               delta = (st->current_address - st->start_address) >> 10;
+               while (!(delta & 1023) && unit[1]) {
+                       delta >>= 10;
+                       unit++;
+               }
+               seq_printf(m, "%9lu%c ", delta, *unit);
+               printk_prot(m, st->current_prot, st->level);
+
+               /*
+                * We print markers for special areas of address space,
+                * such as the start of vmalloc space etc.
+                * This helps in the interpretation.
+                */
+               if (st->current_address >= st->marker[1].start_address) {
+                       st->marker++;
+                       seq_printf(m, "---[ %s ]---\n", st->marker->name);
+               }
+
+               st->start_address = st->current_address;
+               st->current_prot = new_prot;
+               st->level = level;
+       }
+}
+
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
+                                                       unsigned long P)
+{
+       int i;
+       pte_t *start;
+
+       start = (pte_t *) pmd_page_vaddr(addr);
+       for (i = 0; i < PTRS_PER_PTE; i++) {
+               pgprot_t prot = pte_pgprot(*start);
+
+               st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
+               note_page(m, st, prot, 4);
+               start++;
+       }
+}
+
+#if PTRS_PER_PMD > 1
+
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
+                                                       unsigned long P)
+{
+       int i;
+       pmd_t *start;
+
+       start = (pmd_t *) pud_page_vaddr(addr);
+       for (i = 0; i < PTRS_PER_PMD; i++) {
+               st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
+               if (!hypervisor_space(st->current_address)
+                   && !pmd_none(*start)) {
+                       pgprotval_t prot = __pmd_val(*start) & PTE_FLAGS_MASK;
+
+                       if (pmd_large(*start) || !pmd_present(*start))
+                               note_page(m, st, __pgprot(prot), 3);
+                       else
+                               walk_pte_level(m, st, *start,
+                                              P + i * PMD_LEVEL_MULT);
+               } else
+                       note_page(m, st, __pgprot(0), 3);
+               start++;
+       }
+}
+
+#else
+#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
+#define pud_large(a) pmd_large(__pmd(pud_val(a)))
+#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
+#endif
+
+#if PTRS_PER_PUD > 1
+
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+                                                       unsigned long P)
+{
+       int i;
+       pud_t *start;
+
+       start = (pud_t *) pgd_page_vaddr(addr);
+
+       for (i = 0; i < PTRS_PER_PUD; i++) {
+               st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
+               if (!hypervisor_space(st->current_address)
+                   && !pud_none(*start)) {
+                       pgprotval_t prot = __pud_val(*start) & PTE_FLAGS_MASK;
+
+                       if (pud_large(*start) || !pud_present(*start))
+                               note_page(m, st, __pgprot(prot), 2);
+                       else
+                               walk_pmd_level(m, st, *start,
+                                              P + i * PUD_LEVEL_MULT);
+               } else
+                       note_page(m, st, __pgprot(0), 2);
+
+               start++;
+       }
+}
+
+#else
+#define __pud_ma(x) ((pud_t){ __pgd_ma(x) })
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud_ma(__pgd_val(a)),p)
+#define pgd_large(a) pud_large(__pud_ma(__pgd_val(a)))
+#define pgd_none(a)  pud_none(__pud_ma(__pgd_val(a)))
+#endif
+
+static void walk_pgd_level(struct seq_file *m)
+{
+#ifdef CONFIG_X86_64
+       pgd_t *start = (pgd_t *) &init_level4_pgt;
+#else
+       pgd_t *start = swapper_pg_dir;
+#endif
+       int i;
+       struct pg_state st;
+
+       memset(&st, 0, sizeof(st));
+
+       for (i = 0; i < PTRS_PER_PGD; i++) {
+               st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
+               if (!pgd_none(*start)) {
+                       pgprotval_t prot = __pgd_val(*start) & PTE_FLAGS_MASK;
+
+                       if (pgd_large(*start) || !pgd_present(*start))
+                               note_page(m, &st, __pgprot(prot), 1);
+                       else
+                               walk_pud_level(m, &st, *start,
+                                              i * PGD_LEVEL_MULT);
+               } else
+                       note_page(m, &st, __pgprot(0), 1);
+
+               start++;
+       }
+
+       /* Flush out the last page */
+       st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
+       note_page(m, &st, __pgprot(0), 0);
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+       walk_pgd_level(m);
+       return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+       .open           = ptdump_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int __init pt_dump_init(void)
+{
+       struct dentry *pe;
+
+#ifdef CONFIG_X86_32
+       /* Not a compile-time constant on x86-32 */
+       address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+       address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+# ifdef CONFIG_HIGHMEM
+       address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
+# endif
+       address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
+       address_markers[XEN_SPACE_NR].start_address = hypervisor_virt_start;
+#endif
+
+       pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
+                                &ptdump_fops);
+       if (!pe)
+               return -ENOMEM;
+
+       return 0;
+}
+
+__initcall(pt_dump_init);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
+MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
diff --git a/arch/x86/mm/fault-xen.c b/arch/x86/mm/fault-xen.c

new file mode 100644 (file)

index 0000000..eb107cc
--- /dev/null
+++ b/arch/x86/mm/fault-xen.c
@@ -0,0 +1,1241 @@
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
+ *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
+ */
+#include <linux/magic.h>               /* STACK_END_MAGIC              */
+#include <linux/sched.h>               /* test_thread_flag(), ...      */
+#include <linux/kdebug.h>              /* oops_begin/end, ...          */
+#include <linux/module.h>              /* search_exception_table       */
+#include <linux/bootmem.h>             /* max_low_pfn                  */
+#include <linux/kprobes.h>             /* __kprobes, ...               */
+#include <linux/mmiotrace.h>           /* kmmio_handler, ...           */
+#include <linux/perf_event.h>          /* perf_sw_event                */
+#include <linux/hugetlb.h>             /* hstate_index_to_shift        */
+#include <linux/prefetch.h>            /* prefetchw                    */
+
+#include <asm/traps.h>                 /* dotraplinkage, ...           */
+#include <asm/pgalloc.h>               /* pgd_*(), ...                 */
+#include <asm/kmemcheck.h>             /* kmemcheck_*(), ...           */
+#include <asm/fixmap.h>                        /* VSYSCALL_START               */
+
+/*
+ * Page fault error code bits:
+ *
+ *   bit 0 ==   0: no page found       1: protection fault
+ *   bit 1 ==   0: read access         1: write access
+ *   bit 2 ==   0: kernel-mode access  1: user-mode access
+ *   bit 3 ==                          1: use of reserved bit detected
+ *   bit 4 ==                          1: fault was an instruction fetch
+ */
+enum x86_pf_error_code {
+
+       PF_PROT         =               1 << 0,
+       PF_WRITE        =               1 << 1,
+       PF_USER         =               1 << 2,
+       PF_RSVD         =               1 << 3,
+       PF_INSTR        =               1 << 4,
+};
+
+/*
+ * Returns 0 if mmiotrace is disabled, or if the fault is not
+ * handled by mmiotrace:
+ */
+static inline int __kprobes
+kmmio_fault(struct pt_regs *regs, unsigned long addr)
+{
+       if (unlikely(is_kmmio_active()))
+               if (kmmio_handler(regs, addr) == 1)
+                       return -1;
+       return 0;
+}
+
+static inline int __kprobes notify_page_fault(struct pt_regs *regs)
+{
+       int ret = 0;
+
+       /* kprobe_running() needs smp_processor_id() */
+       if (kprobes_built_in() && !user_mode_vm(regs)) {
+               preempt_disable();
+               if (kprobe_running() && kprobe_fault_handler(regs, 14))
+                       ret = 1;
+               preempt_enable();
+       }
+
+       return ret;
+}
+
+/*
+ * Prefetch quirks:
+ *
+ * 32-bit mode:
+ *
+ *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ *   Check that here and ignore it.
+ *
+ * 64-bit mode:
+ *
+ *   Sometimes the CPU reports invalid exceptions on prefetch.
+ *   Check that here and ignore it.
+ *
+ * Opcode checker based on code by Richard Brunner.
+ */
+static inline int
+check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
+                     unsigned char opcode, int *prefetch)
+{
+       unsigned char instr_hi = opcode & 0xf0;
+       unsigned char instr_lo = opcode & 0x0f;
+
+       switch (instr_hi) {
+       case 0x20:
+       case 0x30:
+               /*
+                * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+                * In X86_64 long mode, the CPU will signal invalid
+                * opcode if some of these prefixes are present so
+                * X86_64 will never get here anyway
+                */
+               return ((instr_lo & 7) == 0x6);
+#ifdef CONFIG_X86_64
+       case 0x40:
+               /*
+                * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+                * Need to figure out under what instruction mode the
+                * instruction was issued. Could check the LDT for lm,
+                * but for now it's good enough to assume that long
+                * mode only uses well known segments or kernel.
+                */
+               return (!user_mode(regs) || user_64bit_mode(regs));
+#endif
+       case 0x60:
+               /* 0x64 thru 0x67 are valid prefixes in all modes. */
+               return (instr_lo & 0xC) == 0x4;
+       case 0xF0:
+               /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+               return !instr_lo || (instr_lo>>1) == 1;
+       case 0x00:
+               /* Prefetch instruction is 0x0F0D or 0x0F18 */
+               if (probe_kernel_address(instr, opcode))
+                       return 0;
+
+               *prefetch = (instr_lo == 0xF) &&
+                       (opcode == 0x0D || opcode == 0x18);
+               return 0;
+       default:
+               return 0;
+       }
+}
+
+static int
+is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
+{
+       unsigned char *max_instr;
+       unsigned char *instr;
+       int prefetch = 0;
+
+       /*
+        * If it was a exec (instruction fetch) fault on NX page, then
+        * do not ignore the fault:
+        */
+       if (error_code & PF_INSTR)
+               return 0;
+
+       instr = (void *)convert_ip_to_linear(current, regs);
+       max_instr = instr + 15;
+
+       if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+               return 0;
+
+       while (instr < max_instr) {
+               unsigned char opcode;
+
+               if (probe_kernel_address(instr, opcode))
+                       break;
+
+               instr++;
+
+               if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
+                       break;
+       }
+       return prefetch;
+}
+
+static void
+force_sig_info_fault(int si_signo, int si_code, unsigned long address,
+                    struct task_struct *tsk, int fault)
+{
+       unsigned lsb = 0;
+       siginfo_t info;
+
+       info.si_signo   = si_signo;
+       info.si_errno   = 0;
+       info.si_code    = si_code;
+       info.si_addr    = (void __user *)address;
+       if (fault & VM_FAULT_HWPOISON_LARGE)
+               lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+       if (fault & VM_FAULT_HWPOISON)
+               lsb = PAGE_SHIFT;
+       info.si_addr_lsb = lsb;
+
+       force_sig_info(si_signo, &info, tsk);
+}
+
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+
+#ifdef CONFIG_X86_32
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+       unsigned index = pgd_index(address);
+       pgd_t *pgd_k;
+       pud_t *pud, *pud_k;
+       pmd_t *pmd, *pmd_k;
+
+       pgd += index;
+       pgd_k = init_mm.pgd + index;
+
+       if (!pgd_present(*pgd_k))
+               return NULL;
+
+       /*
+        * set_pgd(pgd, *pgd_k); here would be useless on PAE
+        * and redundant with the set_pmd() on non-PAE. As would
+        * set_pud.
+        */
+       pud = pud_offset(pgd, address);
+       pud_k = pud_offset(pgd_k, address);
+       if (!pud_present(*pud_k))
+               return NULL;
+
+       pmd = pmd_offset(pud, address);
+       pmd_k = pmd_offset(pud_k, address);
+       if (!pmd_present(*pmd_k))
+               return NULL;
+
+       if (!pmd_present(*pmd))
+#if CONFIG_XEN_COMPAT > 0x030002
+               set_pmd(pmd, *pmd_k);
+#else
+               /*
+                * When running on older Xen we must launder *pmd_k through
+                * pmd_val() to ensure that _PAGE_PRESENT is correctly set.
+                */
+               set_pmd(pmd, __pmd(pmd_val(*pmd_k)));
+#endif
+       else
+               BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+
+       return pmd_k;
+}
+
+void vmalloc_sync_all(void)
+{
+       unsigned long address;
+
+       if (SHARED_KERNEL_PMD)
+               return;
+
+       for (address = VMALLOC_START & PMD_MASK;
+            address >= TASK_SIZE && address < FIXADDR_TOP;
+            address += PMD_SIZE) {
+               struct page *page;
+
+               spin_lock(&pgd_lock);
+               list_for_each_entry(page, &pgd_list, lru) {
+                       spinlock_t *pgt_lock;
+                       pmd_t *ret;
+
+                       /* the pgt_lock only for Xen */
+                       pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+                       spin_lock(pgt_lock);
+                       ret = vmalloc_sync_one(page_address(page), address);
+                       spin_unlock(pgt_lock);
+
+                       if (!ret)
+                               break;
+               }
+               spin_unlock(&pgd_lock);
+       }
+}
+
+/*
+ * 32-bit:
+ *
+ *   Handle a fault on the vmalloc or module mapping area
+ */
+static noinline __kprobes int vmalloc_fault(unsigned long address)
+{
+       unsigned long pgd_paddr;
+       pmd_t *pmd_k;
+       pte_t *pte_k;
+
+       /* Make sure we are in vmalloc area: */
+       if (!(address >= VMALLOC_START && address < VMALLOC_END))
+               return -1;
+
+       WARN_ON_ONCE(in_nmi());
+
+       /*
+        * Synchronize this task's top level page-table
+        * with the 'reference' page table.
+        *
+        * Do _not_ use "current" here. We might be inside
+        * an interrupt in the middle of a task switch..
+        */
+       pgd_paddr = read_cr3();
+       pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+       if (!pmd_k)
+               return -1;
+
+       pte_k = pte_offset_kernel(pmd_k, address);
+       if (!pte_present(*pte_k))
+               return -1;
+
+       return 0;
+}
+
+/*
+ * Did it hit the DOS screen memory VA from vm86 mode?
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+                struct task_struct *tsk)
+{
+       unsigned long bit;
+
+       if (!v8086_mode(regs))
+               return;
+
+       bit = (address - 0xA0000) >> PAGE_SHIFT;
+       if (bit < 32)
+               tsk->thread.screen_bitmap |= 1 << bit;
+}
+
+static bool low_pfn(unsigned long pfn)
+{
+       return pfn < max_low_pfn;
+}
+
+static void dump_pagetable(unsigned long address)
+{
+       pgd_t *base = __va(read_cr3());
+       pgd_t *pgd = &base[pgd_index(address)];
+       pmd_t *pmd;
+       pte_t *pte;
+
+#ifdef CONFIG_X86_PAE
+       printk("*pdpt = %016Lx ", __pgd_val(*pgd));
+       if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
+               goto out;
+#endif
+       pmd = pmd_offset(pud_offset(pgd, address), address);
+       printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)__pmd_val(*pmd));
+
+       /*
+        * We must not directly access the pte in the highpte
+        * case if the page table is located in highmem.
+        * And let's rather not kmap-atomic the pte, just in case
+        * it's allocated already:
+        */
+       if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
+               goto out;
+
+       pte = pte_offset_kernel(pmd, address);
+       printk(KERN_CONT "*pte = %0*Lx ", sizeof(*pte) * 2, (u64)__pte_val(*pte));
+out:
+       printk(KERN_CONT "\n");
+}
+#define dump_pagetable(addr, krnl) dump_pagetable(addr)
+
+#else /* CONFIG_X86_64: */
+
+void vmalloc_sync_all(void)
+{
+       sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
+}
+
+/*
+ * 64-bit:
+ *
+ *   Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static noinline __kprobes int vmalloc_fault(unsigned long address)
+{
+       pgd_t *pgd, *pgd_ref;
+       pud_t *pud, *pud_ref;
+       pmd_t *pmd, *pmd_ref;
+       pte_t *pte, *pte_ref;
+
+       /* Make sure we are in vmalloc area: */
+       if (!(address >= VMALLOC_START && address < VMALLOC_END))
+               return -1;
+
+       WARN_ON_ONCE(in_nmi());
+
+       /*
+        * Copy kernel mappings over when needed. This can also
+        * happen within a race in page table update. In the later
+        * case just flush:
+        */
+       pgd = pgd_offset(current->active_mm, address);
+       pgd_ref = pgd_offset_k(address);
+       if (pgd_none(*pgd_ref))
+               return -1;
+
+       if (pgd_none(*pgd))
+               set_pgd(pgd, *pgd_ref);
+       else
+               BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+
+       /*
+        * Below here mismatches are bugs because these lower tables
+        * are shared:
+        */
+
+       pud = pud_offset(pgd, address);
+       pud_ref = pud_offset(pgd_ref, address);
+       if (pud_none(*pud_ref))
+               return -1;
+
+       if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+               BUG();
+
+       pmd = pmd_offset(pud, address);
+       pmd_ref = pmd_offset(pud_ref, address);
+       if (pmd_none(*pmd_ref))
+               return -1;
+
+       if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+               BUG();
+
+       pte_ref = pte_offset_kernel(pmd_ref, address);
+       if (!pte_present(*pte_ref))
+               return -1;
+
+       pte = pte_offset_kernel(pmd, address);
+
+       /*
+        * Don't use pte_page here, because the mappings can point
+        * outside mem_map, and the NUMA hash lookup cannot handle
+        * that:
+        */
+       if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+               BUG();
+
+       return 0;
+}
+
+#ifdef CONFIG_CPU_SUP_AMD
+static const char errata93_warning[] =
+KERN_ERR
+"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+"******* Working around it, but it may cause SEGVs or burn power.\n"
+"******* Please consider a BIOS update.\n"
+"******* Disabling USB legacy in the BIOS may also help.\n";
+#endif
+
+/*
+ * No vm86 mode in 64-bit mode:
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+                struct task_struct *tsk)
+{
+}
+
+static int bad_address(void *p)
+{
+       unsigned long dummy;
+
+       return probe_kernel_address((unsigned long *)p, dummy);
+}
+
+static void dump_pagetable(unsigned long address, bool kernel)
+{
+       pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+       pgd_t *pgd = base + pgd_index(address);
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       if (!kernel)
+               pgd = __user_pgd(base) + pgd_index(address);
+
+       if (bad_address(pgd))
+               goto bad;
+
+       printk("PGD %lx ", pgd_val(*pgd));
+
+       if (!pgd_present(*pgd))
+               goto out;
+
+       pud = pud_offset(pgd, address);
+       if (bad_address(pud))
+               goto bad;
+
+       printk(KERN_CONT "PUD %lx ", pud_val(*pud));
+       if (!pud_present(*pud) || pud_large(*pud))
+               goto out;
+
+       pmd = pmd_offset(pud, address);
+       if (bad_address(pmd))
+               goto bad;
+
+       printk(KERN_CONT "PMD %lx ", pmd_val(*pmd));
+       if (!pmd_present(*pmd) || pmd_large(*pmd))
+               goto out;
+
+       pte = pte_offset_kernel(pmd, address);
+       if (bad_address(pte))
+               goto bad;
+
+       printk(KERN_CONT "PTE %lx", pte_val(*pte));
+out:
+       printk(KERN_CONT "\n");
+       return;
+bad:
+       printk("BAD\n");
+}
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Workaround for K8 erratum #93 & buggy BIOS.
+ *
+ * BIOS SMM functions are required to use a specific workaround
+ * to avoid corruption of the 64bit RIP register on C stepping K8.
+ *
+ * A lot of BIOS that didn't get tested properly miss this.
+ *
+ * The OS sees this as a page fault with the upper 32bits of RIP cleared.
+ * Try to work around it here.
+ *
+ * Note we only handle faults in kernel here.
+ * Does nothing on 32-bit.
+ */
+static int is_errata93(struct pt_regs *regs, unsigned long address)
+{
+#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
+           || boot_cpu_data.x86 != 0xf)
+               return 0;
+
+       if (address != regs->ip)
+               return 0;
+
+       if ((address >> 32) != 0)
+               return 0;
+
+       address |= 0xffffffffUL << 32;
+       if ((address >= (u64)_stext && address <= (u64)_etext) ||
+           (address >= MODULES_VADDR && address <= MODULES_END)) {
+               printk_once(errata93_warning);
+               regs->ip = address;
+               return 1;
+       }
+#endif
+       return 0;
+}
+
+/*
+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps
+ * to illegal addresses >4GB.
+ *
+ * We catch this in the page fault handler because these addresses
+ * are not reachable. Just detect this case and return.  Any code
+ * segment in LDT is compatibility mode.
+ */
+static int is_errata100(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+       if ((regs->cs == __USER32_CS || regs->cs == FLAT_USER_CS32 ||
+            (regs->cs & (1<<2))) && (address >> 32))
+               return 1;
+#endif
+       return 0;
+}
+
+static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_F00F_BUG
+       unsigned long nr;
+
+       /*
+        * Pentium F0 0F C7 C8 bug workaround:
+        */
+       if (boot_cpu_data.f00f_bug) {
+               nr = (address - idt_descr.address) >> 3;
+
+               if (nr == 6) {
+                       do_invalid_op(regs, 0);
+                       return 1;
+               }
+       }
+#endif
+       return 0;
+}
+
+static const char nx_warning[] = KERN_CRIT
+"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
+
+static void
+show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+               unsigned long address)
+{
+       if (!oops_may_print())
+               return;
+
+       if (error_code & PF_INSTR) {
+               unsigned int level;
+
+               pte_t *pte = lookup_address(address, &level);
+
+               if (pte && pte_present(*pte) && !pte_exec(*pte))
+                       printk(nx_warning, current_uid());
+       }
+
+       printk(KERN_ALERT "BUG: unable to handle kernel ");
+       if (address < PAGE_SIZE)
+               printk(KERN_CONT "NULL pointer dereference");
+       else
+               printk(KERN_CONT "paging request");
+
+       printk(KERN_CONT " at %p\n", (void *) address);
+       printk(KERN_ALERT "IP:");
+       printk_address(regs->ip, 1);
+
+       dump_pagetable(address, !(error_code & PF_USER));
+}
+
+static noinline void
+pgtable_bad(struct pt_regs *regs, unsigned long error_code,
+           unsigned long address)
+{
+       struct task_struct *tsk;
+       unsigned long flags;
+       int sig;
+
+       flags = oops_begin();
+       tsk = current;
+       sig = SIGKILL;
+
+       printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+              tsk->comm, address);
+       dump_pagetable(address, !(error_code & PF_USER));
+
+       tsk->thread.cr2         = address;
+       tsk->thread.trap_nr     = X86_TRAP_PF;
+       tsk->thread.error_code  = error_code;
+
+       if (__die("Bad pagetable", regs, error_code))
+               sig = 0;
+
+       oops_end(flags, regs, sig);
+}
+
+static noinline void
+no_context(struct pt_regs *regs, unsigned long error_code,
+          unsigned long address, int signal, int si_code)
+{
+       struct task_struct *tsk = current;
+       unsigned long *stackend;
+       unsigned long flags;
+       int sig;
+
+       /* Are we prepared to handle this kernel fault? */
+       if (fixup_exception(regs)) {
+               if (current_thread_info()->sig_on_uaccess_error && signal) {
+                       tsk->thread.trap_nr = X86_TRAP_PF;
+                       tsk->thread.error_code = error_code | PF_USER;
+                       tsk->thread.cr2 = address;
+
+                       /* XXX: hwpoison faults will set the wrong code. */
+                       force_sig_info_fault(signal, si_code, address, tsk, 0);
+               }
+               return;
+       }
+
+       /*
+        * 32-bit:
+        *
+        *   Valid to do another page fault here, because if this fault
+        *   had been triggered by is_prefetch fixup_exception would have
+        *   handled it.
+        *
+        * 64-bit:
+        *
+        *   Hall of shame of CPU/BIOS bugs.
+        */
+       if (is_prefetch(regs, error_code, address))
+               return;
+
+       if (is_errata93(regs, address))
+               return;
+
+       /*
+        * Oops. The kernel tried to access some bad page. We'll have to
+        * terminate things with extreme prejudice:
+        */
+       flags = oops_begin();
+
+       show_fault_oops(regs, error_code, address);
+
+       stackend = end_of_stack(tsk);
+       if (tsk != &init_task && *stackend != STACK_END_MAGIC)
+               printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
+       tsk->thread.cr2         = address;
+       tsk->thread.trap_nr     = X86_TRAP_PF;
+       tsk->thread.error_code  = error_code;
+
+       sig = SIGKILL;
+       if (__die("Oops", regs, error_code))
+               sig = 0;
+
+       /* Executive summary in case the body of the oops scrolled away */
+       printk(KERN_DEFAULT "CR2: %016lx\n", address);
+
+       oops_end(flags, regs, sig);
+}
+
+/*
+ * Print out info about fatal segfaults, if the show_unhandled_signals
+ * sysctl is set:
+ */
+static inline void
+show_signal_msg(struct pt_regs *regs, unsigned long error_code,
+               unsigned long address, struct task_struct *tsk)
+{
+       if (!unhandled_signal(tsk, SIGSEGV))
+               return;
+
+       if (!printk_ratelimit())
+               return;
+
+       printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+               task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+               tsk->comm, task_pid_nr(tsk), address,
+               (void *)regs->ip, (void *)regs->sp, error_code);
+
+       print_vma_addr(KERN_CONT " in ", regs->ip);
+
+       printk(KERN_CONT "\n");
+}
+
+static void
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+                      unsigned long address, int si_code)
+{
+       struct task_struct *tsk = current;
+
+       /* User mode accesses just cause a SIGSEGV */
+       if (error_code & PF_USER) {
+               /*
+                * It's possible to have interrupts off here:
+                */
+               local_irq_enable();
+
+               /*
+                * Valid to do another page fault here because this one came
+                * from user space:
+                */
+               if (is_prefetch(regs, error_code, address))
+                       return;
+
+               if (is_errata100(regs, address))
+                       return;
+
+#ifdef CONFIG_X86_64
+               /*
+                * Instruction fetch faults in the vsyscall page might need
+                * emulation.
+                */
+               if (unlikely((error_code & PF_INSTR) &&
+                            ((address & ~0xfff) == VSYSCALL_START))) {
+                       if (emulate_vsyscall(regs, address))
+                               return;
+               }
+#endif
+
+               if (unlikely(show_unhandled_signals))
+                       show_signal_msg(regs, error_code, address, tsk);
+
+               /* Kernel addresses are always protection faults: */
+               tsk->thread.cr2         = address;
+               tsk->thread.error_code  = error_code | (address >= TASK_SIZE);
+               tsk->thread.trap_nr     = X86_TRAP_PF;
+
+               force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
+
+               return;
+       }
+
+       if (is_f00f_bug(regs, address))
+               return;
+
+       no_context(regs, error_code, address, SIGSEGV, si_code);
+}
+
+static noinline void
+bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+                    unsigned long address)
+{
+       __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+}
+
+static void
+__bad_area(struct pt_regs *regs, unsigned long error_code,
+          unsigned long address, int si_code)
+{
+       struct mm_struct *mm = current->mm;
+
+       /*
+        * Something tried to access memory that isn't in our memory map..
+        * Fix it, but check if it's kernel or user first..
+        */
+       up_read(&mm->mmap_sem);
+
+       __bad_area_nosemaphore(regs, error_code, address, si_code);
+}
+
+static noinline void
+bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+       __bad_area(regs, error_code, address, SEGV_MAPERR);
+}
+
+static noinline void
+bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
+                     unsigned long address)
+{
+       __bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+
+/* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
+static void
+out_of_memory(struct pt_regs *regs, unsigned long error_code,
+             unsigned long address)
+{
+       /*
+        * We ran out of memory, call the OOM killer, and return the userspace
+        * (which will retry the fault, or kill us if we got oom-killed):
+        */
+       up_read(&current->mm->mmap_sem);
+
+       pagefault_out_of_memory();
+}
+
+static void
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+         unsigned int fault)
+{
+       struct task_struct *tsk = current;
+       struct mm_struct *mm = tsk->mm;
+       int code = BUS_ADRERR;
+
+       up_read(&mm->mmap_sem);
+
+       /* Kernel mode? Handle exceptions or die: */
+       if (!(error_code & PF_USER)) {
+               no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+               return;
+       }
+
+       /* User-space => ok to do another page fault: */
+       if (is_prefetch(regs, error_code, address))
+               return;
+
+       tsk->thread.cr2         = address;
+       tsk->thread.error_code  = error_code;
+       tsk->thread.trap_nr     = X86_TRAP_PF;
+
+#ifdef CONFIG_MEMORY_FAILURE
+       if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+               printk(KERN_ERR
+       "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+                       tsk->comm, tsk->pid, address);
+               code = BUS_MCEERR_AR;
+       }
+#endif
+       force_sig_info_fault(SIGBUS, code, address, tsk, fault);
+}
+
+static noinline int
+mm_fault_error(struct pt_regs *regs, unsigned long error_code,
+              unsigned long address, unsigned int fault)
+{
+       /*
+        * Pagefault was interrupted by SIGKILL. We have no reason to
+        * continue pagefault.
+        */
+       if (fatal_signal_pending(current)) {
+               if (!(fault & VM_FAULT_RETRY))
+                       up_read(&current->mm->mmap_sem);
+               if (!(error_code & PF_USER))
+                       no_context(regs, error_code, address, 0, 0);
+               return 1;
+       }
+       if (!(fault & VM_FAULT_ERROR))
+               return 0;
+
+       if (fault & VM_FAULT_OOM) {
+               /* Kernel mode? Handle exceptions or die: */
+               if (!(error_code & PF_USER)) {
+                       up_read(&current->mm->mmap_sem);
+                       no_context(regs, error_code, address,
+                                  SIGSEGV, SEGV_MAPERR);
+                       return 1;
+               }
+
+               out_of_memory(regs, error_code, address);
+       } else {
+               if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+                            VM_FAULT_HWPOISON_LARGE))
+                       do_sigbus(regs, error_code, address, fault);
+               else
+                       BUG();
+       }
+       return 1;
+}
+
+static int spurious_fault_check(unsigned long error_code, pte_t *pte)
+{
+       if ((error_code & PF_WRITE) && !pte_write(*pte))
+               return 0;
+
+       if ((error_code & PF_INSTR) && !pte_exec(*pte))
+               return 0;
+
+       return 1;
+}
+
+/*
+ * Handle a spurious fault caused by a stale TLB entry.
+ *
+ * This allows us to lazily refresh the TLB when increasing the
+ * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
+ * eagerly is very expensive since that implies doing a full
+ * cross-processor TLB flush, even if no stale TLB entries exist
+ * on other processors.
+ *
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ */
+static noinline __kprobes int
+spurious_fault(unsigned long error_code, unsigned long address)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       int ret;
+
+       /* Reserved-bit violation or user access to kernel space? */
+       if (error_code & (PF_USER | PF_RSVD))
+               return 0;
+
+       pgd = init_mm.pgd + pgd_index(address);
+       if (!pgd_present(*pgd))
+               return 0;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return 0;
+
+       if (pud_large(*pud))
+               return spurious_fault_check(error_code, (pte_t *) pud);
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return 0;
+
+       if (pmd_large(*pmd))
+               return spurious_fault_check(error_code, (pte_t *) pmd);
+
+       /*
+        * Note: don't use pte_present() here, since it returns true
+        * if the _PAGE_PROTNONE bit is set.  However, this aliases the
+        * _PAGE_GLOBAL bit, which for kernel pages give false positives
+        * when CONFIG_DEBUG_PAGEALLOC is used.
+        */
+       pte = pte_offset_kernel(pmd, address);
+       if (!(pte_flags(*pte) & _PAGE_PRESENT))
+               return 0;
+
+       ret = spurious_fault_check(error_code, pte);
+       if (!ret)
+               return 0;
+
+       /*
+        * Make sure we have permissions in PMD.
+        * If not, then there's a bug in the page tables:
+        */
+       ret = spurious_fault_check(error_code, (pte_t *) pmd);
+       WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
+
+       return ret;
+}
+
+int show_unhandled_signals = 1;
+
+static inline int
+access_error(unsigned long error_code, struct vm_area_struct *vma)
+{
+       if (error_code & PF_WRITE) {
+               /* write, present and write, not present: */
+               if (unlikely(!(vma->vm_flags & VM_WRITE)))
+                       return 1;
+               return 0;
+       }
+
+       /* read, present: */
+       if (unlikely(error_code & PF_PROT))
+               return 1;
+
+       /* read, not present: */
+       if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+               return 1;
+
+       return 0;
+}
+
+static int fault_in_kernel_space(unsigned long address)
+{
+       return address >= TASK_SIZE_MAX;
+}
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+dotraplinkage void __kprobes
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+       struct vm_area_struct *vma;
+       struct task_struct *tsk;
+       unsigned long address;
+       struct mm_struct *mm;
+       int fault;
+       int write = error_code & PF_WRITE;
+       unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
+                                       (write ? FAULT_FLAG_WRITE : 0);
+
+       /* Set the "privileged fault" bit to something sane. */
+       if (user_mode_vm(regs))
+               error_code |= PF_USER;
+       else
+               error_code &= ~PF_USER;
+
+       tsk = current;
+       mm = tsk->mm;
+
+       /* Get the faulting address: */
+       address = read_cr2();
+
+       /*
+        * Detect and handle instructions that would cause a page fault for
+        * both a tracked kernel page and a userspace page.
+        */
+       if (kmemcheck_active(regs))
+               kmemcheck_hide(regs);
+       prefetchw(&mm->mmap_sem);
+
+       if (unlikely(kmmio_fault(regs, address)))
+               return;
+
+       /*
+        * We fault-in kernel-space virtual memory on-demand. The
+        * 'reference' page table is init_mm.pgd.
+        *
+        * NOTE! We MUST NOT take any locks for this case. We may
+        * be in an interrupt or a critical region, and should
+        * only copy the information from the master page table,
+        * nothing more.
+        *
+        * This verifies that the fault happens in kernel space
+        * (error_code & 4) == 0, and that the fault was not a
+        * protection error (error_code & 9) == 0.
+        */
+       if (unlikely(fault_in_kernel_space(address))) {
+               /* Faults in hypervisor area can never be patched up. */
+#if defined(CONFIG_X86_XEN)
+               if (address >= hypervisor_virt_start) {
+#elif defined(CONFIG_X86_64_XEN)
+               if (address >= HYPERVISOR_VIRT_START
+                   && address < HYPERVISOR_VIRT_END) {
+#endif
+                       bad_area_nosemaphore(regs, error_code, address);
+                       return;
+               }
+
+               if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+                       if (vmalloc_fault(address) >= 0)
+                               return;
+
+                       if (kmemcheck_fault(regs, address, error_code))
+                               return;
+               }
+
+               /* Can handle a stale RO->RW TLB: */
+               if (spurious_fault(error_code, address))
+                       return;
+
+               /* kprobes don't want to hook the spurious faults: */
+               if (notify_page_fault(regs))
+                       return;
+               /*
+                * Don't take the mm semaphore here. If we fixup a prefetch
+                * fault we could otherwise deadlock:
+                */
+               bad_area_nosemaphore(regs, error_code, address);
+
+               return;
+       }
+
+       /* kprobes don't want to hook the spurious faults: */
+       if (unlikely(notify_page_fault(regs)))
+               return;
+       /*
+        * It's safe to allow irq's after cr2 has been saved and the
+        * vmalloc fault has been handled.
+        *
+        * User-mode registers count as a user access even for any
+        * potential system fault or CPU buglet:
+        */
+       if (user_mode_vm(regs)) {
+               local_irq_enable();
+               error_code |= PF_USER;
+       } else {
+               if (regs->flags & X86_EFLAGS_IF)
+                       local_irq_enable();
+       }
+
+       if (unlikely(error_code & PF_RSVD))
+               pgtable_bad(regs, error_code, address);
+
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+       /*
+        * If we're in an interrupt, have no user context or are running
+        * in an atomic region then we must not take the fault:
+        */
+       if (unlikely(in_atomic() || !mm)) {
+               bad_area_nosemaphore(regs, error_code, address);
+               return;
+       }
+
+       /*
+        * When running in the kernel we expect faults to occur only to
+        * addresses in user space.  All other faults represent errors in
+        * the kernel and should generate an OOPS.  Unfortunately, in the
+        * case of an erroneous fault occurring in a code path which already
+        * holds mmap_sem we will deadlock attempting to validate the fault
+        * against the address space.  Luckily the kernel only validly
+        * references user space from well defined areas of code, which are
+        * listed in the exceptions table.
+        *
+        * As the vast majority of faults will be valid we will only perform
+        * the source reference check when there is a possibility of a
+        * deadlock. Attempt to lock the address space, if we cannot we then
+        * validate the source. If this is invalid we can skip the address
+        * space check, thus avoiding the deadlock:
+        */
+       if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+               if ((error_code & PF_USER) == 0 &&
+                   !search_exception_tables(regs->ip)) {
+                       bad_area_nosemaphore(regs, error_code, address);
+                       return;
+               }
+retry:
+               down_read(&mm->mmap_sem);
+       } else {
+               /*
+                * The above down_read_trylock() might have succeeded in
+                * which case we'll have missed the might_sleep() from
+                * down_read():
+                */
+               might_sleep();
+       }
+
+       vma = find_vma(mm, address);
+       if (unlikely(!vma)) {
+               bad_area(regs, error_code, address);
+               return;
+       }
+       if (likely(vma->vm_start <= address))
+               goto good_area;
+       if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+               bad_area(regs, error_code, address);
+               return;
+       }
+       if (error_code & PF_USER) {
+               /*
+                * Accessing the stack below %sp is always a bug.
+                * The large cushion allows instructions like enter
+                * and pusha to work. ("enter $65535, $31" pushes
+                * 32 pointers and then decrements %sp by 65535.)
+                */
+               if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
+                       bad_area(regs, error_code, address);
+                       return;
+               }
+       }
+       if (unlikely(expand_stack(vma, address))) {
+               bad_area(regs, error_code, address);
+               return;
+       }
+
+       /*
+        * Ok, we have a good vm_area for this memory access, so
+        * we can handle it..
+        */
+good_area:
+       if (unlikely(access_error(error_code, vma))) {
+               bad_area_access_error(regs, error_code, address);
+               return;
+       }
+
+       /*
+        * If for any reason at all we couldn't handle the fault,
+        * make sure we exit gracefully rather than endlessly redo
+        * the fault:
+        */
+       fault = handle_mm_fault(mm, vma, address, flags);
+
+       if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
+               if (mm_fault_error(regs, error_code, address, fault))
+                       return;
+       }
+
+       /*
+        * Major/minor page fault accounting is only done on the
+        * initial attempt. If we go through a retry, it is extremely
+        * likely that the page will be found in page cache at that point.
+        */
+       if (flags & FAULT_FLAG_ALLOW_RETRY) {
+               if (fault & VM_FAULT_MAJOR) {
+                       tsk->maj_flt++;
+                       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
+                                     regs, address);
+               } else {
+                       tsk->min_flt++;
+                       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
+                                     regs, address);
+               }
+               if (fault & VM_FAULT_RETRY) {
+                       /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+                        * of starvation. */
+                       flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       goto retry;
+               }
+       }
+
+       check_v8086_mode(regs, address, tsk);
+
+       up_read(&mm->mmap_sem);
+}
diff --git a/arch/x86/mm/highmem_32-xen.c b/arch/x86/mm/highmem_32-xen.c

new file mode 100644 (file)

index 0000000..9166ffd
--- /dev/null
+++ b/arch/x86/mm/highmem_32-xen.c
@@ -0,0 +1,196 @@
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/swap.h> /* for totalram_pages */
+
+void *kmap(struct page *page)
+{
+       might_sleep();
+       if (!PageHighMem(page))
+               return page_address(page);
+       return kmap_high(page);
+}
+EXPORT_SYMBOL(kmap);
+
+void kunmap(struct page *page)
+{
+       if (in_interrupt())
+               BUG();
+       if (!PageHighMem(page))
+               return;
+       kunmap_high(page);
+}
+EXPORT_SYMBOL(kunmap);
+
+/*
+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
+ * no global lock is needed and because the kmap code must perform a global TLB
+ * invalidation when the kmap pool wraps.
+ *
+ * However when holding an atomic kmap it is not legal to sleep, so atomic
+ * kmaps are appropriate for short, tight code paths only.
+ */
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+       unsigned long vaddr;
+       int idx, type;
+
+       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+       pagefault_disable();
+
+       if (!PageHighMem(page))
+               return page_address(page);
+
+       type = kmap_atomic_idx_push();
+       idx = type + KM_TYPE_NR*smp_processor_id();
+       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+       BUG_ON(!pte_none(*(kmap_pte-idx)));
+       set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
+       /*arch_flush_lazy_mmu_mode();*/
+
+       return (void *)vaddr;
+}
+EXPORT_SYMBOL(kmap_atomic_prot);
+
+void *kmap_atomic(struct page *page)
+{
+       return kmap_atomic_prot(page, kmap_prot);
+}
+EXPORT_SYMBOL(kmap_atomic);
+
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+void *kmap_atomic_pfn(unsigned long pfn)
+{
+       return kmap_atomic_prot_pfn(pfn, kmap_prot);
+}
+EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
+
+void __kunmap_atomic(void *kvaddr)
+{
+       unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+
+       if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+           vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+               int idx, type;
+
+               type = kmap_atomic_idx();
+               idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+               WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+               /*
+                * Force other mappings to Oops if they'll try to access this
+                * pte without first remap it.  Keeping stale mappings around
+                * is a bad idea also, in case the page changes cacheability
+                * attributes or becomes a protected page in a hypervisor.
+                */
+               kpte_clear_flush(kmap_pte-idx, vaddr);
+               kmap_atomic_idx_pop();
+               /*arch_flush_lazy_mmu_mode();*/
+       }
+#ifdef CONFIG_DEBUG_HIGHMEM
+       else {
+               BUG_ON(vaddr < PAGE_OFFSET);
+               BUG_ON(vaddr >= (unsigned long)high_memory);
+       }
+#endif
+
+       pagefault_enable();
+}
+EXPORT_SYMBOL(__kunmap_atomic);
+
+struct page *kmap_atomic_to_page(void *ptr)
+{
+       unsigned long idx, vaddr = (unsigned long)ptr;
+       pte_t *pte;
+
+       if (vaddr < FIXADDR_START)
+               return virt_to_page(ptr);
+
+       idx = virt_to_fix(vaddr);
+       pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
+       return pte_page(*pte);
+}
+EXPORT_SYMBOL(kmap_atomic_to_page);
+
+void clear_highpage(struct page *page)
+{
+       void *kaddr;
+
+       if (likely(xen_feature(XENFEAT_highmem_assist))
+           && PageHighMem(page)) {
+               struct mmuext_op meo;
+
+               meo.cmd = MMUEXT_CLEAR_PAGE;
+               meo.arg1.mfn = pfn_to_mfn(page_to_pfn(page));
+               if (HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
+                       return;
+       }
+
+       kaddr = kmap_atomic(page);
+       clear_page(kaddr);
+       kunmap_atomic(kaddr);
+}
+EXPORT_SYMBOL(clear_highpage);
+
+void copy_highpage(struct page *to, struct page *from)
+{
+       void *vfrom, *vto;
+
+       if (likely(xen_feature(XENFEAT_highmem_assist))
+           && (PageHighMem(from) || PageHighMem(to))) {
+               unsigned long from_pfn = page_to_pfn(from);
+               unsigned long to_pfn = page_to_pfn(to);
+               struct mmuext_op meo;
+
+               meo.cmd = MMUEXT_COPY_PAGE;
+               meo.arg1.mfn = pfn_to_mfn(to_pfn);
+               meo.arg2.src_mfn = pfn_to_mfn(from_pfn);
+               if (mfn_to_pfn(meo.arg2.src_mfn) == from_pfn
+                   && mfn_to_pfn(meo.arg1.mfn) == to_pfn
+                   && HYPERVISOR_mmuext_op(&meo, 1, NULL, DOMID_SELF) == 0)
+                       return;
+       }
+
+       vfrom = kmap_atomic(from);
+       vto = kmap_atomic(to);
+       copy_page(vto, vfrom);
+       kunmap_atomic(vfrom);
+       kunmap_atomic(vto);
+}
+EXPORT_SYMBOL(copy_highpage);
+
+void __init set_highmem_pages_init(void)
+{
+       struct zone *zone;
+       int nid;
+
+       for_each_zone(zone) {
+               unsigned long zone_start_pfn, zone_end_pfn;
+
+               if (!is_highmem(zone))
+                       continue;
+
+               zone_start_pfn = zone->zone_start_pfn;
+               zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+
+               nid = zone_to_nid(zone);
+               printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
+                               zone->name, nid, zone_start_pfn, zone_end_pfn);
+
+               add_highpages_with_active_regions(nid, zone_start_pfn,
+                                zone_end_pfn);
+
+               /* XEN: init high-mem pages outside initial allocation. */
+               if (zone_start_pfn < xen_start_info->nr_pages)
+                       zone_start_pfn = xen_start_info->nr_pages;
+               for (; zone_start_pfn < zone_end_pfn; zone_start_pfn++) {
+                       ClearPageReserved(pfn_to_page(zone_start_pfn));
+                       init_page_count(pfn_to_page(zone_start_pfn));
+               }
+       }
+       totalram_pages += totalhigh_pages;
+}
diff --git a/arch/x86/mm/hypervisor.c b/arch/x86/mm/hypervisor.c

new file mode 100644 (file)

index 0000000..60cda7f
--- /dev/null
+++ b/arch/x86/mm/hypervisor.c
@@ -0,0 +1,1314 @@
+/******************************************************************************
+ * mm/hypervisor.c
+ * 
+ * Update page tables via the hypervisor.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+#include <asm/hypervisor.h>
+#include <xen/balloon.h>
+#include <xen/features.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/vcpu.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#include <linux/highmem.h>
+#ifdef CONFIG_X86_32
+#include <linux/bootmem.h> /* for max_pfn */
+#endif
+
+EXPORT_SYMBOL(hypercall_page);
+
+shared_info_t *__read_mostly HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+#ifndef CONFIG_XEN_VCPU_INFO_PLACEMENT
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+#else
+DEFINE_PER_CPU(struct vcpu_info, vcpu_info) __aligned(sizeof(struct vcpu_info));
+EXPORT_PER_CPU_SYMBOL(vcpu_info);
+
+void __ref setup_vcpu_info(unsigned int cpu)
+{
+       struct vcpu_info *v = &per_cpu(vcpu_info, cpu);
+       struct vcpu_register_vcpu_info info;
+#ifdef CONFIG_X86_64
+       static bool first = true;
+
+       if (first) {
+               first = false;
+               info.mfn = early_arbitrary_virt_to_mfn(v);
+       } else
+#endif
+               info.mfn = arbitrary_virt_to_mfn(v);
+       info.offset = offset_in_page(v);
+
+       if (HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info))
+               BUG();
+}
+
+void __init adjust_boot_vcpu_info(void)
+{
+       unsigned long lpfn, rpfn, lmfn, rmfn;
+       pte_t *lpte, *rpte;
+       unsigned int level;
+       mmu_update_t mmu[2];
+
+       /*
+        * setup_vcpu_info() cannot be used more than once for a given (v)CPU,
+        * hence we must swap the underlying MFNs of the two pages holding old
+        * and new vcpu_info of the boot CPU.
+        *
+        * Do *not* use __get_cpu_var() or this_cpu_{write,...}() here, as the
+        * per-CPU segment didn't get reloaded yet. Using this_cpu_read(), as
+        * in arch_use_lazy_mmu_mode(), though undesirable, is safe except for
+        * the accesses to variables that were updated in setup_percpu_areas().
+        */
+       lpte = lookup_address((unsigned long)&vcpu_info
+                             + (__per_cpu_load - __per_cpu_start),
+                             &level);
+       rpte = lookup_address((unsigned long)&per_cpu(vcpu_info, 0), &level);
+       BUG_ON(!lpte || !(pte_flags(*lpte) & _PAGE_PRESENT));
+       BUG_ON(!rpte || !(pte_flags(*rpte) & _PAGE_PRESENT));
+       lmfn = __pte_mfn(*lpte);
+       rmfn = __pte_mfn(*rpte);
+
+       if (lmfn == rmfn)
+               return;
+
+       lpfn = mfn_to_local_pfn(lmfn);
+       rpfn = mfn_to_local_pfn(rmfn);
+
+       pr_info("Swapping MFNs for PFN %lx and %lx (MFN %lx and %lx)\n",
+               lpfn, rpfn, lmfn, rmfn);
+
+       xen_l1_entry_update(lpte, pfn_pte_ma(rmfn, pte_pgprot(*lpte)));
+       xen_l1_entry_update(rpte, pfn_pte_ma(lmfn, pte_pgprot(*rpte)));
+#ifdef CONFIG_X86_64
+       if (HYPERVISOR_update_va_mapping((unsigned long)__va(lpfn<<PAGE_SHIFT),
+                                        pfn_pte_ma(rmfn, PAGE_KERNEL_RO), 0))
+               BUG();
+#endif
+       if (HYPERVISOR_update_va_mapping((unsigned long)__va(rpfn<<PAGE_SHIFT),
+                                        pfn_pte_ma(lmfn, PAGE_KERNEL),
+                                        UVMF_TLB_FLUSH))
+               BUG();
+
+       set_phys_to_machine(lpfn, rmfn);
+       set_phys_to_machine(rpfn, lmfn);
+
+       mmu[0].ptr = ((uint64_t)lmfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+       mmu[0].val = rpfn;
+       mmu[1].ptr = ((uint64_t)rmfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+       mmu[1].val = lpfn;
+       if (HYPERVISOR_mmu_update(mmu, 2, NULL, DOMID_SELF))
+               BUG();
+
+       /*
+        * Copy over all contents of the page just replaced, except for the
+        * vcpu_info itself, as it may have got updated after having been
+        * copied from __per_cpu_load[].
+        */
+       memcpy(__va(rpfn << PAGE_SHIFT),
+              __va(lpfn << PAGE_SHIFT),
+              (unsigned long)&vcpu_info & (PAGE_SIZE - 1));
+       level = (unsigned long)(&vcpu_info + 1) & (PAGE_SIZE - 1);
+       if (level)
+               memcpy(__va(rpfn << PAGE_SHIFT) + level,
+                      __va(lpfn << PAGE_SHIFT) + level,
+                      PAGE_SIZE - level);
+}
+#endif
+
+#define NR_MC     BITS_PER_LONG
+#define NR_MMU    BITS_PER_LONG
+#define NR_MMUEXT (BITS_PER_LONG / 4)
+
+DEFINE_PER_CPU(bool, xen_lazy_mmu);
+struct lazy_mmu {
+       unsigned int nr_mc, nr_mmu, nr_mmuext;
+       multicall_entry_t mc[NR_MC];
+       mmu_update_t mmu[NR_MMU];
+       struct mmuext_op mmuext[NR_MMUEXT];
+};
+static DEFINE_PER_CPU(struct lazy_mmu, lazy_mmu);
+
+static inline bool use_lazy_mmu_mode(void)
+{
+#ifdef CONFIG_PREEMPT
+       if (!preempt_count())
+               return false;
+#endif
+       return !irq_count();
+}
+
+static void multicall_failed(const multicall_entry_t *mc, int rc)
+{
+       pr_emerg("hypercall#%lu(%lx, %lx, %lx, %lx) failed: %d"
+                " (caller %lx)\n",
+              mc->op, mc->args[0], mc->args[1], mc->args[2], mc->args[3],
+              rc, mc->args[5]);
+       BUG();
+}
+
+static int _xen_multicall_flush(bool ret_last) {
+       struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
+       multicall_entry_t *mc = lazy->mc;
+       unsigned int count = lazy->nr_mc;
+
+       if (!count)
+               return 0;
+
+       lazy->nr_mc = 0;
+       lazy->nr_mmu = 0;
+       lazy->nr_mmuext = 0;
+
+       if (count == 1) {
+               int rc = _hypercall(int, mc->op, mc->args[0], mc->args[1],
+                                   mc->args[2], mc->args[3], mc->args[4]);
+
+               if (unlikely(rc)) {
+                       if (ret_last)
+                               return rc;
+                       multicall_failed(mc, rc);
+               }
+       } else {
+               if (HYPERVISOR_multicall(mc, count))
+                       BUG();
+               while (count-- > ret_last)
+                       if (unlikely(mc++->result))
+                               multicall_failed(mc - 1, mc[-1].result);
+               if (ret_last)
+                       return mc->result;
+       }
+
+       return 0;
+}
+
+void xen_multicall_flush(void) {
+       if (use_lazy_mmu_mode())
+               _xen_multicall_flush(false);
+}
+
+int xen_multi_update_va_mapping(unsigned long va, pte_t pte,
+                               unsigned long uvmf)
+{
+       struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
+       multicall_entry_t *mc;
+
+       if (unlikely(!use_lazy_mmu_mode()))
+#ifdef CONFIG_X86_PAE
+               return _hypercall4(int, update_va_mapping, va,
+                                  pte.pte_low, pte.pte_high, uvmf);
+#else
+               return _hypercall3(int, update_va_mapping, va,
+                                  pte.pte, uvmf);
+#endif
+
+       if (unlikely(lazy->nr_mc == NR_MC))
+               _xen_multicall_flush(false);
+
+       mc = lazy->mc + lazy->nr_mc++;
+       mc->op = __HYPERVISOR_update_va_mapping;
+       mc->args[0] = va;
+#ifndef CONFIG_X86_PAE
+       mc->args[1] = pte.pte;
+#else
+       mc->args[1] = pte.pte_low;
+       mc->args[2] = pte.pte_high;
+#endif
+       mc->args[MULTI_UVMFLAGS_INDEX] = uvmf;
+       mc->args[5] = (long)__builtin_return_address(0);
+
+       return 0;
+}
+
+static inline bool mmu_may_merge(const multicall_entry_t *mc,
+                                unsigned int op, domid_t domid)
+{
+       return mc->op == op && !mc->args[2] && mc->args[3] == domid;
+}
+
+int xen_multi_mmu_update(mmu_update_t *src, unsigned int count,
+                        unsigned int *success_count, domid_t domid)
+{
+       struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
+       multicall_entry_t *mc = lazy->mc + lazy->nr_mc;
+       mmu_update_t *dst;
+       bool commit, merge;
+
+       if (unlikely(!use_lazy_mmu_mode()))
+               return _hypercall4(int, mmu_update, src, count,
+                                  success_count, domid);
+
+       commit = (lazy->nr_mmu + count) > NR_MMU || success_count;
+       merge = lazy->nr_mc && !commit
+               && mmu_may_merge(mc - 1, __HYPERVISOR_mmu_update, domid);
+       if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
+               _xen_multicall_flush(false);
+               mc = lazy->mc;
+               commit = count > NR_MMU || success_count;
+       }
+
+       if (!lazy->nr_mc && unlikely(commit))
+               return _hypercall4(int, mmu_update, src, count,
+                                  success_count, domid);
+
+       dst = lazy->mmu + lazy->nr_mmu;
+       lazy->nr_mmu += count;
+       if (merge) {
+               mc[-1].args[1] += count;
+               memcpy(dst, src, count * sizeof(*src));
+       } else {
+               ++lazy->nr_mc;
+               mc->op = __HYPERVISOR_mmu_update;
+               if (!commit) {
+                       mc->args[0] = (unsigned long)dst;
+                       memcpy(dst, src, count * sizeof(*src));
+               } else
+                       mc->args[0] = (unsigned long)src;
+               mc->args[1] = count;
+               mc->args[2] = (unsigned long)success_count;
+               mc->args[3] = domid;
+               mc->args[5] = (long)__builtin_return_address(0);
+       }
+
+       while (!commit && count--)
+               switch (src++->ptr & (sizeof(pteval_t) - 1)) {
+               case MMU_NORMAL_PT_UPDATE:
+               case MMU_PT_UPDATE_PRESERVE_AD:
+                       break;
+               default:
+                       commit = true;
+                       break;
+               }
+
+       return commit ? _xen_multicall_flush(true) : 0;
+}
+
+int xen_multi_mmuext_op(struct mmuext_op *src, unsigned int count,
+                       unsigned int *success_count, domid_t domid)
+{
+       struct lazy_mmu *lazy = &__get_cpu_var(lazy_mmu);
+       multicall_entry_t *mc;
+       struct mmuext_op *dst;
+       bool commit, merge;
+
+       if (unlikely(!use_lazy_mmu_mode()))
+               return _hypercall4(int, mmuext_op, src, count,
+                                  success_count, domid);
+
+       /*
+        * While it could be useful in theory, I've never seen the body of
+        * this conditional to be reached, hence it seems more reasonable
+        * to disable it for the time being.
+        */
+       if (0 && likely(count)
+           && likely(!success_count)
+           && likely(domid == DOMID_SELF)
+           && likely(lazy->nr_mc)
+           && lazy->mc[lazy->nr_mc - 1].op == __HYPERVISOR_update_va_mapping) {
+               unsigned long oldf, newf = UVMF_NONE;
+
+               switch (src->cmd) {
+               case MMUEXT_TLB_FLUSH_ALL:
+                       newf = UVMF_TLB_FLUSH | UVMF_ALL;
+                       break;
+               case MMUEXT_INVLPG_ALL:
+                       newf = UVMF_INVLPG | UVMF_ALL;
+                       break;
+               case MMUEXT_TLB_FLUSH_MULTI:
+                       newf = UVMF_TLB_FLUSH | UVMF_MULTI
+                              | (unsigned long)src->arg2.vcpumask.p;
+                       break;
+               case MMUEXT_INVLPG_MULTI:
+                       newf = UVMF_INVLPG | UVMF_MULTI
+                              | (unsigned long)src->arg2.vcpumask.p;
+                       break;
+               case MMUEXT_TLB_FLUSH_LOCAL:
+                       newf = UVMF_TLB_FLUSH | UVMF_LOCAL;
+                       break;
+               case MMUEXT_INVLPG_LOCAL:
+                       newf = UVMF_INVLPG | UVMF_LOCAL;
+                       break;
+               }
+               mc = lazy->mc + lazy->nr_mc - 1;
+               oldf = mc->args[MULTI_UVMFLAGS_INDEX];
+               if (newf == UVMF_NONE || oldf == UVMF_NONE
+                   || newf == (UVMF_TLB_FLUSH | UVMF_ALL))
+                       ;
+               else if (oldf == (UVMF_TLB_FLUSH | UVMF_ALL))
+                       newf = UVMF_TLB_FLUSH | UVMF_ALL;
+               else if ((newf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG
+                        && (oldf & UVMF_FLUSHTYPE_MASK) == UVMF_INVLPG
+                        && ((src->arg1.linear_addr ^ mc->args[0])
+                            >> PAGE_SHIFT))
+                       newf = UVMF_NONE;
+               else if (((oldf | newf) & UVMF_ALL)
+                        && !((oldf ^ newf) & UVMF_FLUSHTYPE_MASK))
+                       newf |= UVMF_ALL;
+               else if ((oldf ^ newf) & ~UVMF_FLUSHTYPE_MASK)
+                       newf = UVMF_NONE;
+               else if ((oldf & UVMF_FLUSHTYPE_MASK) == UVMF_TLB_FLUSH)
+                       newf = (newf & ~UVMF_FLUSHTYPE_MASK) | UVMF_TLB_FLUSH;
+               else if ((newf & UVMF_FLUSHTYPE_MASK) != UVMF_TLB_FLUSH
+                        && ((newf ^ oldf) & UVMF_FLUSHTYPE_MASK))
+                       newf = UVMF_NONE;
+               if (newf != UVMF_NONE) {
+                       mc->args[MULTI_UVMFLAGS_INDEX] = newf;
+                       ++src;
+                       if (!--count)
+                               return 0;
+               }
+       }
+
+       mc = lazy->mc + lazy->nr_mc;
+       commit = (lazy->nr_mmuext + count) > NR_MMUEXT || success_count;
+       merge = lazy->nr_mc && !commit
+               && mmu_may_merge(mc - 1, __HYPERVISOR_mmuext_op, domid);
+       if (unlikely(lazy->nr_mc == NR_MC) && !merge) {
+               _xen_multicall_flush(false);
+               mc = lazy->mc;
+               commit = count > NR_MMUEXT || success_count;
+       }
+
+       if (!lazy->nr_mc && unlikely(commit))
+               return _hypercall4(int, mmuext_op, src, count,
+                                  success_count, domid);
+
+       dst = lazy->mmuext + lazy->nr_mmuext;
+       lazy->nr_mmuext += count;
+       if (merge) {
+               mc[-1].args[1] += count;
+               memcpy(dst, src, count * sizeof(*src));
+       } else {
+               ++lazy->nr_mc;
+               mc->op = __HYPERVISOR_mmuext_op;
+               if (!commit) {
+                       mc->args[0] = (unsigned long)dst;
+                       memcpy(dst, src, count * sizeof(*src));
+               } else
+                       mc->args[0] = (unsigned long)src;
+               mc->args[1] = count;
+               mc->args[2] = (unsigned long)success_count;
+               mc->args[3] = domid;
+               mc->args[5] = (long)__builtin_return_address(0);
+       }
+
+       while (!commit && count--)
+               switch (src++->cmd) {
+               case MMUEXT_PIN_L1_TABLE:
+               case MMUEXT_PIN_L2_TABLE:
+               case MMUEXT_PIN_L3_TABLE:
+               case MMUEXT_PIN_L4_TABLE:
+               case MMUEXT_UNPIN_TABLE:
+               case MMUEXT_TLB_FLUSH_LOCAL:
+               case MMUEXT_INVLPG_LOCAL:
+               case MMUEXT_TLB_FLUSH_MULTI:
+               case MMUEXT_INVLPG_MULTI:
+               case MMUEXT_TLB_FLUSH_ALL:
+               case MMUEXT_INVLPG_ALL:
+                       break;
+               default:
+                       commit = true;
+                       break;
+               }
+
+       return commit ? _xen_multicall_flush(true) : 0;
+}
+
+void xen_l1_entry_update(pte_t *ptr, pte_t val)
+{
+       mmu_update_t u;
+       u.ptr = ptep_to_machine(ptr);
+       u.val = __pte_val(val);
+       BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL_GPL(xen_l1_entry_update);
+
+static void do_lN_entry_update(mmu_update_t *mmu, unsigned int mmu_count,
+                               struct page *page)
+{
+       if (likely(page)) {
+               multicall_entry_t mcl[2];
+               unsigned long pfn = page_to_pfn(page);
+
+               MULTI_update_va_mapping(mcl,
+                                       (unsigned long)__va(pfn << PAGE_SHIFT),
+                                       pfn_pte(pfn, PAGE_KERNEL_RO), 0);
+               SetPagePinned(page);
+               MULTI_mmu_update(mcl + 1, mmu, mmu_count, NULL, DOMID_SELF);
+               if (unlikely(HYPERVISOR_multicall_check(mcl, 2, NULL)))
+                       BUG();
+       } else if (unlikely(HYPERVISOR_mmu_update(mmu, mmu_count,
+                                                 NULL, DOMID_SELF) < 0))
+               BUG();
+}
+
+void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
+{
+       mmu_update_t u;
+       struct page *page = NULL;
+
+       if (likely(pmd_present(val)) && likely(!pmd_large(val))
+           && likely(mem_map)
+           && likely(PagePinned(virt_to_page(ptr)))) {
+               page = pmd_page(val);
+               if (unlikely(PagePinned(page)))
+                       page = NULL;
+               else if (PageHighMem(page)) {
+#ifndef CONFIG_HIGHPTE
+                       BUG();
+#endif
+                       kmap_flush_unused();
+                       page = NULL;
+               }
+       }
+       u.ptr = virt_to_machine(ptr);
+       u.val = __pmd_val(val);
+       do_lN_entry_update(&u, 1, page);
+}
+
+#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
+void xen_l3_entry_update(pud_t *ptr, pud_t val)
+{
+       mmu_update_t u;
+       struct page *page = NULL;
+
+       if (likely(pud_present(val))
+#ifdef CONFIG_X86_64
+           && likely(!pud_large(val))
+#endif
+           && likely(mem_map)
+           && likely(PagePinned(virt_to_page(ptr)))) {
+               page = pud_page(val);
+               if (unlikely(PagePinned(page)))
+                       page = NULL;
+       }
+       u.ptr = virt_to_machine(ptr);
+       u.val = __pud_val(val);
+       do_lN_entry_update(&u, 1, page);
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
+{
+       mmu_update_t u[2];
+       struct page *page = NULL;
+
+       if (likely(pgd_present(val)) && likely(mem_map)
+           && likely(PagePinned(virt_to_page(ptr)))) {
+               page = pgd_page(val);
+               if (unlikely(PagePinned(page)))
+                       page = NULL;
+       }
+       u[0].ptr = virt_to_machine(ptr);
+       u[0].val = __pgd_val(val);
+       if (((unsigned long)ptr & ~PAGE_MASK)
+           <= pgd_index(TASK_SIZE_MAX) * sizeof(*ptr)) {
+               ptr = __user_pgd(ptr);
+               BUG_ON(!ptr);
+               u[1].ptr = virt_to_machine(ptr);
+               u[1].val = __pgd_val(val);
+               do_lN_entry_update(u, 2, page);
+       } else
+               do_lN_entry_update(u, 1, page);
+}
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_X86_64
+void xen_pt_switch(pgd_t *pgd)
+{
+       struct mmuext_op op;
+       op.cmd = MMUEXT_NEW_BASEPTR;
+       op.arg1.mfn = virt_to_mfn(pgd);
+       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_new_user_pt(pgd_t *pgd)
+{
+       struct mmuext_op op;
+
+       pgd = __user_pgd(pgd);
+       op.cmd = MMUEXT_NEW_USER_BASEPTR;
+       op.arg1.mfn = pgd ? virt_to_mfn(pgd) : 0;
+       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+#endif
+
+void xen_tlb_flush(void)
+{
+       struct mmuext_op op;
+       op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
+       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL(xen_tlb_flush);
+
+void xen_invlpg(unsigned long ptr)
+{
+       struct mmuext_op op;
+       op.cmd = MMUEXT_INVLPG_LOCAL;
+       op.arg1.linear_addr = ptr & PAGE_MASK;
+       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL(xen_invlpg);
+
+#ifdef CONFIG_SMP
+
+void xen_tlb_flush_all(void)
+{
+       struct mmuext_op op;
+       op.cmd = MMUEXT_TLB_FLUSH_ALL;
+       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL_GPL(xen_tlb_flush_all);
+
+void xen_tlb_flush_mask(const cpumask_t *mask)
+{
+       struct mmuext_op op;
+       if ( cpus_empty(*mask) )
+               return;
+       op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+       set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask));
+       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL_GPL(xen_tlb_flush_mask);
+
+void xen_invlpg_all(unsigned long ptr)
+{
+       struct mmuext_op op;
+       op.cmd = MMUEXT_INVLPG_ALL;
+       op.arg1.linear_addr = ptr & PAGE_MASK;
+       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL_GPL(xen_invlpg_all);
+
+void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr)
+{
+       struct mmuext_op op;
+       if ( cpus_empty(*mask) )
+               return;
+       op.cmd = MMUEXT_INVLPG_MULTI;
+       op.arg1.linear_addr = ptr & PAGE_MASK;
+       set_xen_guest_handle(op.arg2.vcpumask, cpus_addr(*mask));
+       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+EXPORT_SYMBOL_GPL(xen_invlpg_mask);
+
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_X86_64
+#define NR_PGD_PIN_OPS 2
+#else
+#define NR_PGD_PIN_OPS 1
+#endif
+
+void xen_pgd_pin(pgd_t *pgd)
+{
+       struct mmuext_op op[NR_PGD_PIN_OPS];
+
+       op[0].cmd = MMUEXT_PIN_L3_TABLE;
+       op[0].arg1.mfn = virt_to_mfn(pgd);
+#ifdef CONFIG_X86_64
+       op[1].cmd = op[0].cmd = MMUEXT_PIN_L4_TABLE;
+       pgd = __user_pgd(pgd);
+       if (pgd)
+               op[1].arg1.mfn = virt_to_mfn(pgd);
+       else {
+               op[1].cmd = MMUEXT_PIN_L3_TABLE;
+               op[1].arg1.mfn = pfn_to_mfn(__pa_symbol(level3_user_pgt)
+                                           >> PAGE_SHIFT);
+       }
+#endif
+       if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0)
+               BUG();
+}
+
+void xen_pgd_unpin(pgd_t *pgd)
+{
+       struct mmuext_op op[NR_PGD_PIN_OPS];
+
+       op[0].cmd = MMUEXT_UNPIN_TABLE;
+       op[0].arg1.mfn = virt_to_mfn(pgd);
+#ifdef CONFIG_X86_64
+       pgd = __user_pgd(pgd);
+       BUG_ON(!pgd);
+       op[1].cmd = MMUEXT_UNPIN_TABLE;
+       op[1].arg1.mfn = virt_to_mfn(pgd);
+#endif
+       if (HYPERVISOR_mmuext_op(op, NR_PGD_PIN_OPS, NULL, DOMID_SELF) < 0)
+               BUG();
+}
+
+void xen_set_ldt(const void *ptr, unsigned int ents)
+{
+       struct mmuext_op op;
+       op.cmd = MMUEXT_SET_LDT;
+       op.arg1.linear_addr = (unsigned long)ptr;
+       op.arg2.nr_ents     = ents;
+       BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+/* Protected by balloon_lock. */
+#define INIT_CONTIG_ORDER 6 /* 256kB */
+static unsigned int __read_mostly max_contig_order = INIT_CONTIG_ORDER;
+static unsigned long __initdata init_df[1U << INIT_CONTIG_ORDER];
+static unsigned long *__refdata discontig_frames = init_df;
+static multicall_entry_t __initdata init_mc[1U << INIT_CONTIG_ORDER];
+static multicall_entry_t *__refdata cr_mcl = init_mc;
+
+static int __init init_contig_order(void)
+{
+       discontig_frames = vmalloc((sizeof(*discontig_frames)
+                                   + sizeof(*cr_mcl)) << INIT_CONTIG_ORDER);
+       BUG_ON(!discontig_frames);
+
+       cr_mcl = (void *)(discontig_frames + (1U << INIT_CONTIG_ORDER));
+
+       return 0;
+}
+early_initcall(init_contig_order);
+
+static int check_contig_order(unsigned int order)
+{
+#ifdef CONFIG_64BIT
+       if (unlikely(order >= 32))
+#else
+       if (unlikely(order > BITS_PER_LONG - fls(sizeof(*cr_mcl))))
+#endif
+               return -ENOMEM;
+
+       if (unlikely(order > max_contig_order))
+       {
+               unsigned long *df = __vmalloc((sizeof(*discontig_frames)
+                                              + sizeof(*cr_mcl)) << order,
+                                             GFP_ATOMIC, PAGE_KERNEL);
+               unsigned long flags;
+
+               if (!df) {
+                       vfree(df);
+                       return -ENOMEM;
+               }
+               balloon_lock(flags);
+               if (order > max_contig_order) {
+                       void *temp = discontig_frames;
+
+                       discontig_frames = df;
+                       cr_mcl = (void *)(df + (1U << order));
+                       df = temp;
+
+                       wmb();
+                       max_contig_order = order;
+               }
+               balloon_unlock(flags);
+               vfree(df);
+               pr_info("Adjusted maximum contiguous region order to %u\n",
+                       order);
+       }
+
+       return 0;
+}
+
+/* Ensure multi-page extents are contiguous in machine memory. */
+int xen_create_contiguous_region(
+       unsigned long vstart, unsigned int order, unsigned int address_bits)
+{
+       unsigned long *in_frames, out_frame, frame, flags;
+       unsigned int   i;
+       int            rc, success;
+#ifdef CONFIG_64BIT
+       pte_t         *ptep = NULL;
+#endif
+       struct xen_memory_exchange exchange = {
+               .in = {
+                       .nr_extents   = 1UL << order,
+                       .extent_order = 0,
+                       .domid        = DOMID_SELF
+               },
+               .out = {
+                       .nr_extents   = 1,
+                       .extent_order = order,
+                       .address_bits = address_bits,
+                       .domid        = DOMID_SELF
+               }
+       };
+
+       /*
+        * Currently an auto-translated guest will not perform I/O, nor will
+        * it require PAE page directories below 4GB. Therefore any calls to
+        * this function are redundant and can be ignored.
+        */
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return 0;
+
+       rc = check_contig_order(order);
+       if (unlikely(rc))
+               return rc;
+
+#ifdef CONFIG_64BIT
+       if (unlikely(vstart > PAGE_OFFSET + MAXMEM)) {
+               unsigned int level;
+
+               if (vstart < __START_KERNEL_map
+                   || vstart + (PAGE_SIZE << order) > _brk_end)
+                       return -EINVAL;
+               ptep = lookup_address((unsigned long)__va(__pa(vstart)),
+                                     &level);
+               if (ptep && pte_none(*ptep))
+                       ptep = NULL;
+               if (vstart < __START_KERNEL && ptep)
+                       return -EINVAL;
+               rc = check_contig_order(order + 1);
+               if (unlikely(rc))
+                       return rc;
+       }
+#else
+       if (unlikely(vstart + (PAGE_SIZE << order) > (unsigned long)high_memory))
+               return -EINVAL;
+#endif
+
+       set_xen_guest_handle(exchange.out.extent_start, &out_frame);
+
+       xen_scrub_pages((void *)vstart, 1 << order);
+
+       balloon_lock(flags);
+
+       in_frames = discontig_frames;
+       set_xen_guest_handle(exchange.in.extent_start, in_frames);
+
+       /* 1. Zap current PTEs, remembering MFNs. */
+       for (i = 0; i < (1U<<order); i++) {
+               in_frames[i] = pfn_to_mfn((__pa(vstart) >> PAGE_SHIFT) + i);
+               MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
+                                       __pte_ma(0), 0);
+#ifdef CONFIG_64BIT
+               if (ptep)
+                       MULTI_update_va_mapping(cr_mcl + i + (1U << order),
+                               (unsigned long)__va(__pa(vstart)) + (i*PAGE_SIZE),
+                               __pte_ma(0), 0);
+#endif
+               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
+                       INVALID_P2M_ENTRY);
+       }
+#ifdef CONFIG_64BIT
+       if (ptep)
+               i += i;
+#endif
+       if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
+               BUG();
+
+       /* 2. Get a new contiguous memory extent. */
+       out_frame = __pa(vstart) >> PAGE_SHIFT;
+       rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+       success = (exchange.nr_exchanged == (1UL << order));
+       BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+       BUG_ON(success && (rc != 0));
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (unlikely(rc == -ENOSYS)) {
+               /* Compatibility when XENMEM_exchange is unsupported. */
+               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+                                        &exchange.in) != (1UL << order))
+                       BUG();
+               success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+                                               &exchange.out) == 1);
+               if (!success) {
+                       /* Couldn't get special memory: fall back to normal. */
+                       for (i = 0; i < (1U<<order); i++)
+                               in_frames[i] = (__pa(vstart)>>PAGE_SHIFT) + i;
+                       if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+                                                &exchange.in) != (1UL<<order))
+                               BUG();
+               }
+       }
+#endif
+
+       /* 3. Map the new extent in place of old pages. */
+       for (i = 0; i < (1U<<order); i++) {
+               frame = success ? (out_frame + i) : in_frames[i];
+               MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
+                                       pfn_pte_ma(frame, PAGE_KERNEL), 0);
+#ifdef CONFIG_64BIT
+               if (ptep)
+                       MULTI_update_va_mapping(cr_mcl + i + (1U << order),
+                               (unsigned long)__va(__pa(vstart)) + (i*PAGE_SIZE),
+                               pfn_pte_ma(frame, PAGE_KERNEL_RO), 0);
+#endif
+               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
+       }
+#ifdef CONFIG_64BIT
+       if (ptep)
+               i += i;
+#endif
+       cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
+                                                  ? UVMF_TLB_FLUSH|UVMF_ALL
+                                                  : UVMF_INVLPG|UVMF_ALL;
+       if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
+               BUG();
+
+       balloon_unlock(flags);
+
+       return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
+
+void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
+{
+       unsigned long *out_frames, in_frame, frame, flags;
+       unsigned int   i;
+       int            rc, success;
+       struct xen_memory_exchange exchange = {
+               .in = {
+                       .nr_extents   = 1,
+                       .extent_order = order,
+                       .domid        = DOMID_SELF
+               },
+               .out = {
+                       .nr_extents   = 1UL << order,
+                       .extent_order = 0,
+                       .domid        = DOMID_SELF
+               }
+       };
+
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return;
+
+       if (unlikely(order > max_contig_order))
+               return;
+
+       set_xen_guest_handle(exchange.in.extent_start, &in_frame);
+
+       xen_scrub_pages((void *)vstart, 1 << order);
+
+       balloon_lock(flags);
+
+       out_frames = discontig_frames;
+       set_xen_guest_handle(exchange.out.extent_start, out_frames);
+
+       /* 1. Find start MFN of contiguous extent. */
+       in_frame = pfn_to_mfn(__pa(vstart) >> PAGE_SHIFT);
+
+       /* 2. Zap current PTEs. */
+       for (i = 0; i < (1U<<order); i++) {
+               MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
+                                       __pte_ma(0), 0);
+               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i,
+                       INVALID_P2M_ENTRY);
+               out_frames[i] = (__pa(vstart) >> PAGE_SHIFT) + i;
+       }
+       if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
+               BUG();
+
+       /* 3. Do the exchange for non-contiguous MFNs. */
+       rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+       success = (exchange.nr_exchanged == 1);
+       BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+       BUG_ON(success && (rc != 0));
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (unlikely(rc == -ENOSYS)) {
+               /* Compatibility when XENMEM_exchange is unsupported. */
+               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+                                        &exchange.in) != 1)
+                       BUG();
+               if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+                                        &exchange.out) != (1UL << order))
+                       BUG();
+               success = 1;
+       }
+#endif
+
+       /* 4. Map new pages in place of old pages. */
+       for (i = 0; i < (1U<<order); i++) {
+               frame = success ? out_frames[i] : (in_frame + i);
+               MULTI_update_va_mapping(cr_mcl + i, vstart + (i*PAGE_SIZE),
+                                       pfn_pte_ma(frame, PAGE_KERNEL), 0);
+               set_phys_to_machine((__pa(vstart)>>PAGE_SHIFT)+i, frame);
+       }
+
+       cr_mcl[i - 1].args[MULTI_UVMFLAGS_INDEX] = order
+                                                  ? UVMF_TLB_FLUSH|UVMF_ALL
+                                                  : UVMF_INVLPG|UVMF_ALL;
+       if (HYPERVISOR_multicall_check(cr_mcl, i, NULL))
+               BUG();
+
+       balloon_unlock(flags);
+
+       if (unlikely(!success)) {
+               /* Try hard to get the special memory back to Xen. */
+               exchange.in.extent_order = 0;
+               set_xen_guest_handle(exchange.in.extent_start, &in_frame);
+
+               for (i = 0; i < (1U<<order); i++) {
+                       struct page *page = alloc_page(__GFP_HIGHMEM|__GFP_COLD);
+                       unsigned long pfn;
+                       mmu_update_t mmu;
+                       unsigned int j = 0;
+
+                       if (!page) {
+                               pr_warn("Xen and kernel out of memory"
+                                       " while trying to release an order"
+                                       " %u contiguous region\n", order);
+                               break;
+                       }
+                       pfn = page_to_pfn(page);
+
+                       balloon_lock(flags);
+
+                       if (!PageHighMem(page)) {
+                               void *v = __va(pfn << PAGE_SHIFT);
+
+                               xen_scrub_pages(v, 1);
+                               MULTI_update_va_mapping(cr_mcl + j, (unsigned long)v,
+                                                       __pte_ma(0), UVMF_INVLPG|UVMF_ALL);
+                               ++j;
+                       }
+#ifdef CONFIG_XEN_SCRUB_PAGES
+                       else {
+                               xen_scrub_pages(kmap(page), 1);
+                               kunmap(page);
+                               kmap_flush_unused();
+                       }
+#endif
+
+                       frame = pfn_to_mfn(pfn);
+                       set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+
+                       MULTI_update_va_mapping(cr_mcl + j, vstart,
+                                               pfn_pte_ma(frame, PAGE_KERNEL),
+                                               UVMF_INVLPG|UVMF_ALL);
+                       ++j;
+
+                       pfn = __pa(vstart) >> PAGE_SHIFT;
+                       set_phys_to_machine(pfn, frame);
+                       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                               mmu.ptr = ((uint64_t)frame << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+                               mmu.val = pfn;
+                               cr_mcl[j].op = __HYPERVISOR_mmu_update;
+                               cr_mcl[j].args[0] = (unsigned long)&mmu;
+                               cr_mcl[j].args[1] = 1;
+                               cr_mcl[j].args[2] = 0;
+                               cr_mcl[j].args[3] = DOMID_SELF;
+                               ++j;
+                       }
+
+                       cr_mcl[j].op = __HYPERVISOR_memory_op;
+                       cr_mcl[j].args[0] = XENMEM_decrease_reservation;
+                       cr_mcl[j].args[1] = (unsigned long)&exchange.in;
+
+                       if (HYPERVISOR_multicall(cr_mcl, j + 1))
+                               BUG();
+                       BUG_ON(cr_mcl[j].result != 1);
+                       while (j--)
+                               BUG_ON(cr_mcl[j].result != 0);
+
+                       balloon_unlock(flags);
+
+                       free_empty_pages(&page, 1);
+
+                       in_frame++;
+                       vstart += PAGE_SIZE;
+               }
+       }
+}
+EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
+
+int __init early_create_contiguous_region(unsigned long pfn,
+                                         unsigned int order,
+                                         unsigned int address_bits)
+{
+       unsigned long *in_frames = discontig_frames, out_frame = pfn;
+       unsigned int i;
+       int rc, success;
+       struct xen_memory_exchange exchange = {
+               .in = {
+                       .nr_extents   = 1UL << order,
+                       .extent_order = 0,
+                       .domid        = DOMID_SELF
+               },
+               .out = {
+                       .nr_extents   = 1,
+                       .extent_order = order,
+                       .address_bits = address_bits,
+                       .domid        = DOMID_SELF
+               }
+       };
+
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return 0;
+
+       if (unlikely(order > max_contig_order))
+               return -ENOMEM;
+
+       for (i = 0; i < (1U << order); ++i) {
+               in_frames[i] = pfn_to_mfn(pfn + i);
+               set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY);
+       }
+
+       set_xen_guest_handle(exchange.in.extent_start, in_frames);
+       set_xen_guest_handle(exchange.out.extent_start, &out_frame);
+
+       rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+       success = (exchange.nr_exchanged == (1UL << order));
+       BUG_ON(!success && (exchange.nr_exchanged || !rc));
+       BUG_ON(success && rc);
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (unlikely(rc == -ENOSYS)) {
+               /* Compatibility when XENMEM_exchange is unavailable. */
+               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+                                        &exchange.in) != (1UL << order))
+                       BUG();
+               success = (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+                                               &exchange.out) == 1);
+               if (!success) {
+                       for (i = 0; i < (1U << order); ++i)
+                               in_frames[i] = pfn + i;
+                       if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+                                                &exchange.in) != (1UL << order))
+                               BUG();
+               }
+       }
+#endif
+
+       for (i = 0; i < (1U << order); ++i, ++out_frame) {
+               if (!success)
+                       out_frame = in_frames[i];
+               set_phys_to_machine(pfn + i, out_frame);
+       }
+
+       return success ? 0 : -ENOMEM;
+}
+
+static void undo_limit_pages(struct page *pages, unsigned int order)
+{
+       BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+       BUG_ON(order > max_contig_order);
+       xen_limit_pages_to_max_mfn(pages, order, 0);
+       ClearPageForeign(pages);
+       init_page_count(pages);
+       __free_pages(pages, order);
+}
+
+int xen_limit_pages_to_max_mfn(
+       struct page *pages, unsigned int order, unsigned int address_bits)
+{
+       unsigned long flags, frame, *limit_map, _limit_map;
+       unsigned long *in_frames, *out_frames;
+       struct page *page;
+       unsigned int i, n, nr_mcl;
+       int rc, success;
+
+       struct xen_memory_exchange exchange = {
+               .in = {
+                       .extent_order = 0,
+                       .domid        = DOMID_SELF
+               },
+               .out = {
+                       .extent_order = 0,
+                       .address_bits = address_bits,
+                       .domid        = DOMID_SELF
+               }
+       };
+
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return 0;
+
+       if (address_bits && address_bits < PAGE_SHIFT)
+               return -EINVAL;
+
+       rc = check_contig_order(order + 1);
+       if (unlikely(rc))
+               return rc;
+
+       if (BITS_PER_LONG >> order) {
+               limit_map = kmalloc(BITS_TO_LONGS(1U << order)
+                                   * sizeof(*limit_map), GFP_ATOMIC);
+               if (unlikely(!limit_map))
+                       return -ENOMEM;
+       } else
+               limit_map = &_limit_map;
+
+       if (address_bits)
+               bitmap_zero(limit_map, 1U << order);
+       else if (order) {
+               BUILD_BUG_ON(sizeof(pages->index) != sizeof(*limit_map));
+               for (i = 0; i < BITS_TO_LONGS(1U << order); ++i)
+                       limit_map[i] = pages[i + 1].index;
+       } else
+               __set_bit(0, limit_map);
+
+       /* 0. Scrub the pages. */
+       for (i = 0, n = 0; i < 1U<<order ; i++) {
+               page = &pages[i];
+               if (address_bits) {
+                       if (!(pfn_to_mfn(page_to_pfn(page)) >> (address_bits - PAGE_SHIFT)))
+                               continue;
+                       __set_bit(i, limit_map);
+               }
+
+               if (!PageHighMem(page))
+                       xen_scrub_pages(page_address(page), 1);
+#ifdef CONFIG_XEN_SCRUB_PAGES
+               else {
+                       xen_scrub_pages(kmap(page), 1);
+                       kunmap(page);
+                       ++n;
+               }
+#endif
+       }
+       if (bitmap_empty(limit_map, 1U << order)) {
+               if (limit_map != &_limit_map)
+                       kfree(limit_map);
+               return 0;
+       }
+
+       if (n)
+               kmap_flush_unused();
+
+       balloon_lock(flags);
+
+       in_frames = discontig_frames;
+       set_xen_guest_handle(exchange.in.extent_start, in_frames);
+       out_frames = in_frames + (1U << order);
+       set_xen_guest_handle(exchange.out.extent_start, out_frames);
+
+       /* 1. Zap current PTEs (if any), remembering MFNs. */
+       for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
+               if(!test_bit(i, limit_map))
+                       continue;
+               page = &pages[i];
+
+               out_frames[n] = page_to_pfn(page);
+               in_frames[n] = pfn_to_mfn(out_frames[n]);
+
+               if (!PageHighMem(page))
+                       MULTI_update_va_mapping(cr_mcl + nr_mcl++,
+                                               (unsigned long)page_address(page),
+                                               __pte_ma(0), 0);
+
+               set_phys_to_machine(out_frames[n], INVALID_P2M_ENTRY);
+               ++n;
+       }
+       if (nr_mcl && HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
+               BUG();
+
+       /* 2. Get new memory below the required limit. */
+       exchange.in.nr_extents = n;
+       exchange.out.nr_extents = n;
+       rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
+       success = (exchange.nr_exchanged == n);
+       BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
+       BUG_ON(success && (rc != 0));
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (unlikely(rc == -ENOSYS)) {
+               /* Compatibility when XENMEM_exchange is unsupported. */
+               if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+                                        &exchange.in) != n)
+                       BUG();
+               if (HYPERVISOR_memory_op(XENMEM_populate_physmap,
+                                        &exchange.out) != n)
+                       BUG();
+               success = 1;
+       }
+#endif
+
+       /* 3. Map the new pages in place of old pages. */
+       for (i = 0, n = 0, nr_mcl = 0; i < (1U<<order); i++) {
+               if(!test_bit(i, limit_map))
+                       continue;
+               page = &pages[i];
+
+               frame = success ? out_frames[n] : in_frames[n];
+
+               if (!PageHighMem(page))
+                       MULTI_update_va_mapping(cr_mcl + nr_mcl++,
+                                               (unsigned long)page_address(page),
+                                               pfn_pte_ma(frame, PAGE_KERNEL), 0);
+
+               set_phys_to_machine(page_to_pfn(page), frame);
+               ++n;
+       }
+       if (nr_mcl) {
+               cr_mcl[nr_mcl - 1].args[MULTI_UVMFLAGS_INDEX] = order
+                                                               ? UVMF_TLB_FLUSH|UVMF_ALL
+                                                               : UVMF_INVLPG|UVMF_ALL;
+               if (HYPERVISOR_multicall_check(cr_mcl, nr_mcl, NULL))
+                       BUG();
+       }
+
+       balloon_unlock(flags);
+
+       if (success && address_bits) {
+               if (order) {
+                       BUILD_BUG_ON(sizeof(*limit_map) != sizeof(pages->index));
+                       for (i = 0; i < BITS_TO_LONGS(1U << order); ++i)
+                               pages[i + 1].index = limit_map[i];
+               }
+               SetPageForeign(pages, undo_limit_pages);
+       }
+
+       if (limit_map != &_limit_map)
+               kfree(limit_map);
+
+       return success ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(xen_limit_pages_to_max_mfn);
+
+bool hypervisor_oom(void)
+{
+       WARN_ONCE(1, "Hypervisor is out of memory");
+       return false;//temp
+}
+
+int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
+                         void *arg, int (*func)(unsigned long, unsigned long,
+                                                void *))
+{
+       return start_pfn < max_pfn && nr_pages
+              ? func(start_pfn, min(max_pfn - start_pfn, nr_pages), arg)
+              : -1;
+}
+
+int write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
+{
+       maddr_t mach_lp = arbitrary_virt_to_machine(ldt + entry);
+       return HYPERVISOR_update_descriptor(mach_lp, *(const u64*)desc);
+}
+
+int write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc,
+                   int type)
+{
+       maddr_t mach_gp = arbitrary_virt_to_machine(gdt + entry);
+       return HYPERVISOR_update_descriptor(mach_gp, *(const u64*)desc);
+}
diff --git a/arch/x86/mm/init-xen.c b/arch/x86/mm/init-xen.c

new file mode 100644 (file)

index 0000000..70a07ff
--- /dev/null
+++ b/arch/x86/mm/init-xen.c
@@ -0,0 +1,502 @@
+#include <linux/gfp.h>
+#include <linux/initrd.h>
+#include <linux/ioport.h>
+#include <linux/swap.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/init.h>
+#include <asm/page.h>
+#include <asm/page_types.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/proto.h>
+#include <asm/dma.h>           /* for MAX_DMA_PFN */
+
+unsigned long __meminitdata pgt_buf_start;
+unsigned long __meminitdata pgt_buf_end;
+unsigned long __meminitdata pgt_buf_top;
+
+int after_bootmem;
+
+#if !defined(CONFIG_XEN)
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+                               = 1
+#endif
+;
+#elif defined(CONFIG_X86_32)
+#define direct_gbpages 0
+extern unsigned long extend_init_mapping(unsigned long tables_space);
+#else
+extern void xen_finish_init_mapping(void);
+#endif
+
+static void __init find_early_table_space(unsigned long end, int use_pse,
+                                         int use_gbpages)
+{
+       unsigned long puds, pmds, ptes, tables;
+
+       puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+       tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+
+       if (use_gbpages) {
+               unsigned long extra;
+
+               extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+               pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+       } else
+               pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
+       tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+       if (use_pse) {
+               unsigned long extra;
+
+               extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+               extra += PMD_SIZE;
+#endif
+               ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       } else
+               ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+       tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+
+#ifdef CONFIG_X86_32
+       /* for fixmap */
+       tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+
+       pgt_buf_start = extend_init_mapping(tables);
+       pgt_buf_end = pgt_buf_start;
+#else /* CONFIG_X86_64 */
+       if (!pgt_buf_top) {
+               pgt_buf_start = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT) +
+                       xen_start_info->nr_pt_frames;
+               pgt_buf_end = pgt_buf_start;
+       } else {
+               /*
+                * [table_start, table_top) gets passed to reserve_early(),
+                * so we must not use table_end here, despite continuing
+                * to allocate from there. table_end possibly being below
+                * table_start is otoh not a problem.
+                */
+               pgt_buf_start = pgt_buf_top;
+       }
+#endif
+       if (pgt_buf_start == -1UL)
+               panic("Cannot find space for the kernel page tables");
+
+       pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+
+       printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+               end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
+}
+
+void __init xen_pagetable_reserve(u64 start, u64 end)
+{
+       if (end > start)
+               memblock_reserve(start, end - start);
+}
+
+struct map_range {
+       unsigned long start;
+       unsigned long end;
+       unsigned page_size_mask;
+};
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int __meminit save_mr(struct map_range *mr, int nr_range,
+                            unsigned long start_pfn, unsigned long end_pfn,
+                            unsigned long page_size_mask)
+{
+       if (start_pfn < end_pfn) {
+               if (nr_range >= NR_RANGE_MR)
+                       panic("run out of range for init_memory_mapping\n");
+               mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+               mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
+               mr[nr_range].page_size_mask = page_size_mask;
+               nr_range++;
+       }
+
+       return nr_range;
+}
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+                                              unsigned long end)
+{
+       unsigned long page_size_mask = 0;
+       unsigned long start_pfn, end_pfn;
+       unsigned long ret = 0;
+       unsigned long pos;
+
+       struct map_range mr[NR_RANGE_MR];
+       int nr_range, i;
+       int use_pse, use_gbpages;
+
+       printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
+       /*
+        * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+        * This will simplify cpa(), which otherwise needs to support splitting
+        * large pages into small in interrupt context, etc.
+        */
+       use_pse = use_gbpages = 0;
+#else
+       use_pse = cpu_has_pse;
+       use_gbpages = direct_gbpages;
+#endif
+
+       /* Enable PSE if available */
+       if (cpu_has_pse)
+               set_in_cr4(X86_CR4_PSE);
+
+       /* Enable PGE if available */
+       if (cpu_has_pge) {
+               set_in_cr4(X86_CR4_PGE);
+               __supported_pte_mask |= _PAGE_GLOBAL;
+       }
+
+       if (use_gbpages)
+               page_size_mask |= 1 << PG_LEVEL_1G;
+       if (use_pse)
+               page_size_mask |= 1 << PG_LEVEL_2M;
+
+       memset(mr, 0, sizeof(mr));
+       nr_range = 0;
+
+       /* head if not big page alignment ? */
+       start_pfn = start >> PAGE_SHIFT;
+       pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+       /*
+        * Don't use a large page for the first 2/4MB of memory
+        * because there are often fixed size MTRRs in there
+        * and overlapping MTRRs into large pages can cause
+        * slowdowns.
+        */
+       if (pos == 0)
+               end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+       else
+               end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+                                << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+       end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+                       << (PMD_SHIFT - PAGE_SHIFT);
+#endif
+       if (end_pfn > (end >> PAGE_SHIFT))
+               end_pfn = end >> PAGE_SHIFT;
+       if (start_pfn < end_pfn) {
+               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+               pos = end_pfn << PAGE_SHIFT;
+       }
+
+       /* big page (2M) range */
+       start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+                        << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+       end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+       end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+                        << (PUD_SHIFT - PAGE_SHIFT);
+       if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+               end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
+       if (start_pfn < end_pfn) {
+               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+                               page_size_mask & (1<<PG_LEVEL_2M));
+               pos = end_pfn << PAGE_SHIFT;
+       }
+
+#ifdef CONFIG_X86_64
+       /* big page (1G) range */
+       start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+                        << (PUD_SHIFT - PAGE_SHIFT);
+       end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+       if (start_pfn < end_pfn) {
+               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+                               page_size_mask &
+                                ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+               pos = end_pfn << PAGE_SHIFT;
+       }
+
+       /* tail is not big page (1G) alignment */
+       start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+                        << (PMD_SHIFT - PAGE_SHIFT);
+       end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+       if (start_pfn < end_pfn) {
+               nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+                               page_size_mask & (1<<PG_LEVEL_2M));
+               pos = end_pfn << PAGE_SHIFT;
+       }
+#endif
+
+       /* tail is not big page (2M) alignment */
+       start_pfn = pos>>PAGE_SHIFT;
+       end_pfn = end>>PAGE_SHIFT;
+       nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+       /* try to merge same page size and continuous */
+       for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+               unsigned long old_start;
+               if (mr[i].end != mr[i+1].start ||
+                   mr[i].page_size_mask != mr[i+1].page_size_mask)
+                       continue;
+               /* move it */
+               old_start = mr[i].start;
+               memmove(&mr[i], &mr[i+1],
+                       (nr_range - 1 - i) * sizeof(struct map_range));
+               mr[i--].start = old_start;
+               nr_range--;
+       }
+
+       for (i = 0; i < nr_range; i++)
+               printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+                               mr[i].start, mr[i].end,
+                       (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+                        (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+
+       /*
+        * Find space for the kernel direct mapping tables.
+        *
+        * Later we should allocate these tables in the local node of the
+        * memory mapped. Unfortunately this is done currently before the
+        * nodes are discovered.
+        */
+       if (!after_bootmem)
+               find_early_table_space(end, use_pse, use_gbpages);
+
+#ifdef CONFIG_X86_64
+#define addr_to_page(addr)                                             \
+       ((unsigned long *)                                              \
+        ((mfn_to_pfn(((addr) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)      \
+          << PAGE_SHIFT) + __START_KERNEL_map))
+
+       if (!start) {
+               unsigned long addr, va = __START_KERNEL_map;
+               unsigned long *page = (unsigned long *)init_level4_pgt;
+
+               /* Kill mapping of memory below _text. */
+               while (va < (unsigned long)&_text) {
+                       if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+                               BUG();
+                       va += PAGE_SIZE;
+               }
+
+               /* Blow away any spurious initial mappings. */
+               va = __START_KERNEL_map + (pgt_buf_start << PAGE_SHIFT);
+
+               addr = page[pgd_index(va)];
+               page = addr_to_page(addr);
+               addr = page[pud_index(va)];
+               page = addr_to_page(addr);
+               while (pmd_index(va) | pte_index(va)) {
+                       if (pmd_none(*(pmd_t *)&page[pmd_index(va)]))
+                               break;
+                       if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+                               BUG();
+                       va += PAGE_SIZE;
+               }
+       }
+#undef addr_to_page
+#endif
+
+       for (i = 0; i < nr_range; i++)
+               ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+                                                  mr[i].page_size_mask);
+
+#ifdef CONFIG_X86_32
+       early_ioremap_page_table_range_init();
+#endif
+
+#ifdef CONFIG_X86_64
+       BUG_ON(pgt_buf_end > pgt_buf_top);
+       if (!start)
+               xen_finish_init_mapping();
+       else
+#endif
+       if (pgt_buf_end < pgt_buf_top)
+               /* Disable the 'table_end' allocator. */
+               pgt_buf_top = pgt_buf_end;
+
+       __flush_tlb_all();
+
+       /*
+        * Reserve the kernel pagetable pages we used (pgt_buf_start -
+        * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
+        * so that they can be reused for other purposes.
+        *
+        * On native it just means calling memblock_reserve, on Xen it also
+        * means marking RW the pagetable pages that we allocated before
+        * but that haven't been used.
+        *
+        * In fact on xen we mark RO the whole range pgt_buf_start -
+        * pgt_buf_top, because we have to make sure that when
+        * init_memory_mapping reaches the pagetable pages area, it maps
+        * RO all the pagetable pages, including the ones that are beyond
+        * pgt_buf_end at that time.
+        */
+       if (!after_bootmem && pgt_buf_top > pgt_buf_start) {
+#ifdef CONFIG_X86_64
+               reserve_pgtable_low();
+#endif
+               x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
+                               PFN_PHYS(pgt_buf_top));
+       }
+
+       if (!after_bootmem)
+               early_memtest(start, end);
+
+       return ret >> PAGE_SHIFT;
+}
+
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+       if (pagenr <= 256)
+               return 1;
+       if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+               return 0;
+       if (mfn_to_local_pfn(pagenr) >= max_pfn)
+               return 1;
+       return 0;
+}
+
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+       unsigned long addr;
+       unsigned long begin_aligned, end_aligned;
+
+       /* Make sure boundaries are page aligned */
+       begin_aligned = PAGE_ALIGN(begin);
+       end_aligned   = end & PAGE_MASK;
+
+       if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
+               begin = begin_aligned;
+               end   = end_aligned;
+       }
+
+       if (begin >= end)
+               return;
+
+       addr = begin;
+
+       /*
+        * If debugging page accesses then do not free this memory but
+        * mark them not present - any buggy init-section access will
+        * create a kernel page fault:
+        */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+               begin, end);
+       set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
+       /*
+        * We just marked the kernel text read only above, now that
+        * we are going to free part of that, we need to make that
+        * writeable and non-executable first.
+        */
+       set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
+       set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+
+       printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
+
+       for (; addr < end; addr += PAGE_SIZE) {
+               ClearPageReserved(virt_to_page(addr));
+               init_page_count(virt_to_page(addr));
+               memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
+#ifdef CONFIG_X86_64
+               if (addr >= __START_KERNEL_map) {
+                       /* make_readonly() reports all kernel addresses. */
+                       if (HYPERVISOR_update_va_mapping((unsigned long)__va(__pa(addr)),
+                                                        pfn_pte(__pa(addr) >> PAGE_SHIFT,
+                                                                PAGE_KERNEL),
+                                                        0))
+                               BUG();
+                       if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
+                               BUG();
+               }
+#endif
+               free_page(addr);
+               totalram_pages++;
+       }
+#endif
+}
+
+void free_initmem(void)
+{
+       free_init_pages("unused kernel memory",
+                       (unsigned long)(&__init_begin),
+                       (unsigned long)(&__init_end));
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+       /*
+        * end could be not aligned, and We can not align that,
+        * decompresser could be confused by aligned initrd_end
+        * We already reserve the end partial page before in
+        *   - i386_start_kernel()
+        *   - x86_64_start_kernel()
+        *   - relocate_initrd()
+        * So here We can do PAGE_ALIGN() safely to get partial page to be freed
+        */
+#ifdef CONFIG_ACPI_INITRD_TABLE_OVERRIDE
+       if (acpi_initrd_offset)
+               free_init_pages("initrd memory", start - acpi_initrd_offset,
+                               PAGE_ALIGN(end));
+       else
+#endif
+       free_init_pages("initrd memory", start, PAGE_ALIGN(end));
+}
+#endif
+
+void __init zone_sizes_init(void)
+{
+       unsigned long max_zone_pfns[MAX_NR_ZONES];
+
+       memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+
+#ifdef CONFIG_ZONE_DMA
+       max_zone_pfns[ZONE_DMA]         = MAX_DMA_PFN;
+#endif
+#ifdef CONFIG_ZONE_DMA32
+       max_zone_pfns[ZONE_DMA32]       = MAX_DMA32_PFN;
+#endif
+       max_zone_pfns[ZONE_NORMAL]      = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+       max_zone_pfns[ZONE_HIGHMEM]     = max_pfn;
+#endif
+
+       free_area_init_nodes(max_zone_pfns);
+
+       xen_init_pgd_pin();
+}
+
diff --git a/arch/x86/mm/init_32-xen.c b/arch/x86/mm/init_32-xen.c

new file mode 100644 (file)

index 0000000..dee54f2
--- /dev/null
+++ b/arch/x86/mm/init_32-xen.c
@@ -0,0 +1,1018 @@
+/*
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#include <linux/module.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/proc_fs.h>
+#include <linux/memory_hotplug.h>
+#include <linux/initrd.h>
+#include <linux/cpumask.h>
+#include <linux/gfp.h>
+#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
+
+#include <asm/asm.h>
+#include <asm/bios_ebda.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/bugs.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/olpc_ofw.h>
+#include <asm/pgalloc.h>
+#include <asm/sections.h>
+#include <asm/hypervisor.h>
+#include <asm/swiotlb.h>
+#include <asm/setup.h>
+#include <asm/cacheflush.h>
+#include <asm/page_types.h>
+#include <asm/init.h>
+
+unsigned long highstart_pfn, highend_pfn;
+
+static noinline int do_test_wp_bit(void);
+
+bool __read_mostly __vmalloc_start_set = false;
+
+static __init void *alloc_low_page(void)
+{
+       unsigned long pfn = pgt_buf_end++;
+       void *adr;
+
+       if (pfn >= pgt_buf_top)
+               panic("alloc_low_page: ran out of memory");
+
+       adr = __va(pfn * PAGE_SIZE);
+       clear_page(adr);
+       return adr;
+}
+
+/*
+ * Creates a middle page table and puts a pointer to it in the
+ * given global directory entry. This only returns the gd entry
+ * in non-PAE compilation mode, since the middle layer is folded.
+ */
+static pmd_t * __init one_md_table_init(pgd_t *pgd)
+{
+       pud_t *pud;
+       pmd_t *pmd_table;
+
+#ifdef CONFIG_X86_PAE
+       if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
+               if (after_bootmem)
+                       pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
+               else
+                       pmd_table = (pmd_t *)alloc_low_page();
+               paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+               make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
+               set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+               pud = pud_offset(pgd, 0);
+               BUG_ON(pmd_table != pmd_offset(pud, 0));
+
+               return pmd_table;
+       }
+#endif
+       pud = pud_offset(pgd, 0);
+       pmd_table = pmd_offset(pud, 0);
+
+       return pmd_table;
+}
+
+/*
+ * Create a page table and place a pointer to it in a middle page
+ * directory entry:
+ */
+static pte_t * __init one_page_table_init(pmd_t *pmd)
+{
+#if CONFIG_XEN_COMPAT <= 0x030002
+       if (pmd_none(*pmd)) {
+#else
+       if (!(__pmd_val(*pmd) & _PAGE_PRESENT)) {
+#endif
+               pte_t *page_table = NULL;
+
+               if (after_bootmem) {
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
+                       page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
+#endif
+                       if (!page_table)
+                               page_table =
+                               (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
+               } else
+                       page_table = (pte_t *)alloc_low_page();
+
+               paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+               make_lowmem_page_readonly(page_table,
+                                         XENFEAT_writable_page_tables);
+               set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+               BUG_ON(page_table != pte_offset_kernel(pmd, 0));
+       }
+
+       return pte_offset_kernel(pmd, 0);
+}
+
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+       int pgd_idx = pgd_index(vaddr);
+       int pmd_idx = pmd_index(vaddr);
+
+       return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+       int pte_idx = pte_index(vaddr);
+       pmd_t *pmd;
+
+       pmd = populate_extra_pmd(vaddr);
+       return one_page_table_init(pmd) + pte_idx;
+}
+
+static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
+                                          unsigned long vaddr, pte_t *lastpte)
+{
+#ifdef CONFIG_HIGHMEM
+       /*
+        * Something (early fixmap) may already have put a pte
+        * page here, which causes the page table allocation
+        * to become nonlinear. Attempt to fix it, and if it
+        * is still nonlinear then we have to bug.
+        */
+       int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+       int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+
+       if (pmd_idx_kmap_begin != pmd_idx_kmap_end
+           && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
+           && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
+           && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
+               || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
+               pte_t *newpte;
+               int i;
+
+               BUG_ON(after_bootmem);
+               newpte = alloc_low_page();
+               for (i = 0; i < PTRS_PER_PTE; i++)
+                       set_pte(newpte + i, pte[i]);
+
+               paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
+               make_lowmem_page_readonly(newpte,
+                                         XENFEAT_writable_page_tables);
+               set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
+               BUG_ON(newpte != pte_offset_kernel(pmd, 0));
+               __flush_tlb_all();
+
+               paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
+               make_lowmem_page_writable(pte,
+                                         XENFEAT_writable_page_tables);
+               pte = newpte;
+       }
+       BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
+              && vaddr > fix_to_virt(FIX_KMAP_END)
+              && lastpte && lastpte + PTRS_PER_PTE != pte);
+#endif
+       return pte;
+}
+
+/*
+ * This function initializes a certain range of kernel virtual memory
+ * with new bootmem page tables, everywhere page tables are missing in
+ * the given range.
+ *
+ * NOTE: The pagetables are allocated contiguous on the physical space
+ * so we can cache the place of the first one and move around without
+ * checking the pgd every time.
+ */
+static void __init
+page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
+{
+       int pgd_idx, pmd_idx;
+       unsigned long vaddr;
+       pgd_t *pgd;
+       pmd_t *pmd;
+       pte_t *pte = NULL;
+
+       vaddr = start;
+       pgd_idx = pgd_index(vaddr);
+       pmd_idx = pmd_index(vaddr);
+       pgd = pgd_base + pgd_idx;
+
+       for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
+               pmd = one_md_table_init(pgd);
+               pmd = pmd + pmd_index(vaddr);
+               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+                                                       pmd++, pmd_idx++) {
+                       if (vaddr >= hypervisor_virt_start)
+                               break;
+                       pte = page_table_kmap_check(one_page_table_init(pmd),
+                                                   pmd, vaddr, pte);
+
+                       vaddr += PMD_SIZE;
+               }
+               pmd_idx = 0;
+       }
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+       if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
+               return 1;
+       return 0;
+}
+
+/*
+ * This maps the physical memory to kernel virtual address space, a total
+ * of max_low_pfn pages, by creating page tables starting from address
+ * PAGE_OFFSET:
+ */
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+                            unsigned long end,
+                            unsigned long page_size_mask)
+{
+       int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+       unsigned long last_map_addr = end;
+       unsigned long start_pfn, end_pfn;
+       pgd_t *pgd_base = swapper_pg_dir;
+       int pgd_idx, pmd_idx, pte_ofs;
+       unsigned long pfn;
+       pgd_t *pgd;
+       pmd_t *pmd;
+       pte_t *pte;
+       unsigned pages_2m, pages_4k;
+       int mapping_iter;
+
+       start_pfn = start >> PAGE_SHIFT;
+       end_pfn = end >> PAGE_SHIFT;
+
+       /*
+        * First iteration will setup identity mapping using large/small pages
+        * based on use_pse, with other attributes same as set by
+        * the early code in head_32.S
+        *
+        * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
+        * as desired for the kernel identity mapping.
+        *
+        * This two pass mechanism conforms to the TLB app note which says:
+        *
+        *     "Software should not write to a paging-structure entry in a way
+        *      that would change, for any linear address, both the page size
+        *      and either the page frame or attributes."
+        */
+       mapping_iter = 1;
+
+       if (!cpu_has_pse) {
+               use_pse = 0;
+               mapping_iter = 0;
+       }
+
+repeat:
+       pages_2m = pages_4k = 0;
+       pfn = start_pfn;
+       pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+       pgd = pgd_base + pgd_idx;
+       for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+#ifdef CONFIG_XEN
+               /*
+                * Native linux hasn't PAE-paging enabled yet at this
+                * point.  When running as xen domain we are in PAE
+                * mode already, thus we can't simply hook a empty
+                * pmd.  That would kill the mappings we are currently
+                * using ...
+                */
+               pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
+#else
+               pmd = one_md_table_init(pgd);
+#endif
+
+               if (pfn >= end_pfn)
+                       continue;
+#ifdef CONFIG_X86_PAE
+               pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+               pmd += pmd_idx;
+#else
+               pmd_idx = 0;
+#endif
+               for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
+                    pmd++, pmd_idx++) {
+                       unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
+
+                       if (addr >= hypervisor_virt_start)
+                               continue;
+
+                       /*
+                        * Map with big pages if possible, otherwise
+                        * create normal page tables:
+                        */
+                       if (use_pse) {
+                               unsigned int addr2;
+                               pgprot_t prot = PAGE_KERNEL_LARGE;
+                               /*
+                                * first pass will use the same initial
+                                * identity mapping attribute + _PAGE_PSE.
+                                */
+                               pgprot_t init_prot =
+                                       __pgprot(PTE_IDENT_ATTR |
+                                                _PAGE_PSE);
+
+                               addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
+                                       PAGE_OFFSET + PAGE_SIZE-1;
+
+                               if (is_kernel_text(addr) ||
+                                   is_kernel_text(addr2))
+                                       prot = PAGE_KERNEL_LARGE_EXEC;
+
+                               pages_2m++;
+                               if (mapping_iter == 1)
+                                       set_pmd(pmd, pfn_pmd(pfn, init_prot));
+                               else
+                                       set_pmd(pmd, pfn_pmd(pfn, prot));
+
+                               pfn += PTRS_PER_PTE;
+                               continue;
+                       }
+                       pte = one_page_table_init(pmd);
+
+                       pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+                       pte += pte_ofs;
+                       for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
+                            pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
+                               pgprot_t prot = PAGE_KERNEL;
+                               /*
+                                * first pass will use the same initial
+                                * identity mapping attribute.
+                                */
+                               pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
+
+                               /* XEN: Only map initial RAM allocation. */
+                               if (pfn >= xen_start_info->nr_pages || pte_present(*pte))
+                                       continue;
+                               if (is_kernel_text(addr))
+                                       prot = PAGE_KERNEL_EXEC;
+
+                               pages_4k++;
+                               if (mapping_iter == 1) {
+                                       set_pte(pte, pfn_pte(pfn, init_prot));
+                                       last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
+                               } else
+                                       set_pte(pte, pfn_pte(pfn, prot));
+                       }
+               }
+       }
+       if (mapping_iter <= 1) {
+               /*
+                * update direct mapping page count only in the first
+                * iteration.
+                */
+               update_page_count(PG_LEVEL_2M, pages_2m);
+               update_page_count(PG_LEVEL_4K, pages_4k);
+       }
+       if (mapping_iter == 1) {
+               /*
+                * local global flush tlb, which will flush the previous
+                * mappings present in both small and large page TLB's.
+                */
+               __flush_tlb_all();
+
+               /*
+                * Second iteration will set the actual desired PTE attributes.
+                */
+               mapping_iter = 2;
+               goto repeat;
+       }
+       return last_map_addr;
+}
+
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+
+static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
+{
+       return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
+                       vaddr), vaddr), vaddr);
+}
+
+static void __init kmap_init(void)
+{
+       unsigned long kmap_vstart;
+
+       /*
+        * Cache the first kmap pte:
+        */
+       kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
+       kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+
+       kmap_prot = PAGE_KERNEL;
+}
+
+#ifdef CONFIG_HIGHMEM
+static void __init permanent_kmaps_init(pgd_t *pgd_base)
+{
+       unsigned long vaddr;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       vaddr = PKMAP_BASE;
+       page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       pud = pud_offset(pgd, vaddr);
+       pmd = pmd_offset(pud, vaddr);
+       pte = pte_offset_kernel(pmd, vaddr);
+       pkmap_page_table = pte;
+}
+
+static void __init add_one_highpage_init(struct page *page)
+{
+       ClearPageReserved(page);
+       init_page_count(page);
+       __free_page(page);
+       totalhigh_pages++;
+}
+
+void __init add_highpages_with_active_regions(int nid,
+                        unsigned long start_pfn, unsigned long end_pfn)
+{
+       phys_addr_t start, end;
+       u64 i;
+
+       for_each_free_mem_range(i, nid, &start, &end, NULL) {
+               unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
+                                           start_pfn, end_pfn);
+               unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
+                                             start_pfn, end_pfn);
+               for ( ; pfn < e_pfn; pfn++)
+                       if (pfn_valid(pfn))
+                               add_one_highpage_init(pfn_to_page(pfn));
+       }
+}
+#else
+static inline void permanent_kmaps_init(pgd_t *pgd_base)
+{
+}
+#endif /* CONFIG_HIGHMEM */
+
+pgd_t *swapper_pg_dir;
+
+/*
+ * Build a proper pagetable for the kernel mappings.  Up until this
+ * point, we've been running on some set of pagetables constructed by
+ * the boot process.
+ *
+ * If we're booting on native hardware, this will be a pagetable
+ * constructed in arch/x86/kernel/head_32.S.  The root of the
+ * pagetable will be swapper_pg_dir.
+ *
+ * If we're booting paravirtualized under a hypervisor, then there are
+ * more options: we may already be running PAE, and the pagetable may
+ * or may not be based in swapper_pg_dir.  In any case,
+ * paravirt_pagetable_setup_start() will set up swapper_pg_dir
+ * appropriately for the rest of the initialization to work.
+ *
+ * In general, pagetable_init() assumes that the pagetable may already
+ * be partially populated, and so it avoids stomping on any existing
+ * mappings.
+ */
+void __init early_ioremap_page_table_range_init(void)
+{
+       pgd_t *pgd_base = swapper_pg_dir;
+       unsigned long vaddr, end;
+
+       /*
+        * Fixed mappings, only the page table structure has to be
+        * created - mappings will be set by set_fixmap():
+        */
+       vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+       end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
+       page_table_range_init(vaddr, end, pgd_base);
+       early_ioremap_reset();
+}
+
+static void __init pagetable_init(void)
+{
+       pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
+
+       permanent_kmaps_init(pgd_base);
+}
+
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+       if (!arg)
+               return -EINVAL;
+
+       highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+       return 0;
+}
+early_param("highmem", parse_highmem);
+
+#define MSG_HIGHMEM_TOO_BIG \
+       "highmem size (%luMB) is bigger than pages available (%luMB)!\n"
+
+#define MSG_LOWMEM_TOO_SMALL \
+       "highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
+/*
+ * All of RAM fits into lowmem - but if user wants highmem
+ * artificially via the highmem=x boot parameter then create
+ * it:
+ */
+void __init lowmem_pfn_init(void)
+{
+       /* max_low_pfn is 0, we already have early_res support */
+       max_low_pfn = max_pfn;
+
+       if (highmem_pages == -1)
+               highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+       if (highmem_pages >= max_pfn) {
+               printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
+                       pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
+               highmem_pages = 0;
+       }
+       if (highmem_pages) {
+               if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
+                       printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
+                               pages_to_mb(highmem_pages));
+                       highmem_pages = 0;
+               }
+               max_low_pfn -= highmem_pages;
+       }
+#else
+       if (highmem_pages)
+               printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+}
+
+#define MSG_HIGHMEM_TOO_SMALL \
+       "only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
+
+#define MSG_HIGHMEM_TRIMMED \
+       "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
+/*
+ * We have more RAM than fits into lowmem - we try to put it into
+ * highmem, also taking the highmem=x boot parameter into account:
+ */
+void __init highmem_pfn_init(void)
+{
+       max_low_pfn = MAXMEM_PFN;
+
+       if (highmem_pages == -1)
+               highmem_pages = max_pfn - MAXMEM_PFN;
+
+       if (highmem_pages + MAXMEM_PFN < max_pfn)
+               max_pfn = MAXMEM_PFN + highmem_pages;
+
+       if (highmem_pages + MAXMEM_PFN > max_pfn) {
+               printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
+                       pages_to_mb(max_pfn - MAXMEM_PFN),
+                       pages_to_mb(highmem_pages));
+               highmem_pages = 0;
+       }
+#ifndef CONFIG_HIGHMEM
+       /* Maximum memory usable is what is directly addressable */
+       printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
+       if (max_pfn > MAX_NONPAE_PFN)
+               printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+       else
+               printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+       max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+       if (max_pfn > MAX_NONPAE_PFN) {
+               max_pfn = MAX_NONPAE_PFN;
+               printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
+       }
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+}
+
+/*
+ * Determine low and high memory ranges:
+ */
+void __init find_low_pfn_range(void)
+{
+       /* it could update max_pfn */
+
+       if (max_pfn <= MAXMEM_PFN)
+               lowmem_pfn_init();
+       else
+               highmem_pfn_init();
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init initmem_init(void)
+{
+#ifdef CONFIG_HIGHMEM
+       highstart_pfn = highend_pfn = max_pfn;
+       if (max_pfn > max_low_pfn)
+               highstart_pfn = max_low_pfn;
+       printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+               pages_to_mb(highend_pfn - highstart_pfn));
+       num_physpages = highend_pfn;
+       high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+       num_physpages = max_low_pfn;
+       high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+
+       memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+       sparse_memory_present_with_active_regions(0);
+
+#ifdef CONFIG_FLATMEM
+       max_mapnr = num_physpages;
+#endif
+       __vmalloc_start_set = true;
+
+       printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+                       pages_to_mb(max_low_pfn));
+
+       setup_bootmem_allocator();
+}
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+void __init setup_bootmem_allocator(void)
+{
+       printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
+                max_pfn_mapped<<PAGE_SHIFT);
+       printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
+
+       after_bootmem = 1;
+}
+
+unsigned long __init extend_init_mapping(unsigned long tables_space)
+{
+       unsigned long start_pfn = (__pa(xen_start_info->pt_base) >> PAGE_SHIFT)
+                                 + xen_start_info->nr_pt_frames;
+       unsigned long start = start_pfn, va = (unsigned long)&_text;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       /* Ensure init mappings cover kernel text/data and initial tables. */
+       while (va < PAGE_OFFSET + (start_pfn << PAGE_SHIFT) + tables_space) {
+               pgd = pgd_offset_k(va);
+               pud = pud_offset(pgd, va);
+               pmd = pmd_offset(pud, va);
+               if (pmd_none(*pmd)) {
+                       unsigned long pa = start_pfn++ << PAGE_SHIFT;
+
+                       clear_page(__va(pa));
+                       make_lowmem_page_readonly(__va(pa),
+                                                 XENFEAT_writable_page_tables);
+                       xen_l2_entry_update(pmd, __pmd(pa | _KERNPG_TABLE));
+               }
+               pte = pte_offset_kernel(pmd, va);
+               if (pte_none(*pte)) {
+                       pte_t new_pte = __pte(__pa(va) | _KERNPG_TABLE);
+
+                       if (HYPERVISOR_update_va_mapping(va, new_pte, 0))
+                               BUG();
+               }
+               va += PAGE_SIZE;
+       }
+
+       /* Finally, blow away any spurious initial mappings. */
+       while (1) {
+               pgd = pgd_offset_k(va);
+               pud = pud_offset(pgd, va);
+               pmd = pmd_offset(pud, va);
+               if (pmd_none(*pmd))
+                       break;
+               if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
+                       BUG();
+               va += PAGE_SIZE;
+       }
+
+       if (start_pfn > start)
+               memblock_reserve(PFN_PHYS(start), PFN_PHYS(start_pfn - start));
+
+       return start_pfn;
+}
+
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+       pagetable_init();
+
+       __flush_tlb_all();
+
+       kmap_init();
+
+       /*
+        * NOTE: at this point the bootmem allocator is fully available.
+        */
+       olpc_dt_build_devicetree();
+       sparse_memory_present_with_active_regions(MAX_NUMNODES);
+       sparse_init();
+       zone_sizes_init();
+}
+
+/*
+ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
+ * and also on some strange 486's. All 586+'s are OK. This used to involve
+ * black magic jumps to work around some nasty CPU bugs, but fortunately the
+ * switch to using exceptions got rid of all that.
+ */
+static void __init test_wp_bit(void)
+{
+       printk(KERN_INFO
+  "Checking if this processor honours the WP bit even in supervisor mode...");
+
+       /* Any page-aligned address will do, the test is non-destructive */
+       __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
+       boot_cpu_data.wp_works_ok = do_test_wp_bit();
+       clear_fixmap(FIX_WP_TEST);
+
+       if (!boot_cpu_data.wp_works_ok) {
+               printk(KERN_CONT "No.\n");
+#ifdef CONFIG_X86_WP_WORKS_OK
+               panic(
+  "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!");
+#endif
+       } else {
+               printk(KERN_CONT "Ok.\n");
+       }
+}
+
+void __init mem_init(void)
+{
+       int codesize, reservedpages, datasize, initsize;
+       int tmp;
+       unsigned long pfn;
+
+       pci_iommu_alloc();
+
+#ifdef CONFIG_FLATMEM
+       BUG_ON(!mem_map);
+#endif
+       /*
+        * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
+        * be done before free_all_bootmem(). Memblock use free low memory for
+        * temporary data (see find_range_array()) and for this purpose can use
+        * pages that was already passed to the buddy allocator, hence marked as
+        * not accessible in the page tables when compiled with
+        * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
+        * important here.
+        */
+       set_highmem_pages_init();
+
+       /* this will put all low memory onto the freelists */
+       totalram_pages += free_all_bootmem();
+       /* XEN: init low-mem pages outside initial allocation. */
+       for (pfn = xen_start_info->nr_pages; pfn < max_low_pfn; pfn++) {
+               ClearPageReserved(pfn_to_page(pfn));
+               init_page_count(pfn_to_page(pfn));
+       }
+
+       reservedpages = 0;
+       for (tmp = 0; tmp < max_low_pfn; tmp++)
+               /*
+                * Only count reserved RAM pages:
+                */
+               if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
+                       reservedpages++;
+
+       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
+       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
+       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
+                       "%dk reserved, %dk data, %dk init, %ldk highmem)\n",
+               nr_free_pages() << (PAGE_SHIFT-10),
+               num_physpages << (PAGE_SHIFT-10),
+               codesize >> 10,
+               reservedpages << (PAGE_SHIFT-10),
+               datasize >> 10,
+               initsize >> 10,
+               totalhigh_pages << (PAGE_SHIFT-10));
+
+       printk(KERN_INFO "virtual kernel memory layout:\n"
+               "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#ifdef CONFIG_HIGHMEM
+               "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#endif
+               "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+               "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+               "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+               "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+               "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
+               FIXADDR_START, FIXADDR_TOP,
+               (FIXADDR_TOP - FIXADDR_START) >> 10,
+
+#ifdef CONFIG_HIGHMEM
+               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+               (LAST_PKMAP*PAGE_SIZE) >> 10,
+#endif
+
+               VMALLOC_START, VMALLOC_END,
+               (VMALLOC_END - VMALLOC_START) >> 20,
+
+               (unsigned long)__va(0), (unsigned long)high_memory,
+               ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+
+               (unsigned long)&__init_begin, (unsigned long)&__init_end,
+               ((unsigned long)&__init_end -
+                (unsigned long)&__init_begin) >> 10,
+
+               (unsigned long)&_etext, (unsigned long)&_edata,
+               ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+
+               (unsigned long)&_text, (unsigned long)&_etext,
+               ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+
+       /*
+        * Check boundaries twice: Some fundamental inconsistencies can
+        * be detected at build time already.
+        */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#ifdef CONFIG_HIGHMEM
+       BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE  > FIXADDR_START);
+       BUILD_BUG_ON(VMALLOC_END                        > PKMAP_BASE);
+#endif
+#define high_memory (-128UL << 20)
+       BUILD_BUG_ON(VMALLOC_START                      >= VMALLOC_END);
+#undef high_memory
+#undef __FIXADDR_TOP
+
+#ifdef CONFIG_HIGHMEM
+       BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE        > FIXADDR_START);
+       BUG_ON(VMALLOC_END                              > PKMAP_BASE);
+#endif
+       BUG_ON(VMALLOC_START                            >= VMALLOC_END);
+       BUG_ON((unsigned long)high_memory               > VMALLOC_START);
+
+       if (boot_cpu_data.wp_works_ok < 0)
+               test_wp_bit();
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+       struct pglist_data *pgdata = NODE_DATA(nid);
+       struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
+       unsigned long start_pfn = start >> PAGE_SHIFT;
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+
+       return __add_pages(nid, zone, start_pfn, nr_pages);
+}
+#endif
+
+/*
+ * This function cannot be __init, since exceptions don't work in that
+ * section.  Put this after the callers, so that it cannot be inlined.
+ */
+static noinline int do_test_wp_bit(void)
+{
+       char tmp_reg;
+       int flag;
+
+       __asm__ __volatile__(
+               "       movb %0, %1     \n"
+               "1:     movb %1, %0     \n"
+               "       xorl %2, %2     \n"
+               "2:                     \n"
+               _ASM_EXTABLE(1b,2b)
+               :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
+                "=q" (tmp_reg),
+                "=r" (flag)
+               :"2" (1)
+               :"memory");
+
+       return flag;
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+int kernel_set_to_readonly __read_mostly;
+
+void set_kernel_text_rw(void)
+{
+       unsigned long start = PFN_ALIGN(_text);
+       unsigned long size = PFN_ALIGN(_etext) - start;
+
+       if (!kernel_set_to_readonly)
+               return;
+
+       pr_debug("Set kernel text: %lx - %lx for read write\n",
+                start, start+size);
+
+       set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+       unsigned long start = PFN_ALIGN(_text);
+       unsigned long size = PFN_ALIGN(_etext) - start;
+
+       if (!kernel_set_to_readonly)
+               return;
+
+       pr_debug("Set kernel text: %lx - %lx for read only\n",
+                start, start+size);
+
+       set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+static void mark_nxdata_nx(void)
+{
+       /*
+        * When this called, init has already been executed and released,
+        * so everything past _etext should be NX.
+        */
+       unsigned long start = PFN_ALIGN(_etext);
+       /*
+        * This comes from is_kernel_text upper limit. Also HPAGE where used:
+        */
+       unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+       if (__supported_pte_mask & _PAGE_NX)
+               printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+       set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+       unsigned long start = PFN_ALIGN(_text);
+       unsigned long size = PFN_ALIGN(_etext) - start;
+
+       set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+       printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+               size >> 10);
+
+       kernel_set_to_readonly = 1;
+
+#ifdef CONFIG_CPA_DEBUG
+       printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+               start, start+size);
+       set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+
+       printk(KERN_INFO "Testing CPA: write protecting again\n");
+       set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
+#endif
+
+       start += size;
+       size = (unsigned long)__end_rodata - start;
+       set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+       printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+               size >> 10);
+       rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+       printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
+       set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+
+       printk(KERN_INFO "Testing CPA: write protecting again\n");
+       set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+#endif
+       mark_nxdata_nx();
+}
+#endif
+
diff --git a/arch/x86/mm/init_64-xen.c b/arch/x86/mm/init_64-xen.c

new file mode 100644 (file)

index 0000000..1daecf3
--- /dev/null
+++ b/arch/x86/mm/init_64-xen.c
@@ -0,0 +1,1378 @@
+/*
+ *  linux/arch/x86_64/mm/init.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
+ *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
+ *
+ *  Jun Nakajima <jun.nakajima@intel.com>
+ *     Modified for Xen.
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/proc_fs.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/nmi.h>
+#include <linux/gfp.h>
+
+#include <asm/processor.h>
+#include <asm/bios_ebda.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/numa.h>
+#include <asm/cacheflush.h>
+#include <asm/init.h>
+#include <asm/setup.h>
+
+#include <xen/features.h>
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+unsigned int __kernel_page_user;
+EXPORT_SYMBOL(__kernel_page_user);
+#endif
+
+extern pmd_t level2_fixmap_pgt[PTRS_PER_PMD];
+extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
+
+/*
+ * Use this until direct mapping is established, i.e. before __va() is 
+ * available in init_memory_mapping().
+ */
+
+#define addr_to_page(addr, page)                               \
+       (addr) &= PHYSICAL_PAGE_MASK;                           \
+       (page) = ((unsigned long *) ((unsigned long)            \
+       (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +   \
+       __START_KERNEL_map)))
+
+pmd_t *__init early_get_pmd(unsigned long va)
+{
+       unsigned long addr;
+       unsigned long *page = (unsigned long *)init_level4_pgt;
+
+       addr = page[pgd_index(va)];
+       addr_to_page(addr, page);
+
+       addr = page[pud_index(va)];
+       addr_to_page(addr, page);
+
+       return (pmd_t *)&page[pmd_index(va)];
+}
+
+void __meminit early_make_page_readonly(void *va, unsigned int feature)
+{
+       unsigned long addr, _va = (unsigned long)va;
+       pte_t pte, *ptep;
+       unsigned long *page = (unsigned long *) init_level4_pgt;
+
+       BUG_ON(after_bootmem);
+
+       if (xen_feature(feature))
+               return;
+
+       addr = (unsigned long) page[pgd_index(_va)];
+       addr_to_page(addr, page);
+
+       addr = page[pud_index(_va)];
+       addr_to_page(addr, page);
+
+       addr = page[pmd_index(_va)];
+       addr_to_page(addr, page);
+
+       ptep = (pte_t *) &page[pte_index(_va)];
+
+       pte.pte = ptep->pte & ~_PAGE_RW;
+       if (HYPERVISOR_update_va_mapping(_va, pte, 0))
+               BUG();
+}
+
+unsigned long __init early_arbitrary_virt_to_mfn(void *v)
+{
+       unsigned long va = (unsigned long)v, addr, *page;
+
+       BUG_ON(va < __START_KERNEL_map);
+
+       page = (void *)(xen_read_cr3() + __START_KERNEL_map);
+
+       addr = page[pgd_index(va)];
+       addr_to_page(addr, page);
+
+       addr = page[pud_index(va)];
+       addr_to_page(addr, page);
+
+       addr = page[pmd_index(va)];
+       addr_to_page(addr, page);
+
+       return (page[pte_index(va)] & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT;
+}
+
+#ifndef CONFIG_XEN
+static int __init parse_direct_gbpages_off(char *arg)
+{
+       direct_gbpages = 0;
+       return 0;
+}
+early_param("nogbpages", parse_direct_gbpages_off);
+
+static int __init parse_direct_gbpages_on(char *arg)
+{
+       direct_gbpages = 1;
+       return 0;
+}
+early_param("gbpages", parse_direct_gbpages_on);
+#endif
+
+/*
+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
+ * physical space so we can cache the place of the first one and move
+ * around without checking the pgd every time.
+ */
+
+pteval_t __supported_pte_mask __read_mostly = ~0UL;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+int force_personality32;
+
+/*
+ * noexec32=on|off
+ * Control non executable heap for 32bit processes.
+ * To control the stack too use noexec=off
+ *
+ * on  PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
+ * off PROT_READ implies PROT_EXEC
+ */
+static int __init nonx32_setup(char *str)
+{
+       if (!strcmp(str, "on"))
+               force_personality32 &= ~READ_IMPLIES_EXEC;
+       else if (!strcmp(str, "off"))
+               force_personality32 |= READ_IMPLIES_EXEC;
+       return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
+/*
+ * When memory was added/removed make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+       unsigned long address;
+
+       for (address = start; address <= end; address += PGDIR_SIZE) {
+               const pgd_t *pgd_ref = pgd_offset_k(address);
+               struct page *page;
+
+               if (pgd_none(*pgd_ref))
+                       continue;
+
+               spin_lock(&pgd_lock);
+               list_for_each_entry(page, &pgd_list, lru) {
+                       pgd_t *pgd;
+                       spinlock_t *pgt_lock;
+
+                       pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                       /* the pgt_lock only for Xen */
+                       pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+                       spin_lock(pgt_lock);
+
+                       if (pgd_none(*pgd))
+                               set_pgd(pgd, *pgd_ref);
+                       else
+                               BUG_ON(pgd_page_vaddr(*pgd)
+                                      != pgd_page_vaddr(*pgd_ref));
+
+                       spin_unlock(pgt_lock);
+               }
+               spin_unlock(&pgd_lock);
+       }
+}
+
+static struct reserved_pfn_range {
+       unsigned long pfn, nr;
+} reserved_pfn_ranges[3] __meminitdata;
+
+void __init reserve_pfn_range(unsigned long pfn, unsigned long nr)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
+               struct reserved_pfn_range *range = reserved_pfn_ranges + i;
+
+               if (!range->nr) {
+                       range->pfn = pfn;
+                       range->nr = nr;
+                       break;
+               }
+               BUG_ON(range->pfn < pfn + nr && pfn < range->pfn + range->nr);
+               if (range->pfn > pfn) {
+                       i = ARRAY_SIZE(reserved_pfn_ranges) - 1;
+                       if (reserved_pfn_ranges[i].nr)
+                               continue;
+                       for (; reserved_pfn_ranges + i > range; --i)
+                               reserved_pfn_ranges[i]
+                                        = reserved_pfn_ranges[i - 1];
+                       range->pfn = pfn;
+                       range->nr = nr;
+                       break;
+               }
+       }
+       BUG_ON(i >= ARRAY_SIZE(reserved_pfn_ranges));
+       memblock_reserve(PFN_PHYS(pfn), PFN_PHYS(nr));
+}
+
+void __init reserve_pgtable_low(void)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
+               struct reserved_pfn_range *range = reserved_pfn_ranges + i;
+
+               if (!range->nr)
+                       break;
+               if (pgt_buf_start <= range->pfn && pgt_buf_top > range->pfn) {
+                       x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
+                                       PFN_PHYS(range->pfn));
+                       pgt_buf_start = range->pfn + range->nr;
+               }
+       }
+}
+
+static __init unsigned long get_table_end(void)
+{
+       unsigned int i;
+
+       BUG_ON(!pgt_buf_end);
+       for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
+               struct reserved_pfn_range *range = reserved_pfn_ranges + i;
+
+               if (!range->nr)
+                       break;
+               if (pgt_buf_end == range->pfn) {
+                       pgt_buf_end += range->nr;
+                       pgt_buf_top += range->nr;
+               }
+       }
+       return pgt_buf_end++;
+}
+
+/*
+ * NOTE: This function is marked __ref because it calls __init function
+ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+ */
+static __ref void *spp_getpage(void)
+{
+       void *ptr;
+
+       if (after_bootmem)
+               ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+       else if (pgt_buf_end < pgt_buf_top) {
+               ptr = __va(get_table_end() << PAGE_SHIFT);
+               clear_page(ptr);
+       } else
+               ptr = alloc_bootmem_pages(PAGE_SIZE);
+
+       if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
+               panic("set_pte_phys: cannot allocate page data %s\n",
+                       after_bootmem ? "after bootmem" : "");
+       }
+
+       pr_debug("spp_getpage %p\n", ptr);
+
+       return ptr;
+}
+
+static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
+{
+       if (pgd_none(*pgd)) {
+               pud_t *pud = (pud_t *)spp_getpage();
+               if (!after_bootmem) {
+                       make_page_readonly(pud, XENFEAT_writable_page_tables);
+                       xen_l4_entry_update(pgd, __pgd(__pa(pud) | _PAGE_TABLE));
+               } else
+                       pgd_populate(&init_mm, pgd, pud);
+               if (pud != pud_offset(pgd, 0))
+                       printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+                              pud, pud_offset(pgd, 0));
+       }
+       return pud_offset(pgd, vaddr);
+}
+
+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
+{
+       if (pud_none(*pud)) {
+               pmd_t *pmd = (pmd_t *) spp_getpage();
+               if (!after_bootmem) {
+                       make_page_readonly(pmd, XENFEAT_writable_page_tables);
+                       xen_l3_entry_update(pud, __pud(__pa(pmd) | _PAGE_TABLE));
+               } else
+                       pud_populate(&init_mm, pud, pmd);
+               if (pmd != pmd_offset(pud, 0))
+                       printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+                              pmd, pmd_offset(pud, 0));
+       }
+       return pmd_offset(pud, vaddr);
+}
+
+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
+       if (pmd_none(*pmd)) {
+               pte_t *pte = (pte_t *) spp_getpage();
+               make_page_readonly(pte, XENFEAT_writable_page_tables);
+               pmd_populate_kernel(&init_mm, pmd, pte);
+               if (pte != pte_offset_kernel(pmd, 0))
+                       printk(KERN_ERR "PAGETABLE BUG #02!\n");
+       }
+       return pte_offset_kernel(pmd, vaddr);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pud = pud_page + pud_index(vaddr);
+       pmd = fill_pmd(pud, vaddr);
+       pte = fill_pte(pmd, vaddr);
+
+       set_pte(pte, new_pte);
+
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+       pgd_t *pgd;
+       pud_t *pud_page;
+
+       pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, __pte_val(pteval));
+
+       pgd = pgd_offset_k(vaddr);
+       if (pgd_none(*pgd)) {
+               printk(KERN_ERR
+                       "PGD FIXMAP MISSING, it should be setup in head.S!\n");
+               return;
+       }
+       pud_page = (pud_t*)pgd_page_vaddr(*pgd);
+       set_pte_vaddr_pud(pud_page, vaddr, pteval);
+}
+
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+
+       pgd = pgd_offset_k(vaddr);
+       pud = fill_pud(pgd, vaddr);
+       return fill_pmd(pud, vaddr);
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+       pmd_t *pmd;
+
+       pmd = populate_extra_pmd(vaddr);
+       return fill_pte(pmd, vaddr);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Create large page table mappings for a range of physical addresses.
+ */
+static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
+                                               pgprot_t prot)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
+       for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
+               pgd = pgd_offset_k((unsigned long)__va(phys));
+               if (pgd_none(*pgd)) {
+                       pud = (pud_t *) spp_getpage();
+                       set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
+                                               _PAGE_USER));
+               }
+               pud = pud_offset(pgd, (unsigned long)__va(phys));
+               if (pud_none(*pud)) {
+                       pmd = (pmd_t *) spp_getpage();
+                       set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
+                                               _PAGE_USER));
+               }
+               pmd = pmd_offset(pud, phys);
+               BUG_ON(!pmd_none(*pmd));
+               set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
+       }
+}
+
+void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
+{
+       __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
+}
+
+void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
+{
+       __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
+}
+
+/*
+ * The head.S code sets up the kernel high mapping:
+ *
+ *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
+ *
+ * phys_addr holds the negative offset to the kernel, which is added
+ * to the compile time generated pmds. This results in invalid pmds up
+ * to the point where we hit the physaddr 0 mapping.
+ *
+ * We limit the mappings to the region from _text to _brk_end.  _brk_end
+ * is rounded up to the 2MB boundary. This catches the invalid pmds as
+ * well, as they are located before _text:
+ */
+void __init cleanup_highmap(void)
+{
+       unsigned long vaddr = __START_KERNEL_map;
+       unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+       unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
+       pmd_t *pmd = level2_kernel_pgt;
+
+       for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
+               if (pmd_none(*pmd))
+                       continue;
+               if (vaddr < (unsigned long) _text || vaddr > end)
+                       set_pmd(pmd, __pmd(0));
+       }
+}
+#endif
+
+static __ref void *alloc_low_page(unsigned long *phys)
+{
+       unsigned long pfn;
+       void *adr;
+
+       if (after_bootmem) {
+               adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+               *phys = __pa(adr);
+
+               return adr;
+       }
+
+       pfn = get_table_end();
+       if (pfn >= pgt_buf_top)
+               panic("alloc_low_page: ran out of memory");
+
+       adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
+       clear_page(adr);
+       *phys  = pfn * PAGE_SIZE;
+       return adr;
+}
+
+static __ref void *map_low_page(void *virt)
+{
+       void *adr;
+       unsigned long phys, left;
+
+       if (after_bootmem)
+               return virt;
+
+       phys = __pa(virt);
+       left = phys & (PAGE_SIZE - 1);
+       adr = early_memremap_ro(phys & PAGE_MASK, PAGE_SIZE);
+       adr = (void *)(((unsigned long)adr) | left);
+
+       return adr;
+}
+
+static __ref void unmap_low_page(void *adr)
+{
+       if (after_bootmem)
+               return;
+
+       early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
+}
+
+static inline int __meminit make_readonly(unsigned long paddr)
+{
+       int readonly = 0;
+
+       /* Make new page tables read-only on the first pass. */
+       if (!xen_feature(XENFEAT_writable_page_tables)
+           && !max_pfn_mapped
+           && (paddr >= (pgt_buf_start << PAGE_SHIFT))) {
+               unsigned long top = pgt_buf_top;
+               unsigned int i;
+
+               /* Account for the ranges get_table_end() skips. */
+               for (i = 0; i < ARRAY_SIZE(reserved_pfn_ranges); ++i) {
+                       const struct reserved_pfn_range *range;
+
+                       range = reserved_pfn_ranges + i;
+                       if (!range->nr)
+                               continue;
+                       if (pgt_buf_end <= range->pfn && top > range->pfn) {
+                               if (paddr > (range->pfn << PAGE_SHIFT)
+                                   && paddr < ((range->pfn + range->nr)
+                                               << PAGE_SHIFT))
+                                       break;
+                               top += range->nr;
+                       }
+               }
+               if (paddr < (top << PAGE_SHIFT))
+                       readonly = (i >= ARRAY_SIZE(reserved_pfn_ranges));
+       }
+       /* Make old page tables read-only. */
+       if (!xen_feature(XENFEAT_writable_page_tables)
+           && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
+           && (paddr < (pgt_buf_end << PAGE_SHIFT)))
+               readonly = 1;
+       /* Make P->M table (and its page tables) read-only. */
+       if (!xen_feature(XENFEAT_writable_page_tables)
+           && xen_start_info->mfn_list < __START_KERNEL_map
+           && paddr >= (xen_start_info->first_p2m_pfn << PAGE_SHIFT)
+           && paddr < (xen_start_info->first_p2m_pfn
+                       + xen_start_info->nr_p2m_frames) << PAGE_SHIFT)
+               readonly = 1;
+
+       /*
+        * No need for writable mapping of kernel image. This also ensures that
+        * page and descriptor tables embedded inside don't have writable
+        * mappings. The range must be in sync with that passed to
+        * reserve_early() (as "TEXT DATA BSS"), since all other regions can be
+        * allocated from under CONFIG_NO_BOOTMEM and thus must be writable.
+        */
+       if ((paddr >= __pa_symbol(&_text))
+            && (paddr < (__pa_symbol(__bss_stop) & PAGE_MASK)))
+               readonly = 1;
+
+       return readonly;
+}
+
+static unsigned long __meminit
+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
+             pgprot_t prot)
+{
+       unsigned pages = 0;
+       unsigned long last_map_addr = end;
+       int i;
+
+       pte_t *pte = pte_page + pte_index(addr);
+
+       for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {
+               unsigned long pteval = addr | pgprot_val(prot);
+
+               if (addr >= end ||
+                   (!after_bootmem &&
+                    (addr >> PAGE_SHIFT) >= xen_start_info->nr_pages))
+                       break;
+
+               /*
+                * We will re-use the existing mapping.
+                * Xen for example has some special requirements, like mapping
+                * pagetable pages as RO. So assume someone who pre-setup
+                * these mappings are more intelligent.
+                */
+               if (__pte_val(*pte)) {
+                       pages++;
+                       continue;
+               }
+
+               if (make_readonly(addr))
+                       pteval &= ~_PAGE_RW;
+               if (0)
+                       printk("   pte=%p addr=%lx pte=%016lx\n",
+                              pte, addr, pteval);
+               pages++;
+               if (!after_bootmem)
+                       *pte = __pte(pteval & __supported_pte_mask);
+               else
+                       set_pte(pte, __pte(pteval & __supported_pte_mask));
+               last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
+       }
+
+       update_page_count(PG_LEVEL_4K, pages);
+
+       return last_map_addr;
+}
+
+static unsigned long __meminit
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
+             unsigned long page_size_mask, pgprot_t prot)
+{
+       unsigned long pages = 0;
+       unsigned long last_map_addr = end;
+
+       int i = pmd_index(address);
+
+       for (; i < PTRS_PER_PMD; i++, address = (address & PMD_MASK) + PMD_SIZE) {
+               unsigned long pte_phys;
+               pmd_t *pmd = pmd_page + pmd_index(address);
+               pte_t *pte;
+               pgprot_t new_prot = prot;
+
+               if (address >= end)
+                       break;
+
+               if (__pmd_val(*pmd)) {
+                       if (!pmd_large(*pmd)) {
+                               spin_lock(&init_mm.page_table_lock);
+                               pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+                               last_map_addr = phys_pte_init(pte, address,
+                                                               end, prot);
+                               unmap_low_page(pte);
+                               spin_unlock(&init_mm.page_table_lock);
+                               continue;
+                       }
+                       /*
+                        * If we are ok with PG_LEVEL_2M mapping, then we will
+                        * use the existing mapping,
+                        *
+                        * Otherwise, we will split the large page mapping but
+                        * use the same existing protection bits except for
+                        * large page, so that we don't violate Intel's TLB
+                        * Application note (317080) which says, while changing
+                        * the page sizes, new and old translations should
+                        * not differ with respect to page frame and
+                        * attributes.
+                        */
+                       if (page_size_mask & (1 << PG_LEVEL_2M)) {
+                               pages++;
+                               continue;
+                       }
+                       new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
+               }
+
+               if (page_size_mask & (1<<PG_LEVEL_2M)) {
+                       pages++;
+                       spin_lock(&init_mm.page_table_lock);
+                       set_pte((pte_t *)pmd,
+                               pfn_pte(address >> PAGE_SHIFT,
+                                       __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+                       spin_unlock(&init_mm.page_table_lock);
+                       last_map_addr = (address & PMD_MASK) + PMD_SIZE;
+                       continue;
+               }
+
+               pte = alloc_low_page(&pte_phys);
+               last_map_addr = phys_pte_init(pte, address, end, new_prot);
+               unmap_low_page(pte);
+
+               if (!after_bootmem) {
+                       if (max_pfn_mapped)
+                               make_page_readonly(__va(pte_phys),
+                                                  XENFEAT_writable_page_tables);
+                       if (page_size_mask & (1 << PG_LEVEL_NUM)) {
+                               mmu_update_t u;
+
+                               u.ptr = arbitrary_virt_to_machine(pmd);
+                               u.val = phys_to_machine(pte_phys) | _PAGE_TABLE;
+                               if (HYPERVISOR_mmu_update(&u, 1, NULL,
+                                                         DOMID_SELF) < 0)
+                                       BUG();
+                       } else
+                               *pmd = __pmd(pte_phys | _PAGE_TABLE);
+               } else {
+                       spin_lock(&init_mm.page_table_lock);
+                       pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+                       spin_unlock(&init_mm.page_table_lock);
+               }
+       }
+       update_page_count(PG_LEVEL_2M, pages);
+       return last_map_addr;
+}
+
+static unsigned long __meminit
+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
+                        unsigned long page_size_mask)
+{
+       unsigned long pages = 0;
+       unsigned long last_map_addr = end;
+       int i = pud_index(addr);
+
+       for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
+               unsigned long pmd_phys;
+               pud_t *pud = pud_page + pud_index(addr);
+               pmd_t *pmd;
+               pgprot_t prot = PAGE_KERNEL;
+
+               if (addr >= end)
+                       break;
+
+               if (__pud_val(*pud)) {
+                       if (!pud_large(*pud)) {
+                               pmd = map_low_page(pmd_offset(pud, 0));
+                               last_map_addr = phys_pmd_init(pmd, addr, end,
+                                       page_size_mask | (1 << PG_LEVEL_NUM),
+                                       prot);
+                               unmap_low_page(pmd);
+                               __flush_tlb_all();
+                               continue;
+                       }
+                       /*
+                        * If we are ok with PG_LEVEL_1G mapping, then we will
+                        * use the existing mapping.
+                        *
+                        * Otherwise, we will split the gbpage mapping but use
+                        * the same existing protection  bits except for large
+                        * page, so that we don't violate Intel's TLB
+                        * Application note (317080) which says, while changing
+                        * the page sizes, new and old translations should
+                        * not differ with respect to page frame and
+                        * attributes.
+                        */
+                       if (page_size_mask & (1 << PG_LEVEL_1G)) {
+                               pages++;
+                               continue;
+                       }
+                       prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
+               }
+
+               if (page_size_mask & (1<<PG_LEVEL_1G)) {
+                       pages++;
+                       spin_lock(&init_mm.page_table_lock);
+                       set_pte((pte_t *)pud,
+                               pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+                       spin_unlock(&init_mm.page_table_lock);
+                       last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
+                       continue;
+               }
+
+               pmd = alloc_low_page(&pmd_phys);
+               last_map_addr = phys_pmd_init(pmd, addr, end,
+                                             page_size_mask & ~(1 << PG_LEVEL_NUM),
+                                             prot);
+               unmap_low_page(pmd);
+
+               if (!after_bootmem) {
+                       if (max_pfn_mapped)
+                               make_page_readonly(__va(pmd_phys),
+                                                  XENFEAT_writable_page_tables);
+                       if (page_size_mask & (1 << PG_LEVEL_NUM)) {
+                               mmu_update_t u;
+
+                               u.ptr = arbitrary_virt_to_machine(pud);
+                               u.val = phys_to_machine(pmd_phys) | _PAGE_TABLE;
+                               if (HYPERVISOR_mmu_update(&u, 1, NULL,
+                                                         DOMID_SELF) < 0)
+                                       BUG();
+                       } else
+                               *pud = __pud(pmd_phys | _PAGE_TABLE);
+               } else {
+                       spin_lock(&init_mm.page_table_lock);
+                       pud_populate(&init_mm, pud, __va(pmd_phys));
+                       spin_unlock(&init_mm.page_table_lock);
+               }
+       }
+       __flush_tlb_all();
+
+       update_page_count(PG_LEVEL_1G, pages);
+
+       return last_map_addr;
+}
+
+void __init xen_init_pt(void)
+{
+       unsigned long addr, *page;
+
+       /* Find the initial pte page that was built for us. */
+       page = (unsigned long *)xen_start_info->pt_base;
+       addr = page[pgd_index(__START_KERNEL_map)];
+       addr_to_page(addr, page);
+
+#if CONFIG_XEN_COMPAT <= 0x030002
+       /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
+          in kernel PTEs. We check that here. */
+       if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
+               unsigned long *pg;
+               pte_t pte;
+
+               /* Mess with the initial mapping of page 0. It's not needed. */
+               BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
+               addr = page[pud_index(__START_KERNEL_map)];
+               addr_to_page(addr, pg);
+               addr = pg[pmd_index(__START_KERNEL_map)];
+               addr_to_page(addr, pg);
+               pte.pte = pg[pte_index(__START_KERNEL_map)];
+               BUG_ON(!(pte.pte & _PAGE_PRESENT));
+
+               /* If _PAGE_USER isn't set, we obviously do not need it. */
+               if (pte.pte & _PAGE_USER) {
+                       /* _PAGE_USER is needed, but is it set implicitly? */
+                       pte.pte &= ~_PAGE_USER;
+                       if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
+                                                         pte, 0) != 0) ||
+                           !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
+                               /* We need to explicitly specify _PAGE_USER. */
+                               __kernel_page_user = _PAGE_USER;
+               }
+       }
+#endif
+
+       /* Construct mapping of initial pte page in our own directories. */
+       init_level4_pgt[pgd_index(__START_KERNEL_map)] = 
+               __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE);
+       memcpy(level3_kernel_pgt + pud_index(__START_KERNEL_map),
+              page + pud_index(__START_KERNEL_map),
+              (PTRS_PER_PUD - pud_index(__START_KERNEL_map))
+              * sizeof(*level3_kernel_pgt));
+
+       /* Copy the initial P->M table mappings if necessary. */
+       addr = pgd_index(xen_start_info->mfn_list);
+       if (addr < pgd_index(__START_KERNEL_map))
+               init_level4_pgt[addr] =
+                       ((pgd_t *)xen_start_info->pt_base)[addr];
+
+       /* Do an early initialization of the fixmap area. */
+       addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE);
+       if (pud_present(level3_kernel_pgt[pud_index(addr)])) {
+               unsigned long adr = page[pud_index(addr)];
+
+               addr_to_page(adr, page);
+               copy_page(level2_fixmap_pgt, page);
+       }
+       level3_kernel_pgt[pud_index(addr)] =
+               __pud(__pa_symbol(level2_fixmap_pgt) | _PAGE_TABLE);
+       level2_fixmap_pgt[pmd_index(addr)] =
+               __pmd(__pa_symbol(level1_fixmap_pgt) | _PAGE_TABLE);
+
+       early_make_page_readonly(init_level4_pgt,
+                                XENFEAT_writable_page_tables);
+       early_make_page_readonly(level3_kernel_pgt,
+                                XENFEAT_writable_page_tables);
+       early_make_page_readonly(level3_user_pgt,
+                                XENFEAT_writable_page_tables);
+       early_make_page_readonly(level2_fixmap_pgt,
+                                XENFEAT_writable_page_tables);
+       early_make_page_readonly(level1_fixmap_pgt,
+                                XENFEAT_writable_page_tables);
+
+       if (!xen_feature(XENFEAT_writable_page_tables))
+               xen_pgd_pin(init_level4_pgt);
+}
+
+void __init xen_finish_init_mapping(void)
+{
+       unsigned long start, end;
+       struct mmuext_op mmuext;
+
+       /* Re-vector virtual addresses pointing into the initial
+          mapping to the just-established permanent ones. */
+       xen_start_info = __va(__pa(xen_start_info));
+       xen_start_info->pt_base = (unsigned long)
+               __va(__pa(xen_start_info->pt_base));
+       if (!xen_feature(XENFEAT_auto_translated_physmap)
+           && xen_start_info->mfn_list >= __START_KERNEL_map)
+               phys_to_machine_mapping =
+                       __va(__pa(xen_start_info->mfn_list));
+
+       /* Unpin the no longer used Xen provided page tables. */
+       mmuext.cmd = MMUEXT_UNPIN_TABLE;
+       mmuext.arg1.mfn = virt_to_mfn(xen_start_info->pt_base);
+       if (HYPERVISOR_mmuext_op(&mmuext, 1, NULL, DOMID_SELF))
+               BUG();
+
+       /* Destroy the Xen-created mappings beyond the kernel image. */
+       start = PAGE_ALIGN(_brk_end);
+       end   = __START_KERNEL_map + (pgt_buf_start << PAGE_SHIFT);
+       for (; start < end; start += PAGE_SIZE)
+               if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
+                       BUG();
+
+       WARN(pgt_buf_end != pgt_buf_top, "start=%lx cur=%lx top=%lx\n",
+            pgt_buf_start, pgt_buf_end, pgt_buf_top);
+       if (pgt_buf_end > pgt_buf_top)
+               pgt_buf_top = pgt_buf_end;
+}
+
+unsigned long __meminit
+kernel_physical_mapping_init(unsigned long start,
+                            unsigned long end,
+                            unsigned long page_size_mask)
+{
+       bool pgd_changed = false;
+       unsigned long next, last_map_addr = end;
+       unsigned long addr;
+
+       start = (unsigned long)__va(start);
+       end = (unsigned long)__va(end);
+       addr = start;
+
+       for (; start < end; start = next) {
+               pgd_t *pgd = pgd_offset_k(start);
+               unsigned long pud_phys;
+               pud_t *pud;
+
+               next = (start + PGDIR_SIZE) & PGDIR_MASK;
+               if (next > end)
+                       next = end;
+
+               if (__pgd_val(*pgd)) {
+                       pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+                       last_map_addr = phys_pud_init(pud, __pa(start),
+                               __pa(end), page_size_mask | (1 << PG_LEVEL_NUM));
+                       unmap_low_page(pud);
+                       continue;
+               }
+
+               pud = alloc_low_page(&pud_phys);
+               last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
+                                                page_size_mask);
+               unmap_low_page(pud);
+
+               if (!after_bootmem) {
+                       if (max_pfn_mapped)
+                               make_page_readonly(__va(pud_phys),
+                                                  XENFEAT_writable_page_tables);
+                       xen_l4_entry_update(pgd, __pgd(pud_phys | _PAGE_TABLE));
+               } else {
+                       spin_lock(&init_mm.page_table_lock);
+                       pgd_populate(&init_mm, pgd, __va(pud_phys));
+                       spin_unlock(&init_mm.page_table_lock);
+                       pgd_changed = true;
+               }
+       }
+
+       if (pgd_changed)
+               sync_global_pgds(addr, end);
+
+       return last_map_addr;
+}
+
+#ifndef CONFIG_NUMA
+void __init initmem_init(void)
+{
+       memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+}
+#endif
+
+void __init paging_init(void)
+{
+       sparse_memory_present_with_active_regions(MAX_NUMNODES);
+       sparse_init();
+
+       /*
+        * clear the default setting with node 0
+        * note: don't use nodes_clear here, that is really clearing when
+        *       numa support is not compiled in, and later node_set_state
+        *       will not set it back.
+        */
+       node_clear_state(0, N_NORMAL_MEMORY);
+
+       zone_sizes_init();
+}
+
+/*
+ * Memory hotplug specific functions
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
+ * updating.
+ */
+static void  update_end_of_memory_vars(u64 start, u64 size)
+{
+       unsigned long end_pfn = PFN_UP(start + size);
+
+       if (end_pfn > max_pfn) {
+               max_pfn = end_pfn;
+               max_low_pfn = end_pfn;
+               high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+       }
+}
+
+/*
+ * Memory is added always to NORMAL zone. This means you will never get
+ * additional DMA/DMA32 memory.
+ */
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+       struct pglist_data *pgdat = NODE_DATA(nid);
+       struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
+       unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
+       unsigned long nr_pages = size >> PAGE_SHIFT;
+       int ret;
+
+       last_mapped_pfn = init_memory_mapping(start, start + size);
+       if (last_mapped_pfn > max_pfn_mapped)
+               max_pfn_mapped = last_mapped_pfn;
+
+       ret = __add_pages(nid, zone, start_pfn, nr_pages);
+       WARN_ON_ONCE(ret);
+
+       /* update max_pfn, max_low_pfn and high_memory */
+       update_end_of_memory_vars(start, size);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(arch_add_memory);
+
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+static struct kcore_list kcore_vsyscall;
+
+void __init mem_init(void)
+{
+       long codesize, reservedpages, datasize, initsize;
+       unsigned long absent_pages;
+       unsigned long pfn;
+
+       pci_iommu_alloc();
+
+       /* clear_bss() already clear the empty_zero_page */
+
+       reservedpages = 0;
+
+       /* this will put all low memory onto the freelists */
+#ifdef CONFIG_NUMA
+       totalram_pages = numa_free_all_bootmem();
+#else
+       totalram_pages = free_all_bootmem();
+#endif
+
+       /* XEN: init pages outside initial allocation. */
+       for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
+               ClearPageReserved(pfn_to_page(pfn));
+               init_page_count(pfn_to_page(pfn));
+       }
+
+       absent_pages = absent_pages_in_range(0, max_pfn);
+       reservedpages = max_pfn - totalram_pages - absent_pages;
+       after_bootmem = 1;
+
+       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
+       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
+       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+       /* Register memory areas for /proc/kcore */
+       kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
+                        VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
+
+       printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
+                        "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
+               nr_free_pages() << (PAGE_SHIFT-10),
+               max_pfn << (PAGE_SHIFT-10),
+               codesize >> 10,
+               absent_pages << (PAGE_SHIFT-10),
+               reservedpages << (PAGE_SHIFT-10),
+               datasize >> 10,
+               initsize >> 10);
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+int kernel_set_to_readonly;
+
+void set_kernel_text_rw(void)
+{
+       unsigned long start = PFN_ALIGN(_text);
+       unsigned long end = PFN_ALIGN(__stop___ex_table);
+
+       if (!kernel_set_to_readonly)
+               return;
+
+       pr_debug("Set kernel text: %lx - %lx for read write\n",
+                start, end);
+
+       /*
+        * Make the kernel identity mapping for text RW. Kernel text
+        * mapping will always be RO. Refer to the comment in
+        * static_protections() in pageattr.c
+        */
+       set_memory_rw(start, (end - start) >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+       unsigned long start = PFN_ALIGN(_text);
+       unsigned long end = PFN_ALIGN(__stop___ex_table);
+
+       if (!kernel_set_to_readonly)
+               return;
+
+       pr_debug("Set kernel text: %lx - %lx for read only\n",
+                start, end);
+
+       /*
+        * Set the kernel identity mapping for text RO.
+        */
+       set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+       unsigned long start = PFN_ALIGN(_text);
+       unsigned long rodata_start =
+               ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+       unsigned long end = (unsigned long) &__end_rodata;
+       unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
+       unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
+       unsigned long data_start = (unsigned long) &_sdata;
+
+       printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+              (end - start) >> 10);
+       set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+
+       kernel_set_to_readonly = 1;
+
+       /*
+        * The rodata section (but not the kernel text!) should also be
+        * not-executable.
+        */
+       set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
+
+       rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+       printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
+       set_memory_rw(start, (end-start) >> PAGE_SHIFT);
+
+       printk(KERN_INFO "Testing CPA: again\n");
+       set_memory_ro(start, (end-start) >> PAGE_SHIFT);
+#endif
+
+       free_init_pages("unused kernel memory",
+                       (unsigned long) page_address(virt_to_page(text_end)),
+                       (unsigned long)
+                                page_address(virt_to_page(rodata_start)));
+       free_init_pages("unused kernel memory",
+                       (unsigned long) page_address(virt_to_page(rodata_end)),
+                       (unsigned long) page_address(virt_to_page(data_start)));
+}
+
+#endif
+
+int kern_addr_valid(unsigned long addr)
+{
+       unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       if (above != 0 && above != -1UL)
+               return 0;
+
+#ifdef CONFIG_XEN
+       /*
+        * Don't walk page tables for hypervisor addresses, but allow
+        * the M2P table to be accessed through e.g. /proc/kcore.
+        */
+       if (addr >= (unsigned long)machine_to_phys_mapping &&
+           addr < (unsigned long)(machine_to_phys_mapping +
+                                  machine_to_phys_nr))
+               return 1;
+       if (addr >= HYPERVISOR_VIRT_START && addr < HYPERVISOR_VIRT_END)
+               return 0;
+#endif
+
+       pgd = pgd_offset_k(addr);
+       if (pgd_none(*pgd))
+               return 0;
+
+       pud = pud_offset(pgd, addr);
+       if (pud_none(*pud))
+               return 0;
+
+       pmd = pmd_offset(pud, addr);
+       if (pmd_none(*pmd))
+               return 0;
+
+       if (pmd_large(*pmd))
+               return pfn_valid(pmd_pfn(*pmd));
+
+       pte = pte_offset_kernel(pmd, addr);
+       if (pte_none(*pte))
+               return 0;
+
+       return pfn_valid(pte_pfn(*pte));
+}
+
+/*
+ * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
+ * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
+ * not need special handling anymore:
+ */
+static struct vm_area_struct gate_vma = {
+       .vm_start       = VSYSCALL_START,
+       .vm_end         = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
+       .vm_page_prot   = PAGE_READONLY_EXEC,
+       .vm_flags       = VM_READ | VM_EXEC
+};
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+#ifdef CONFIG_IA32_EMULATION
+       if (!mm || mm->context.ia32_compat)
+               return NULL;
+#endif
+       return &gate_vma;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
+{
+       struct vm_area_struct *vma = get_gate_vma(mm);
+
+       if (!vma)
+               return 0;
+
+       return (addr >= vma->vm_start) && (addr < vma->vm_end);
+}
+
+/*
+ * Use this when you have no reliable mm, typically from interrupt
+ * context. It is less reliable than using a task's mm and may give
+ * false positives.
+ */
+int in_gate_area_no_mm(unsigned long addr)
+{
+       return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+       if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+               return "[vdso]";
+       if (vma == &gate_vma)
+               return "[vsyscall]";
+       return NULL;
+}
+
+#ifdef CONFIG_X86_UV
+unsigned long memory_block_size_bytes(void)
+{
+       if (is_uv_system()) {
+               printk(KERN_INFO "UV: memory block size 2GB\n");
+               return 2UL * 1024 * 1024 * 1024;
+       }
+       return MIN_MEMORY_BLOCK_SIZE;
+}
+#endif
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
+ */
+static long __meminitdata addr_start, addr_end;
+static void __meminitdata *p_start, *p_end;
+static int __meminitdata node_start;
+
+int __meminit
+vmemmap_populate(struct page *start_page, unsigned long size, int node)
+{
+       unsigned long addr = (unsigned long)start_page;
+       unsigned long end = (unsigned long)(start_page + size);
+       unsigned long next;
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       for (; addr < end; addr = next) {
+               void *p = NULL;
+
+               pgd = vmemmap_pgd_populate(addr, node);
+               if (!pgd)
+                       return -ENOMEM;
+
+               pud = vmemmap_pud_populate(pgd, addr, node);
+               if (!pud)
+                       return -ENOMEM;
+
+               if (!cpu_has_pse) {
+                       next = (addr + PAGE_SIZE) & PAGE_MASK;
+                       pmd = vmemmap_pmd_populate(pud, addr, node);
+
+                       if (!pmd)
+                               return -ENOMEM;
+
+                       p = vmemmap_pte_populate(pmd, addr, node);
+
+                       if (!p)
+                               return -ENOMEM;
+
+                       addr_end = addr + PAGE_SIZE;
+                       p_end = p + PAGE_SIZE;
+               } else {
+                       next = pmd_addr_end(addr, end);
+
+                       pmd = pmd_offset(pud, addr);
+                       if (pmd_none(*pmd)) {
+                               pte_t entry;
+
+                               p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+                               if (!p)
+                                       return -ENOMEM;
+
+                               entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+                                               PAGE_KERNEL_LARGE);
+                               set_pmd(pmd, __pmd_ma(__pte_val(entry)));
+
+                               /* check to see if we have contiguous blocks */
+                               if (p_end != p || node_start != node) {
+                                       if (p_start)
+                                               printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+                                                      addr_start, addr_end-1, p_start, p_end-1, node_start);
+                                       addr_start = addr;
+                                       node_start = node;
+                                       p_start = p;
+                               }
+
+                               addr_end = addr + PMD_SIZE;
+                               p_end = p + PMD_SIZE;
+                       } else
+                               vmemmap_verify((pte_t *)pmd, node, addr, next);
+               }
+
+       }
+       sync_global_pgds((unsigned long)start_page, end);
+       return 0;
+}
+
+void __meminit vmemmap_populate_print_last(void)
+{
+       if (p_start) {
+               printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+                       addr_start, addr_end-1, p_start, p_end-1, node_start);
+               p_start = NULL;
+               p_end = NULL;
+               node_start = 0;
+       }
+}
+#endif
diff --git a/arch/x86/mm/iomap_32-xen.c b/arch/x86/mm/iomap_32-xen.c

new file mode 100644 (file)

index 0000000..bbd4134
--- /dev/null
+++ b/arch/x86/mm/iomap_32-xen.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright © 2008 Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <asm/iomap.h>
+#include <asm/pat.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+
+static int is_io_mapping_possible(resource_size_t base, unsigned long size)
+{
+#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
+       /* There is no way to map greater than 1 << 32 address without PAE */
+       if (base + size > 0x100000000ULL)
+               return 0;
+#endif
+       return 1;
+}
+
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
+{
+       unsigned long flag = _PAGE_CACHE_WC;
+       int ret;
+
+       if (!is_io_mapping_possible(base, size))
+               return -EINVAL;
+
+       ret = io_reserve_memtype(base, base + size, &flag);
+       if (ret)
+               return ret;
+
+       *prot = __pgprot(__PAGE_KERNEL | flag);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_create_wc);
+
+void iomap_free(resource_size_t base, unsigned long size)
+{
+       io_free_memtype(base, base + size);
+}
+EXPORT_SYMBOL_GPL(iomap_free);
+
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
+{
+       unsigned long vaddr;
+       int idx, type;
+
+       pagefault_disable();
+
+       type = kmap_atomic_idx_push();
+       idx = type + KM_TYPE_NR * smp_processor_id();
+       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+       set_pte_at(&init_mm, vaddr, kmap_pte - idx, pfn_pte(pfn, prot));
+       /*arch_flush_lazy_mmu_mode();*/
+
+       return (void *)vaddr;
+}
+
+/*
+ * Map 'mfn' using protections 'prot'
+ */
+void __iomem *
+iomap_atomic_prot_pfn(unsigned long mfn, pgprot_t prot)
+{
+       /*
+        * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
+        * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
+        * MTRR is UC or WC.  UC_MINUS gets the real intention, of the
+        * user, which is "WC if the MTRR is WC, UC if you can't do that."
+        */
+       if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
+               prot = PAGE_KERNEL_UC_MINUS;
+
+       pgprot_val(prot) |= _PAGE_IOMAP;
+       return (void __force __iomem *) kmap_atomic_prot_pfn(mfn, prot);
+}
+EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
+
+void
+iounmap_atomic(void __iomem *kvaddr)
+{
+       unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+
+       if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+           vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+               int idx, type;
+
+               type = kmap_atomic_idx();
+               idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+               WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+               /*
+                * Force other mappings to Oops if they'll try to access this
+                * pte without first remap it.  Keeping stale mappings around
+                * is a bad idea also, in case the page changes cacheability
+                * attributes or becomes a protected page in a hypervisor.
+                */
+               kpte_clear_flush(kmap_pte-idx, vaddr);
+               kmap_atomic_idx_pop();
+       }
+
+       pagefault_enable();
+}
+EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/ioremap-xen.c b/arch/x86/mm/ioremap-xen.c

new file mode 100644 (file)

index 0000000..5e5e402
--- /dev/null
+++ b/arch/x86/mm/ioremap-xen.c
@@ -0,0 +1,827 @@
+/*
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mmiotrace.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/pat.h>
+
+#include "physaddr.h"
+
+static int direct_remap_area_pte_fn(pte_t *pte,
+                                   struct page *pmd_page,
+                                   unsigned long address,
+                                   void *data)
+{
+       mmu_update_t **v = (mmu_update_t **)data;
+
+       BUG_ON(!pte_none(*pte));
+
+       (*v)->ptr = ((u64)pfn_to_mfn(page_to_pfn(pmd_page)) <<
+                    PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
+       (*v)++;
+
+       return 0;
+}
+
+static int __direct_remap_pfn_range(struct mm_struct *mm,
+                                   unsigned long address,
+                                   phys_addr_t mfn,
+                                   unsigned long size,
+                                   pgprot_t prot,
+                                   domid_t  domid)
+{
+       int rc = 0;
+       unsigned long i, start_address;
+       mmu_update_t *u, *v, *w;
+
+       u = v = w = (mmu_update_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+       if (u == NULL)
+               return -ENOMEM;
+
+       start_address = address;
+
+       flush_cache_all();
+
+       for (i = 0; i < size; i += PAGE_SIZE) {
+               if ((v - u) == (PAGE_SIZE / sizeof(mmu_update_t))) {
+                       /* Flush a full batch after filling in the PTE ptrs. */
+                       rc = apply_to_page_range(mm, start_address,
+                                                address - start_address,
+                                                direct_remap_area_pte_fn, &w);
+                       if (rc)
+                               goto out;
+                       rc = HYPERVISOR_mmu_update(u, v - u, NULL, domid);
+                       if (rc < 0)
+                               goto out;
+                       v = w = u;
+                       start_address = address;
+               }
+
+               /*
+                * Fill in the machine address: PTE ptr is done later by
+                * apply_to_page_range().
+                */
+               pgprot_val(prot) |= _PAGE_IOMAP;
+               v->val = __pte_val(pte_mkspecial(pfn_pte_ma(mfn, prot)));
+
+               mfn++;
+               address += PAGE_SIZE;
+               v++;
+       }
+
+       if (v != u) {
+               /* Final batch. */
+               rc = apply_to_page_range(mm, start_address,
+                                        address - start_address,
+                                        direct_remap_area_pte_fn, &w);
+               if (rc)
+                       goto out;
+               rc = HYPERVISOR_mmu_update(u, v - u, NULL, domid);
+       }
+
+ out:
+       flush_tlb_all();
+
+       free_page((unsigned long)u);
+
+       return rc;
+}
+
+int direct_remap_pfn_range(struct vm_area_struct *vma,
+                          unsigned long address,
+                          phys_addr_t mfn,
+                          unsigned long size,
+                          pgprot_t prot,
+                          domid_t  domid)
+{
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return remap_pfn_range(vma, address, mfn, size, prot);
+
+       if (domid == DOMID_SELF)
+               return -EINVAL;
+
+       vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+
+       vma->vm_mm->context.has_foreign_mappings = 1;
+
+       return __direct_remap_pfn_range(
+               vma->vm_mm, address, mfn, size, prot, domid);
+}
+EXPORT_SYMBOL(direct_remap_pfn_range);
+
+int direct_kernel_remap_pfn_range(unsigned long address,
+                                 unsigned long mfn,
+                                 unsigned long size,
+                                 pgprot_t prot,
+                                 domid_t  domid)
+{
+       return __direct_remap_pfn_range(
+               &init_mm, address, mfn, size, prot, domid);
+}
+EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
+
+static int lookup_pte_fn(
+       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+       uint64_t *ptep = (uint64_t *)data;
+       if (ptep)
+               *ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pmd_page)) <<
+                        PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
+       return 0;
+}
+
+int create_lookup_pte_addr(struct mm_struct *mm,
+                          unsigned long address,
+                          uint64_t *ptep)
+{
+       return apply_to_page_range(mm, address, PAGE_SIZE,
+                                  lookup_pte_fn, ptep);
+}
+
+EXPORT_SYMBOL(create_lookup_pte_addr);
+
+#ifdef CONFIG_MODULES
+/*
+ * Force the implementation of ioremap_page_range() to be pulled in from
+ * lib/lib.a even if there is no other reference from the core kernel to it
+ * (native uses it in __ioremap_caller()), so that it gets exported.
+ */
+static void *const __section(.discard.ioremap) __used
+_ioremap_page_range = ioremap_page_range;
+#endif
+
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+                              unsigned long prot_val)
+{
+       unsigned long nrpages = size >> PAGE_SHIFT;
+       int err;
+
+       switch (prot_val) {
+       case _PAGE_CACHE_UC:
+       default:
+               err = _set_memory_uc(vaddr, nrpages);
+               break;
+       case _PAGE_CACHE_WC:
+               err = _set_memory_wc(vaddr, nrpages);
+               break;
+       case _PAGE_CACHE_WB:
+               err = _set_memory_wb(vaddr, nrpages);
+               break;
+       }
+
+       return err;
+}
+
+int ioremap_check_change_attr(unsigned long mfn, unsigned long size,
+                             unsigned long prot_val)
+{
+       unsigned long sz;
+       int rc;
+
+       for (sz = rc = 0; sz < size && !rc; ++mfn, sz += PAGE_SIZE) {
+               unsigned long pfn = mfn_to_local_pfn(mfn);
+
+               if (pfn >= max_low_pfn_mapped &&
+                   (pfn < (1UL<<(32 - PAGE_SHIFT)) || pfn >= max_pfn_mapped))
+                       continue;
+               rc = ioremap_change_attr((unsigned long)__va(pfn << PAGE_SHIFT),
+                                        PAGE_SIZE, prot_val);
+       }
+
+       return rc;
+}
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
+               unsigned long size, unsigned long prot_val, void *caller)
+{
+       unsigned long offset, vaddr;
+       phys_addr_t mfn, last_mfn, last_addr;
+       const resource_size_t unaligned_phys_addr = phys_addr;
+       const unsigned long unaligned_size = size;
+       struct vm_struct *area;
+       unsigned long new_prot_val;
+       pgprot_t prot;
+       int retval;
+       domid_t domid = DOMID_IO;
+       void __iomem *ret_addr;
+
+       /* Don't allow wraparound or zero size */
+       last_addr = phys_addr + size - 1;
+       if (!size || last_addr < phys_addr)
+               return NULL;
+
+       if (!phys_addr_valid(phys_addr)) {
+               printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
+                      (unsigned long long)phys_addr);
+               WARN_ON_ONCE(1);
+               return NULL;
+       }
+
+       /*
+        * Don't remap the low PCI/ISA area, it's always mapped..
+        */
+       if (is_initial_xendomain() && is_ISA_range(phys_addr, last_addr))
+               return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
+
+       /*
+        * Don't allow anybody to remap normal RAM that we're using..
+        */
+       last_mfn = PFN_DOWN(last_addr);
+       for (mfn = PFN_DOWN(phys_addr); mfn <= last_mfn; mfn++) {
+               unsigned long pfn = mfn_to_local_pfn(mfn);
+
+               if (pfn_valid(pfn)) {
+                       if (!PageReserved(pfn_to_page(pfn)))
+                               return NULL;
+                       domid = DOMID_SELF;
+               }
+       }
+       WARN_ON_ONCE(domid == DOMID_SELF);
+
+       /*
+        * Mappings have to be page-aligned
+        */
+       offset = phys_addr & ~PAGE_MASK;
+       phys_addr &= PHYSICAL_PAGE_MASK;
+       size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+       retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
+                                               prot_val, &new_prot_val);
+       if (retval) {
+               printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
+               return NULL;
+       }
+
+       if (prot_val != new_prot_val) {
+               if (!is_new_memtype_allowed(phys_addr, size,
+                                           prot_val, new_prot_val)) {
+                       printk(KERN_ERR
+               "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
+                               (unsigned long long)phys_addr,
+                               (unsigned long long)(phys_addr + size),
+                               prot_val, new_prot_val);
+                       goto err_free_memtype;
+               }
+               prot_val = new_prot_val;
+       }
+
+       switch (prot_val) {
+       case _PAGE_CACHE_UC:
+       default:
+               prot = PAGE_KERNEL_IO_NOCACHE;
+               break;
+       case _PAGE_CACHE_UC_MINUS:
+               prot = PAGE_KERNEL_IO_UC_MINUS;
+               break;
+       case _PAGE_CACHE_WC:
+               prot = PAGE_KERNEL_IO_WC;
+               break;
+       case _PAGE_CACHE_WB:
+               prot = PAGE_KERNEL_IO;
+               break;
+       }
+
+       /*
+        * Ok, go for it..
+        */
+       area = get_vm_area_caller(size, VM_IOREMAP, caller);
+       if (!area)
+               goto err_free_memtype;
+       area->phys_addr = phys_addr;
+       vaddr = (unsigned long) area->addr;
+
+       if (kernel_map_sync_memtype(phys_addr, size, prot_val))
+               goto err_free_area;
+
+       if (__direct_remap_pfn_range(&init_mm, vaddr, PFN_DOWN(phys_addr),
+                                    size, prot, domid))
+               goto err_free_area;
+
+       ret_addr = (void __iomem *) (vaddr + offset);
+       mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+
+       /*
+        * Check if the request spans more than any BAR in the iomem resource
+        * tree.
+        */
+       WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
+                 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
+
+       return ret_addr;
+err_free_area:
+       free_vm_area(area);
+err_free_memtype:
+       free_memtype(phys_addr, phys_addr + size);
+       return NULL;
+}
+
+/**
+ * ioremap_nocache     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
+{
+       /*
+        * Ideally, this should be:
+        *      pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
+        *
+        * Till we fix all X drivers to use ioremap_wc(), we will use
+        * UC MINUS.
+        */
+       unsigned long val = _PAGE_CACHE_UC_MINUS;
+
+       return __ioremap_caller(phys_addr, size, val,
+                               __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_nocache);
+
+/**
+ * ioremap_wc  -       map memory into CPU space write combined
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write combining.
+ * Write combining allows faster writes to some hardware devices.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
+{
+       if (pat_enabled)
+               return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
+                                       __builtin_return_address(0));
+       else
+               return ioremap_nocache(phys_addr, size);
+}
+EXPORT_SYMBOL(ioremap_wc);
+
+void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
+{
+       return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+                               __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_cache);
+
+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+                               unsigned long prot_val)
+{
+       return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
+                               __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_prot);
+
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+       struct vm_struct *p, *o;
+
+       if ((void __force *)addr <= high_memory)
+               return;
+
+       /*
+        * __ioremap special-cases the PCI/ISA range by not instantiating a
+        * vm_area and by simply returning an address into the kernel mapping
+        * of ISA space.   So handle that here.
+        */
+       if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
+               return;
+
+       addr = (volatile void __iomem *)
+               (PAGE_MASK & (unsigned long __force)addr);
+
+       mmiotrace_iounmap(addr);
+
+       /* Use the vm area unlocked, assuming the caller
+          ensures there isn't another iounmap for the same address
+          in parallel. Reuse of the virtual address is prevented by
+          leaving it in the global lists until we're done with it.
+          cpa takes care of the direct mappings. */
+       read_lock(&vmlist_lock);
+       for (p = vmlist; p; p = p->next) {
+               if (p->addr == (void __force *)addr)
+                       break;
+       }
+       read_unlock(&vmlist_lock);
+
+       if (!p) {
+               printk(KERN_ERR "iounmap: bad address %p\n", addr);
+               dump_stack();
+               return;
+       }
+
+       free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
+
+       /* Finally remove it */
+       o = remove_vm_area((void __force *)addr);
+       BUG_ON(p != o || o == NULL);
+       kfree(p);
+}
+EXPORT_SYMBOL(iounmap);
+
+#ifndef CONFIG_XEN
+/*
+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
+ * access
+ */
+void *xlate_dev_mem_ptr(unsigned long phys)
+{
+       void *addr;
+       unsigned long start = phys & PAGE_MASK;
+
+       /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
+       if (page_is_ram(start >> PAGE_SHIFT))
+               return __va(phys);
+
+       addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
+       if (addr)
+               addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
+
+       return addr;
+}
+
+void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
+{
+       if (page_is_ram(phys >> PAGE_SHIFT))
+               return;
+
+       iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+       return;
+}
+#endif
+
+static int __initdata early_ioremap_debug;
+
+static int __init early_ioremap_debug_setup(char *str)
+{
+       early_ioremap_debug = 1;
+
+       return 0;
+}
+early_param("early_ioremap_debug", early_ioremap_debug_setup);
+
+static __initdata int after_paging_init;
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
+
+#ifdef CONFIG_X86_32
+static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
+{
+       /* Don't assume we're using swapper_pg_dir at this point */
+       pgd_t *base = __va(read_cr3());
+       pgd_t *pgd = &base[pgd_index(addr)];
+       pud_t *pud = pud_offset(pgd, addr);
+       pmd_t *pmd = pmd_offset(pud, addr);
+
+       return pmd;
+}
+#else
+#define early_ioremap_pmd early_get_pmd
+#undef make_lowmem_page_readonly
+#define make_lowmem_page_readonly early_make_page_readonly
+#endif
+
+static inline pte_t * __init early_ioremap_pte(unsigned long addr)
+{
+       return &bm_pte[pte_index(addr)];
+}
+
+bool __init is_early_ioremap_ptep(pte_t *ptep)
+{
+       return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
+}
+
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
+void __init early_ioremap_init(void)
+{
+       pmd_t *pmd;
+       int i;
+
+       if (early_ioremap_debug)
+               printk(KERN_INFO "early_ioremap_init()\n");
+
+       for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+               slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+
+       pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
+       memset(bm_pte, 0, sizeof(bm_pte));
+       make_lowmem_page_readonly(bm_pte, XENFEAT_writable_page_tables);
+       pmd_populate_kernel(&init_mm, pmd, bm_pte);
+
+       /*
+        * The boot-ioremap range spans multiple pmds, for which
+        * we are not prepared:
+        */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+       BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
+                    != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
+#undef __FIXADDR_TOP
+       if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
+               WARN_ON(1);
+               printk(KERN_WARNING "pmd %p != %p\n",
+                      pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
+               printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
+                       fix_to_virt(FIX_BTMAP_BEGIN));
+               printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END):   %08lx\n",
+                       fix_to_virt(FIX_BTMAP_END));
+
+               printk(KERN_WARNING "FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
+               printk(KERN_WARNING "FIX_BTMAP_BEGIN:     %d\n",
+                      FIX_BTMAP_BEGIN);
+       }
+}
+
+void __init early_ioremap_reset(void)
+{
+       after_paging_init = 1;
+}
+
+static void __init __early_set_fixmap(enum fixed_addresses idx,
+                                     phys_addr_t phys, pgprot_t flags)
+{
+       unsigned long addr = __fix_to_virt(idx);
+       pte_t *pte;
+
+       if (idx >= __end_of_fixed_addresses) {
+               BUG();
+               return;
+       }
+       pte = early_ioremap_pte(addr);
+
+       if (pgprot_val(flags))
+               set_pte(pte, pfn_pte_ma(phys >> PAGE_SHIFT, flags));
+       else
+               pte_clear(&init_mm, addr, pte);
+       __flush_tlb_one(addr);
+}
+
+static inline void __init early_set_fixmap(enum fixed_addresses idx,
+                                          phys_addr_t phys, pgprot_t prot)
+{
+       if (after_paging_init)
+               __set_fixmap(idx, phys, prot);
+       else
+               __early_set_fixmap(idx, phys, prot);
+}
+
+static inline void __init early_clear_fixmap(enum fixed_addresses idx)
+{
+       if (after_paging_init)
+               clear_fixmap(idx);
+       else
+               __early_set_fixmap(idx, 0, __pgprot(0));
+}
+
+static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+
+#ifndef CONFIG_XEN
+void __init fixup_early_ioremap(void)
+{
+       int i;
+
+       for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+               if (prev_map[i]) {
+                       WARN_ON(1);
+                       break;
+               }
+       }
+
+       early_ioremap_init();
+}
+#endif
+
+static int __init check_early_ioremap_leak(void)
+{
+       int count = 0;
+       int i;
+
+       for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+               if (prev_map[i])
+                       count++;
+
+       if (!count)
+               return 0;
+       WARN(1, KERN_WARNING
+              "Debug warning: early ioremap leak of %d areas detected.\n",
+               count);
+       printk(KERN_WARNING
+               "please boot with early_ioremap_debug and report the dmesg.\n");
+
+       return 1;
+}
+late_initcall(check_early_ioremap_leak);
+
+static void __init __iomem *
+__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
+{
+       unsigned long offset;
+       resource_size_t last_addr;
+       unsigned int nrpages;
+       enum fixed_addresses idx0, idx;
+       int i, slot;
+
+       WARN_ON(system_state != SYSTEM_BOOTING);
+
+       slot = -1;
+       for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+               if (!prev_map[i]) {
+                       slot = i;
+                       break;
+               }
+       }
+
+       if (slot < 0) {
+               printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n",
+                        (u64)phys_addr, size);
+               WARN_ON(1);
+               return NULL;
+       }
+
+       if (early_ioremap_debug) {
+               printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ",
+                      (u64)phys_addr, size, slot);
+               dump_stack();
+       }
+
+       /* Don't allow wraparound or zero size */
+       last_addr = phys_addr + size - 1;
+       if (!size || last_addr < phys_addr) {
+               WARN_ON(1);
+               return NULL;
+       }
+
+       prev_size[slot] = size;
+       /*
+        * Mappings have to be page-aligned
+        */
+       offset = phys_addr & ~PAGE_MASK;
+       phys_addr &= PAGE_MASK;
+       size = PAGE_ALIGN(last_addr + 1) - phys_addr;
+
+       /*
+        * Mappings have to fit in the FIX_BTMAP area.
+        */
+       nrpages = size >> PAGE_SHIFT;
+       if (nrpages > NR_FIX_BTMAPS) {
+               WARN_ON(1);
+               return NULL;
+       }
+
+       /*
+        * Ok, go for it..
+        */
+       idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+       idx = idx0;
+       while (nrpages > 0) {
+               early_set_fixmap(idx, phys_addr, prot);
+               phys_addr += PAGE_SIZE;
+               --idx;
+               --nrpages;
+       }
+       if (early_ioremap_debug)
+               printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
+
+       prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
+       return prev_map[slot];
+}
+
+/* Remap an IO device */
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+       /*
+        * Don't remap the low PCI/ISA area, it's always mapped.
+        */
+       if (is_initial_xendomain() && is_ISA_range(phys_addr, phys_addr + size - 1))
+               return (__force void __iomem *)isa_bus_to_virt((unsigned long)phys_addr);
+
+       return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
+}
+
+/* Remap memory */
+void __init __iomem *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+       return __early_ioremap(phys_to_machine(phys_addr), size, PAGE_KERNEL);
+}
+
+void __init __iomem *
+early_memremap_ro(resource_size_t phys_addr, unsigned long size)
+{
+       return __early_ioremap(phys_to_machine(phys_addr), size, PAGE_KERNEL_RO);
+}
+
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+       unsigned long virt_addr;
+       unsigned long offset;
+       unsigned int nrpages;
+       enum fixed_addresses idx;
+       int i, slot;
+
+       /*
+        * early_ioremap special-cases the PCI/ISA range by not instantiating a
+        * vm_area and by simply returning an address into the kernel mapping
+        * of ISA space.   So handle that here.
+        */
+       if ((unsigned long)addr >= fix_to_virt(FIX_ISAMAP_BEGIN)
+           && (unsigned long)addr < fix_to_virt(FIX_ISAMAP_END - 1))
+               return;
+
+       slot = -1;
+       for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+               if (prev_map[i] == addr) {
+                       slot = i;
+                       break;
+               }
+       }
+
+       if (slot < 0) {
+               printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
+                        addr, size);
+               WARN_ON(1);
+               return;
+       }
+
+       if (prev_size[slot] != size) {
+               printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
+                        addr, size, slot, prev_size[slot]);
+               WARN_ON(1);
+               return;
+       }
+
+       if (early_ioremap_debug) {
+               printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
+                      size, slot);
+               dump_stack();
+       }
+
+       virt_addr = (unsigned long)addr;
+       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
+               WARN_ON(1);
+               return;
+       }
+       offset = virt_addr & ~PAGE_MASK;
+       nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
+
+       idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+       while (nrpages > 0) {
+               early_clear_fixmap(idx);
+               --idx;
+               --nrpages;
+       }
+       prev_map[slot] = NULL;
+}
diff --git a/arch/x86/mm/pageattr-xen.c b/arch/x86/mm/pageattr-xen.c

new file mode 100644 (file)

index 0000000..e22daa8
--- /dev/null
+++ b/arch/x86/mm/pageattr-xen.c
@@ -0,0 +1,1545 @@
+/*
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Thanks to Ben LaHaise for precious feedback.
+ */
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/pfn.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/pci.h>
+
+#include <asm/e820.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/proto.h>
+#include <asm/pat.h>
+
+/*
+ * The current flushing context - we pass it instead of 5 arguments:
+ */
+struct cpa_data {
+       unsigned long   *vaddr;
+       pgprot_t        mask_set;
+       pgprot_t        mask_clr;
+       int             numpages;
+       int             flags;
+       unsigned long   pfn;
+       unsigned        force_split : 1;
+       int             curpage;
+       struct page     **pages;
+};
+
+/*
+ * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
+ * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
+ * entries change the page attribute in parallel to some other cpu
+ * splitting a large page entry along with changing the attribute.
+ */
+static DEFINE_SPINLOCK(cpa_lock);
+
+#define CPA_FLUSHTLB 1
+#define CPA_ARRAY 2
+#define CPA_PAGES_ARRAY 4
+
+#ifdef CONFIG_PROC_FS
+static unsigned long direct_pages_count[PG_LEVEL_NUM];
+
+void update_page_count(int level, unsigned long pages)
+{
+       /* Protect against CPA */
+       spin_lock(&pgd_lock);
+       direct_pages_count[level] += pages;
+       spin_unlock(&pgd_lock);
+}
+
+static void split_page_count(int level)
+{
+       direct_pages_count[level]--;
+       direct_pages_count[level - 1] += PTRS_PER_PTE;
+}
+
+void arch_report_meminfo(struct seq_file *m)
+{
+       seq_printf(m, "DirectMap4k:    %8lu kB\n",
+                       direct_pages_count[PG_LEVEL_4K] << 2);
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+       seq_printf(m, "DirectMap2M:    %8lu kB\n",
+                       direct_pages_count[PG_LEVEL_2M] << 11);
+#else
+       seq_printf(m, "DirectMap4M:    %8lu kB\n",
+                       direct_pages_count[PG_LEVEL_2M] << 12);
+#endif
+#ifdef CONFIG_X86_64
+       if (direct_gbpages)
+               seq_printf(m, "DirectMap1G:    %8lu kB\n",
+                       direct_pages_count[PG_LEVEL_1G] << 20);
+#endif
+}
+#else
+static inline void split_page_count(int level) { }
+#endif
+
+#ifdef CONFIG_X86_64
+
+static inline unsigned long highmap_start_pfn(void)
+{
+       return __pa(_text) >> PAGE_SHIFT;
+}
+
+static inline unsigned long highmap_end_pfn(void)
+{
+       return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+# define debug_pagealloc 1
+#else
+# define debug_pagealloc 0
+#endif
+
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+       return addr >= start && addr < end;
+}
+
+/*
+ * Flushing functions
+ */
+
+/**
+ * clflush_cache_range - flush a cache range with clflush
+ * @addr:      virtual start address
+ * @size:      number of bytes to flush
+ *
+ * clflush is an unordered instruction which needs fencing with mfence
+ * to avoid ordering issues.
+ */
+void clflush_cache_range(void *vaddr, unsigned int size)
+{
+       void *vend = vaddr + size - 1;
+
+       mb();
+
+       for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
+               clflush(vaddr);
+       /*
+        * Flush any possible final partial cacheline:
+        */
+       clflush(vend);
+
+       mb();
+}
+EXPORT_SYMBOL_GPL(clflush_cache_range);
+
+static void __cpa_flush_all(void *arg)
+{
+       unsigned long cache = (unsigned long)arg;
+
+       /*
+        * Flush all to work around Errata in early athlons regarding
+        * large page flushing.
+        */
+       __flush_tlb_all();
+
+       if (cache && boot_cpu_data.x86 >= 4)
+               wbinvd();
+}
+
+static void cpa_flush_all(unsigned long cache)
+{
+       BUG_ON(irqs_disabled());
+
+       on_each_cpu(__cpa_flush_all, (void *) cache, 1);
+}
+
+static void __cpa_flush_range(void *arg)
+{
+       /*
+        * We could optimize that further and do individual per page
+        * tlb invalidates for a low number of pages. Caveat: we must
+        * flush the high aliases on 64bit as well.
+        */
+       __flush_tlb_all();
+}
+
+static void cpa_flush_range(unsigned long start, int numpages, int cache)
+{
+       unsigned int i, level;
+       unsigned long addr;
+
+       BUG_ON(irqs_disabled());
+       WARN_ON(PAGE_ALIGN(start) != start);
+
+       on_each_cpu(__cpa_flush_range, NULL, 1);
+
+       if (!cache)
+               return;
+
+       /*
+        * We only need to flush on one CPU,
+        * clflush is a MESI-coherent instruction that
+        * will cause all other CPUs to flush the same
+        * cachelines:
+        */
+       for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
+               pte_t *pte = lookup_address(addr, &level);
+
+               /*
+                * Only flush present addresses:
+                */
+               if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
+                       clflush_cache_range((void *) addr, PAGE_SIZE);
+       }
+}
+
+static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+                           int in_flags, struct page **pages)
+{
+       unsigned int i, level;
+       unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
+
+       BUG_ON(irqs_disabled());
+
+       on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
+
+       if (!cache || do_wbinvd)
+               return;
+
+       /*
+        * We only need to flush on one CPU,
+        * clflush is a MESI-coherent instruction that
+        * will cause all other CPUs to flush the same
+        * cachelines:
+        */
+       for (i = 0; i < numpages; i++) {
+               unsigned long addr;
+               pte_t *pte;
+
+               if (in_flags & CPA_PAGES_ARRAY)
+                       addr = (unsigned long)page_address(pages[i]);
+               else
+                       addr = start[i];
+
+               pte = lookup_address(addr, &level);
+
+               /*
+                * Only flush present addresses:
+                */
+               if (pte && (__pte_val(*pte) & _PAGE_PRESENT))
+                       clflush_cache_range((void *)addr, PAGE_SIZE);
+       }
+}
+
+/*
+ * Certain areas of memory on x86 require very specific protection flags,
+ * for example the BIOS area or kernel text. Callers don't always get this
+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
+ * checks and fixes these known static required protection bits.
+ */
+static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
+                                  unsigned long pfn)
+{
+       pgprot_t forbidden = __pgprot(0);
+
+       /*
+        * The BIOS area between 640k and 1Mb needs to be executable for
+        * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
+        */
+#ifdef CONFIG_PCI_BIOS
+       if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+               pgprot_val(forbidden) |= _PAGE_NX;
+#endif
+
+       /*
+        * The kernel text needs to be executable for obvious reasons
+        * Does not cover __inittext since that is gone later on. On
+        * 64bit we do not enforce !NX on the low mapping
+        */
+       if (within(address, (unsigned long)_text, (unsigned long)_etext))
+               pgprot_val(forbidden) |= _PAGE_NX;
+
+       /*
+        * The .rodata section needs to be read-only. Using the pfn
+        * catches all aliases.
+        */
+       if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
+                  __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
+               pgprot_val(forbidden) |= _PAGE_RW;
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) && !defined(CONFIG_XEN)
+       /*
+        * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
+        * kernel text mappings for the large page aligned text, rodata sections
+        * will be always read-only. For the kernel identity mappings covering
+        * the holes caused by this alignment can be anything that user asks.
+        *
+        * This will preserve the large page mappings for kernel text/data
+        * at no extra cost.
+        */
+       if (kernel_set_to_readonly &&
+           within(address, (unsigned long)_text,
+                  (unsigned long)__end_rodata_hpage_align)) {
+               unsigned int level;
+
+               /*
+                * Don't enforce the !RW mapping for the kernel text mapping,
+                * if the current mapping is already using small page mapping.
+                * No need to work hard to preserve large page mappings in this
+                * case.
+                *
+                * This also fixes the Linux Xen paravirt guest boot failure
+                * (because of unexpected read-only mappings for kernel identity
+                * mappings). In this paravirt guest case, the kernel text
+                * mapping and the kernel identity mapping share the same
+                * page-table pages. Thus we can't really use different
+                * protections for the kernel text and identity mappings. Also,
+                * these shared mappings are made of small page mappings.
+                * Thus this don't enforce !RW mapping for small page kernel
+                * text mapping logic will help Linux Xen parvirt guest boot
+                * as well.
+                */
+               if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
+                       pgprot_val(forbidden) |= _PAGE_RW;
+       }
+#endif
+
+       prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+
+       return prot;
+}
+
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
+pte_t *lookup_address(unsigned long address, unsigned int *level)
+{
+       pgd_t *pgd = pgd_offset_k(address);
+       pud_t *pud;
+       pmd_t *pmd;
+
+       *level = PG_LEVEL_NONE;
+
+       if (pgd_none(*pgd))
+               return NULL;
+
+       pud = pud_offset(pgd, address);
+       if (pud_none(*pud))
+               return NULL;
+
+       *level = PG_LEVEL_1G;
+       if (pud_large(*pud) || !pud_present(*pud))
+               return (pte_t *)pud;
+
+       pmd = pmd_offset(pud, address);
+       if (pmd_none(*pmd))
+               return NULL;
+
+       *level = PG_LEVEL_2M;
+       if (pmd_large(*pmd) || !pmd_present(*pmd))
+               return (pte_t *)pmd;
+
+       *level = PG_LEVEL_4K;
+
+       return pte_offset_kernel(pmd, address);
+}
+EXPORT_SYMBOL_GPL(lookup_address);
+
+/*
+ * Set the new pmd in all the pgds we know about:
+ */
+static void __set_pmd_pte(pte_t *kpte, unsigned long address,
+                         unsigned int level, pte_t pte)
+{
+       /* change init_mm */
+       switch(level) {
+       case PG_LEVEL_2M:
+               xen_l2_entry_update((pmd_t *)kpte, __pmd_ma(__pte_val(pte)));
+               break;
+#ifdef CONFIG_X86_64
+       case PG_LEVEL_1G:
+               xen_l3_entry_update((pud_t *)kpte, __pud_ma(__pte_val(pte)));
+               break;
+#endif
+       default:
+               BUG();
+       }
+#ifdef CONFIG_X86_32
+       if (!SHARED_KERNEL_PMD) {
+               struct page *page;
+
+               list_for_each_entry(page, &pgd_list, lru) {
+                       pgd_t *pgd;
+                       pud_t *pud;
+                       pmd_t *pmd;
+
+                       pgd = (pgd_t *)page_address(page) + pgd_index(address);
+                       pud = pud_offset(pgd, address);
+                       pmd = pmd_offset(pud, address);
+                       xen_l2_entry_update(pmd, __pmd_ma(__pte_val(pte)));
+               }
+       }
+#endif
+}
+
+static int
+try_preserve_large_page(pte_t *kpte, unsigned long address,
+                       struct cpa_data *cpa)
+{
+       unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
+       pte_t new_pte, old_pte, *tmp;
+       pgprot_t old_prot, new_prot, req_prot;
+       int i, do_split = 1;
+       unsigned int level;
+
+       if (cpa->force_split)
+               return 1;
+
+       spin_lock(&pgd_lock);
+       /*
+        * Check for races, another CPU might have split this page
+        * up already:
+        */
+       tmp = lookup_address(address, &level);
+       if (tmp != kpte)
+               goto out_unlock;
+
+       switch (level) {
+       case PG_LEVEL_2M:
+               psize = PMD_PAGE_SIZE;
+               pmask = PMD_PAGE_MASK;
+               break;
+#ifdef CONFIG_X86_64
+       case PG_LEVEL_1G:
+               psize = PUD_PAGE_SIZE;
+               pmask = PUD_PAGE_MASK;
+               break;
+#endif
+       default:
+               do_split = -EINVAL;
+               goto out_unlock;
+       }
+
+       /*
+        * Calculate the number of pages, which fit into this large
+        * page starting at address:
+        */
+       nextpage_addr = (address + psize) & pmask;
+       numpages = (nextpage_addr - address) >> PAGE_SHIFT;
+       if (numpages < cpa->numpages)
+               cpa->numpages = numpages;
+
+       /*
+        * We are safe now. Check whether the new pgprot is the same:
+        */
+       old_pte = *kpte;
+       old_prot = new_prot = req_prot = pte_pgprot(old_pte);
+
+       pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
+       pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
+
+       /*
+        * old_pte points to the large page base address. So we need
+        * to add the offset of the virtual address:
+        */
+       pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
+       cpa->pfn = pfn;
+
+       new_prot = static_protections(req_prot, address, pfn);
+
+       /*
+        * We need to check the full range, whether
+        * static_protection() requires a different pgprot for one of
+        * the pages in the range we try to preserve:
+        */
+       addr = address & pmask;
+       pfn = pte_pfn(old_pte);
+       for (i = 0; i < (psize >> PAGE_SHIFT) && pfn < max_mapnr;
+            i++, addr += PAGE_SIZE, pfn++) {
+               pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
+
+               if (pgprot_val(chk_prot) != pgprot_val(new_prot))
+                       goto out_unlock;
+       }
+
+       /*
+        * If there are no changes, return. maxpages has been updated
+        * above:
+        */
+       if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
+               do_split = 0;
+               goto out_unlock;
+       }
+
+       /*
+        * We need to change the attributes. Check, whether we can
+        * change the large page in one go. We request a split, when
+        * the address is not aligned and the number of pages is
+        * smaller than the number of pages in the large page. Note
+        * that we limited the number of possible pages already to
+        * the number of pages in the large page.
+        */
+       if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
+               /*
+                * The address is aligned and the number of pages
+                * covers the full page.
+                */
+               new_pte = pfn_pte_ma(__pte_mfn(old_pte), canon_pgprot(new_prot));
+               __set_pmd_pte(kpte, address, level, new_pte);
+               cpa->flags |= CPA_FLUSHTLB;
+               do_split = 0;
+       }
+
+out_unlock:
+       spin_unlock(&pgd_lock);
+
+       return do_split;
+}
+
+static int split_large_page(pte_t *kpte, unsigned long address)
+{
+       unsigned long mfn, mfninc = 1;
+       unsigned int i, level;
+       pte_t *pbase, *tmp;
+       pgprot_t ref_prot;
+       struct page *base;
+
+       if (!debug_pagealloc)
+               spin_unlock(&cpa_lock);
+       base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+       if (!debug_pagealloc)
+               spin_lock(&cpa_lock);
+       if (!base)
+               return -ENOMEM;
+
+       spin_lock(&pgd_lock);
+       /*
+        * Check for races, another CPU might have split this page
+        * up for us already:
+        */
+       tmp = lookup_address(address, &level);
+       if (tmp != kpte)
+               goto out_unlock;
+
+       pbase = (pte_t *)page_address(base);
+       paravirt_alloc_pte(&init_mm, page_to_pfn(base));
+       ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+       /*
+        * If we ever want to utilize the PAT bit, we need to
+        * update this function to make sure it's converted from
+        * bit 12 to bit 7 when we cross from the 2MB level to
+        * the 4K level:
+        */
+       WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);
+
+#ifdef CONFIG_X86_64
+       if (level == PG_LEVEL_1G) {
+               mfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+               pgprot_val(ref_prot) |= _PAGE_PSE;
+       }
+#endif
+
+       if (address >= (unsigned long)__va(0) &&
+               address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+               split_page_count(level);
+
+#ifdef CONFIG_X86_64
+       if (address >= (unsigned long)__va(1UL<<32) &&
+               address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
+               split_page_count(level);
+#endif
+
+       /*
+        * Get the target mfn from the original entry:
+        */
+       mfn = __pte_mfn(*kpte);
+       for (i = 0; i < PTRS_PER_PTE; i++, mfn += mfninc)
+               set_pte(&pbase[i], pfn_pte_ma(mfn, ref_prot));
+
+       /*
+        * Install the new, split up pagetable.
+        *
+        * We use the standard kernel pagetable protections for the new
+        * pagetable protections, the actual ptes set above control the
+        * primary protection behavior:
+        */
+       if (!xen_feature(XENFEAT_writable_page_tables) &&
+           HYPERVISOR_update_va_mapping((unsigned long)pbase,
+                                        mk_pte(base, PAGE_KERNEL_RO), 0))
+               BUG();
+       __set_pmd_pte(kpte, address, level, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+
+       /*
+        * Intel Atom errata AAH41 workaround.
+        *
+        * The real fix should be in hw or in a microcode update, but
+        * we also probabilistically try to reduce the window of having
+        * a large TLB mixed with 4K TLBs while instruction fetches are
+        * going on.
+        */
+       __flush_tlb_all();
+
+       base = NULL;
+
+out_unlock:
+       /*
+        * If we dropped out via the lookup_address check under
+        * pgd_lock then stick the page back into the pool:
+        */
+       if (base)
+               __free_page(base);
+       spin_unlock(&pgd_lock);
+
+       return 0;
+}
+
+static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
+                              int primary)
+{
+       /*
+        * Ignore all non primary paths.
+        */
+       if (!primary)
+               return 0;
+
+       /*
+        * Ignore the NULL PTE for kernel identity mapping, as it is expected
+        * to have holes.
+        * Also set numpages to '1' indicating that we processed cpa req for
+        * one virtual address page and its pfn. TBD: numpages can be set based
+        * on the initial value and the level returned by lookup_address().
+        */
+       if (within(vaddr, PAGE_OFFSET,
+                  PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+               cpa->numpages = 1;
+               cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
+               return 0;
+       } else {
+               WARN(1, KERN_WARNING "CPA: called for zero pte. "
+                       "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
+                       *cpa->vaddr);
+
+               return -EFAULT;
+       }
+}
+
+static int __change_page_attr(struct cpa_data *cpa, int primary)
+{
+       unsigned long address;
+       int do_split, err;
+       unsigned int level;
+       pte_t *kpte, old_pte;
+
+       if (cpa->flags & CPA_PAGES_ARRAY) {
+               struct page *page = cpa->pages[cpa->curpage];
+               if (unlikely(PageHighMem(page)))
+                       return 0;
+               address = (unsigned long)page_address(page);
+       } else if (cpa->flags & CPA_ARRAY)
+               address = cpa->vaddr[cpa->curpage];
+       else
+               address = *cpa->vaddr;
+repeat:
+       kpte = lookup_address(address, &level);
+       if (!kpte)
+               return __cpa_process_fault(cpa, address, primary);
+
+       old_pte = *kpte;
+       if (!__pte_val(old_pte))
+               return __cpa_process_fault(cpa, address, primary);
+
+       if (level == PG_LEVEL_4K) {
+               pte_t new_pte;
+               pgprot_t new_prot = pte_pgprot(old_pte);
+               unsigned long mfn = __pte_mfn(old_pte);
+
+               pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+               pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+
+               new_prot = static_protections(new_prot, address,
+                                             mfn_to_local_pfn(mfn));
+
+               /*
+                * We need to keep the mfn from the existing PTE,
+                * after all we're only going to change it's attributes
+                * not the memory it points to
+                */
+               new_pte = pfn_pte_ma(mfn, canon_pgprot(new_prot));
+               cpa->pfn = mfn_to_local_pfn(mfn);
+               /*
+                * Do we really change anything ?
+                */
+               if (__pte_val(old_pte) != __pte_val(new_pte)) {
+                       mmu_update_t u;
+
+                       u.ptr = virt_to_machine(kpte);
+                       u.val = __pte_val(new_pte);
+                       WARN_ON_ONCE(arch_use_lazy_mmu_mode());
+                       do {
+                               err = HYPERVISOR_mmu_update(&u, 1, NULL,
+                                                           DOMID_SELF);
+                               switch (err) {
+                               case 0:
+                                       break;
+                               case -ENOMEM:
+                                       BUG_ON(!primary);
+                                       BUG_ON(!((pgprot_val(cpa->mask_set) |
+                                                 pgprot_val(cpa->mask_clr)) &
+                                                _PAGE_CACHE_MASK));
+                                       if (hypervisor_oom())
+                                               continue;
+                                       /* fall through */
+                               default:
+                                       return err;
+                               }
+                       } while (err);
+                       cpa->flags |= CPA_FLUSHTLB;
+               }
+               cpa->numpages = 1;
+               return 0;
+       }
+
+       /*
+        * Check, whether we can keep the large page intact
+        * and just change the pte:
+        */
+       do_split = try_preserve_large_page(kpte, address, cpa);
+       /*
+        * When the range fits into the existing large page,
+        * return. cp->numpages and cpa->tlbflush have been updated in
+        * try_large_page:
+        */
+       if (do_split <= 0)
+               return do_split;
+
+       /*
+        * We have to split the large page:
+        */
+       err = split_large_page(kpte, address);
+       if (!err) {
+               /*
+                * Do a global flush tlb after splitting the large page
+                * and before we do the actual change page attribute in the PTE.
+                *
+                * With out this, we violate the TLB application note, that says
+                * "The TLBs may contain both ordinary and large-page
+                *  translations for a 4-KByte range of linear addresses. This
+                *  may occur if software modifies the paging structures so that
+                *  the page size used for the address range changes. If the two
+                *  translations differ with respect to page frame or attributes
+                *  (e.g., permissions), processor behavior is undefined and may
+                *  be implementation-specific."
+                *
+                * We do this global tlb flush inside the cpa_lock, so that we
+                * don't allow any other cpu, with stale tlb entries change the
+                * page attribute in parallel, that also falls into the
+                * just split large page entry.
+                */
+               flush_tlb_all();
+               goto repeat;
+       }
+
+       return err;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
+
+static int cpa_process_alias(struct cpa_data *cpa)
+{
+       struct cpa_data alias_cpa;
+       unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
+       unsigned long vaddr;
+       int ret;
+
+       if (cpa->pfn >= max_pfn_mapped)
+               return 0;
+
+#ifdef CONFIG_X86_64
+       if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
+               return 0;
+#endif
+       /*
+        * No need to redo, when the primary call touched the direct
+        * mapping already:
+        */
+       if (cpa->flags & CPA_PAGES_ARRAY) {
+               struct page *page = cpa->pages[cpa->curpage];
+               if (unlikely(PageHighMem(page)))
+                       return 0;
+               vaddr = (unsigned long)page_address(page);
+       } else if (cpa->flags & CPA_ARRAY)
+               vaddr = cpa->vaddr[cpa->curpage];
+       else
+               vaddr = *cpa->vaddr;
+
+       if (!(within(vaddr, PAGE_OFFSET,
+                   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
+
+               alias_cpa = *cpa;
+               alias_cpa.vaddr = &laddr;
+               alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+
+               ret = __change_page_attr_set_clr(&alias_cpa, 0);
+               if (ret)
+                       return ret;
+       }
+
+#ifdef CONFIG_X86_64
+       /*
+        * If the primary call didn't touch the high mapping already
+        * and the physical address is inside the kernel map, we need
+        * to touch the high mapped kernel as well:
+        */
+       if (!within(vaddr, (unsigned long)_text, _brk_end) &&
+           within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
+               unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
+                                              __START_KERNEL_map;
+               alias_cpa = *cpa;
+               alias_cpa.vaddr = &temp_cpa_vaddr;
+               alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+
+               /*
+                * The high mapping range is imprecise, so ignore the
+                * return value.
+                */
+               __change_page_attr_set_clr(&alias_cpa, 0);
+       }
+#endif
+
+       return 0;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
+{
+       int ret, numpages = cpa->numpages;
+
+       while (numpages) {
+               /*
+                * Store the remaining nr of pages for the large page
+                * preservation check.
+                */
+               cpa->numpages = numpages;
+               /* for array changes, we can't use large page */
+               if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
+                       cpa->numpages = 1;
+
+               if (!debug_pagealloc)
+                       spin_lock(&cpa_lock);
+               ret = __change_page_attr(cpa, checkalias);
+               if (!debug_pagealloc)
+                       spin_unlock(&cpa_lock);
+               if (ret)
+                       return ret;
+
+               if (checkalias) {
+                       ret = cpa_process_alias(cpa);
+                       if (ret)
+                               return ret;
+               }
+
+               /*
+                * Adjust the number of pages with the result of the
+                * CPA operation. Either a large page has been
+                * preserved or a single page update happened.
+                */
+               BUG_ON(cpa->numpages > numpages);
+               numpages -= cpa->numpages;
+               if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
+                       cpa->curpage++;
+               else
+                       *cpa->vaddr += cpa->numpages * PAGE_SIZE;
+
+       }
+       return 0;
+}
+
+static inline int cache_attr(pgprot_t attr)
+{
+       return pgprot_val(attr) &
+               (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
+}
+
+static int change_page_attr_set_clr(unsigned long *addr, int numpages,
+                                   pgprot_t mask_set, pgprot_t mask_clr,
+                                   int force_split, int in_flag,
+                                   struct page **pages)
+{
+       struct cpa_data cpa;
+       int ret, cache, checkalias;
+       unsigned long baddr = 0;
+
+       /*
+        * Check, if we are requested to change a not supported
+        * feature:
+        */
+       mask_set = canon_pgprot(mask_set);
+       mask_clr = canon_pgprot(mask_clr);
+       if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
+               return 0;
+
+       /* Ensure we are PAGE_SIZE aligned */
+       if (in_flag & CPA_ARRAY) {
+               int i;
+               for (i = 0; i < numpages; i++) {
+                       if (addr[i] & ~PAGE_MASK) {
+                               addr[i] &= PAGE_MASK;
+                               WARN_ON_ONCE(1);
+                       }
+               }
+       } else if (!(in_flag & CPA_PAGES_ARRAY)) {
+               /*
+                * in_flag of CPA_PAGES_ARRAY implies it is aligned.
+                * No need to cehck in that case
+                */
+               if (*addr & ~PAGE_MASK) {
+                       *addr &= PAGE_MASK;
+                       /*
+                        * People should not be passing in unaligned addresses:
+                        */
+                       WARN_ON_ONCE(1);
+               }
+               /*
+                * Save address for cache flush. *addr is modified in the call
+                * to __change_page_attr_set_clr() below.
+                */
+               baddr = *addr;
+       }
+
+       /* Must avoid aliasing mappings in the highmem code */
+       kmap_flush_unused();
+
+       vm_unmap_aliases();
+
+       cpa.vaddr = addr;
+       cpa.pages = pages;
+       cpa.numpages = numpages;
+       cpa.mask_set = mask_set;
+       cpa.mask_clr = mask_clr;
+       cpa.flags = 0;
+       cpa.curpage = 0;
+       cpa.force_split = force_split;
+
+       if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
+               cpa.flags |= in_flag;
+
+       /* No alias checking for _NX bit modifications */
+       checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
+
+       ret = __change_page_attr_set_clr(&cpa, checkalias);
+
+       /*
+        * Check whether we really changed something:
+        */
+       if (!(cpa.flags & CPA_FLUSHTLB))
+               goto out;
+
+       /*
+        * No need to flush, when we did not set any of the caching
+        * attributes:
+        */
+       cache = cache_attr(mask_set);
+
+       /*
+        * On success we use clflush, when the CPU supports it to
+        * avoid the wbindv. If the CPU does not support it and in the
+        * error case we fall back to cpa_flush_all (which uses
+        * wbindv):
+        */
+       if (!ret && cpu_has_clflush) {
+               if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+                       cpa_flush_array(addr, numpages, cache,
+                                       cpa.flags, pages);
+               } else
+                       cpa_flush_range(baddr, numpages, cache);
+       } else
+               cpa_flush_all(cache);
+
+out:
+       return ret;
+}
+
+static inline int change_page_attr_set(unsigned long *addr, int numpages,
+                                      pgprot_t mask, int array)
+{
+       return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
+               (array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int change_page_attr_clear(unsigned long *addr, int numpages,
+                                        pgprot_t mask, int array)
+{
+       return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
+               (array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int cpa_set_pages_array(struct page **pages, int numpages,
+                                      pgprot_t mask)
+{
+       return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
+               CPA_PAGES_ARRAY, pages);
+}
+
+static inline int cpa_clear_pages_array(struct page **pages, int numpages,
+                                        pgprot_t mask)
+{
+       return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
+               CPA_PAGES_ARRAY, pages);
+}
+
+#ifdef CONFIG_XEN
+static void _free_memtype(u64 pstart, u64 pend)
+{
+       u64 pa = pstart &= __PHYSICAL_MASK;
+       u64 ma = phys_to_machine(pa);
+
+       while ((pa += PAGE_SIZE) < pend) {
+               if (phys_to_machine(pa) != ma + (pa - pstart)) {
+                       free_memtype(ma, ma + (pa - pstart));
+                       pstart = pa;
+                       ma = phys_to_machine(pa);
+               }
+       }
+       free_memtype(ma, ma + (pend - pstart));
+}
+#define free_memtype _free_memtype
+
+static int _reserve_memtype(u64 pstart, u64 pend, unsigned long req_type)
+{
+       u64 pcur = pstart &= __PHYSICAL_MASK, pa = pcur;
+       u64 ma = phys_to_machine(pa);
+       int rc = 0;
+
+       while ((pa += PAGE_SIZE) < pend) {
+               if (phys_to_machine(pa) != ma + (pa - pcur)) {
+                       rc = reserve_memtype(ma, ma + (pa - pcur),
+                                            req_type, NULL);
+                       if (rc)
+                               break;
+                       pcur = pa;
+                       ma = phys_to_machine(pa);
+               }
+       }
+       if (likely(!rc))
+               rc = reserve_memtype(ma, ma + (pend - pcur), req_type, NULL);
+
+       if (unlikely(!rc) && pstart < pcur)
+               _free_memtype(pstart, pcur);
+
+       return rc;
+}
+#define reserve_memtype(s, e, r, n) \
+       _reserve_memtype(s, e, BUILD_BUG_ON_ZERO(n) ?: (r))
+#endif
+
+int _set_memory_uc(unsigned long addr, int numpages)
+{
+       /*
+        * for now UC MINUS. see comments in ioremap_nocache()
+        */
+       return change_page_attr_set(&addr, numpages,
+                                   __pgprot(_PAGE_CACHE_UC_MINUS), 0);
+}
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+       int ret;
+
+       /*
+        * for now UC MINUS. see comments in ioremap_nocache()
+        */
+       ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+                           _PAGE_CACHE_UC_MINUS, NULL);
+       if (ret)
+               goto out_err;
+
+       ret = _set_memory_uc(addr, numpages);
+       if (ret)
+               goto out_free;
+
+       return 0;
+
+out_free:
+       free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+       return ret;
+}
+EXPORT_SYMBOL(set_memory_uc);
+
+static int _set_memory_array(unsigned long *addr, int addrinarray,
+               unsigned long new_type)
+{
+       int i, j;
+       int ret;
+
+       /*
+        * for now UC MINUS. see comments in ioremap_nocache()
+        */
+       for (i = 0; i < addrinarray; i++) {
+               ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
+                                       new_type, NULL);
+               if (ret)
+                       goto out_free;
+       }
+
+       ret = change_page_attr_set(addr, addrinarray,
+                                   __pgprot(_PAGE_CACHE_UC_MINUS), 1);
+
+       if (!ret && new_type == _PAGE_CACHE_WC)
+               ret = change_page_attr_set_clr(addr, addrinarray,
+                                              __pgprot(_PAGE_CACHE_WC),
+                                              __pgprot(_PAGE_CACHE_MASK),
+                                              0, CPA_ARRAY, NULL);
+       if (ret)
+               goto out_free;
+
+       return 0;
+
+out_free:
+       for (j = 0; j < i; j++)
+               free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
+
+       return ret;
+}
+
+int set_memory_array_uc(unsigned long *addr, int addrinarray)
+{
+       return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS);
+}
+EXPORT_SYMBOL(set_memory_array_uc);
+
+int set_memory_array_wc(unsigned long *addr, int addrinarray)
+{
+       return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC);
+}
+EXPORT_SYMBOL(set_memory_array_wc);
+
+int _set_memory_wc(unsigned long addr, int numpages)
+{
+       int ret;
+       unsigned long addr_copy = addr;
+
+       ret = change_page_attr_set(&addr, numpages,
+                                   __pgprot(_PAGE_CACHE_UC_MINUS), 0);
+       if (!ret) {
+               ret = change_page_attr_set_clr(&addr_copy, numpages,
+                                              __pgprot(_PAGE_CACHE_WC),
+                                              __pgprot(_PAGE_CACHE_MASK),
+                                              0, 0, NULL);
+       }
+       return ret;
+}
+
+int set_memory_wc(unsigned long addr, int numpages)
+{
+       int ret;
+
+       if (!pat_enabled)
+               return set_memory_uc(addr, numpages);
+
+       ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+               _PAGE_CACHE_WC, NULL);
+       if (ret)
+               goto out_err;
+
+       ret = _set_memory_wc(addr, numpages);
+       if (ret)
+               goto out_free;
+
+       return 0;
+
+out_free:
+       free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+       return ret;
+}
+EXPORT_SYMBOL(set_memory_wc);
+
+int _set_memory_wb(unsigned long addr, int numpages)
+{
+       return change_page_attr_clear(&addr, numpages,
+                                     __pgprot(_PAGE_CACHE_MASK), 0);
+}
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+       int ret;
+
+       ret = _set_memory_wb(addr, numpages);
+       if (ret)
+               return ret;
+
+       free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+       return 0;
+}
+EXPORT_SYMBOL(set_memory_wb);
+
+int set_memory_array_wb(unsigned long *addr, int addrinarray)
+{
+       int i;
+       int ret;
+
+       ret = change_page_attr_clear(addr, addrinarray,
+                                     __pgprot(_PAGE_CACHE_MASK), 1);
+       if (ret)
+               return ret;
+
+       for (i = 0; i < addrinarray; i++)
+               free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
+
+       return 0;
+}
+EXPORT_SYMBOL(set_memory_array_wb);
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+       if (!(__supported_pte_mask & _PAGE_NX))
+               return 0;
+
+       return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
+}
+EXPORT_SYMBOL(set_memory_x);
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+       if (!(__supported_pte_mask & _PAGE_NX))
+               return 0;
+
+       return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
+}
+EXPORT_SYMBOL(set_memory_nx);
+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+       return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
+}
+EXPORT_SYMBOL_GPL(set_memory_ro);
+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+       return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
+}
+EXPORT_SYMBOL_GPL(set_memory_rw);
+
+int set_memory_np(unsigned long addr, int numpages)
+{
+       return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
+}
+
+int set_memory_4k(unsigned long addr, int numpages)
+{
+       return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+                                       __pgprot(0), 1, 0, NULL);
+}
+
+int set_pages_uc(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return set_memory_uc(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_uc);
+
+static int _set_pages_array(struct page **pages, int addrinarray,
+               unsigned long new_type)
+{
+       unsigned long start;
+       unsigned long end;
+       int i;
+       int free_idx;
+       int ret;
+
+       for (i = 0; i < addrinarray; i++) {
+               if (PageHighMem(pages[i]))
+                       continue;
+               start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+               end = start + PAGE_SIZE;
+               if (reserve_memtype(start, end, new_type, NULL))
+                       goto err_out;
+       }
+
+       ret = cpa_set_pages_array(pages, addrinarray,
+                       __pgprot(_PAGE_CACHE_UC_MINUS));
+       if (!ret && new_type == _PAGE_CACHE_WC)
+               ret = change_page_attr_set_clr(NULL, addrinarray,
+                                              __pgprot(_PAGE_CACHE_WC),
+                                              __pgprot(_PAGE_CACHE_MASK),
+                                              0, CPA_PAGES_ARRAY, pages);
+       if (ret)
+               goto err_out;
+       return 0; /* Success */
+err_out:
+       free_idx = i;
+       for (i = 0; i < free_idx; i++) {
+               if (PageHighMem(pages[i]))
+                       continue;
+               start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+               end = start + PAGE_SIZE;
+               free_memtype(start, end);
+       }
+       return -EINVAL;
+}
+
+int set_pages_array_uc(struct page **pages, int addrinarray)
+{
+       return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS);
+}
+EXPORT_SYMBOL(set_pages_array_uc);
+
+int set_pages_array_wc(struct page **pages, int addrinarray)
+{
+       return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC);
+}
+EXPORT_SYMBOL(set_pages_array_wc);
+
+int set_pages_wb(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return set_memory_wb(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_wb);
+
+int set_pages_array_wb(struct page **pages, int addrinarray)
+{
+       int retval;
+       unsigned long start;
+       unsigned long end;
+       int i;
+
+       retval = cpa_clear_pages_array(pages, addrinarray,
+                       __pgprot(_PAGE_CACHE_MASK));
+       if (retval)
+               return retval;
+
+       for (i = 0; i < addrinarray; i++) {
+               if (PageHighMem(pages[i]))
+                       continue;
+               start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+               end = start + PAGE_SIZE;
+               free_memtype(start, end);
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(set_pages_array_wb);
+
+int set_pages_x(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return set_memory_x(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_x);
+
+int set_pages_nx(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return set_memory_nx(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_nx);
+
+int set_pages_ro(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return set_memory_ro(addr, numpages);
+}
+
+int set_pages_rw(struct page *page, int numpages)
+{
+       unsigned long addr = (unsigned long)page_address(page);
+
+       return set_memory_rw(addr, numpages);
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+
+static int __set_pages_p(struct page *page, int numpages)
+{
+       unsigned long tempaddr = (unsigned long) page_address(page);
+       struct cpa_data cpa = { .vaddr = &tempaddr,
+                               .numpages = numpages,
+                               .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+                               .mask_clr = __pgprot(0),
+                               .flags = 0};
+
+       /*
+        * No alias checking needed for setting present flag. otherwise,
+        * we may need to break large pages for 64-bit kernel text
+        * mappings (this adds to complexity if we want to do this from
+        * atomic context especially). Let's keep it simple!
+        */
+       return __change_page_attr_set_clr(&cpa, 0);
+}
+
+static int __set_pages_np(struct page *page, int numpages)
+{
+       unsigned long tempaddr = (unsigned long) page_address(page);
+       struct cpa_data cpa = { .vaddr = &tempaddr,
+                               .numpages = numpages,
+                               .mask_set = __pgprot(0),
+                               .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+                               .flags = 0};
+
+       /*
+        * No alias checking needed for setting not present flag. otherwise,
+        * we may need to break large pages for 64-bit kernel text
+        * mappings (this adds to complexity if we want to do this from
+        * atomic context especially). Let's keep it simple!
+        */
+       return __change_page_attr_set_clr(&cpa, 0);
+}
+
+void kernel_map_pages(struct page *page, int numpages, int enable)
+{
+       if (PageHighMem(page))
+               return;
+       if (!enable) {
+               debug_check_no_locks_freed(page_address(page),
+                                          numpages * PAGE_SIZE);
+       }
+
+       /*
+        * The return value is ignored as the calls cannot fail.
+        * Large pages for identity mappings are not used at boot time
+        * and hence no memory allocations during large page split.
+        */
+       if (enable)
+               __set_pages_p(page, numpages);
+       else
+               __set_pages_np(page, numpages);
+
+       /*
+        * We should perform an IPI and flush all tlbs,
+        * but that can deadlock->flush only current cpu:
+        */
+       __flush_tlb_all();
+}
+
+#ifdef CONFIG_HIBERNATION
+
+bool kernel_page_present(struct page *page)
+{
+       unsigned int level;
+       pte_t *pte;
+
+       if (PageHighMem(page))
+               return false;
+
+       pte = lookup_address((unsigned long)page_address(page), &level);
+       return (__pte_val(*pte) & _PAGE_PRESENT);
+}
+
+#endif /* CONFIG_HIBERNATION */
+
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+static inline int in_secondary_range(unsigned long va)
+{
+#ifdef CONFIG_X86_64
+       return va >= VMALLOC_START && va < VMALLOC_END;
+#else
+       return va >= (unsigned long)high_memory;
+#endif
+}
+
+static void __make_page_readonly(unsigned long va)
+{
+       pte_t *pte;
+       unsigned int level;
+
+       pte = lookup_address(va, &level);
+       BUG_ON(!pte || level != PG_LEVEL_4K);
+       if (HYPERVISOR_update_va_mapping(va, pte_wrprotect(*pte), 0))
+               BUG();
+       if (in_secondary_range(va)) {
+               unsigned long pfn = pte_pfn(*pte);
+
+#ifdef CONFIG_HIGHMEM
+               if (pfn >= highstart_pfn)
+                       kmap_flush_unused(); /* flush stale writable kmaps */
+               else
+#endif
+                       __make_page_readonly((unsigned long)__va(pfn << PAGE_SHIFT));
+       }
+}
+
+static void __make_page_writable(unsigned long va)
+{
+       pte_t *pte;
+       unsigned int level;
+
+       pte = lookup_address(va, &level);
+       BUG_ON(!pte || level != PG_LEVEL_4K);
+       if (HYPERVISOR_update_va_mapping(va, pte_mkwrite(*pte), UVMF_INVLPG))
+               BUG();
+       if (in_secondary_range(va)) {
+               unsigned long pfn = pte_pfn(*pte);
+
+#ifdef CONFIG_HIGHMEM
+               if (pfn < highstart_pfn)
+#endif
+                       __make_page_writable((unsigned long)__va(pfn << PAGE_SHIFT));
+       }
+}
+
+void make_page_readonly(void *va, unsigned int feature)
+{
+       if (!xen_feature(feature))
+               __make_page_readonly((unsigned long)va);
+}
+
+void make_page_writable(void *va, unsigned int feature)
+{
+       if (!xen_feature(feature))
+               __make_page_writable((unsigned long)va);
+}
+
+void make_pages_readonly(void *va, unsigned int nr, unsigned int feature)
+{
+       unsigned long addr;
+
+       if (xen_feature(feature))
+               return;
+
+       for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
+               __make_page_readonly(addr);
+}
+
+void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
+{
+       unsigned long addr;
+
+       if (xen_feature(feature))
+               return;
+
+       for (addr = (unsigned long)va; nr--; addr += PAGE_SIZE)
+               __make_page_writable(addr);
+}
+
+/*
+ * The testcases use internal knowledge of the implementation that shouldn't
+ * be exposed to the rest of the kernel. Include these directly here.
+ */
+#ifdef CONFIG_CPA_DEBUG
+#include "pageattr-test.c"
+#endif
diff --git a/arch/x86/mm/pat-xen.c b/arch/x86/mm/pat-xen.c

new file mode 100644 (file)

index 0000000..6207e52
--- /dev/null
+++ b/arch/x86/mm/pat-xen.c
@@ -0,0 +1,840 @@
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *          Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rbtree.h>
+
+#include <asm/cacheflush.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/x86_init.h>
+#include <asm/pgtable.h>
+#include <asm/fcntl.h>
+#include <asm/e820.h>
+#include <asm/mtrr.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
+#include <asm/io.h>
+
+#include "pat_internal.h"
+
+#ifdef CONFIG_X86_PAT
+int __read_mostly pat_enabled = 1;
+
+static inline void pat_disable(const char *reason)
+{
+       pat_enabled = 0;
+       printk(KERN_INFO "%s\n", reason);
+}
+
+static int __init nopat(char *str)
+{
+       pat_disable("PAT support disabled.");
+       return 0;
+}
+early_param("nopat", nopat);
+#else
+static inline void pat_disable(const char *reason)
+{
+       (void)reason;
+}
+#endif
+
+
+int pat_debug_enable;
+
+static int __init pat_debug_setup(char *str)
+{
+       pat_debug_enable = 1;
+       return 0;
+}
+__setup("debugpat", pat_debug_setup);
+
+static u64 __read_mostly boot_pat_state;
+
+enum {
+       PAT_UC = 0,             /* uncached */
+       PAT_WC = 1,             /* Write combining */
+       PAT_WT = 4,             /* Write Through */
+       PAT_WP = 5,             /* Write Protected */
+       PAT_WB = 6,             /* Write Back (default) */
+       PAT_UC_MINUS = 7,       /* UC, but can be overriden by MTRR */
+};
+
+#define PAT(x, y)      ((u64)PAT_ ## y << ((x)*8))
+
+void pat_init(void)
+{
+       u64 pat;
+       bool boot_cpu = !boot_pat_state;
+
+       if (!pat_enabled)
+               return;
+
+       if (!cpu_has_pat) {
+               if (!boot_pat_state) {
+                       pat_disable("PAT not supported by CPU.");
+                       return;
+               } else {
+                       /*
+                        * If this happens we are on a secondary CPU, but
+                        * switched to PAT on the boot CPU. We have no way to
+                        * undo PAT.
+                        */
+                       printk(KERN_ERR "PAT enabled, "
+                              "but not supported by secondary CPU\n");
+                       BUG();
+               }
+       }
+
+#ifndef CONFIG_XEN
+       /* Set PWT to Write-Combining. All other bits stay the same */
+       /*
+        * PTE encoding used in Linux:
+        *      PAT
+        *      |PCD
+        *      ||PWT
+        *      |||
+        *      000 WB          _PAGE_CACHE_WB
+        *      001 WC          _PAGE_CACHE_WC
+        *      010 UC-         _PAGE_CACHE_UC_MINUS
+        *      011 UC          _PAGE_CACHE_UC
+        * PAT bit unused
+        */
+       pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+             PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+
+       /* Boot CPU check */
+       if (!boot_pat_state)
+               rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
+
+       wrmsrl(MSR_IA32_CR_PAT, pat);
+#else
+       /*
+        * PAT settings are part of the hypervisor interface, and their
+        * assignment cannot be changed.
+        */
+       rdmsrl(MSR_IA32_CR_PAT, pat);
+       if (!boot_pat_state)
+               boot_pat_state = pat;
+#endif
+
+       if (boot_cpu)
+               printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
+                      smp_processor_id(), boot_pat_state, pat);
+}
+
+#undef PAT
+
+static DEFINE_SPINLOCK(memtype_lock);  /* protects memtype accesses */
+
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end);
+static inline u8 _mtrr_type_lookup(u64 start, u64 end)
+{
+       if (is_initial_xendomain())
+               return mtrr_type_lookup(start, end);
+       return pat_pagerange_is_ram(start, end) > 0
+              ? MTRR_TYPE_WRCOMB : MTRR_TYPE_UNCACHABLE;
+}
+#define mtrr_type_lookup _mtrr_type_lookup
+
+/*
+ * Does intersection of PAT memory type and MTRR memory type and returns
+ * the resulting memory type as PAT understands it.
+ * (Type in pat and mtrr will not have same value)
+ * The intersection is based on "Effective Memory Type" tables in IA-32
+ * SDM vol 3a
+ */
+static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
+{
+       /*
+        * Look for MTRR hint to get the effective type in case where PAT
+        * request is for WB.
+        */
+       if (req_type == _PAGE_CACHE_WB) {
+               u8 mtrr_type;
+
+               mtrr_type = mtrr_type_lookup(start, end);
+               if (mtrr_type != MTRR_TYPE_WRBACK)
+                       return _PAGE_CACHE_UC_MINUS;
+
+               return _PAGE_CACHE_WB;
+       }
+
+       return req_type;
+}
+
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
+{
+       int ram_page = 0, not_rampage = 0;
+       unsigned long page_nr;
+
+       for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
+            ++page_nr) {
+               /*
+                * For legacy reasons, physical address range in the legacy ISA
+                * region is tracked as non-RAM. This will allow users of
+                * /dev/mem to map portions of legacy ISA region, even when
+                * some of those portions are listed(or not even listed) with
+                * different e820 types(RAM/reserved/..)
+                */
+               if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
+                   page_is_ram(mfn_to_local_pfn(page_nr)))
+                       ram_page = 1;
+               else
+                       not_rampage = 1;
+
+               if (ram_page == not_rampage)
+                       return -1;
+       }
+
+       return ram_page;
+}
+
+/*
+ * For RAM pages, we use page flags to mark the pages with appropriate type.
+ * Here we do two pass:
+ * - Find the memtype of all the pages in the range, look for any conflicts
+ * - In case of no conflicts, set the new memtype for pages in the range
+ */
+static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
+                                 unsigned long *new_type)
+{
+       struct page *page;
+       unsigned long mfn;
+
+       if (req_type == _PAGE_CACHE_UC) {
+               /* We do not support strong UC */
+               WARN_ON_ONCE(1);
+               req_type = _PAGE_CACHE_UC_MINUS;
+       }
+
+       for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) {
+               unsigned long type, pfn = mfn_to_local_pfn(mfn);
+
+               BUG_ON(!pfn_valid(pfn));
+               page = pfn_to_page(pfn);
+               type = get_page_memtype(page);
+               if (type != -1) {
+                       printk(KERN_INFO "reserve_ram_pages_type failed "
+                               "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
+                               start, end, type, req_type);
+                       if (new_type)
+                               *new_type = type;
+
+                       return -EBUSY;
+               }
+       }
+
+       if (new_type)
+               *new_type = req_type;
+
+       for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) {
+               page = pfn_to_page(mfn_to_local_pfn(mfn));
+               set_page_memtype(page, req_type);
+       }
+       return 0;
+}
+
+static int free_ram_pages_type(u64 start, u64 end)
+{
+       struct page *page;
+       unsigned long mfn;
+
+       for (mfn = (start >> PAGE_SHIFT); mfn < (end >> PAGE_SHIFT); ++mfn) {
+               unsigned long pfn = mfn_to_local_pfn(mfn);
+
+               BUG_ON(!pfn_valid(pfn));
+               page = pfn_to_page(pfn);
+               set_page_memtype(page, -1);
+       }
+       return 0;
+}
+
+/*
+ * req_type typically has one of the:
+ * - _PAGE_CACHE_WB
+ * - _PAGE_CACHE_WC
+ * - _PAGE_CACHE_UC_MINUS
+ * - _PAGE_CACHE_UC
+ *
+ * If new_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If new_type is non-NULL, function will return
+ * available type in new_type in case of no error. In case of any error
+ * it will return a negative return value.
+ */
+int reserve_memtype(u64 start, u64 end, unsigned long req_type,
+                   unsigned long *new_type)
+{
+       struct memtype *new;
+       unsigned long actual_type;
+       int is_range_ram;
+       int err = 0;
+
+       BUG_ON(start >= end); /* end is exclusive */
+
+       if (!pat_enabled) {
+               /* This is identical to page table setting without PAT */
+               if (new_type) {
+                       if (req_type == _PAGE_CACHE_WC)
+                               *new_type = _PAGE_CACHE_UC_MINUS;
+                       else
+                               *new_type = req_type & _PAGE_CACHE_MASK;
+               }
+               return 0;
+       }
+
+       /* Low ISA region is always mapped WB in page table. No need to track */
+       if (x86_platform.is_untracked_pat_range(start, end)) {
+               if (new_type)
+                       *new_type = _PAGE_CACHE_WB;
+               return 0;
+       }
+
+       /*
+        * Call mtrr_lookup to get the type hint. This is an
+        * optimization for /dev/mem mmap'ers into WB memory (BIOS
+        * tools and ACPI tools). Use WB request for WB memory and use
+        * UC_MINUS otherwise.
+        */
+       actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK);
+
+       if (new_type)
+               *new_type = actual_type;
+
+       is_range_ram = pat_pagerange_is_ram(start, end);
+       if (is_range_ram == 1) {
+
+               err = reserve_ram_pages_type(start, end, req_type, new_type);
+
+               return err;
+       } else if (is_range_ram < 0) {
+               return -EINVAL;
+       }
+
+       new  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+       if (!new)
+               return -ENOMEM;
+
+       new->start      = start;
+       new->end        = end;
+       new->type       = actual_type;
+
+       spin_lock(&memtype_lock);
+
+       err = rbt_memtype_check_insert(new, new_type);
+       if (err) {
+               printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
+                      "track %s, req %s\n",
+                      start, end, cattr_name(new->type), cattr_name(req_type));
+               kfree(new);
+               spin_unlock(&memtype_lock);
+
+               return err;
+       }
+
+       spin_unlock(&memtype_lock);
+
+       dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
+               start, end, cattr_name(new->type), cattr_name(req_type),
+               new_type ? cattr_name(*new_type) : "-");
+
+       return err;
+}
+
+int free_memtype(u64 start, u64 end)
+{
+       int err = -EINVAL;
+       int is_range_ram;
+       struct memtype *entry;
+
+       if (!pat_enabled)
+               return 0;
+
+       /* Low ISA region is always mapped WB. No need to track */
+       if (x86_platform.is_untracked_pat_range(start, end))
+               return 0;
+
+       is_range_ram = pat_pagerange_is_ram(start, end);
+       if (is_range_ram == 1) {
+
+               err = free_ram_pages_type(start, end);
+
+               return err;
+       } else if (is_range_ram < 0) {
+               return -EINVAL;
+       }
+
+       spin_lock(&memtype_lock);
+       entry = rbt_memtype_erase(start, end);
+       spin_unlock(&memtype_lock);
+
+       if (!entry) {
+               printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
+                       current->comm, current->pid, start, end);
+               return -EINVAL;
+       }
+
+       kfree(entry);
+
+       dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+
+       return 0;
+}
+
+
+#ifndef CONFIG_XEN
+/**
+ * lookup_memtype - Looksup the memory type for a physical address
+ * @paddr: physical address of which memory type needs to be looked up
+ *
+ * Only to be called when PAT is enabled
+ *
+ * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
+ * _PAGE_CACHE_UC
+ */
+static unsigned long lookup_memtype(u64 paddr)
+{
+       int rettype = _PAGE_CACHE_WB;
+       struct memtype *entry;
+
+       if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
+               return rettype;
+
+       if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
+               struct page *page;
+               page = pfn_to_page(paddr >> PAGE_SHIFT);
+               rettype = get_page_memtype(page);
+               /*
+                * -1 from get_page_memtype() implies RAM page is in its
+                * default state and not reserved, and hence of type WB
+                */
+               if (rettype == -1)
+                       rettype = _PAGE_CACHE_WB;
+
+               return rettype;
+       }
+
+       spin_lock(&memtype_lock);
+
+       entry = rbt_memtype_lookup(paddr);
+       if (entry != NULL)
+               rettype = entry->type;
+       else
+               rettype = _PAGE_CACHE_UC_MINUS;
+
+       spin_unlock(&memtype_lock);
+       return rettype;
+}
+#endif
+
+/**
+ * io_reserve_memtype - Request a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ * @type: A pointer to memtype, with requested type. On success, requested
+ * or any other compatible type that was available for the region is returned
+ *
+ * On success, returns 0
+ * On failure, returns non-zero
+ */
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+                       unsigned long *type)
+{
+       resource_size_t size = end - start;
+       unsigned long req_type = *type;
+       unsigned long new_type;
+       int ret;
+
+       WARN_ON_ONCE(iomem_map_sanity_check(start, size));
+
+       ret = reserve_memtype(start, end, req_type, &new_type);
+       if (ret)
+               goto out_err;
+
+       if (!is_new_memtype_allowed(start, size, req_type, new_type))
+               goto out_free;
+
+       if (kernel_map_sync_memtype(start, size, new_type) < 0)
+               goto out_free;
+
+       *type = new_type;
+       return 0;
+
+out_free:
+       free_memtype(start, end);
+       ret = -EBUSY;
+out_err:
+       return ret;
+}
+
+/**
+ * io_free_memtype - Release a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ */
+void io_free_memtype(resource_size_t start, resource_size_t end)
+{
+       free_memtype(start, end);
+}
+
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long mfn,
+                               unsigned long size, pgprot_t vma_prot)
+{
+       return vma_prot;
+}
+
+#ifdef CONFIG_STRICT_DEVMEM
+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
+static inline int range_is_allowed(unsigned long mfn, unsigned long size)
+{
+       return 1;
+}
+#else
+/* This check is needed to avoid cache aliasing when PAT is enabled */
+static inline int range_is_allowed(unsigned long mfn, unsigned long size)
+{
+       u64 from = ((u64)mfn) << PAGE_SHIFT;
+       u64 to = from + size;
+       u64 cursor = from;
+
+       if (!pat_enabled)
+               return 1;
+
+       while (cursor < to) {
+               if (!devmem_is_allowed(mfn)) {
+                       printk(KERN_INFO
+               "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+                               current->comm, from, to);
+                       return 0;
+               }
+               cursor += PAGE_SIZE;
+               mfn++;
+       }
+       return 1;
+}
+#endif /* CONFIG_STRICT_DEVMEM */
+
+int phys_mem_access_prot_allowed(struct file *file, unsigned long mfn,
+                               unsigned long size, pgprot_t *vma_prot)
+{
+       unsigned long flags = _PAGE_CACHE_WB;
+
+       if (!range_is_allowed(mfn, size))
+               return 0;
+
+       if (file->f_flags & O_DSYNC)
+               flags = _PAGE_CACHE_UC_MINUS;
+
+#ifndef CONFIG_X86_32
+#ifndef CONFIG_XEN /* Xen sets correct MTRR type on non-RAM for us. */
+       /*
+        * On the PPro and successors, the MTRRs are used to set
+        * memory types for physical addresses outside main memory,
+        * so blindly setting UC or PWT on those pages is wrong.
+        * For Pentiums and earlier, the surround logic should disable
+        * caching for the high addresses through the KEN pin, but
+        * we maintain the tradition of paranoia in this code.
+        */
+       if (!pat_enabled &&
+           !(boot_cpu_has(X86_FEATURE_MTRR) ||
+             boot_cpu_has(X86_FEATURE_K6_MTRR) ||
+             boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
+             boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
+           (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+               flags = _PAGE_CACHE_UC;
+       }
+#endif
+#endif
+
+       *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
+                            flags);
+       return 1;
+}
+
+/*
+ * Change the memory type for the physial address range in kernel identity
+ * mapping space if that range is a part of identity map.
+ */
+int kernel_map_sync_memtype(u64 ma, unsigned long size, unsigned long flags)
+{
+       return ioremap_check_change_attr(ma >> PAGE_SHIFT, size, flags);
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Internal interface to reserve a range of physical memory with prot.
+ * Reserved non RAM regions only and after successful reserve_memtype,
+ * this func also keeps identity mapping (if any) in sync with this new prot.
+ */
+static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
+                               int strict_prot)
+{
+       int is_ram = 0;
+       int ret;
+       unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
+       unsigned long flags = want_flags;
+
+       is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+
+       /*
+        * reserve_pfn_range() for RAM pages. We do not refcount to keep
+        * track of number of mappings of RAM pages. We can assert that
+        * the type requested matches the type of first page in the range.
+        */
+       if (is_ram) {
+               if (!pat_enabled)
+                       return 0;
+
+               flags = lookup_memtype(paddr);
+               if (want_flags != flags) {
+                       printk(KERN_WARNING
+                       "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
+                               current->comm, current->pid,
+                               cattr_name(want_flags),
+                               (unsigned long long)paddr,
+                               (unsigned long long)(paddr + size),
+                               cattr_name(flags));
+                       *vma_prot = __pgprot((pgprot_val(*vma_prot) &
+                                             (~_PAGE_CACHE_MASK)) |
+                                            flags);
+               }
+               return 0;
+       }
+
+       ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
+       if (ret)
+               return ret;
+
+       if (flags != want_flags) {
+               if (strict_prot ||
+                   !is_new_memtype_allowed(paddr, size, want_flags, flags)) {
+                       free_memtype(paddr, paddr + size);
+                       printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
+                               " for %Lx-%Lx, got %s\n",
+                               current->comm, current->pid,
+                               cattr_name(want_flags),
+                               (unsigned long long)paddr,
+                               (unsigned long long)(paddr + size),
+                               cattr_name(flags));
+                       return -EINVAL;
+               }
+               /*
+                * We allow returning different type than the one requested in
+                * non strict case.
+                */
+               *vma_prot = __pgprot((pgprot_val(*vma_prot) &
+                                     (~_PAGE_CACHE_MASK)) |
+                                    flags);
+       }
+
+       if (kernel_map_sync_memtype(paddr, size, flags) < 0) {
+               free_memtype(paddr, paddr + size);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Internal interface to free a range of physical memory.
+ * Frees non RAM regions only.
+ */
+static void free_pfn_range(u64 paddr, unsigned long size)
+{
+       int is_ram;
+
+       is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+       if (is_ram == 0)
+               free_memtype(paddr, paddr + size);
+}
+
+/*
+ * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
+ * copied through copy_page_range().
+ *
+ * If the vma has a linear pfn mapping for the entire range, we get the prot
+ * from pte and reserve the entire vma range with single reserve_pfn_range call.
+ */
+int track_pfn_vma_copy(struct vm_area_struct *vma)
+{
+       resource_size_t paddr;
+       unsigned long prot;
+       unsigned long vma_size = vma->vm_end - vma->vm_start;
+       pgprot_t pgprot;
+
+       if (is_linear_pfn_mapping(vma)) {
+               /*
+                * reserve the whole chunk covered by vma. We need the
+                * starting address and protection from pte.
+                */
+               if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
+                       WARN_ON_ONCE(1);
+                       return -EINVAL;
+               }
+               pgprot = __pgprot(prot);
+               return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
+       }
+
+       return 0;
+}
+
+/*
+ * track_pfn_vma_new is called when a _new_ pfn mapping is being established
+ * for physical range indicated by pfn and size.
+ *
+ * prot is passed in as a parameter for the new mapping. If the vma has a
+ * linear pfn mapping for the entire range reserve the entire vma range with
+ * single reserve_pfn_range call.
+ */
+int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
+                       unsigned long pfn, unsigned long size)
+{
+       unsigned long flags;
+       resource_size_t paddr;
+       unsigned long vma_size = vma->vm_end - vma->vm_start;
+
+       if (is_linear_pfn_mapping(vma)) {
+               /* reserve the whole chunk starting from vm_pgoff */
+               paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+               return reserve_pfn_range(paddr, vma_size, prot, 0);
+       }
+
+       if (!pat_enabled)
+               return 0;
+
+       /* for vm_insert_pfn and friends, we set prot based on lookup */
+       flags = lookup_memtype(pfn << PAGE_SHIFT);
+       *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+                        flags);
+
+       return 0;
+}
+
+/*
+ * untrack_pfn_vma is called while unmapping a pfnmap for a region.
+ * untrack can be called for a specific region indicated by pfn and size or
+ * can be for the entire vma (in which case size can be zero).
+ */
+void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
+                       unsigned long size)
+{
+       resource_size_t paddr;
+       unsigned long vma_size = vma->vm_end - vma->vm_start;
+
+       if (is_linear_pfn_mapping(vma)) {
+               /* free the whole chunk starting from vm_pgoff */
+               paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
+               free_pfn_range(paddr, vma_size);
+               return;
+       }
+}
+#endif /* CONFIG_XEN */
+
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+       if (pat_enabled)
+               return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
+       else
+               return pgprot_noncached(prot);
+}
+EXPORT_SYMBOL_GPL(pgprot_writecombine);
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
+
+static struct memtype *memtype_get_idx(loff_t pos)
+{
+       struct memtype *print_entry;
+       int ret;
+
+       print_entry  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+       if (!print_entry)
+               return NULL;
+
+       spin_lock(&memtype_lock);
+       ret = rbt_memtype_copy_nth_element(print_entry, pos);
+       spin_unlock(&memtype_lock);
+
+       if (!ret) {
+               return print_entry;
+       } else {
+               kfree(print_entry);
+               return NULL;
+       }
+}
+
+static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
+{
+       if (*pos == 0) {
+               ++*pos;
+               seq_printf(seq, "PAT memtype list:\n");
+       }
+
+       return memtype_get_idx(*pos);
+}
+
+static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       ++*pos;
+       return memtype_get_idx(*pos);
+}
+
+static void memtype_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int memtype_seq_show(struct seq_file *seq, void *v)
+{
+       struct memtype *print_entry = (struct memtype *)v;
+
+       seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
+                       print_entry->start, print_entry->end);
+       kfree(print_entry);
+
+       return 0;
+}
+
+static const struct seq_operations memtype_seq_ops = {
+       .start = memtype_seq_start,
+       .next  = memtype_seq_next,
+       .stop  = memtype_seq_stop,
+       .show  = memtype_seq_show,
+};
+
+static int memtype_seq_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &memtype_seq_ops);
+}
+
+static const struct file_operations memtype_fops = {
+       .open    = memtype_seq_open,
+       .read    = seq_read,
+       .llseek  = seq_lseek,
+       .release = seq_release,
+};
+
+static int __init pat_memtype_list_init(void)
+{
+       if (pat_enabled) {
+               debugfs_create_file("pat_memtype_list", S_IRUSR,
+                                   arch_debugfs_dir, NULL, &memtype_fops);
+       }
+       return 0;
+}
+
+late_initcall(pat_memtype_list_init);
+
+#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h

index 77e5ba1..dbed48d 100644 (file)
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -21,6 +21,10 @@ static inline char *cattr_name(unsigned long flags)
         case _PAGE_CACHE_UC_MINUS:      return "uncached-minus";
         case _PAGE_CACHE_WB:            return "write-back";
         case _PAGE_CACHE_WC:            return "write-combining";
+#ifdef CONFIG_XEN
+       case _PAGE_CACHE_WP:            return "write-protected";
+       case _PAGE_CACHE_WT:            return "write-through";
+#endif
         default:                        return "broken";
         }
  }
diff --git a/arch/x86/mm/pgtable-xen.c b/arch/x86/mm/pgtable-xen.c

new file mode 100644 (file)

index 0000000..682c44f
--- /dev/null
+++ b/arch/x86/mm/pgtable-xen.c
@@ -0,0 +1,970 @@
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <xen/features.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/fixmap.h>
+#include <asm/hypervisor.h>
+#include <asm/mmu_context.h>
+
+#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+
+#ifdef CONFIG_HIGHPTE
+#define PGALLOC_USER_GFP __GFP_HIGHMEM
+#else
+#define PGALLOC_USER_GFP 0
+#endif
+
+gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+       pte_t *pte = (pte_t *)__get_free_page(PGALLOC_GFP);
+       if (pte)
+               make_lowmem_page_readonly(pte, XENFEAT_writable_page_tables);
+       return pte;
+}
+
+static void _pte_free(struct page *page, unsigned int order)
+{
+       BUG_ON(order);
+       __pte_free(page);
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+       struct page *pte;
+
+       pte = alloc_pages(__userpte_alloc_gfp, 0);
+       if (pte) {
+               pgtable_page_ctor(pte);
+               SetPageForeign(pte, _pte_free);
+               init_page_count(pte);
+       }
+       return pte;
+}
+
+static int __init setup_userpte(char *arg)
+{
+       if (!arg)
+               return -EINVAL;
+
+       /*
+        * "userpte=nohigh" disables allocation of user pagetables in
+        * high memory.
+        */
+       if (strcmp(arg, "nohigh") == 0)
+               __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+       else
+               return -EINVAL;
+       return 0;
+}
+early_param("userpte", setup_userpte);
+
+void __pte_free(pgtable_t pte)
+{
+       if (!PageHighMem(pte)) {
+               if (PagePinned(pte)) {
+                       unsigned long pfn = page_to_pfn(pte);
+
+                       if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
+                                                        pfn_pte(pfn,
+                                                                PAGE_KERNEL),
+                                                        0))
+                               BUG();
+                       ClearPagePinned(pte);
+               }
+       } else
+#ifdef CONFIG_HIGHPTE
+               ClearPagePinned(pte);
+#else
+               BUG();
+#endif
+
+       ClearPageForeign(pte);
+       init_page_count(pte);
+       pgtable_page_dtor(pte);
+       __free_page(pte);
+}
+
+void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+       pgtable_page_dtor(pte);
+       paravirt_release_pte(page_to_pfn(pte));
+       tlb_remove_page(tlb, pte);
+}
+
+#if PAGETABLE_LEVELS > 2
+static void _pmd_free(struct page *page, unsigned int order)
+{
+       BUG_ON(order);
+       __pmd_free(page);
+}
+
+pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+       struct page *pmd;
+
+       pmd = alloc_pages(PGALLOC_GFP, 0);
+       if (!pmd)
+               return NULL;
+       SetPageForeign(pmd, _pmd_free);
+       init_page_count(pmd);
+       return page_address(pmd);
+}
+
+void __pmd_free(pgtable_t pmd)
+{
+       if (PagePinned(pmd)) {
+               unsigned long pfn = page_to_pfn(pmd);
+
+               if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
+                                                pfn_pte(pfn, PAGE_KERNEL),
+                                                0))
+                       BUG();
+               ClearPagePinned(pmd);
+       }
+
+       ClearPageForeign(pmd);
+       init_page_count(pmd);
+       __free_page(pmd);
+}
+
+void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+       paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
+       tlb_remove_page(tlb, virt_to_page(pmd));
+}
+
+#if PAGETABLE_LEVELS > 3
+void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+       paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+       tlb_remove_page(tlb, virt_to_page(pud));
+}
+#endif /* PAGETABLE_LEVELS > 3 */
+#endif /* PAGETABLE_LEVELS > 2 */
+
+static void _pin_lock(struct mm_struct *mm, int lock) {
+       if (lock)
+               spin_lock(&mm->page_table_lock);
+#if USE_SPLIT_PTLOCKS
+       /* While mm->page_table_lock protects us against insertions and
+        * removals of higher level page table pages, it doesn't protect
+        * against updates of pte-s. Such updates, however, require the
+        * pte pages to be in consistent state (unpinned+writable or
+        * pinned+readonly). The pinning and attribute changes, however
+        * cannot be done atomically, which is why such updates must be
+        * prevented from happening concurrently.
+        * Note that no pte lock can ever elsewhere be acquired nesting
+        * with an already acquired one in the same mm, or with the mm's
+        * page_table_lock already acquired, as that would break in the
+        * non-split case (where all these are actually resolving to the
+        * one page_table_lock). Thus acquiring all of them here is not
+        * going to result in dead locks, and the order of acquires
+        * doesn't matter.
+        */
+       {
+               pgd_t *pgd = mm->pgd;
+               unsigned g;
+
+               for (g = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
+                       pud_t *pud;
+                       unsigned u;
+
+                       if (pgd_none(*pgd))
+                               continue;
+                       pud = pud_offset(pgd, 0);
+                       for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+                               pmd_t *pmd;
+                               unsigned m;
+
+                               if (pud_none(*pud))
+                                       continue;
+                               pmd = pmd_offset(pud, 0);
+                               for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+                                       spinlock_t *ptl;
+
+                                       if (pmd_none(*pmd))
+                                               continue;
+                                       ptl = pte_lockptr(0, pmd);
+                                       if (lock)
+                                               spin_lock(ptl);
+                                       else
+                                               spin_unlock(ptl);
+                               }
+                       }
+               }
+       }
+#endif
+       if (!lock)
+               spin_unlock(&mm->page_table_lock);
+}
+#define pin_lock(mm) _pin_lock(mm, 1)
+#define pin_unlock(mm) _pin_lock(mm, 0)
+
+#define PIN_BATCH sizeof(void *)
+static DEFINE_PER_CPU(multicall_entry_t[PIN_BATCH], pb_mcl);
+
+static inline unsigned int pgd_walk_set_prot(struct page *page, pgprot_t flags,
+                                            unsigned int cpu, unsigned int seq)
+{
+       unsigned long pfn = page_to_pfn(page);
+
+       if (pgprot_val(flags) & _PAGE_RW)
+               ClearPagePinned(page);
+       else
+               SetPagePinned(page);
+       if (PageHighMem(page))
+               return seq;
+       MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+                               (unsigned long)__va(pfn << PAGE_SHIFT),
+                               pfn_pte(pfn, flags), 0);
+       if (unlikely(++seq == PIN_BATCH)) {
+               if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+                                                       PIN_BATCH, NULL)))
+                       BUG();
+               seq = 0;
+       }
+
+       return seq;
+}
+
+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
+{
+       pgd_t       *pgd = pgd_base;
+       pud_t       *pud;
+       pmd_t       *pmd;
+       int          g,u,m;
+       unsigned int cpu, seq;
+       multicall_entry_t *mcl;
+
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return;
+
+       cpu = get_cpu();
+
+       /*
+        * Cannot iterate up to USER_PTRS_PER_PGD on x86-64 as these pagetables
+        * may not be the 'current' task's pagetables (e.g., current may be
+        * 32-bit, but the pagetables may be for a 64-bit task).
+        * Subtracting 1 from TASK_SIZE_MAX means the loop limit is correct
+        * regardless of whether TASK_SIZE_MAX is a multiple of PGDIR_SIZE.
+        */
+       for (g = 0, seq = 0; g <= ((TASK_SIZE_MAX-1) / PGDIR_SIZE); g++, pgd++) {
+               if (pgd_none(*pgd))
+                       continue;
+               pud = pud_offset(pgd, 0);
+               if (PTRS_PER_PUD > 1) /* not folded */
+                       seq = pgd_walk_set_prot(virt_to_page(pud),flags,cpu,seq);
+               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+                       if (pud_none(*pud))
+                               continue;
+                       pmd = pmd_offset(pud, 0);
+                       if (PTRS_PER_PMD > 1) /* not folded */
+                               seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
+                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+                               if (pmd_none(*pmd))
+                                       continue;
+                               seq = pgd_walk_set_prot(pmd_page(*pmd),flags,cpu,seq);
+                       }
+               }
+       }
+
+#ifdef CONFIG_X86_PAE
+       for (; g < PTRS_PER_PGD; g++, pgd++) {
+               BUG_ON(pgd_none(*pgd));
+               pud = pud_offset(pgd, 0);
+               BUG_ON(pud_none(*pud));
+               pmd = pmd_offset(pud, 0);
+               seq = pgd_walk_set_prot(virt_to_page(pmd),flags,cpu,seq);
+       }
+#endif
+
+       mcl = per_cpu(pb_mcl, cpu);
+#ifdef CONFIG_X86_64
+       if (unlikely(seq > PIN_BATCH - 2)) {
+               if (unlikely(HYPERVISOR_multicall_check(mcl, seq, NULL)))
+                       BUG();
+               seq = 0;
+       }
+       pgd = __user_pgd(pgd_base);
+       BUG_ON(!pgd);
+       MULTI_update_va_mapping(mcl + seq,
+              (unsigned long)pgd,
+              pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, flags),
+              0);
+       MULTI_update_va_mapping(mcl + seq + 1,
+              (unsigned long)pgd_base,
+              pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+              UVMF_TLB_FLUSH);
+       if (unlikely(HYPERVISOR_multicall_check(mcl, seq + 2, NULL)))
+               BUG();
+#else
+       if (likely(seq != 0)) {
+               MULTI_update_va_mapping(per_cpu(pb_mcl, cpu) + seq,
+                       (unsigned long)pgd_base,
+                       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+                       UVMF_TLB_FLUSH);
+               if (unlikely(HYPERVISOR_multicall_check(per_cpu(pb_mcl, cpu),
+                                                       seq + 1, NULL)))
+                       BUG();
+       } else if(HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
+                       pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+                       UVMF_TLB_FLUSH))
+               BUG();
+#endif
+
+       put_cpu();
+}
+
+void __init xen_init_pgd_pin(void)
+{
+       pgd_t       *pgd = init_mm.pgd;
+       pud_t       *pud;
+       pmd_t       *pmd;
+       unsigned int g, u, m;
+
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return;
+
+       SetPagePinned(virt_to_page(pgd));
+       for (g = 0; g < PTRS_PER_PGD; g++, pgd++) {
+#ifndef CONFIG_X86_PAE
+               if (g >= pgd_index(HYPERVISOR_VIRT_START)
+                   && g <= pgd_index(HYPERVISOR_VIRT_END - 1))
+                       continue;
+#endif
+               if (!pgd_present(*pgd))
+                       continue;
+               pud = pud_offset(pgd, 0);
+               if (PTRS_PER_PUD > 1) /* not folded */
+                       SetPagePinned(virt_to_page(pud));
+               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+                       if (!pud_present(*pud) || pud_large(*pud))
+                               continue;
+                       pmd = pmd_offset(pud, 0);
+                       if (PTRS_PER_PMD > 1) /* not folded */
+                               SetPagePinned(virt_to_page(pmd));
+                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+#ifdef CONFIG_X86_PAE
+                               if (g == pgd_index(HYPERVISOR_VIRT_START)
+                                   && m >= pmd_index(HYPERVISOR_VIRT_START))
+                                       continue;
+#endif
+                               if (!pmd_present(*pmd) || pmd_large(*pmd))
+                                       continue;
+                               SetPagePinned(pmd_page(*pmd));
+                       }
+               }
+       }
+#ifdef CONFIG_X86_64
+       SetPagePinned(virt_to_page(level3_user_pgt));
+#endif
+}
+
+static void __pgd_pin(pgd_t *pgd)
+{
+       pgd_walk(pgd, PAGE_KERNEL_RO);
+       kmap_flush_unused();
+       xen_pgd_pin(pgd);
+       SetPagePinned(virt_to_page(pgd));
+}
+
+static void __pgd_unpin(pgd_t *pgd)
+{
+       xen_pgd_unpin(pgd);
+       pgd_walk(pgd, PAGE_KERNEL);
+       ClearPagePinned(virt_to_page(pgd));
+}
+
+static void pgd_test_and_unpin(pgd_t *pgd)
+{
+       if (PagePinned(virt_to_page(pgd)))
+               __pgd_unpin(pgd);
+}
+
+void mm_pin(struct mm_struct *mm)
+{
+       if (xen_feature(XENFEAT_writable_page_tables))
+               return;
+
+       pin_lock(mm);
+       __pgd_pin(mm->pgd);
+       pin_unlock(mm);
+}
+
+void mm_unpin(struct mm_struct *mm)
+{
+       if (xen_feature(XENFEAT_writable_page_tables))
+               return;
+
+       pin_lock(mm);
+       __pgd_unpin(mm->pgd);
+       pin_unlock(mm);
+}
+
+void mm_pin_all(void)
+{
+       struct page *page;
+
+       if (xen_feature(XENFEAT_writable_page_tables))
+               return;
+
+       /*
+        * Allow uninterrupted access to the pgd_list. Also protects
+        * __pgd_pin() by ensuring preemption is disabled.
+        * All other CPUs must be at a safe point (e.g., in stop_machine
+        * or offlined entirely).
+        */
+       BUG_ON(!irqs_disabled());
+       spin_lock(&pgd_lock);
+       list_for_each_entry(page, &pgd_list, lru) {
+               if (!PagePinned(page))
+                       __pgd_pin((pgd_t *)page_address(page));
+       }
+       spin_unlock(&pgd_lock);
+}
+
+void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+       if (!PagePinned(virt_to_page(mm->pgd)))
+               mm_pin(mm);
+}
+
+/*
+ * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() *much*
+ * faster this way, as no hypercalls are needed for the page table updates.
+ */
+static void leave_active_mm(struct task_struct *tsk, struct mm_struct *mm)
+       __releases(tsk->alloc_lock)
+{
+       if (tsk->active_mm == mm) {
+               tsk->active_mm = &init_mm;
+               atomic_inc(&init_mm.mm_count);
+
+               switch_mm(mm, &init_mm, tsk);
+
+               if (atomic_dec_and_test(&mm->mm_count))
+                       BUG();
+       }
+
+       task_unlock(tsk);
+}
+
+static void _leave_active_mm(void *mm)
+{
+       struct task_struct *tsk = current;
+
+       if (spin_trylock(&tsk->alloc_lock))
+               leave_active_mm(tsk, mm);
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
+       struct task_struct *tsk = current;
+
+       task_lock(tsk);
+       leave_active_mm(tsk, mm);
+
+       preempt_disable();
+       smp_call_function_many(mm_cpumask(mm), _leave_active_mm, mm, 1);
+       preempt_enable();
+
+       if (PagePinned(virt_to_page(mm->pgd))
+           && atomic_read(&mm->mm_count) == 1
+           && !mm->context.has_foreign_mappings)
+               mm_unpin(mm);
+}
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+       struct page *page = virt_to_page(pgd);
+
+       list_add(&page->lru, &pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+       struct page *page = virt_to_page(pgd);
+
+       list_del(&page->lru);
+}
+
+#define UNSHARED_PTRS_PER_PGD                          \
+       (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
+
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+       BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+       virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+       return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
+{
+       pgd_test_and_unpin(pgd);
+
+       /* If the pgd points to a shared pagetable level (either the
+          ptes in non-PAE, or shared PMD in PAE), then just copy the
+          references from swapper_pg_dir. */
+       if (PAGETABLE_LEVELS == 2 ||
+           (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+           PAGETABLE_LEVELS == 4) {
+               clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
+                               swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+                               KERNEL_PGD_PTRS);
+       }
+
+#ifdef CONFIG_X86_64
+       /* set level3_user_pgt for vsyscall area */
+       __user_pgd(pgd)[pgd_index(VSYSCALL_START)] =
+               __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
+#endif
+
+       /* list required to sync kernel mapping updates */
+       if (!SHARED_KERNEL_PMD) {
+               pgd_set_mm(pgd, mm);
+               pgd_list_add(pgd);
+       }
+}
+
+static void pgd_dtor(pgd_t *pgd)
+{
+       if (!SHARED_KERNEL_PMD) {
+               spin_lock(&pgd_lock);
+               pgd_list_del(pgd);
+               spin_unlock(&pgd_lock);
+       }
+
+       pgd_test_and_unpin(pgd);
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
+
+#ifdef CONFIG_X86_PAE
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update.  Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+#define PREALLOCATED_PMDS      UNSHARED_PTRS_PER_PGD
+
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+       /* Note: almost everything apart from _PAGE_PRESENT is
+          reserved at the pmd (PDPT) level. */
+       pud_t pud = __pud(__pa(pmd) | _PAGE_PRESENT);
+
+       paravirt_alloc_pmd(mm, page_to_pfn(virt_to_page(pmd)));
+
+       if (likely(!PagePinned(virt_to_page(pudp)))) {
+               *pudp = pud;
+               return;
+       }
+
+       set_pud(pudp, pud);
+
+       /*
+        * According to Intel App note "TLBs, Paging-Structure Caches,
+        * and Their Invalidation", April 2007, document 317080-001,
+        * section 8.1: in PAE mode we explicitly have to flush the
+        * TLB via cr3 if the top-level pgd is changed...
+        */
+       flush_tlb_mm(mm);
+}
+#else  /* !CONFIG_X86_PAE */
+
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+#define PREALLOCATED_PMDS      0
+
+#endif /* CONFIG_X86_PAE */
+
+static void free_pmds(pmd_t *pmds[], struct mm_struct *mm, bool contig)
+{
+       int i;
+
+#ifdef CONFIG_X86_PAE
+       if (contig)
+               xen_destroy_contiguous_region((unsigned long)mm->pgd, 0);
+#endif
+
+       for(i = 0; i < PREALLOCATED_PMDS; i++)
+               if (pmds[i])
+                       pmd_free(mm, pmds[i]);
+}
+
+static int preallocate_pmds(pmd_t *pmds[], struct mm_struct *mm)
+{
+       int i;
+       bool failed = false;
+
+       for(i = 0; i < PREALLOCATED_PMDS; i++) {
+               pmd_t *pmd = pmd_alloc_one(mm, i << PUD_SHIFT);
+               if (pmd == NULL)
+                       failed = true;
+               pmds[i] = pmd;
+       }
+
+       if (failed) {
+               free_pmds(pmds, mm, false);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+       int i;
+
+       for(i = 0; i < PREALLOCATED_PMDS; i++) {
+               pgd_t pgd = pgdp[i];
+
+               if (__pgd_val(pgd) != 0) {
+                       pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+                       pgdp[i] = xen_make_pgd(0);
+
+                       paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+                       pmd_free(mm, pmd);
+               }
+       }
+
+#ifdef CONFIG_X86_PAE
+       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
+               xen_destroy_contiguous_region((unsigned long)pgdp, 0);
+#endif
+}
+
+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
+{
+       pud_t *pud;
+       unsigned long addr;
+       int i;
+
+       if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
+               return;
+
+       pud = pud_offset(pgd, 0);
+       for (addr = i = 0; i < PREALLOCATED_PMDS;
+            i++, pud++, addr += PUD_SIZE) {
+               pmd_t *pmd = pmds[i];
+
+               if (i >= KERNEL_PGD_BOUNDARY)
+                       memcpy(pmd,
+                              (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+                              sizeof(pmd_t) * PTRS_PER_PMD);
+
+               /* It is safe to poke machine addresses of pmds under the pgd_lock. */
+               pud_populate(mm, pud, pmd);
+       }
+}
+
+static inline pgd_t *user_pgd_alloc(pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+       if (pgd) {
+               pgd_t *upgd = (void *)__get_free_page(PGALLOC_GFP);
+
+               if (upgd)
+                       set_page_private(virt_to_page(pgd),
+                                        (unsigned long)upgd);
+               else {
+                       free_page((unsigned long)pgd);
+                       pgd = NULL;
+               }
+       }
+#endif
+       return pgd;
+}
+
+static inline void user_pgd_free(pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+       free_page(page_private(virt_to_page(pgd)));
+#endif
+}
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+       pgd_t *pgd;
+       pmd_t *pmds[PREALLOCATED_PMDS];
+
+       pgd = user_pgd_alloc((void *)__get_free_page(PGALLOC_GFP));
+
+       if (pgd == NULL)
+               goto out;
+
+       mm->pgd = pgd;
+
+       if (preallocate_pmds(pmds, mm) != 0)
+               goto out_free_pgd;
+
+       if (paravirt_pgd_alloc(mm) != 0)
+               goto out_free_pmds;
+
+       /*
+        * Make sure that pre-populating the pmds is atomic with
+        * respect to anything walking the pgd_list, so that they
+        * never see a partially populated pgd.
+        */
+       spin_lock(&pgd_lock);
+
+#ifdef CONFIG_X86_PAE
+       /* Protect against save/restore: move below 4GB under pgd_lock. */
+       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)
+           && xen_create_contiguous_region((unsigned long)pgd, 0, 32)) {
+               spin_unlock(&pgd_lock);
+               goto out_free_pmds;
+       }
+#endif
+
+       pgd_ctor(mm, pgd);
+       pgd_prepopulate_pmd(mm, pgd, pmds);
+
+       spin_unlock(&pgd_lock);
+
+       return pgd;
+
+out_free_pmds:
+       free_pmds(pmds, mm, !xen_feature(XENFEAT_pae_pgdir_above_4gb));
+out_free_pgd:
+       user_pgd_free(pgd);
+       free_page((unsigned long)pgd);
+out:
+       return NULL;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+       /*
+        * After this the pgd should not be pinned for the duration of this
+        * function's execution. We should never sleep and thus never race:
+        *  1. User pmds will not become write-protected under our feet due
+        *     to a concurrent mm_pin_all().
+        *  2. The machine addresses in PGD entries will not become invalid
+        *     due to a concurrent save/restore.
+        */
+       pgd_dtor(pgd);
+
+       pgd_mop_up_pmds(mm, pgd);
+       paravirt_pgd_free(mm, pgd);
+       user_pgd_free(pgd);
+       free_page((unsigned long)pgd);
+}
+
+/* blktap and gntdev need this, as otherwise they would implicitly (and
+ * needlessly, as they never use it) reference init_mm. */
+pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *vma,
+                                 unsigned long addr, pte_t *ptep, int full)
+{
+       return ptep_get_and_clear_full(vma ? vma->vm_mm : &init_mm,
+                                      addr, ptep, full);
+}
+EXPORT_SYMBOL_GPL(xen_ptep_get_and_clear_full);
+
+int ptep_set_access_flags(struct vm_area_struct *vma,
+                         unsigned long address, pte_t *ptep,
+                         pte_t entry, int dirty)
+{
+       int changed = !pte_same(*ptep, entry);
+
+       if (changed && dirty) {
+               if (likely(vma->vm_mm == current->mm)) {
+                       if (HYPERVISOR_update_va_mapping(address,
+                               entry,
+                               uvm_multi(mm_cpumask(vma->vm_mm))|UVMF_INVLPG))
+                               BUG();
+               } else {
+                       xen_l1_entry_update(ptep, entry);
+                       flush_tlb_page(vma, address);
+               }
+       }
+
+       return changed;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+                         unsigned long address, pmd_t *pmdp,
+                         pmd_t entry, int dirty)
+{
+       int changed = !pmd_same(*pmdp, entry);
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+       if (changed && dirty) {
+               *pmdp = entry;
+               pmd_update_defer(vma->vm_mm, address, pmdp);
+               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       }
+
+       return changed;
+}
+#endif
+
+int ptep_test_and_clear_young(struct vm_area_struct *vma,
+                             unsigned long addr, pte_t *ptep)
+{
+       int ret = 0;
+
+       if (pte_young(*ptep))
+               ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+                                        (unsigned long *) &ptep->pte);
+
+       if (ret)
+               pte_update(vma->vm_mm, addr, ptep);
+
+       return ret;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+                             unsigned long addr, pmd_t *pmdp)
+{
+       int ret = 0;
+
+       if (pmd_young(*pmdp))
+               ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+                                        (unsigned long *)pmdp);
+
+       if (ret)
+               pmd_update(vma->vm_mm, addr, pmdp);
+
+       return ret;
+}
+#endif
+
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+                          unsigned long address, pte_t *ptep)
+{
+       pte_t pte = *ptep;
+       int young = pte_young(pte);
+
+       pte = pte_mkold(pte);
+       if (PagePinned(virt_to_page(vma->vm_mm->pgd)))
+               ptep_set_access_flags(vma, address, ptep, pte, young);
+       else if (young)
+               ptep->pte_low = pte.pte_low;
+
+       return young;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp)
+{
+       int young;
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+       young = pmdp_test_and_clear_young(vma, address, pmdp);
+       if (young)
+               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+       return young;
+}
+
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+                         unsigned long address, pmd_t *pmdp)
+{
+       int set;
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+       set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+                               (unsigned long *)pmdp);
+       if (set) {
+               pmd_update(vma->vm_mm, address, pmdp);
+               /* need tlb flush only to serialize against gup-fast */
+               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+       }
+}
+#endif
+
+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void __init reserve_top_address(unsigned long reserve)
+{
+#ifdef CONFIG_X86_32
+       BUG_ON(fixmaps_set > 0);
+       printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
+              (int)-reserve);
+       __FIXADDR_TOP = -reserve - PAGE_SIZE;
+#endif
+}
+
+int fixmaps_set;
+
+void xen_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
+{
+       unsigned long address = __fix_to_virt(idx);
+       pte_t pte;
+
+       if (idx >= __end_of_fixed_addresses) {
+               BUG();
+               return;
+       }
+
+       switch (idx) {
+#ifdef CONFIG_X86_64
+       extern pte_t level1_fixmap_pgt[PTRS_PER_PTE];
+
+       case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+       case VVAR_PAGE:
+               pte = pfn_pte(phys >> PAGE_SHIFT, flags);
+               set_pte_vaddr_pud(level3_user_pgt, address, pte);
+               break;
+       case FIX_EARLYCON_MEM_BASE:
+       case FIX_SHARED_INFO:
+       case FIX_ISAMAP_END ... FIX_ISAMAP_BEGIN:
+               xen_l1_entry_update(level1_fixmap_pgt + pte_index(address),
+                                   pfn_pte_ma(phys >> PAGE_SHIFT, flags));
+               fixmaps_set++;
+               return;
+#else
+       case FIX_WP_TEST:
+       case FIX_VDSO:
+               pte = pfn_pte(phys >> PAGE_SHIFT, flags);
+               break;
+#endif
+       default:
+               pte = pfn_pte_ma(phys >> PAGE_SHIFT, flags);
+               break;
+       }
+       set_pte_vaddr(address, pte);
+       fixmaps_set++;
+}
diff --git a/arch/x86/mm/pgtable_32-xen.c b/arch/x86/mm/pgtable_32-xen.c

new file mode 100644 (file)

index 0000000..2976cbc
--- /dev/null
+++ b/arch/x86/mm/pgtable_32-xen.c
@@ -0,0 +1,178 @@
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/nmi.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+
+#include <xen/features.h>
+#include <asm/hypervisor.h>
+
+unsigned int __VMALLOC_RESERVE = 128 << 20;
+
+/*
+ * Associate a virtual page frame with a given physical page frame
+ * and protection flags for that frame.
+ */
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+#ifndef CONFIG_XEN
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               BUG();
+               return;
+       }
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               BUG();
+               return;
+       }
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               BUG();
+               return;
+       }
+       pte = pte_offset_kernel(pmd, vaddr);
+       if (pte_val(pteval))
+               set_pte_at(&init_mm, vaddr, pte, pteval);
+       else
+               pte_clear(&init_mm, vaddr, pte);
+
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+#else
+       if (HYPERVISOR_update_va_mapping(vaddr, pteval,
+                                        UVMF_INVLPG|UVMF_ALL))
+               BUG();
+#endif
+}
+
+/*
+ * Associate a large virtual page frame with a given physical page frame 
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned. 
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */ 
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
+               printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
+               return; /* BUG(); */
+       }
+       if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
+               printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
+               return; /* BUG(); */
+       }
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
+               return; /* BUG(); */
+       }
+       pud = pud_offset(pgd, vaddr);
+       pmd = pmd_offset(pud, vaddr);
+       set_pmd(pmd, pfn_pmd(pfn, flags));
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
+unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
+EXPORT_SYMBOL(__FIXADDR_TOP);
+
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+       if (!arg)
+               return -EINVAL;
+
+       /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
+       __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
+       return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+
+#ifndef CONFIG_XEN
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+       unsigned long address;
+
+       if (!arg)
+               return -EINVAL;
+
+       address = memparse(arg, &arg);
+       reserve_top_address(address);
+       fixup_early_ioremap();
+       return 0;
+}
+early_param("reservetop", parse_reservetop);
+#endif
+
+void make_lowmem_page_readonly(void *va, unsigned int feature)
+{
+       pte_t *pte;
+       unsigned int level;
+       int rc;
+
+       if (xen_feature(feature))
+               return;
+
+       pte = lookup_address((unsigned long)va, &level);
+       BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
+       rc = HYPERVISOR_update_va_mapping(
+               (unsigned long)va, pte_wrprotect(*pte), 0);
+       BUG_ON(rc);
+}
+
+void make_lowmem_page_writable(void *va, unsigned int feature)
+{
+       pte_t *pte;
+       unsigned int level;
+       int rc;
+
+       if (xen_feature(feature))
+               return;
+
+       pte = lookup_address((unsigned long)va, &level);
+       BUG_ON(!pte || level != PG_LEVEL_4K || !pte_present(*pte));
+       rc = HYPERVISOR_update_va_mapping(
+               (unsigned long)va, pte_mkwrite(*pte), UVMF_INVLPG);
+       BUG_ON(rc);
+}
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c

index d2e2735..fdb369a 100644 (file)
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -8,6 +8,10 @@
  
  #ifdef CONFIG_X86_64
  
+#ifdef CONFIG_XEN
+#define phys_base 0
+#endif
+
  unsigned long __phys_addr(unsigned long x)
  {
         if (x >= __START_KERNEL_map) {
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile

index 1599f56..eaf5382 100644 (file)
--- a/arch/x86/oprofile/Makefile
+++ b/arch/x86/oprofile/Makefile
@@ -6,6 +6,13 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
                 oprofilefs.o oprofile_stats.o  \
                 timer_int.o nmi_timer_int.o )
  
+ifdef CONFIG_XEN
+XENOPROF_COMMON_OBJS = $(addprefix ../../../drivers/xen/xenoprof/, \
+                        xenoprofile.o)
+oprofile-y                             := $(DRIVER_OBJS) \
+                                          $(XENOPROF_COMMON_OBJS) xenoprof.o
+else
  oprofile-y                             := $(DRIVER_OBJS) init.o backtrace.o
  oprofile-$(CONFIG_X86_LOCAL_APIC)      += nmi_int.o op_model_amd.o \
                                            op_model_ppro.o op_model_p4.o
+endif
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c

index d6aa6e8..8a40294 100644 (file)
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -17,6 +17,17 @@
  #include <asm/ptrace.h>
  #include <asm/stacktrace.h>
  
+static void backtrace_warning_symbol(void *data, char *msg,
+                                    unsigned long symbol)
+{
+       /* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+       /* Ignore warnings */
+}
+
  static int backtrace_stack(void *data, char *name)
  {
         /* Yes, we want all stacks */
@@ -32,6 +43,8 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
  }
  
  static struct stacktrace_ops backtrace_ops = {
+       .warning        = backtrace_warning,
+       .warning_symbol = backtrace_warning_symbol,
         .stack          = backtrace_stack,
         .address        = backtrace_address,
         .walk_stack     = print_context_stack,
diff --git a/arch/x86/oprofile/xenoprof.c b/arch/x86/oprofile/xenoprof.c

new file mode 100644 (file)

index 0000000..dde0310
--- /dev/null
+++ b/arch/x86/oprofile/xenoprof.c
@@ -0,0 +1,179 @@
+/**
+ * @file xenoprof.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
+ *
+ * Modified by Aravind Menon and Jose Renato Santos for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ *
+ * x86-specific part
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ */
+
+#include <linux/init.h>
+#include <linux/oprofile.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <asm/pgtable.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/xenoprof.h>
+#include <xen/xenoprof.h>
+#include "op_counter.h"
+
+static unsigned int num_events = 0;
+
+void __init xenoprof_arch_init_counter(struct xenoprof_init *init)
+{
+       num_events = init->num_events;
+       /* just in case - make sure we do not overflow event list 
+          (i.e. counter_config list) */
+       if (num_events > OP_MAX_COUNTER) {
+               num_events = OP_MAX_COUNTER;
+               init->num_events = num_events;
+       }
+}
+
+void xenoprof_arch_counter(void)
+{
+       int i;
+       struct xenoprof_counter counter;
+
+       for (i=0; i<num_events; i++) {
+               counter.ind       = i;
+               counter.count     = (uint64_t)counter_config[i].count;
+               counter.enabled   = (uint32_t)counter_config[i].enabled;
+               counter.event     = (uint32_t)counter_config[i].event;
+               counter.kernel    = (uint32_t)counter_config[i].kernel;
+               counter.user      = (uint32_t)counter_config[i].user;
+               counter.unit_mask = (uint64_t)counter_config[i].unit_mask;
+               WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_counter,
+                                              &counter));
+       }
+}
+
+void xenoprof_arch_start(void) 
+{
+       /* nothing */
+}
+
+void xenoprof_arch_stop(void)
+{
+       /* nothing */
+}
+
+void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer * sbuf)
+{
+       if (sbuf->buffer) {
+               vunmap(sbuf->buffer);
+               sbuf->buffer = NULL;
+       }
+}
+
+int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer * get_buffer,
+                                   struct xenoprof_shared_buffer * sbuf)
+{
+       int npages, ret;
+       struct vm_struct *area;
+
+       sbuf->buffer = NULL;
+       if ( (ret = HYPERVISOR_xenoprof_op(XENOPROF_get_buffer, get_buffer)) )
+               return ret;
+
+       npages = (get_buffer->bufsize * get_buffer->nbuf - 1) / PAGE_SIZE + 1;
+
+       area = alloc_vm_area(npages * PAGE_SIZE, NULL);
+       if (area == NULL)
+               return -ENOMEM;
+
+       if ( (ret = direct_kernel_remap_pfn_range(
+                     (unsigned long)area->addr,
+                     get_buffer->buf_gmaddr >> PAGE_SHIFT,
+                     npages * PAGE_SIZE, __pgprot(_KERNPG_TABLE),
+                     DOMID_SELF)) ) {
+               vunmap(area->addr);
+               return ret;
+       }
+
+       sbuf->buffer = area->addr;
+       return ret;
+}
+
+int xenoprof_arch_set_passive(struct xenoprof_passive * pdomain,
+                             struct xenoprof_shared_buffer * sbuf)
+{
+       int ret;
+       int npages;
+       struct vm_struct *area;
+       pgprot_t prot = __pgprot(_KERNPG_TABLE);
+
+       sbuf->buffer = NULL;
+       ret = HYPERVISOR_xenoprof_op(XENOPROF_set_passive, pdomain);
+       if (ret)
+               goto out;
+
+       npages = (pdomain->bufsize * pdomain->nbuf - 1) / PAGE_SIZE + 1;
+
+       area = alloc_vm_area(npages * PAGE_SIZE, NULL);
+       if (area == NULL) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       ret = direct_kernel_remap_pfn_range(
+               (unsigned long)area->addr,
+               pdomain->buf_gmaddr >> PAGE_SHIFT,
+               npages * PAGE_SIZE, prot, DOMID_SELF);
+       if (ret) {
+               vunmap(area->addr);
+               goto out;
+       }
+       sbuf->buffer = area->addr;
+
+out:
+       return ret;
+}
+
+struct op_counter_config counter_config[OP_MAX_COUNTER];
+
+int xenoprof_create_files(struct super_block * sb, struct dentry * root)
+{
+       unsigned int i;
+
+       for (i = 0; i < num_events; ++i) {
+               struct dentry * dir;
+               char buf[2];
+ 
+               snprintf(buf, 2, "%d", i);
+               dir = oprofilefs_mkdir(sb, root, buf);
+               oprofilefs_create_ulong(sb, dir, "enabled",
+                                       &counter_config[i].enabled);
+               oprofilefs_create_ulong(sb, dir, "event",
+                                       &counter_config[i].event);
+               oprofilefs_create_ulong(sb, dir, "count",
+                                       &counter_config[i].count);
+               oprofilefs_create_ulong(sb, dir, "unit_mask",
+                                       &counter_config[i].unit_mask);
+               oprofilefs_create_ulong(sb, dir, "kernel",
+                                       &counter_config[i].kernel);
+               oprofilefs_create_ulong(sb, dir, "user",
+                                       &counter_config[i].user);
+       }
+
+       return 0;
+}
+
+int __init oprofile_arch_init(struct oprofile_operations * ops)
+{
+       return xenoprofile_init(ops);
+}
+
+void oprofile_arch_exit(void)
+{
+       xenoprofile_exit();
+}
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile

index e76e18c..14d087d 100644 (file)
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -5,6 +5,9 @@ obj-$(CONFIG_PCI_MMCONFIG)      += mmconfig_$(BITS).o direct.o mmconfig-shared.o
  obj-$(CONFIG_PCI_DIRECT)       += direct.o
  obj-$(CONFIG_PCI_OLPC)         += olpc.o
  obj-$(CONFIG_PCI_XEN)          += xen.o
+# pcifront should be after mmconfig.o and direct.o as it should only
+# take over if direct access to the PCI bus is unavailable
+obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront.o
  
  obj-y                          += fixup.o
  obj-$(CONFIG_X86_INTEL_CE)      += ce4100.o
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c

index 0567df3..a9f0c26 100644 (file)
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -330,6 +330,7 @@ static int __init early_fill_mp_bus_info(void)
  
  #define ENABLE_CF8_EXT_CFG      (1ULL << 46)
  
+#ifndef CONFIG_XEN
  static void __cpuinit enable_pci_io_ecs(void *unused)
  {
         u64 reg;
@@ -358,6 +359,7 @@ static int __cpuinit amd_cpu_notify(struct notifier_block *self,
  static struct notifier_block __cpuinitdata amd_cpu_notifier = {
         .notifier_call  = amd_cpu_notify,
  };
+#endif /* CONFIG_XEN */
  
  static void __init pci_enable_pci_io_ecs(void)
  {
@@ -398,10 +400,19 @@ static int __init pci_io_ecs_init(void)
         if (early_pci_allowed())
                 pci_enable_pci_io_ecs();
  
+#ifndef CONFIG_XEN
         register_cpu_notifier(&amd_cpu_notifier);
         for_each_online_cpu(cpu)
                 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
                                (void *)(long)cpu);
+#else
+       if (cpu = 1, cpu) {
+               u64 reg;
+               rdmsrl(MSR_AMD64_NB_CFG, reg);
+               if (!(reg & ENABLE_CF8_EXT_CFG))
+                       return 0;
+       }
+#endif
         pci_probe |= PCI_HAS_IO_ECS;
  
         return 0;
@@ -409,6 +420,10 @@ static int __init pci_io_ecs_init(void)
  
  static int __init amd_postcore_init(void)
  {
+#ifdef CONFIG_XEN
+       if (!is_initial_xendomain())
+               return 0;
+#endif
         if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
                 return 0;
  
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c

index 831971e..8d634d3 100644 (file)
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -323,12 +323,14 @@ void __init pcibios_resource_survey(void)
         pcibios_allocate_resources(1);
  
         e820_reserve_resources_late();
+#ifndef CONFIG_XEN
         /*
          * Insert the IO APIC resources after PCI initialization has
          * occurred to handle IO APICS that are mapped in on a BAR in
          * PCI space, but before trying to assign unassigned pci res.
          */
         ioapic_insert_resources();
+#endif
  }
  
  /**
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c

index 372e9b8..c7a3f6a 100644 (file)
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -94,13 +94,18 @@ static struct irq_routing_table * __init pirq_find_routing_table(void)
         u8 *addr;
         struct irq_routing_table *rt;
  
+#ifdef CONFIG_XEN
+       if (!is_initial_xendomain())
+               return NULL;
+#endif
         if (pirq_table_addr) {
-               rt = pirq_check_routing_table((u8 *) __va(pirq_table_addr));
+               rt = pirq_check_routing_table((u8 *) isa_bus_to_virt(pirq_table_addr));
                 if (rt)
                         return rt;
                 printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
         }
-       for (addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) {
+       for (addr = (u8 *) isa_bus_to_virt(0xf0000);
+            addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) {
                 rt = pirq_check_routing_table(addr);
                 if (rt)
                         return rt;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c

index 301e325..b919e48 100644 (file)
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -21,6 +21,10 @@
  #include <asm/pci_x86.h>
  #include <asm/acpi.h>
  
+#ifdef CONFIG_XEN
+#include <xen/interface/physdev.h>
+#endif
+
  #define PREFIX "PCI: "
  
  /* Indicate if the mmcfg resources have been placed into the resource table. */
@@ -471,6 +475,31 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
                 }
         }
  
+#ifdef CONFIG_XEN
+       if (!with_e820) {
+               struct physdev_pci_mmcfg_reserved r = {
+                       .address = cfg->address,
+                       .segment = cfg->segment,
+                       .start_bus = cfg->start_bus,
+                       .end_bus = cfg->end_bus,
+                       .flags = valid ? XEN_PCI_MMCFG_RESERVED : 0
+               };
+               int rc;
+
+               rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r);
+               switch (rc) {
+               case 0: case -ENOSYS:
+                       break;
+               default:
+                       pr_warn(PREFIX "Failed to report MMCONFIG reservation"
+                               " state for %04x [bus%02x-%02x] to hypervisor"
+                               " (%d)\n",
+                               cfg->segment, cfg->start_bus, cfg->end_bus,
+                               rc);
+               }
+       }
+#endif
+
         return valid;
  }
  
diff --git a/arch/x86/pci/pcifront.c b/arch/x86/pci/pcifront.c

new file mode 100644 (file)

index 0000000..5d08be1
--- /dev/null
+++ b/arch/x86/pci/pcifront.c
@@ -0,0 +1,59 @@
+/*
+ * PCI Frontend Stub - puts some "dummy" functions in to the Linux x86 PCI core
+ *                     to support the Xen PCI Frontend's operation
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <asm/acpi.h>
+#include <asm/pci_x86.h>
+#include <xen/evtchn.h>
+
+static int pcifront_enable_irq(struct pci_dev *dev)
+{
+       u8 irq;
+       pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
+       if (!alloc_irq_and_cfg_at(irq, numa_node_id()))
+               return -ENOMEM;
+       evtchn_register_pirq(irq);
+       dev->irq = irq;
+
+       return 0;
+}
+
+extern u8 pci_cache_line_size;
+
+static int __init pcifront_x86_stub_init(void)
+{
+       struct cpuinfo_x86 *c = &boot_cpu_data;
+
+       /* Only install our method if we haven't found real hardware already */
+       if (raw_pci_ops)
+               return 0;
+
+       pr_info("PCI: setting up Xen PCI frontend stub\n");
+
+       /* Copied from arch/i386/pci/common.c */
+       pci_cache_line_size = 32 >> 2;
+       if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD)
+               pci_cache_line_size = 64 >> 2;  /* K7 & K8 */
+       else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL)
+               pci_cache_line_size = 128 >> 2; /* P4 */
+
+       /* On x86, we need to disable the normal IRQ routing table and
+        * just ask the backend
+        */
+       pcibios_enable_irq = pcifront_enable_irq;
+       pcibios_disable_irq = NULL;
+
+#ifdef CONFIG_ACPI
+       /* Keep ACPI out of the picture */
+       acpi_noirq = 1;
+#endif
+
+       return 0;
+}
+
+arch_initcall(pcifront_x86_stub_init);
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile

index 73b8be0..088a961 100644 (file)
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1 +1,2 @@
  obj-$(CONFIG_EFI)              += efi.o efi_$(BITS).o efi_stub_$(BITS).o
+disabled-obj-$(CONFIG_XEN)     := efi_%$(BITS).o
diff --git a/arch/x86/platform/efi/efi-xen.c b/arch/x86/platform/efi/efi-xen.c

new file mode 100644 (file)

index 0000000..257eafe
--- /dev/null
+++ b/arch/x86/platform/efi/efi-xen.c
@@ -0,0 +1,521 @@
+/*
+ * Common EFI (Extensible Firmware Interface) support functions
+ * Based on Extensible Firmware Interface Specification version 1.0
+ *
+ * Copyright (C) 1999 VA Linux Systems
+ * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
+ * Copyright (C) 1999-2002 Hewlett-Packard Co.
+ *     David Mosberger-Tang <davidm@hpl.hp.com>
+ *     Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 2005-2008 Intel Co.
+ *     Fenghua Yu <fenghua.yu@intel.com>
+ *     Bibo Mao <bibo.mao@intel.com>
+ *     Chandramouli Narayanan <mouli@linux.intel.com>
+ *     Huang Ying <ying.huang@intel.com>
+ *
+ * Copied from efi_32.c to eliminate the duplicated code between EFI
+ * 32/64 support code. --ying 2007-10-26
+ *
+ * All EFI Runtime Services are not implemented yet as EFI only
+ * supports physical mode addressing on SoftSDV. This is to be fixed
+ * in a future version.  --drummond 1999-07-20
+ *
+ * Implemented EFI runtime services and virtual mode calls.  --davidm
+ *
+ * Goutham Rao: <goutham.rao@intel.com>
+ *     Skip non-WB memory and ignore empty memory ranges.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/efi.h>
+#include <linux/export.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/time.h>
+
+#include <asm/setup.h>
+#include <asm/efi.h>
+#include <asm/time.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/x86_init.h>
+
+#include <xen/interface/platform.h>
+
+#define EFI_DEBUG      1
+
+int __read_mostly efi_enabled;
+EXPORT_SYMBOL(efi_enabled);
+
+#define call op.u.efi_runtime_call
+#define DECLARE_CALL(what) \
+       struct xen_platform_op op; \
+       op.cmd = XENPF_efi_runtime_call; \
+       call.function = XEN_EFI_##what; \
+       call.misc = 0
+
+static efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
+{
+       int err;
+       DECLARE_CALL(get_time);
+
+       err = HYPERVISOR_platform_op(&op);
+       if (err)
+               return EFI_UNSUPPORTED;
+
+       if (tm) {
+               BUILD_BUG_ON(sizeof(*tm) != sizeof(call.u.get_time.time));
+               memcpy(tm, &call.u.get_time.time, sizeof(*tm));
+       }
+
+       if (tc) {
+               tc->resolution = call.u.get_time.resolution;
+               tc->accuracy = call.u.get_time.accuracy;
+               tc->sets_to_zero = !!(call.misc &
+                                     XEN_EFI_GET_TIME_SET_CLEARS_NS);
+       }
+
+       return call.status;
+}
+
+static efi_status_t xen_efi_set_time(efi_time_t *tm)
+{
+       DECLARE_CALL(set_time);
+
+       BUILD_BUG_ON(sizeof(*tm) != sizeof(call.u.set_time));
+       memcpy(&call.u.set_time, tm, sizeof(*tm));
+
+       return HYPERVISOR_platform_op(&op) ? EFI_UNSUPPORTED : call.status;
+}
+
+static efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled,
+                                           efi_bool_t *pending,
+                                           efi_time_t *tm)
+{
+       int err;
+       DECLARE_CALL(get_wakeup_time);
+
+       err = HYPERVISOR_platform_op(&op);
+       if (err)
+               return EFI_UNSUPPORTED;
+
+       if (tm) {
+               BUILD_BUG_ON(sizeof(*tm) != sizeof(call.u.get_wakeup_time));
+               memcpy(tm, &call.u.get_wakeup_time, sizeof(*tm));
+       }
+
+       if (enabled)
+               *enabled = !!(call.misc & XEN_EFI_GET_WAKEUP_TIME_ENABLED);
+
+       if (pending)
+               *pending = !!(call.misc & XEN_EFI_GET_WAKEUP_TIME_PENDING);
+
+       return call.status;
+}
+
+static efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
+{
+       DECLARE_CALL(set_wakeup_time);
+
+       BUILD_BUG_ON(sizeof(*tm) != sizeof(call.u.set_wakeup_time));
+       if (enabled)
+               call.misc = XEN_EFI_SET_WAKEUP_TIME_ENABLE;
+       if (tm)
+               memcpy(&call.u.set_wakeup_time, tm, sizeof(*tm));
+       else
+               call.misc |= XEN_EFI_SET_WAKEUP_TIME_ENABLE_ONLY;
+
+       return HYPERVISOR_platform_op(&op) ? EFI_UNSUPPORTED : call.status;
+}
+
+static efi_status_t xen_efi_get_variable(efi_char16_t *name,
+                                        efi_guid_t *vendor,
+                                        u32 *attr,
+                                        unsigned long *data_size,
+                                        void *data)
+{
+       int err;
+       DECLARE_CALL(get_variable);
+
+       set_xen_guest_handle(call.u.get_variable.name, name);
+       BUILD_BUG_ON(sizeof(*vendor) !=
+                    sizeof(call.u.get_variable.vendor_guid));
+       memcpy(&call.u.get_variable.vendor_guid, vendor, sizeof(*vendor));
+       call.u.get_variable.size = *data_size;
+       set_xen_guest_handle(call.u.get_variable.data, data);
+       err = HYPERVISOR_platform_op(&op);
+       if (err)
+               return EFI_UNSUPPORTED;
+
+       *data_size = call.u.get_variable.size;
+       *attr = call.misc;
+
+       return call.status;
+}
+
+static efi_status_t xen_efi_get_next_variable(unsigned long *name_size,
+                                             efi_char16_t *name,
+                                             efi_guid_t *vendor)
+{
+       int err;
+       DECLARE_CALL(get_next_variable_name);
+
+       call.u.get_next_variable_name.size = *name_size;
+       set_xen_guest_handle(call.u.get_next_variable_name.name, name);
+       BUILD_BUG_ON(sizeof(*vendor) !=
+                    sizeof(call.u.get_next_variable_name.vendor_guid));
+       memcpy(&call.u.get_next_variable_name.vendor_guid, vendor,
+              sizeof(*vendor));
+       err = HYPERVISOR_platform_op(&op);
+       if (err)
+               return EFI_UNSUPPORTED;
+
+       *name_size = call.u.get_next_variable_name.size;
+       memcpy(vendor, &call.u.get_next_variable_name.vendor_guid,
+              sizeof(*vendor));
+
+       return call.status;
+}
+
+static efi_status_t xen_efi_set_variable(efi_char16_t *name,
+                                        efi_guid_t *vendor,
+                                        u32 attr,
+                                        unsigned long data_size,
+                                        void *data)
+{
+       DECLARE_CALL(set_variable);
+
+       set_xen_guest_handle(call.u.set_variable.name, name);
+       call.misc = attr;
+       BUILD_BUG_ON(sizeof(*vendor) !=
+                    sizeof(call.u.set_variable.vendor_guid));
+       memcpy(&call.u.set_variable.vendor_guid, vendor, sizeof(*vendor));
+       call.u.set_variable.size = data_size;
+       set_xen_guest_handle(call.u.set_variable.data, data);
+
+       return HYPERVISOR_platform_op(&op) ? EFI_UNSUPPORTED : call.status;
+}
+
+static efi_status_t xen_efi_query_variable_info(u32 attr,
+                                               u64 *storage_space,
+                                               u64 *remaining_space,
+                                               u64 *max_variable_size)
+{
+       int err;
+       DECLARE_CALL(query_variable_info);
+
+       if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+               return EFI_UNSUPPORTED;
+
+       err = HYPERVISOR_platform_op(&op);
+       if (err)
+               return EFI_UNSUPPORTED;
+
+       *storage_space = call.u.query_variable_info.max_store_size;
+       *remaining_space = call.u.query_variable_info.remain_store_size;
+       *max_variable_size = call.u.query_variable_info.max_size;
+
+       return call.status;
+}
+
+static efi_status_t xen_efi_get_next_high_mono_count(u32 *count)
+{
+       int err;
+       DECLARE_CALL(get_next_high_monotonic_count);
+
+       err = HYPERVISOR_platform_op(&op);
+       if (err)
+               return EFI_UNSUPPORTED;
+
+       *count = call.misc;
+
+       return call.status;
+}
+
+static efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules,
+                                          unsigned long count,
+                                          unsigned long sg_list)
+{
+       DECLARE_CALL(update_capsule);
+
+       if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+               return EFI_UNSUPPORTED;
+
+       set_xen_guest_handle(call.u.update_capsule.capsule_header_array,
+                            capsules);
+       call.u.update_capsule.capsule_count = count;
+       call.u.update_capsule.sg_list = sg_list;
+
+       return HYPERVISOR_platform_op(&op) ? EFI_UNSUPPORTED : call.status;
+}
+
+static efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules,
+                                              unsigned long count,
+                                              u64 *max_size,
+                                              int *reset_type)
+{
+       int err;
+       DECLARE_CALL(query_capsule_capabilities);
+
+       if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+               return EFI_UNSUPPORTED;
+
+       set_xen_guest_handle(call.u.query_capsule_capabilities.capsule_header_array,
+                            capsules);
+       call.u.query_capsule_capabilities.capsule_count = count;
+
+       err = HYPERVISOR_platform_op(&op);
+       if (err)
+               return EFI_UNSUPPORTED;
+
+       *max_size = call.u.query_capsule_capabilities.max_capsule_size;
+       *reset_type = call.u.query_capsule_capabilities.reset_type;
+
+       return call.status;
+}
+
+#undef DECLARE_CALL
+#undef call
+
+struct efi __read_mostly efi = {
+       .mps                      = EFI_INVALID_TABLE_ADDR,
+       .acpi                     = EFI_INVALID_TABLE_ADDR,
+       .acpi20                   = EFI_INVALID_TABLE_ADDR,
+       .smbios                   = EFI_INVALID_TABLE_ADDR,
+       .sal_systab               = EFI_INVALID_TABLE_ADDR,
+       .boot_info                = EFI_INVALID_TABLE_ADDR,
+       .hcdp                     = EFI_INVALID_TABLE_ADDR,
+       .uga                      = EFI_INVALID_TABLE_ADDR,
+       .uv_systab                = EFI_INVALID_TABLE_ADDR,
+       .get_time                 = xen_efi_get_time,
+       .set_time                 = xen_efi_set_time,
+       .get_wakeup_time          = xen_efi_get_wakeup_time,
+       .set_wakeup_time          = xen_efi_set_wakeup_time,
+       .get_variable             = xen_efi_get_variable,
+       .get_next_variable        = xen_efi_get_next_variable,
+       .set_variable             = xen_efi_set_variable,
+       .get_next_high_mono_count = xen_efi_get_next_high_mono_count,
+       .query_variable_info      = xen_efi_query_variable_info,
+       .update_capsule           = xen_efi_update_capsule,
+       .query_capsule_caps       = xen_efi_query_capsule_caps,
+};
+EXPORT_SYMBOL(efi);
+
+static int __init setup_noefi(char *arg)
+{
+       efi_enabled = 0;
+       return 0;
+}
+early_param("noefi", setup_noefi);
+
+
+int efi_set_rtc_mmss(unsigned long nowtime)
+{
+       int real_seconds, real_minutes;
+       efi_status_t    status;
+       efi_time_t      eft;
+       efi_time_cap_t  cap;
+
+       status = efi.get_time(&eft, &cap);
+       if (status != EFI_SUCCESS) {
+               pr_err("Oops: efitime: can't read time!\n");
+               return -1;
+       }
+
+       real_seconds = nowtime % 60;
+       real_minutes = nowtime / 60;
+       if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
+               real_minutes += 30;
+       real_minutes %= 60;
+       eft.minute = real_minutes;
+       eft.second = real_seconds;
+
+       status = efi.set_time(&eft);
+       if (status != EFI_SUCCESS) {
+               pr_err("Oops: efitime: can't write time!\n");
+               return -1;
+       }
+       return 0;
+}
+
+unsigned long efi_get_time(void)
+{
+       efi_status_t status;
+       efi_time_t eft;
+       efi_time_cap_t cap;
+
+       status = efi.get_time(&eft, &cap);
+       if (status != EFI_SUCCESS) {
+               pr_err("Oops: efitime: can't read time!\n");
+               return mach_get_cmos_time();
+       }
+
+       return mktime(eft.year, eft.month, eft.day, eft.hour,
+                     eft.minute, eft.second);
+}
+
+void __init efi_probe(void)
+{
+       static struct xen_platform_op __initdata op = {
+               .cmd = XENPF_firmware_info,
+               .u.firmware_info = {
+                       .type = XEN_FW_EFI_INFO,
+                       .index = XEN_FW_EFI_CONFIG_TABLE
+               }
+       };
+
+       if (HYPERVISOR_platform_op(&op) == 0)
+               efi_enabled = 1;
+}
+
+void __init efi_reserve_boot_services(void) { }
+
+static int __init efi_config_init(u64 tables, unsigned int nr_tables)
+{
+       void *config_tables, *tablep;
+       unsigned int i, sz = sizeof(efi_config_table_t);
+
+       /*
+        * Let's see what config tables the firmware passed to us.
+        */
+       config_tables = early_ioremap(tables, nr_tables * sz);
+       if (config_tables == NULL) {
+               pr_err("Could not map Configuration table!\n");
+               return -ENOMEM;
+       }
+
+       tablep = config_tables;
+       pr_info("");
+       for (i = 0; i < nr_tables; i++) {
+               efi_guid_t guid;
+               unsigned long table;
+
+               guid = ((efi_config_table_t *)tablep)->guid;
+               table = ((efi_config_table_t *)tablep)->table;
+               if (!efi_guidcmp(guid, MPS_TABLE_GUID)) {
+                       efi.mps = table;
+                       pr_cont(" MPS=0x%lx ", table);
+               } else if (!efi_guidcmp(guid, ACPI_20_TABLE_GUID)) {
+                       efi.acpi20 = table;
+                       pr_cont(" ACPI 2.0=0x%lx ", table);
+               } else if (!efi_guidcmp(guid, ACPI_TABLE_GUID)) {
+                       efi.acpi = table;
+                       pr_cont(" ACPI=0x%lx ", table);
+               } else if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID)) {
+                       efi.smbios = table;
+                       pr_cont(" SMBIOS=0x%lx ", table);
+               } else if (!efi_guidcmp(guid, HCDP_TABLE_GUID)) {
+                       efi.hcdp = table;
+                       pr_cont(" HCDP=0x%lx ", table);
+               } else if (!efi_guidcmp(guid, UGA_IO_PROTOCOL_GUID)) {
+                       efi.uga = table;
+                       pr_cont(" UGA=0x%lx ", table);
+               }
+               tablep += sz;
+       }
+       pr_cont("\n");
+       early_iounmap(config_tables, nr_tables * sz);
+       return 0;
+}
+
+void __init efi_init(void)
+{
+       efi_char16_t c16[100];
+       char vendor[ARRAY_SIZE(c16)] = "unknown";
+       int ret, i;
+       struct xen_platform_op op;
+       union xenpf_efi_info *info = &op.u.firmware_info.u.efi_info;
+
+       op.cmd = XENPF_firmware_info;
+       op.u.firmware_info.type = XEN_FW_EFI_INFO;
+
+       /*
+        * Show what we know for posterity
+        */
+       op.u.firmware_info.index = XEN_FW_EFI_VENDOR;
+       info->vendor.bufsz = sizeof(c16);
+       set_xen_guest_handle(info->vendor.name, c16);
+       ret = HYPERVISOR_platform_op(&op);
+       if (!ret) {
+               for (i = 0; i < sizeof(vendor) - 1 && c16[i]; ++i)
+                       vendor[i] = c16[i];
+               vendor[i] = '\0';
+       } else
+               pr_err("Could not get the firmware vendor!\n");
+
+       op.u.firmware_info.index = XEN_FW_EFI_VERSION;
+       ret = HYPERVISOR_platform_op(&op);
+       if (!ret)
+               pr_info("EFI v%u.%.02u by %s\n",
+                       info->version >> 16,
+                       info->version & 0xffff, vendor);
+       else
+               pr_err("Could not get EFI revision!\n");
+
+       op.u.firmware_info.index = XEN_FW_EFI_RT_VERSION;
+       ret = HYPERVISOR_platform_op(&op);
+       if (!ret)
+               efi.runtime_version = info->version;
+       else
+               pr_warn("Could not get runtime services revision.\n");
+
+       op.u.firmware_info.index = XEN_FW_EFI_CONFIG_TABLE;
+       if (HYPERVISOR_platform_op(&op))
+               BUG();
+       if (efi_config_init(info->cfg.addr, info->cfg.nent)) {
+               efi_enabled = 0;
+               return;
+       }
+
+       x86_platform.get_wallclock = efi_get_time;
+       x86_platform.set_wallclock = efi_set_rtc_mmss;
+}
+
+void __init efi_enter_virtual_mode(void) { }
+
+static struct platform_device rtc_efi_dev = {
+       .name = "rtc-efi",
+       .id = -1,
+};
+
+static int __init rtc_init(void)
+{
+       if (efi_enabled && platform_device_register(&rtc_efi_dev) < 0)
+               pr_err("unable to register rtc device...\n");
+
+       /* not necessarily an error */
+       return 0;
+}
+arch_initcall(rtc_init);
+
+/*
+ * Convenience functions to obtain memory types and attributes
+ */
+u32 efi_mem_type(unsigned long phys_addr)
+{
+       struct xen_platform_op op;
+       union xenpf_efi_info *info = &op.u.firmware_info.u.efi_info;
+
+       op.cmd = XENPF_firmware_info;
+       op.u.firmware_info.type = XEN_FW_EFI_INFO;
+       op.u.firmware_info.index = XEN_FW_EFI_MEM_INFO;
+       info->mem.addr = phys_addr;
+       info->mem.size = 0;
+       return HYPERVISOR_platform_op(&op) ? 0 : info->mem.type;
+}
+
+u64 efi_mem_attributes(unsigned long phys_addr)
+{
+       struct xen_platform_op op;
+       union xenpf_efi_info *info = &op.u.firmware_info.u.efi_info;
+
+       op.cmd = XENPF_firmware_info;
+       op.u.firmware_info.type = XEN_FW_EFI_INFO;
+       op.u.firmware_info.index = XEN_FW_EFI_MEM_INFO;
+       info->mem.addr = phys_addr;
+       info->mem.size = 0;
+       return HYPERVISOR_platform_op(&op) ? 0 : info->mem.attr;
+}
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c

index 7785b72..f37ca26 100644 (file)
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -32,6 +32,7 @@
  #include <asm/apic.h>
  
  #ifdef CONFIG_X86_LOCAL_APIC
+#ifndef CONFIG_XEN
  static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
  
  /* All CPUs enumerated by SFI must be present and enabled */
@@ -47,6 +48,9 @@ static void __cpuinit mp_sfi_register_lapic(u8 id)
  
         generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
  }
+#else
+#define mp_sfi_register_lapic(id)
+#endif
  
  static int __init sfi_parse_cpus(struct sfi_table_header *table)
  {
@@ -86,9 +90,12 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
                 pentry++;
         }
  
+#ifndef CONFIG_XEN
         WARN(pic_mode, KERN_WARNING
                 "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
         pic_mode = 0;
+#endif
+
         return 0;
  }
  #endif /* CONFIG_X86_IO_APIC */
diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile

index a6a198c..0832f49 100644 (file)
--- a/arch/x86/power/Makefile
+++ b/arch/x86/power/Makefile
@@ -5,3 +5,5 @@ CFLAGS_cpu.o    := $(nostackp)
  
  obj-$(CONFIG_PM_SLEEP)         += cpu.o
  obj-$(CONFIG_HIBERNATION)      += hibernate_$(BITS).o hibernate_asm_$(BITS).o
+
+disabled-obj-$(CONFIG_XEN)     := cpu.o
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile

index fd14be1..8f010a6 100644 (file)
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -123,6 +123,7 @@ obj-$(VDSO32-y)                     += vdso32-syms.lds
  vdso32.so-$(VDSO32-y)          += int80
  vdso32.so-$(CONFIG_COMPAT)     += syscall
  vdso32.so-$(VDSO32-y)          += sysenter
+vdso32.so-$(CONFIG_X86_XEN)    += syscall
  
  vdso32-images                  = $(vdso32.so-y:%=vdso32-%.so)
  
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c

index 885eff4..4abf8cd 100644 (file)
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -25,6 +25,9 @@
  
  #define gtod (&VVAR(vsyscall_gtod_data))
  
+#ifdef CONFIG_XEN
+#define VCLOCK_NONE 0
+#else
  notrace static cycle_t vread_tsc(void)
  {
         cycle_t ret;
@@ -61,6 +64,7 @@ static notrace cycle_t vread_hpet(void)
  {
         return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
  }
+#endif /* CONFIG_XEN */
  
  notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
  {
@@ -80,6 +84,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
  }
  
  
+#ifndef CONFIG_XEN
  notrace static inline long vgetns(void)
  {
         long v;
@@ -128,6 +133,7 @@ notrace static int do_monotonic(struct timespec *ts)
  
         return mode;
  }
+#endif /* CONFIG_XEN */
  
  notrace static int do_realtime_coarse(struct timespec *ts)
  {
@@ -157,12 +163,14 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
         int ret = VCLOCK_NONE;
  
         switch (clock) {
+#ifndef CONFIG_XEN
         case CLOCK_REALTIME:
                 ret = do_realtime(ts);
                 break;
         case CLOCK_MONOTONIC:
                 ret = do_monotonic(ts);
                 break;
+#endif
         case CLOCK_REALTIME_COARSE:
                 return do_realtime_coarse(ts);
         case CLOCK_MONOTONIC_COARSE:
@@ -180,6 +188,7 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
  {
         long ret = VCLOCK_NONE;
  
+#ifndef CONFIG_XEN
         if (likely(tv != NULL)) {
                 BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
                              offsetof(struct timespec, tv_nsec) ||
@@ -192,6 +201,7 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
                 tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest;
                 tz->tz_dsttime = gtod->sys_tz.tz_dsttime;
         }
+#endif
  
         if (ret == VCLOCK_NONE)
                 return vdso_fallback_gtod(tv, tz);
diff --git a/arch/x86/vdso/vdso32-setup-xen.c b/arch/x86/vdso/vdso32-setup-xen.c

new file mode 100644 (file)

index 0000000..94430b9
--- /dev/null
+++ b/arch/x86/vdso/vdso32-setup-xen.c
@@ -0,0 +1,483 @@
+/*
+ * (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
+ *
+ * This file contains the needed initializations to support sysenter.
+ */
+
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/thread_info.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/module.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+#include <asm/pgtable.h>
+#include <asm/unistd.h>
+#include <asm/elf.h>
+#include <asm/tlbflush.h>
+#include <asm/vdso.h>
+#include <asm/proto.h>
+
+#include <xen/interface/callback.h>
+
+enum {
+       VDSO_DISABLED = 0,
+       VDSO_ENABLED = 1,
+       VDSO_COMPAT = 2,
+};
+
+#ifdef CONFIG_COMPAT_VDSO
+#define VDSO_DEFAULT   VDSO_COMPAT
+#else
+#define VDSO_DEFAULT   VDSO_ENABLED
+#endif
+
+#ifdef CONFIG_X86_64
+#define vdso_enabled                   sysctl_vsyscall32
+#define arch_setup_additional_pages    syscall32_setup_pages
+#endif
+
+/*
+ * This is the difference between the prelinked addresses in the vDSO images
+ * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO
+ * in the user address space.
+ */
+#define VDSO_ADDR_ADJUST       (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK)
+
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
+
+static int __init vdso_setup(char *s)
+{
+       vdso_enabled = simple_strtoul(s, NULL, 0);
+
+       return 1;
+}
+
+/*
+ * For consistency, the argument vdso32=[012] affects the 32-bit vDSO
+ * behavior on both 64-bit and 32-bit kernels.
+ * On 32-bit kernels, vdso=[012] means the same thing.
+ */
+__setup("vdso32=", vdso_setup);
+
+#ifdef CONFIG_X86_32
+__setup_param("vdso=", vdso32_setup, vdso_setup, 0);
+
+EXPORT_SYMBOL_GPL(vdso_enabled);
+#endif
+
+static __init void reloc_symtab(Elf32_Ehdr *ehdr,
+                               unsigned offset, unsigned size)
+{
+       Elf32_Sym *sym = (void *)ehdr + offset;
+       unsigned nsym = size / sizeof(*sym);
+       unsigned i;
+
+       for(i = 0; i < nsym; i++, sym++) {
+               if (sym->st_shndx == SHN_UNDEF ||
+                   sym->st_shndx == SHN_ABS)
+                       continue;  /* skip */
+
+               if (sym->st_shndx > SHN_LORESERVE) {
+                       printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
+                              sym->st_shndx);
+                       continue;
+               }
+
+               switch(ELF_ST_TYPE(sym->st_info)) {
+               case STT_OBJECT:
+               case STT_FUNC:
+               case STT_SECTION:
+               case STT_FILE:
+                       sym->st_value += VDSO_ADDR_ADJUST;
+               }
+       }
+}
+
+static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
+{
+       Elf32_Dyn *dyn = (void *)ehdr + offset;
+
+       for(; dyn->d_tag != DT_NULL; dyn++)
+               switch(dyn->d_tag) {
+               case DT_PLTGOT:
+               case DT_HASH:
+               case DT_STRTAB:
+               case DT_SYMTAB:
+               case DT_RELA:
+               case DT_INIT:
+               case DT_FINI:
+               case DT_REL:
+               case DT_DEBUG:
+               case DT_JMPREL:
+               case DT_VERSYM:
+               case DT_VERDEF:
+               case DT_VERNEED:
+               case DT_ADDRRNGLO ... DT_ADDRRNGHI:
+                       /* definitely pointers needing relocation */
+                       dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
+                       break;
+
+               case DT_ENCODING ... OLD_DT_LOOS-1:
+               case DT_LOOS ... DT_HIOS-1:
+                       /* Tags above DT_ENCODING are pointers if
+                          they're even */
+                       if (dyn->d_tag >= DT_ENCODING &&
+                           (dyn->d_tag & 1) == 0)
+                               dyn->d_un.d_ptr += VDSO_ADDR_ADJUST;
+                       break;
+
+               case DT_VERDEFNUM:
+               case DT_VERNEEDNUM:
+               case DT_FLAGS_1:
+               case DT_RELACOUNT:
+               case DT_RELCOUNT:
+               case DT_VALRNGLO ... DT_VALRNGHI:
+                       /* definitely not pointers */
+                       break;
+
+               case OLD_DT_LOOS ... DT_LOOS-1:
+               case DT_HIOS ... DT_VALRNGLO-1:
+               default:
+                       if (dyn->d_tag > DT_ENCODING)
+                               printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
+                                      dyn->d_tag);
+                       break;
+               }
+}
+
+static __init void relocate_vdso(Elf32_Ehdr *ehdr)
+{
+       Elf32_Phdr *phdr;
+       Elf32_Shdr *shdr;
+       int i;
+
+       BUG_ON(memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0 ||
+              !elf_check_arch_ia32(ehdr) ||
+              ehdr->e_type != ET_DYN);
+
+       ehdr->e_entry += VDSO_ADDR_ADJUST;
+
+       /* rebase phdrs */
+       phdr = (void *)ehdr + ehdr->e_phoff;
+       for (i = 0; i < ehdr->e_phnum; i++) {
+               phdr[i].p_vaddr += VDSO_ADDR_ADJUST;
+
+               /* relocate dynamic stuff */
+               if (phdr[i].p_type == PT_DYNAMIC)
+                       reloc_dyn(ehdr, phdr[i].p_offset);
+       }
+
+       /* rebase sections */
+       shdr = (void *)ehdr + ehdr->e_shoff;
+       for(i = 0; i < ehdr->e_shnum; i++) {
+               if (!(shdr[i].sh_flags & SHF_ALLOC))
+                       continue;
+
+               shdr[i].sh_addr += VDSO_ADDR_ADJUST;
+
+               if (shdr[i].sh_type == SHT_SYMTAB ||
+                   shdr[i].sh_type == SHT_DYNSYM)
+                       reloc_symtab(ehdr, shdr[i].sh_offset,
+                                    shdr[i].sh_size);
+       }
+}
+
+static struct page *vdso32_pages[1];
+
+#ifdef CONFIG_X86_64
+
+#define        vdso32_sysenter()       (boot_cpu_has(X86_FEATURE_SYSENTER32))
+#define        vdso32_syscall()        (boot_cpu_has(X86_FEATURE_SYSCALL32))
+
+void __cpuinit syscall32_cpu_init(void)
+{
+       static const struct callback_register __cpuinitconst cstar = {
+               .type = CALLBACKTYPE_syscall32,
+               .address = (unsigned long)ia32_cstar_target
+       };
+       static const struct callback_register __cpuinitconst sysenter = {
+               .type = CALLBACKTYPE_sysenter,
+               .address = (unsigned long)ia32_sysenter_target
+       };
+
+       if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) < 0)
+               setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
+       if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) < 0)
+               setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+}
+
+#define compat_uses_vma                1
+
+static inline void map_compat_vdso(int map)
+{
+}
+
+#else  /* CONFIG_X86_32 */
+
+#define vdso32_sysenter()      (boot_cpu_has(X86_FEATURE_SEP))
+#ifndef TIF_CSTAR
+#define vdso32_syscall()       0
+#else
+#define vdso32_syscall()       (boot_cpu_has(X86_FEATURE_SYSCALL32))
+
+extern asmlinkage void ia32pv_cstar_target(void);
+static const struct callback_register __cpuinitconst cstar = {
+       .type = CALLBACKTYPE_syscall32,
+       .address = { __KERNEL_CS, (unsigned long)ia32pv_cstar_target },
+};
+#endif
+
+void __cpuinit enable_sep_cpu(void)
+{
+       extern asmlinkage void ia32pv_sysenter_target(void);
+       static struct callback_register __cpuinitdata sysenter = {
+               .type = CALLBACKTYPE_sysenter,
+               .address = { __KERNEL_CS, (unsigned long)ia32pv_sysenter_target },
+       };
+
+#ifdef TIF_CSTAR
+       if (vdso32_syscall()) {
+               if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) != 0)
+                       BUG();
+               return;
+       }
+#endif
+
+       if (!vdso32_sysenter())
+               return;
+
+       if (xen_feature(XENFEAT_supervisor_mode_kernel))
+               sysenter.address.eip = (unsigned long)ia32_sysenter_target;
+
+       switch (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter)) {
+       case 0:
+               break;
+#if CONFIG_XEN_COMPAT < 0x030200
+       case -ENOSYS:
+               sysenter.type = CALLBACKTYPE_sysenter_deprecated;
+               if (HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) == 0)
+                       break;
+#endif
+       default:
+               setup_clear_cpu_cap(X86_FEATURE_SEP);
+               break;
+       }
+}
+
+static struct vm_area_struct gate_vma;
+
+static int __init gate_vma_init(void)
+{
+       gate_vma.vm_mm = NULL;
+       gate_vma.vm_start = FIXADDR_USER_START;
+       gate_vma.vm_end = FIXADDR_USER_END;
+       gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+       gate_vma.vm_page_prot = __P101;
+
+       return 0;
+}
+
+#define compat_uses_vma                0
+
+static void map_compat_vdso(int map)
+{
+       static int vdso_mapped;
+
+       if (map == vdso_mapped)
+               return;
+
+       vdso_mapped = map;
+
+       __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT,
+                    map ? PAGE_READONLY_EXEC : PAGE_NONE);
+
+       /* flush stray tlbs */
+       flush_tlb_all();
+}
+
+#endif /* CONFIG_X86_64 */
+
+int __init sysenter_setup(void)
+{
+       void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
+       const void *vsyscall;
+       size_t vsyscall_len;
+
+       vdso32_pages[0] = virt_to_page(syscall_page);
+
+#ifdef CONFIG_X86_32
+       gate_vma_init();
+
+       if (boot_cpu_has(X86_FEATURE_SYSCALL)) {
+# ifdef TIF_CSTAR
+               if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD
+                   && HYPERVISOR_callback_op(CALLBACKOP_register, &cstar) == 0)
+                       setup_force_cpu_cap(X86_FEATURE_SYSCALL32);
+               else
+# endif
+               {
+                       setup_clear_cpu_cap(X86_FEATURE_SYSCALL);
+                       setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+               }
+       }
+#endif
+       if (vdso32_syscall()) {
+               vsyscall = &vdso32_syscall_start;
+               vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
+       } else if (vdso32_sysenter()){
+               vsyscall = &vdso32_sysenter_start;
+               vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
+       } else {
+               vsyscall = &vdso32_int80_start;
+               vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
+       }
+
+       memcpy(syscall_page, vsyscall, vsyscall_len);
+       relocate_vdso(syscall_page);
+
+       return 0;
+}
+
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+       struct mm_struct *mm = current->mm;
+       unsigned long addr;
+       int ret = 0;
+       bool compat;
+
+#ifdef CONFIG_X86_X32_ABI
+       if (test_thread_flag(TIF_X32))
+               return x32_setup_additional_pages(bprm, uses_interp);
+#endif
+
+       if (vdso_enabled == VDSO_DISABLED)
+               return 0;
+
+       down_write(&mm->mmap_sem);
+
+       /* Test compat mode once here, in case someone
+          changes it via sysctl */
+       compat = (vdso_enabled == VDSO_COMPAT);
+
+       map_compat_vdso(compat);
+
+       if (compat)
+               addr = VDSO_HIGH_BASE;
+       else {
+               addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+               if (IS_ERR_VALUE(addr)) {
+                       ret = addr;
+                       goto up_fail;
+               }
+       }
+
+       current->mm->context.vdso = (void *)addr;
+
+       if (compat_uses_vma || !compat) {
+               /*
+                * MAYWRITE to allow gdb to COW and set breakpoints
+                */
+               ret = install_special_mapping(mm, addr, PAGE_SIZE,
+                                             VM_READ|VM_EXEC|
+                                             VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+                                             vdso32_pages);
+
+               if (ret)
+                       goto up_fail;
+       }
+
+       current_thread_info()->sysenter_return =
+               VDSO32_SYMBOL(addr, SYSENTER_RETURN);
+
+  up_fail:
+       if (ret)
+               current->mm->context.vdso = NULL;
+
+       up_write(&mm->mmap_sem);
+
+       return ret;
+}
+
+#ifdef CONFIG_X86_64
+
+subsys_initcall(sysenter_setup);
+
+#ifdef CONFIG_SYSCTL
+/* Register vsyscall32 into the ABI table */
+#include <linux/sysctl.h>
+
+static ctl_table abi_table2[] = {
+       {
+               .procname       = "vsyscall32",
+               .data           = &sysctl_vsyscall32,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec
+       },
+       {}
+};
+
+static ctl_table abi_root_table2[] = {
+       {
+               .procname = "abi",
+               .mode = 0555,
+               .child = abi_table2
+       },
+       {}
+};
+
+static __init int ia32_binfmt_init(void)
+{
+       register_sysctl_table(abi_root_table2);
+       return 0;
+}
+__initcall(ia32_binfmt_init);
+#endif
+
+#else  /* CONFIG_X86_32 */
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+       if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+               return "[vdso]";
+       return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+       /*
+        * Check to see if the corresponding task was created in compat vdso
+        * mode.
+        */
+       if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
+               return &gate_vma;
+       return NULL;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
+{
+       const struct vm_area_struct *vma = get_gate_vma(mm);
+
+       return vma && addr >= vma->vm_start && addr < vma->vm_end;
+}
+
+int in_gate_area_no_mm(unsigned long addr)
+{
+       return 0;
+}
+
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S

index 2ce5f82..8d4f773 100644 (file)
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -9,7 +9,7 @@ vdso32_int80_end:
  
         .globl vdso32_syscall_start, vdso32_syscall_end
  vdso32_syscall_start:
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_COMPAT) || defined(CONFIG_X86_XEN)
         .incbin "arch/x86/vdso/vdso32-syscall.so"
  #endif
  vdso32_syscall_end:
diff --git a/arch/x86/vdso/vdso32/note.S b/arch/x86/vdso/vdso32/note.S

index c83f257..b6ed8cd 100644 (file)
--- a/arch/x86/vdso/vdso32/note.S
+++ b/arch/x86/vdso/vdso32/note.S
@@ -13,7 +13,7 @@ ELFNOTE_START(Linux, 0, "a")
         .long LINUX_VERSION_CODE
  ELFNOTE_END
  
-#ifdef CONFIG_XEN
+#if defined(CONFIG_X86_XEN) || defined(CONFIG_PARAVIRT_XEN)
  /*
   * Add a special note telling glibc's dynamic linker a fake hardware
   * flavor that it will use to choose the search path for libraries in the
@@ -37,8 +37,12 @@ ELFNOTE_END
  
  ELFNOTE_START(GNU, 2, "a")
         .long 1                 /* ncaps */
+#ifdef CONFIG_PARAVIRT_XEN
  VDSO32_NOTE_MASK:              /* Symbol used by arch/x86/xen/setup.c */
         .long 0                 /* mask */
+#else
+       .long 1 << VDSO_NOTE_NONEGSEG_BIT /* mask */
+#endif
         .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
  ELFNOTE_END
  #endif
diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/vdso/vdso32/syscall.S

index 5415b56..0a27d17 100644 (file)
--- a/arch/x86/vdso/vdso32/syscall.S
+++ b/arch/x86/vdso/vdso32/syscall.S
@@ -19,8 +19,10 @@ __kernel_vsyscall:
  .Lpush_ebp:
         movl    %ecx, %ebp
         syscall
+#ifndef CONFIG_XEN
         movl    $__USER32_DS, %ecx
         movl    %ecx, %ss
+#endif
         movl    %ebp, %ecx
         popl    %ebp
  .Lpop_ebp:
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig

index fdce49c..0bc8165 100644 (file)
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -2,7 +2,7 @@
  # This Kconfig describes xen options
  #
  
-config XEN
+config PARAVIRT_XEN
         bool "Xen guest support"
         select PARAVIRT
         select PARAVIRT_CLOCK
@@ -15,36 +15,39 @@ config XEN
  
  config XEN_DOM0
         def_bool y
-       depends on XEN && PCI_XEN && SWIOTLB_XEN
+       depends on PARAVIRT_XEN && PCI_XEN && SWIOTLB_XEN
         depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
  
  # Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
  # name in tools.
-config XEN_PRIVILEGED_GUEST
-       def_bool XEN_DOM0
+# This doesn't work together with our identical symbol in drivers/xen/Kconfig
+# (produces a recursive dependency), and renaming it is pointless given that
+# it's meant as a compatibility thing.
+#config XEN_PRIVILEGED_GUEST
+#      def_bool XEN_DOM0
  
  config XEN_PVHVM
         def_bool y
-       depends on XEN && PCI && X86_LOCAL_APIC
+       depends on PARAVIRT_XEN && PCI && X86_LOCAL_APIC
  
  config XEN_MAX_DOMAIN_MEMORY
         int
         default 500 if X86_64
         default 64 if X86_32
-       depends on XEN
+       depends on PARAVIRT_XEN
         help
           This only affects the sizing of some bss arrays, the unused
           portions of which are freed.
  
  config XEN_SAVE_RESTORE
         bool
-       depends on XEN
+       depends on PARAVIRT_XEN
         select HIBERNATE_CALLBACKS
         default y
  
  config XEN_DEBUG_FS
         bool "Enable Xen debug and tuning parameters in debugfs"
-       depends on XEN && DEBUG_FS
+       depends on PARAVIRT_XEN && DEBUG_FS
         default n
         help
           Enable statistics output and various tuning options in debugfs.
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c

index 95dccce..6fbd402 100644 (file)
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -125,8 +125,8 @@ static int have_vcpu_info_placement = 1;
  static void clamp_max_cpus(void)
  {
  #ifdef CONFIG_SMP
-       if (setup_max_cpus > MAX_VIRT_CPUS)
-               setup_max_cpus = MAX_VIRT_CPUS;
+       if (setup_max_cpus > XEN_LEGACY_MAX_VCPUS)
+               setup_max_cpus = XEN_LEGACY_MAX_VCPUS;
  #endif
  }
  
@@ -138,11 +138,11 @@ static void xen_vcpu_setup(int cpu)
  
         BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
  
-       if (cpu < MAX_VIRT_CPUS)
+       if (cpu < XEN_LEGACY_MAX_VCPUS)
                 per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
  
         if (!have_vcpu_info_placement) {
-               if (cpu >= MAX_VIRT_CPUS)
+               if (cpu >= XEN_LEGACY_MAX_VCPUS)
                         clamp_max_cpus();
                 return;
         }
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S

index aaa7291..de87595 100644 (file)
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -1,7 +1,7 @@
  /* Xen-specific pieces of head.S, intended to be included in the right
         place in head.S */
  
-#ifdef CONFIG_XEN
+#ifdef CONFIG_PARAVIRT_XEN
  
  #include <linux/elfnote.h>
  #include <linux/init.h>
@@ -52,4 +52,4 @@ ENTRY(hypercall_page)
         ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   _ASM_PTR __HYPERVISOR_VIRT_START)
         ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   _ASM_PTR 0)
  
-#endif /*CONFIG_XEN */
+#endif /* CONFIG_PARAVIRT_XEN */
diff --git a/block/partitions/efi.c b/block/partitions/efi.c

index 6296b40..e4998a3 100644 (file)
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -319,6 +319,15 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
                 goto fail;
         }
  
+       /* Check the GUID Partition Table header size */
+       if (le32_to_cpu((*gpt)->header_size) >
+                       bdev_logical_block_size(state->bdev)) {
+               pr_debug("GUID Partition Table Header size is wrong: %u > %u\n",
+                       le32_to_cpu((*gpt)->header_size),
+                       bdev_logical_block_size(state->bdev));
+               goto fail;
+       }
+
         /* Check the GUID Partition Table CRC */
         origcrc = le32_to_cpu((*gpt)->header_crc32);
         (*gpt)->header_crc32 = 0;
diff --git a/drivers/Makefile b/drivers/Makefile

index 95952c8..52d2385 100644 (file)
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -24,7 +24,7 @@ obj-$(CONFIG_ARM_AMBA)                += amba/
  obj-$(CONFIG_DMA_ENGINE)       += dma/
  
  obj-$(CONFIG_VIRTIO)           += virtio/
-obj-$(CONFIG_XEN)              += xen/
+obj-$(CONFIG_PARAVIRT_XEN)     += xen/
  
  # regulators early, since some subsystems rely on them to initialize
  obj-$(CONFIG_REGULATOR)                += regulator/
@@ -47,6 +47,7 @@ obj-$(CONFIG_PARPORT)         += parport/
  obj-y                          += base/ block/ misc/ mfd/ nfc/
  obj-$(CONFIG_NUBUS)            += nubus/
  obj-y                          += macintosh/
+obj-$(CONFIG_XEN)              += xen/
  obj-$(CONFIG_IDE)              += ide/
  obj-$(CONFIG_SCSI)             += scsi/
  obj-$(CONFIG_ATA)              += ata/
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig

index 47768ff..dd6fdcf 100644 (file)
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -184,7 +184,7 @@ config ACPI_DOCK
  config ACPI_PROCESSOR
         tristate "Processor"
         select THERMAL
-       select CPU_IDLE
+       select CPU_IDLE if !PROCESSOR_EXTERNAL_CONTROL
         default y
         help
           This driver installs ACPI as the idle handler for Linux and uses
@@ -216,7 +216,7 @@ config ACPI_PROCESSOR_AGGREGATOR
         tristate "Processor Aggregator"
         depends on ACPI_PROCESSOR
         depends on EXPERIMENTAL
-       depends on X86
+       depends on X86 && !XEN
         help
           ACPI 4.0 defines processor Aggregator, which enables OS to perform
           specific processor configuration and control that applies to all
@@ -308,6 +308,7 @@ config ACPI_PCI_SLOT
  config X86_PM_TIMER
         bool "Power Management Timer Support" if EXPERT
         depends on X86
+       depends on !XEN
         default y
         help
           The Power Management Timer is available on all ACPI-capable,
@@ -336,7 +337,7 @@ config ACPI_CONTAINER
  
  config ACPI_HOTPLUG_MEMORY
         tristate "Memory Hotplug"
-       depends on MEMORY_HOTPLUG
+       depends on MEMORY_HOTPLUG || XEN_PRIVILEGED_GUEST
         default n
         help
           This driver supports ACPI memory hotplug.  The driver
@@ -395,4 +396,13 @@ config ACPI_BGRT
  
  source "drivers/acpi/apei/Kconfig"
  
+config ACPI_PV_SLEEP
+       bool
+       depends on X86 && XEN && ACPI_SLEEP
+       default y
+
+config PROCESSOR_EXTERNAL_CONTROL
+       bool
+       depends on (X86 || IA64) && XEN
+       default y
  endif  # ACPI
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile

index 47199e2..edf53d8 100644 (file)
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -68,6 +68,7 @@ obj-$(CONFIG_ACPI_BGRT)               += bgrt.o
  processor-y                    := processor_driver.o processor_throttling.o
  processor-y                    += processor_idle.o processor_thermal.o
  processor-$(CONFIG_CPU_FREQ)   += processor_perflib.o
+processor-$(CONFIG_PROCESSOR_EXTERNAL_CONTROL) += processor_perflib.o processor_extcntl.o
  
  obj-$(CONFIG_ACPI_PROCESSOR_AGGREGATOR) += acpi_pad.o
  obj-$(CONFIG_ACPI_IPMI)                += acpi_ipmi.o
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c

index d985713..a63bc73 100644 (file)
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -88,6 +88,14 @@ struct acpi_memory_device {
  
  static int acpi_hotmem_initialized;
  
+#ifdef CONFIG_XEN
+#include "../xen/core/acpi_memhotplug.c"
+#define memory_add_physaddr_to_nid(start) 0
+#else
+static inline int xen_hotadd_mem_init(void) { return 0; }
+static inline void xen_hotadd_mem_exit(void) {}
+#endif
+
  static acpi_status
  acpi_memory_get_resource(struct acpi_resource *resource, void *context)
  {
@@ -229,6 +237,10 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
                 return result;
         }
  
+#ifdef CONFIG_XEN
+       return xen_hotadd_memory(mem_device);
+#endif
+
         node = acpi_get_node(mem_device->device->handle);
         /*
          * Tell the VM there is more memory here...
@@ -312,6 +324,10 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device)
         struct acpi_memory_info *info, *n;
  
  
+#ifdef CONFIG_XEN
+       return -EOPNOTSUPP;
+#endif
+
         /*
          * Ask the VM to offline this memory range.
          * Note: Assume that this function returns zero on success
@@ -531,6 +547,10 @@ static int __init acpi_memory_device_init(void)
         acpi_status status;
  
  
+       result = xen_hotadd_mem_init();
+       if (result < 0)
+               return result;
+
         result = acpi_bus_register_driver(&acpi_memory_device_driver);
  
         if (result < 0)
@@ -570,6 +590,8 @@ static void __exit acpi_memory_device_exit(void)
  
         acpi_bus_unregister_driver(&acpi_memory_device_driver);
  
+       xen_hotadd_mem_exit();
+
         return;
  }
  
diff --git a/drivers/acpi/acpica/hwxfsleep.c b/drivers/acpi/acpica/hwxfsleep.c

index 762d059..24096be 100644 (file)
--- a/drivers/acpi/acpica/hwxfsleep.c
+++ b/drivers/acpi/acpica/hwxfsleep.c
@@ -167,6 +167,7 @@ ACPI_EXPORT_SYMBOL(acpi_set_firmware_waking_vector64)
   *              THIS FUNCTION MUST BE CALLED WITH INTERRUPTS DISABLED
   *
   ******************************************************************************/
+#ifndef CONFIG_XEN
  acpi_status asmlinkage acpi_enter_sleep_state_s4bios(void)
  {
         u32 in_value;
@@ -220,6 +221,7 @@ acpi_status asmlinkage acpi_enter_sleep_state_s4bios(void)
  }
  
  ACPI_EXPORT_SYMBOL(acpi_enter_sleep_state_s4bios)
+#endif
  #endif                         /* !ACPI_REDUCED_HARDWARE */
  /*******************************************************************************
   *
diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c

index 4c9c760..dc332b7 100644 (file)
--- a/drivers/acpi/acpica/tbfadt.c
+++ b/drivers/acpi/acpica/tbfadt.c
@@ -572,11 +572,12 @@ static void acpi_tb_validate_fadt(void)
                             (!address64->address && length)) {
                                 ACPI_WARNING((AE_INFO,
                                               "Optional field %s has zero address or length: "
-                                             "0x%8.8X%8.8X/0x%X",
+                                             "0x%8.8X%8.8X/0x%X - not using it",
                                               name,
                                               ACPI_FORMAT_UINT64(address64->
                                                                  address),
                                               length));
+                               address64->address = 0;
                         }
                 }
         }
diff --git a/drivers/acpi/bgrt.c b/drivers/acpi/bgrt.c

index 8cf6c46..b6df328 100644 (file)
--- a/drivers/acpi/bgrt.c
+++ b/drivers/acpi/bgrt.c
@@ -10,6 +10,7 @@
  #include <linux/module.h>
  #include <linux/init.h>
  #include <linux/device.h>
+#include <linux/io.h>
  #include <linux/sysfs.h>
  #include <acpi/acpi.h>
  #include <acpi/acpi_bus.h>
diff --git a/drivers/acpi/ec_sys.c b/drivers/acpi/ec_sys.c

index 7586544..4e7b798 100644 (file)
--- a/drivers/acpi/ec_sys.c
+++ b/drivers/acpi/ec_sys.c
@@ -12,6 +12,7 @@
  #include <linux/acpi.h>
  #include <linux/debugfs.h>
  #include <linux/module.h>
+#include <linux/uaccess.h>
  #include "internal.h"
  
  MODULE_AUTHOR("Thomas Renninger <trenn@suse.de>");
@@ -34,7 +35,6 @@ static ssize_t acpi_ec_read_io(struct file *f, char __user *buf,
          * struct acpi_ec *ec = ((struct seq_file *)f->private_data)->private;
          */
         unsigned int size = EC_SPACE_SIZE;
-       u8 *data = (u8 *) buf;
         loff_t init_off = *off;
         int err = 0;
  
@@ -47,9 +47,15 @@ static ssize_t acpi_ec_read_io(struct file *f, char __user *buf,
                 size = count;
  
         while (size) {
-               err = ec_read(*off, &data[*off - init_off]);
+               u8 byte_read;
+               err = ec_read(*off, &byte_read);
                 if (err)
                         return err;
+               if (put_user(byte_read, buf + *off - init_off)) {
+                       if (*off - init_off)
+                               return *off - init_off; /* partial read */
+                       return -EFAULT;
+               }
                 *off += 1;
                 size--;
         }
@@ -65,7 +71,6 @@ static ssize_t acpi_ec_write_io(struct file *f, const char __user *buf,
  
         unsigned int size = count;
         loff_t init_off = *off;
-       u8 *data = (u8 *) buf;
         int err = 0;
  
         if (*off >= EC_SPACE_SIZE)
@@ -76,7 +81,12 @@ static ssize_t acpi_ec_write_io(struct file *f, const char __user *buf,
         }
  
         while (size) {
-               u8 byte_write = data[*off - init_off];
+               u8 byte_write;
+               if (get_user(byte_write, buf + *off - init_off)) {
+                       if (*off - init_off)
+                               return *off - init_off; /* partial write */
+                       return -EFAULT;
+               }
                 err = ec_write(*off, byte_write);
                 if (err)
                         return err;
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c

index c3881b2..645ee18 100644 (file)
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -326,8 +326,12 @@ acpi_map_lookup_virt(void __iomem *virt, acpi_size size)
  }
  
  #ifndef CONFIG_IA64
+#ifndef CONFIG_XEN
  #define should_use_kmap(pfn)   page_is_ram(pfn)
  #else
+#define should_use_kmap(mfn)   pfn_valid(pfn = mfn_to_local_pfn(mfn))
+#endif
+#else
  /* ioremap will take care of cache attributes */
  #define should_use_kmap(pfn)   0
  #endif
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c

index 0eefa12..b4ad988 100644 (file)
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -527,3 +527,80 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
         dev_dbg(&dev->dev, "PCI INT %c disabled\n", pin_name(pin));
         acpi_unregister_gsi(gsi);
  }
+
+#if defined(CONFIG_XEN) && defined(CONFIG_PCI)
+static int __init xen_setup_gsi(void)
+{
+       struct pci_dev *dev = NULL;
+
+       if (acpi_noirq)
+               return 0;
+
+       /* Loop body is a clone of acpi_pci_irq_enable(). */
+       for_each_pci_dev(dev) {
+               const struct acpi_prt_entry *entry;
+               int gsi;
+               int triggering = ACPI_LEVEL_SENSITIVE;
+               int polarity = ACPI_ACTIVE_LOW;
+               struct physdev_setup_gsi setup_gsi;
+
+               if (!dev->pin)
+                       continue;
+
+               entry = acpi_pci_irq_lookup(dev, dev->pin);
+               if (!entry) {
+                       /*
+                        * IDE legacy mode controller IRQs are magic. Why do
+                        * compat extensions always make such a nasty mess.
+                        */
+                       if ((dev->class >> 8) == PCI_CLASS_STORAGE_IDE &&
+                           (dev->class & 0x05) == 0)
+                               continue;
+               }
+
+               gsi = entry
+                     ? entry->link
+                       ? acpi_pci_link_allocate_irq(entry->link,
+                                                    entry->index,
+                                                    &triggering, &polarity,
+                                                    NULL)
+                       : entry->index
+                     : -1;
+
+               if (gsi >= 0) {
+                       setup_gsi.gsi = gsi;
+                       setup_gsi.triggering
+                               = (triggering == ACPI_LEVEL_SENSITIVE);
+                       setup_gsi.polarity = (polarity == ACPI_ACTIVE_LOW);
+                       if (HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi,
+                                                 &setup_gsi) < 0)
+                               continue;
+
+                       dev_info(&dev->dev, "GSI%d: %s-%s\n", gsi,
+                                triggering == ACPI_LEVEL_SENSITIVE ? "level"
+                                                                   : "edge",
+                                polarity == ACPI_ACTIVE_LOW ? "low" : "high");
+               } else {
+                       /*
+                        * No IRQ known to the ACPI subsystem - maybe the
+                        * BIOS / driver reported one, then use it.
+                        */
+                       dev_warn(&dev->dev, "PCI INT %c: no GSI",
+                                pin_name(dev->pin));
+                       /* Interrupt Line values above 0xF are forbidden */
+                       if (dev->irq > 0 && (dev->irq <= 0xF)) {
+                               pr_cont(" - using IRQ %d\n", dev->irq);
+                               setup_gsi.gsi = dev->irq;
+                               setup_gsi.triggering = 1;
+                               setup_gsi.polarity = 1;
+                               VOID(HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi,
+                                                          &setup_gsi));
+                       } else
+                               pr_cont("\n");
+               }
+       }
+
+       return 0;
+}
+subsys_initcall(xen_setup_gsi);
+#endif
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c

index 7aff631..508ad3e 100644 (file)
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -449,6 +449,41 @@ out:
  }
  EXPORT_SYMBOL(acpi_pci_osc_control_set);
  
+#ifdef CONFIG_PCI_GUESTDEV
+#include <linux/sysfs.h>
+
+static ssize_t seg_show(struct device *dev,
+                       struct device_attribute *attr, char *buf)
+{
+       struct list_head *entry;
+
+       list_for_each(entry, &acpi_pci_roots) {
+               struct acpi_pci_root *root;
+               root = list_entry(entry, struct acpi_pci_root, node);
+               if (&root->device->dev == dev)
+                       return sprintf(buf, "%04x\n", root->segment);
+       }
+       return 0;
+}
+static DEVICE_ATTR(seg, 0444, seg_show, NULL);
+
+static ssize_t bbn_show(struct device *dev,
+                       struct device_attribute *attr, char *buf)
+{
+       struct list_head *entry;
+
+       list_for_each(entry, &acpi_pci_roots) {
+               struct acpi_pci_root *root;
+               root = list_entry(entry, struct acpi_pci_root, node);
+               if (&root->device->dev == dev)
+                       return sprintf(buf, "%02x\n",
+                                      (unsigned int)root->secondary.start);
+       }
+       return 0;
+}
+static DEVICE_ATTR(bbn, 0444, bbn_show, NULL);
+#endif
+
  static int __devinit acpi_pci_root_add(struct acpi_device *device)
  {
         unsigned long long segment, bus;
@@ -618,6 +653,13 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device)
                          "(_OSC support mask: 0x%02x)\n", flags);
         }
  
+#ifdef CONFIG_PCI_GUESTDEV
+       if (device_create_file(&device->dev, &dev_attr_seg))
+               dev_warn(&device->dev, "could not create seg attr\n");
+       if (device_create_file(&device->dev, &dev_attr_bbn))
+               dev_warn(&device->dev, "could not create bbn attr\n");
+#endif
+
         pci_acpi_add_bus_pm_notifier(device, root->bus);
         if (device->wakeup.flags.run_wake)
                 device_set_run_wake(root->bus->bridge, true);
@@ -665,3 +707,31 @@ static int __init acpi_pci_root_init(void)
  }
  
  subsys_initcall(acpi_pci_root_init);
+
+#ifdef CONFIG_PCI_GUESTDEV
+int acpi_pci_get_root_seg_bbn(char *hid, char *uid, int *seg, int *bbn)
+{
+       struct list_head *entry;
+
+       list_for_each(entry, &acpi_pci_roots) {
+               struct acpi_pci_root *root;
+
+               root = list_entry(entry, struct acpi_pci_root, node);
+               if (strcmp(acpi_device_hid(root->device), hid))
+                       continue;
+
+               if (!root->device->pnp.unique_id) {
+                       if (strlen(uid))
+                               continue;
+               } else {
+                       if (strcmp(root->device->pnp.unique_id, uid))
+                               continue;
+               }
+
+               *seg = (int)root->segment;
+               *bbn = (int)root->secondary.start;
+               return TRUE;
+       }
+       return FALSE;
+}
+#endif
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c

index c850de4..bc347b0 100644 (file)
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -20,6 +20,15 @@
  #define _COMPONENT             ACPI_PROCESSOR_COMPONENT
  ACPI_MODULE_NAME("processor_core");
  
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+/*
+ * External processor control logic may register with its own set of
+ * ops to get ACPI related notification. One example is like VMM.
+ */
+const struct processor_extcntl_ops *processor_extcntl_ops;
+EXPORT_SYMBOL(processor_extcntl_ops);
+#endif
+
  static int __init set_no_mwait(const struct dmi_system_id *id)
  {
         printk(KERN_NOTICE PREFIX "%s detected - "
@@ -165,15 +174,19 @@ exit:
  
  int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
  {
-#ifdef CONFIG_SMP
-       int i;
-#endif
-       int apic_id = -1;
+       int i = 0, apic_id = -1;
+
+       if (type < 0) {
+               if (!processor_cntl_external())
+                       return -1;
+               type = ~type;
+               i = 1;
+       }
  
         apic_id = map_mat_entry(handle, type, acpi_id);
         if (apic_id == -1)
                 apic_id = map_madt_entry(type, acpi_id);
-       if (apic_id == -1) {
+       if (apic_id == -1 || i) {
                 /*
                  * On UP processor, there is no _MAT or MADT table.
                  * So above apic_id is always set to -1.
@@ -192,18 +205,28 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
                  * Ignores apic_id and always return 0 for CPU0's handle.
                  * Return -1 for other CPU's handle.
                  */
-               if (acpi_id == 0)
+               if (acpi_id == 0 && !i)
                         return acpi_id;
                 else
                         return apic_id;
         }
  
  #ifdef CONFIG_SMP
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         for_each_possible_cpu(i) {
                 if (cpu_physical_id(i) == apic_id)
                         return i;
         }
  #else
+       /*
+        * Use of cpu_physical_id() is bogus here. Rather than defining a
+        * stub enforcing a 1:1 mapping, we keep it undefined to catch bad
+        * uses. Return as if there was a 1:1 mapping.
+        */
+       if (apic_id < nr_cpu_ids && cpu_possible(apic_id))
+               return apic_id;
+#endif
+#else
         /* In UP kernel, only processor 0 is valid */
         if (apic_id == 0)
                 return apic_id;
@@ -244,6 +267,8 @@ static bool __init processor_physically_present(acpi_handle handle)
         }
  
         type = (acpi_type == ACPI_TYPE_DEVICE) ? 1 : 0;
+       if (processor_cntl_external())
+               type = ~type;
         cpuid = acpi_get_cpuid(handle, type, acpi_id);
  
         if (cpuid == -1)
@@ -312,19 +337,31 @@ acpi_processor_eval_pdc(acpi_handle handle, struct acpi_object_list *pdc_in)
  {
         acpi_status status = AE_OK;
  
+#ifndef CONFIG_XEN
         if (boot_option_idle_override == IDLE_NOMWAIT) {
                 /*
                  * If mwait is disabled for CPU C-states, the C2C3_FFH access
                  * mode will be disabled in the parameter of _PDC object.
                  * Of course C1_FFH access mode will also be disabled.
                  */
+#else
+       {
+               struct xen_platform_op op;
+#endif
                 union acpi_object *obj;
                 u32 *buffer = NULL;
  
                 obj = pdc_in->pointer;
                 buffer = (u32 *)(obj->buffer.pointer);
+#ifndef CONFIG_XEN
                 buffer[2] &= ~(ACPI_PDC_C_C2C3_FFH | ACPI_PDC_C_C1_FFH);
-
+#else
+               op.cmd = XENPF_set_processor_pminfo;
+               op.u.set_pminfo.id = -1;
+               op.u.set_pminfo.type = XEN_PM_PDC;
+               set_xen_guest_handle(op.u.set_pminfo.u.pdc, buffer);
+               VOID(HYPERVISOR_platform_op(&op));
+#endif
         }
         status = acpi_evaluate_object(handle, "_PDC", pdc_in, NULL);
  
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c

index 0734086..5d0d9b3 100644 (file)
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -110,7 +110,9 @@ static struct acpi_driver acpi_processor_driver = {
  #define UNINSTALL_NOTIFY_HANDLER       2
  
  DEFINE_PER_CPU(struct acpi_processor *, processors);
+#ifndef CONFIG_XEN
  EXPORT_PER_CPU_SYMBOL(processors);
+#endif
  
  struct acpi_processor_errata errata __read_mostly;
  
@@ -324,9 +326,16 @@ static int acpi_processor_get_info(struct acpi_device *device)
          *  they are physically not present.
          */
         if (pr->id == -1) {
-               if (ACPI_FAILURE(acpi_processor_hotadd_init(pr)))
+               if (ACPI_FAILURE(acpi_processor_hotadd_init(pr)) &&
+                   acpi_get_cpuid(pr->handle, ~device_declaration,
+                                  pr->acpi_id) < 0)
                         return -ENODEV;
         }
+#if defined(CONFIG_SMP) && defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
+       if (pr->id >= setup_max_cpus && pr->id > 0)
+               pr->id = -1;
+#endif
+
         /*
          * On some boxes several processors use the same processor bus id.
          * But they are located in different scope. For example:
@@ -336,7 +345,14 @@ static int acpi_processor_get_info(struct acpi_device *device)
          * generated as the following format:
          * CPU+CPU ID.
          */
-       sprintf(acpi_device_bid(device), "CPU%X", pr->id);
+       if (pr->id != -1)
+               sprintf(acpi_device_bid(device), "CPU%X", pr->id);
+       else
+               snprintf(acpi_device_bid(device),
+                        ARRAY_SIZE(acpi_device_bid(device)),
+                        "#%0*X",
+                        (int)ARRAY_SIZE(acpi_device_bid(device)) - 2,
+                        pr->acpi_id);
         ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Processor [%d:%d]\n", pr->id,
                           pr->acpi_id));
  
@@ -368,13 +384,20 @@ static int acpi_processor_get_info(struct acpi_device *device)
          * of /proc/cpuinfo
          */
         status = acpi_evaluate_object(pr->handle, "_SUN", NULL, &buffer);
-       if (ACPI_SUCCESS(status))
+       if (ACPI_SUCCESS(status) && pr->id != -1)
                 arch_fix_phys_package_id(pr->id, object.integer.value);
  
         return 0;
  }
  
+#ifndef CONFIG_XEN
  static DEFINE_PER_CPU(void *, processor_device_array);
+#else
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+static DEFINE_MUTEX(processor_device_mutex);
+static RADIX_TREE(processor_device_tree, GFP_KERNEL);
+#endif
  
  static void acpi_processor_notify(struct acpi_device *device, u32 event)
  {
@@ -469,19 +492,38 @@ static struct notifier_block acpi_cpu_notifier =
   */
  static __ref int acpi_processor_start(struct acpi_processor *pr)
  {
+#ifndef CONFIG_XEN
         struct acpi_device *device = per_cpu(processor_device_array, pr->id);
+#else
+       struct acpi_device *device = radix_tree_lookup(&processor_device_tree, pr->acpi_id);
+#endif
         int result = 0;
  
-#ifdef CONFIG_CPU_FREQ
+#if defined(CONFIG_CPU_FREQ) || defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
         acpi_processor_ppc_has_changed(pr, 0);
+#endif
+#ifdef CONFIG_CPU_FREQ
         acpi_processor_load_module(pr);
  #endif
-       acpi_processor_get_throttling_info(pr);
-       acpi_processor_get_limit_info(pr);
+       /*
+        * pr->id may equal to -1 while processor_cntl_external enabled.
+        * throttle and thermal module don't support this case.
+        * Tx only works when dom0 vcpu == pcpu num by far, as we give
+        * control to dom0.
+        */
+       if (pr->id != -1) {
+               acpi_processor_get_throttling_info(pr);
+               acpi_processor_get_limit_info(pr);
+       }
  
-       if (!cpuidle_get_driver() || cpuidle_get_driver() == &acpi_idle_driver)
+       if (!cpuidle_get_driver() || cpuidle_get_driver() == &acpi_idle_driver
+           || processor_pm_external())
                 acpi_processor_power_init(pr, device);
  
+       result = processor_extcntl_prepare(pr);
+       if (result)
+               goto err_power_exit;
+
         pr->cdev = thermal_cooling_device_register("Processor", device,
                                                    &processor_cooling_ops);
         if (IS_ERR(pr->cdev)) {
@@ -546,32 +588,58 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device)
         device->driver_data = pr;
  
         result = acpi_processor_get_info(device);
-       if (result) {
+       if (result ||
+           ((pr->id == -1) && !processor_cntl_external())) {
                 /* Processor is physically not present */
                 return 0;
         }
  
  #ifdef CONFIG_SMP
-       if (pr->id >= setup_max_cpus && pr->id != 0)
-               return 0;
+       if (pr->id >= setup_max_cpus && pr->id != 0) {
+               if (!processor_cntl_external())
+                       return 0;
+               WARN_ON(pr->id != -1);
+       }
  #endif
  
-       BUG_ON((pr->id >= nr_cpu_ids) || (pr->id < 0));
+       BUG_ON(!processor_cntl_external() &&
+              ((pr->id >= nr_cpu_ids) || (pr->id < 0)));
  
         /*
          * Buggy BIOS check
          * ACPI id of processors can be reported wrongly by the BIOS.
          * Don't trust it blindly
          */
+#ifndef CONFIG_XEN
         if (per_cpu(processor_device_array, pr->id) != NULL &&
             per_cpu(processor_device_array, pr->id) != device) {
+#else
+       mutex_lock(&processor_device_mutex);
+       result = radix_tree_insert(&processor_device_tree,
+                                  pr->acpi_id, device);
+       switch (result) {
+       default:
+               mutex_unlock(&processor_device_mutex);
+               goto err_free_cpumask;
+       case -EEXIST:
+               if (radix_tree_lookup(&processor_device_tree,
+                                     pr->acpi_id) == device) {
+       case 0:
+                       mutex_unlock(&processor_device_mutex);
+                       break;
+               }
+               mutex_unlock(&processor_device_mutex);
+#endif
                 printk(KERN_WARNING "BIOS reported wrong ACPI id "
                         "for the processor\n");
                 result = -ENODEV;
                 goto err_free_cpumask;
         }
+#ifndef CONFIG_XEN
         per_cpu(processor_device_array, pr->id) = device;
-
+#else
+       if (pr->id != -1) {
+#endif
         per_cpu(processors, pr->id) = pr;
  
         dev = get_cpu_device(pr->id);
@@ -579,6 +647,9 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device)
                 result = -EFAULT;
                 goto err_clear_processor;
         }
+#ifdef CONFIG_XEN
+       }
+#endif
  
         /*
          * Do not start hotplugged CPUs now, but when they
@@ -594,12 +665,18 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device)
         return 0;
  
  err_remove_sysfs:
+#ifdef CONFIG_XEN
+       if (pr->id != -1) {
+#endif
         sysfs_remove_link(&device->dev.kobj, "sysdev");
  err_clear_processor:
         /*
          * processor_device_array is not cleared to allow checks for buggy BIOS
          */ 
         per_cpu(processors, pr->id) = NULL;
+#ifdef CONFIG_XEN
+       }
+#endif
  err_free_cpumask:
         free_cpumask_var(pr->throttling.shared_cpu_map);
  err_free_pr:
@@ -617,7 +694,7 @@ static int acpi_processor_remove(struct acpi_device *device, int type)
  
         pr = acpi_driver_data(device);
  
-       if (pr->id >= nr_cpu_ids)
+       if (!processor_cntl_external() && pr->id >= nr_cpu_ids)
                 goto free;
  
         if (type == ACPI_BUS_REMOVAL_EJECT) {
@@ -627,7 +704,8 @@ static int acpi_processor_remove(struct acpi_device *device, int type)
  
         acpi_processor_power_exit(pr, device);
  
-       sysfs_remove_link(&device->dev.kobj, "sysdev");
+       if (pr->id != -1)
+               sysfs_remove_link(&device->dev.kobj, "sysdev");
  
         if (pr->cdev) {
                 sysfs_remove_link(&device->dev.kobj, "thermal_cooling");
@@ -636,8 +714,16 @@ static int acpi_processor_remove(struct acpi_device *device, int type)
                 pr->cdev = NULL;
         }
  
+#ifndef CONFIG_XEN
         per_cpu(processors, pr->id) = NULL;
         per_cpu(processor_device_array, pr->id) = NULL;
+#else
+       if (pr->id != -1)
+               per_cpu(processors, pr->id) = NULL;
+       mutex_lock(&processor_device_mutex);
+       radix_tree_delete(&processor_device_tree, pr->acpi_id);
+       mutex_unlock(&processor_device_mutex);
+#endif
  
  free:
         free_cpumask_var(pr->throttling.shared_cpu_map);
@@ -693,6 +779,10 @@ int acpi_processor_device_add(acpi_handle handle, struct acpi_device **device)
                 return -ENODEV;
         }
  
+       if (processor_cntl_external() && acpi_driver_data(*device))
+               processor_notify_external(acpi_driver_data(*device),
+                       PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD);
+
         return 0;
  }
  
@@ -722,6 +812,10 @@ static void acpi_processor_hotplug_notify(acpi_handle handle,
                                             "Unable to add the device\n");
                         break;
                 }
+               pr = acpi_driver_data(device);
+               if (processor_cntl_external() && pr)
+                       processor_notify_external(pr,
+                                       PROCESSOR_HOTPLUG, HOTPLUG_TYPE_ADD);
                 break;
         case ACPI_NOTIFY_EJECT_REQUEST:
                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
@@ -738,6 +832,9 @@ static void acpi_processor_hotplug_notify(acpi_handle handle,
                                     "Driver data is NULL, dropping EJECT\n");
                         return;
                 }
+               if (processor_cntl_external())
+                       processor_notify_external(pr, PROCESSOR_HOTPLUG,
+                                               HOTPLUG_TYPE_REMOVE);
                 break;
         default:
                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
@@ -813,10 +910,21 @@ static acpi_status acpi_processor_hotadd_init(struct acpi_processor *pr)
  {
         acpi_handle handle = pr->handle;
  
+#ifdef CONFIG_XEN
+       if (xen_pcpu_index(pr->acpi_id, 1) != -1)
+               return AE_OK;
+#endif
+
         if (!is_processor_present(handle)) {
                 return AE_ERROR;
         }
  
+       if (processor_cntl_external()) {
+               processor_notify_external(pr, PROCESSOR_HOTPLUG,
+                                         HOTPLUG_TYPE_ADD);
+               return AE_OK;
+       }
+
         if (acpi_map_lsapic(handle, &pr->id))
                 return AE_ERROR;
  
@@ -841,6 +949,12 @@ static acpi_status acpi_processor_hotadd_init(struct acpi_processor *pr)
  
  static int acpi_processor_handle_eject(struct acpi_processor *pr)
  {
+       if (processor_cntl_external()) {
+               processor_notify_external(pr, PROCESSOR_HOTPLUG,
+                                         HOTPLUG_TYPE_REMOVE);
+               return (0);
+       }
+
         if (cpu_online(pr->id))
                 cpu_down(pr->id);
  
@@ -928,6 +1042,30 @@ static void __exit acpi_processor_exit(void)
  
         acpi_bus_unregister_driver(&acpi_processor_driver);
  
+#ifdef CONFIG_XEN
+       {
+               struct acpi_device *dev;
+               unsigned int idx = 0;
+
+               while (radix_tree_gang_lookup(&processor_device_tree,
+                                             (void **)&dev, idx, 1)) {
+                       struct acpi_processor *pr = acpi_driver_data(dev);
+
+                       /* prevent live lock */
+                       if (pr->acpi_id < idx) {
+                               printk(KERN_WARNING PREFIX "ID %u unexpected"
+                                      " (less than %u); leaking memory\n",
+                                      pr->acpi_id, idx);
+                               break;
+                       }
+                       idx = pr->acpi_id;
+                       radix_tree_delete(&processor_device_tree, idx);
+                       if (!++idx)
+                               break;
+               }
+       }
+#endif
+
         return;
  }
  
diff --git a/drivers/acpi/processor_extcntl.c b/drivers/acpi/processor_extcntl.c

new file mode 100644 (file)

index 0000000..e71db44
--- /dev/null
+++ b/drivers/acpi/processor_extcntl.c
@@ -0,0 +1,214 @@
+/*
+ * processor_extcntl.c - channel to external control logic
+ *
+ *  Copyright (C) 2008, Intel corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/acpi.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+
+#include <acpi/processor.h>
+
+#define ACPI_PROCESSOR_CLASS            "processor"
+#define _COMPONENT              ACPI_PROCESSOR_COMPONENT
+ACPI_MODULE_NAME("processor_extcntl")
+
+static int processor_extcntl_parse_csd(struct acpi_processor *pr);
+static int processor_extcntl_get_performance(struct acpi_processor *pr);
+
+static int processor_notify_smm(void)
+{
+       acpi_status status;
+       static int is_done = 0;
+
+       /* only need successfully notify BIOS once */
+       /* avoid double notification which may lead to unexpected result */
+       if (is_done)
+               return 0;
+
+       /* Can't write pstate_cnt to smi_cmd if either value is zero */
+       if (!acpi_gbl_FADT.smi_command || !acpi_gbl_FADT.pstate_control) {
+               ACPI_DEBUG_PRINT((ACPI_DB_INFO,"No SMI port or pstate_cnt\n"));
+               return 0;
+       }
+
+       ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+               "Writing pstate_cnt [0x%x] to smi_cmd [0x%x]\n",
+               acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
+
+       status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
+                                   acpi_gbl_FADT.pstate_control, 8);
+       if (ACPI_FAILURE(status))
+               return status;
+
+       is_done = 1;
+
+       return 0;
+}
+
+int processor_notify_external(struct acpi_processor *pr, int event, int type)
+{
+       int ret = -EINVAL;
+
+       if (!processor_cntl_external())
+               return -EINVAL;
+
+       switch (event) {
+       case PROCESSOR_PM_INIT:
+       case PROCESSOR_PM_CHANGE:
+               if ((type >= PM_TYPE_MAX) ||
+                       !processor_extcntl_ops->pm_ops[type])
+                       break;
+
+               ret = processor_extcntl_ops->pm_ops[type](pr, event);
+               break;
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+       case PROCESSOR_HOTPLUG:
+               if (processor_extcntl_ops->hotplug)
+                       ret = processor_extcntl_ops->hotplug(pr, type);
+               xen_pcpu_hotplug(type);
+               break;
+#endif
+       default:
+               pr_err("Unsupported processor event %d.\n", event);
+               break;
+       }
+
+       return ret;
+}
+
+/*
+ * This is called from ACPI processor init, and targeted to hold
+ * some tricky housekeeping jobs to satisfy external control model.
+ * For example, we may put dependency parse stub here for idle
+ * and performance state. Those information may be not available
+ * if splitting from dom0 control logic like cpufreq driver.
+ */
+int processor_extcntl_prepare(struct acpi_processor *pr)
+{
+       /* parse cstate dependency information */
+       if (processor_pm_external())
+               processor_extcntl_parse_csd(pr);
+
+       /* Initialize performance states */
+       if (processor_pmperf_external())
+               processor_extcntl_get_performance(pr);
+
+       return 0;
+}
+
+/*
+ * Currently no _CSD is implemented which is why existing ACPI code
+ * doesn't parse _CSD at all. But to keep interface complete with
+ * external control logic, we put a placeholder here for future
+ * compatibility.
+ */
+static int processor_extcntl_parse_csd(struct acpi_processor *pr)
+{
+       int i;
+
+       for (i = 0; i < pr->power.count; i++) {
+               if (!pr->power.states[i].valid)
+                       continue;
+
+               /* No dependency by default */
+               pr->power.states[i].domain_info = NULL;
+               pr->power.states[i].csd_count = 0;
+       }
+
+       return 0;
+}
+
+/*
+ * Existing ACPI module does parse performance states at some point,
+ * when acpi-cpufreq driver is loaded which however is something
+ * we'd like to disable to avoid confliction with external control
+ * logic. So we have to collect raw performance information here
+ * when ACPI processor object is found and started.
+ */
+static int processor_extcntl_get_performance(struct acpi_processor *pr)
+{
+       int ret;
+       struct acpi_processor_performance *perf;
+       struct acpi_psd_package *pdomain;
+
+       if (pr->performance)
+               return -EBUSY;
+
+       perf = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL);
+       if (!perf)
+               return -ENOMEM;
+
+       pr->performance = perf;
+       /* Get basic performance state information */
+       ret = acpi_processor_get_performance_info(pr);
+       if (ret < 0)
+               goto err_out;
+
+       /*
+        * Well, here we need retrieve performance dependency information
+        * from _PSD object. The reason why existing interface is not used
+        * is due to the reason that existing interface sticks to Linux cpu
+        * id to construct some bitmap, however we want to split ACPI
+        * processor objects from Linux cpu id logic. For example, even
+        * when Linux is configured as UP, we still want to parse all ACPI
+        * processor objects to external logic. In this case, it's preferred
+        * to use ACPI ID instead.
+        */
+       pdomain = &pr->performance->domain_info;
+       pdomain->num_processors = 0;
+       ret = acpi_processor_get_psd(pr);
+       if (ret < 0) {
+               /*
+                * _PSD is optional - assume no coordination if absent (or
+                * broken), matching native kernels' behavior.
+                */
+               pdomain->num_entries = ACPI_PSD_REV0_ENTRIES;
+               pdomain->revision = ACPI_PSD_REV0_REVISION;
+               pdomain->domain = pr->acpi_id;
+               pdomain->coord_type = DOMAIN_COORD_TYPE_SW_ALL;
+               pdomain->num_processors = 1;
+       }
+
+       /* Some sanity check */
+       if ((pdomain->revision != ACPI_PSD_REV0_REVISION) ||
+           (pdomain->num_entries != ACPI_PSD_REV0_ENTRIES) ||
+           ((pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ALL) &&
+            (pdomain->coord_type != DOMAIN_COORD_TYPE_SW_ANY) &&
+            (pdomain->coord_type != DOMAIN_COORD_TYPE_HW_ALL))) {
+               ret = -EINVAL;
+               goto err_out;
+       }
+
+       /* Last step is to notify BIOS that external logic exists */
+       processor_notify_smm();
+
+       processor_notify_external(pr, PROCESSOR_PM_INIT, PM_TYPE_PERF);
+
+       return 0;
+err_out:
+       pr->performance = NULL;
+       kfree(perf);
+       return ret;
+}
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c

index f3decb3..9e3886c 100644 (file)
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -125,6 +125,7 @@ static struct dmi_system_id __cpuinitdata processor_power_dmi_table[] = {
  };
  
  
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
  /*
   * Callers should disable interrupts before the call and enable
   * interrupts after return.
@@ -143,6 +144,7 @@ static void acpi_safe_halt(void)
         }
         current_thread_info()->status |= TS_POLLING;
  }
+#endif
  
  #ifdef ARCH_APICTIMER_STOPS_ON_C3
  
@@ -213,7 +215,7 @@ static void lapic_timer_state_broadcast(struct acpi_processor *pr,
  static void lapic_timer_check_state(int state, struct acpi_processor *pr,
                                    struct acpi_processor_cx *cstate) { }
  static void lapic_timer_propagate_broadcast(struct acpi_processor *pr) { }
-static void lapic_timer_state_broadcast(struct acpi_processor *pr,
+static inline void lapic_timer_state_broadcast(struct acpi_processor *pr,
                                        struct acpi_processor_cx *cx,
                                        int broadcast)
  {
@@ -252,7 +254,7 @@ int acpi_processor_resume(struct acpi_device * device)
         return 0;
  }
  
-#if defined(CONFIG_X86)
+#if defined(CONFIG_X86) && !defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
  static void tsc_check_state(int state)
  {
         switch (boot_cpu_data.x86_vendor) {
@@ -449,7 +451,8 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
                                  */
                                 cx.entry_method = ACPI_CSTATE_HALT;
                                 snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
-                       } else {
+                       /* This doesn't apply to external control case */
+                       } else if (!processor_pm_external()) {
                                 continue;
                         }
                         if (cx.type == ACPI_STATE_C1 &&
@@ -488,6 +491,12 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
  
                 cx.power = obj->integer.value;
  
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+               /* cache control methods to notify external logic */
+               if (processor_pm_external())
+                       memcpy(&cx.reg, reg, sizeof(*reg));
+#endif
+
                 current_count++;
                 memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
  
@@ -509,7 +518,7 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
                           current_count));
  
         /* Validate number of power states discovered */
-       if (current_count < 2)
+       if (current_count < (processor_pm_external() ? 1 : 2))
                 status = -EFAULT;
  
        end:
@@ -605,7 +614,9 @@ static int acpi_processor_power_verify(struct acpi_processor *pr)
         unsigned int i;
         unsigned int working = 0;
  
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         pr->power.timer_broadcast_on_state = INT_MAX;
+#endif
  
         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) {
                 struct acpi_processor_cx *cx = &pr->power.states[i];
@@ -677,6 +688,7 @@ static int acpi_processor_get_power_info(struct acpi_processor *pr)
         return 0;
  }
  
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
  /**
   * acpi_idle_bm_check - checks if bus master activity was detected
   */
@@ -1137,6 +1149,10 @@ static int acpi_processor_setup_cpuidle_states(struct acpi_processor *pr)
  
         return 0;
  }
+#else
+static void acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr) {}
+static void acpi_processor_setup_cpuidle_states(struct acpi_processor *pr) {}
+#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */
  
  int acpi_processor_hotplug(struct acpi_processor *pr)
  {
@@ -1155,6 +1171,14 @@ int acpi_processor_hotplug(struct acpi_processor *pr)
         if (!pr->flags.power_setup_done)
                 return -ENODEV;
  
+       if (processor_pm_external()) {
+               pr->flags.power = 0;
+               ret = acpi_processor_get_power_info(pr);
+               processor_notify_external(pr,
+                       PROCESSOR_PM_CHANGE, PM_TYPE_IDLE);
+               return ret;
+       }
+
         cpuidle_pause_and_lock();
         cpuidle_disable_device(&pr->power.dev);
         acpi_processor_get_power_info(pr);
@@ -1231,7 +1255,6 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
                               struct acpi_device *device)
  {
         acpi_status status = 0;
-       int retval;
         static int first_run;
  
         if (disabled_by_idle_boot_param())
@@ -1262,12 +1285,15 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
         acpi_processor_get_power_info(pr);
         pr->flags.power_setup_done = 1;
  
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         /*
          * Install the idle handler if processor power management is supported.
          * Note that we use previously set idle handler will be used on
          * platforms that only support C1.
          */
         if (pr->flags.power) {
+               int retval;
+
                 /* Register acpi_idle_driver if not already registered */
                 if (!acpi_processor_registered) {
                         acpi_processor_setup_cpuidle_states(pr);
@@ -1289,6 +1315,12 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
                 }
                 acpi_processor_registered++;
         }
+#endif
+
+       if (processor_pm_external())
+               processor_notify_external(pr,
+                       PROCESSOR_PM_INIT, PM_TYPE_IDLE);
+
         return 0;
  }
  
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c

index 0af48a8..d199db7 100644 (file)
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -75,6 +75,7 @@ MODULE_PARM_DESC(ignore_ppc, "If the frequency of your machine gets wrongly" \
  
  static int acpi_processor_ppc_status;
  
+#ifdef CONFIG_CPU_FREQ
  static int acpi_processor_ppc_notifier(struct notifier_block *nb,
                                        unsigned long event, void *data)
  {
@@ -117,6 +118,7 @@ static int acpi_processor_ppc_notifier(struct notifier_block *nb,
  static struct notifier_block acpi_ppc_notifier_block = {
         .notifier_call = acpi_processor_ppc_notifier,
  };
+#endif /* CONFIG_CPU_FREQ */
  
  static int acpi_processor_get_platform_limit(struct acpi_processor *pr)
  {
@@ -181,6 +183,12 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
  {
         int ret;
  
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+       /* Xen hypervisor can handle cpufreq _PPC event */
+       if (ignore_ppc < 0 && processor_pmperf_external())
+               ignore_ppc = 0;
+#endif
+
         if (ignore_ppc) {
                 /*
                  * Only when it is notification event, the _OST object
@@ -205,7 +213,12 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
         if (ret < 0)
                 return (ret);
         else
+#ifdef CONFIG_CPU_FREQ
                 return cpufreq_update_policy(pr->id);
+#elif defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
+               return processor_notify_external(pr,
+                               PROCESSOR_PM_CHANGE, PM_TYPE_PERF);
+#endif
  }
  
  int acpi_processor_get_bios_limit(int cpu, unsigned int *limit)
@@ -221,6 +234,7 @@ int acpi_processor_get_bios_limit(int cpu, unsigned int *limit)
  }
  EXPORT_SYMBOL(acpi_processor_get_bios_limit);
  
+#ifdef CONFIG_CPU_FREQ
  void acpi_processor_ppc_init(void)
  {
         if (!cpufreq_register_notifier
@@ -261,6 +275,7 @@ void acpi_processor_load_module(struct acpi_processor *pr)
         }
         kfree(buffer.pointer);
  }
+#endif /* CONFIG_CPU_FREQ */
  
  static int acpi_processor_get_performance_control(struct acpi_processor *pr)
  {
@@ -408,7 +423,10 @@ static int acpi_processor_get_performance_states(struct acpi_processor *pr)
         return result;
  }
  
-static int acpi_processor_get_performance_info(struct acpi_processor *pr)
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+static
+#endif
+int acpi_processor_get_performance_info(struct acpi_processor *pr)
  {
         int result = 0;
         acpi_status status = AE_OK;
@@ -453,6 +471,7 @@ static int acpi_processor_get_performance_info(struct acpi_processor *pr)
         return result;
  }
  
+#ifdef CONFIG_CPU_FREQ
  int acpi_processor_notify_smm(struct module *calling_module)
  {
         acpi_status status;
@@ -513,8 +532,12 @@ int acpi_processor_notify_smm(struct module *calling_module)
  }
  
  EXPORT_SYMBOL(acpi_processor_notify_smm);
+#endif /* CONFIG_CPU_FREQ */
  
-static int acpi_processor_get_psd(struct acpi_processor        *pr)
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+static
+#endif
+int acpi_processor_get_psd(struct acpi_processor *pr)
  {
         int result = 0;
         acpi_status status = AE_OK;
@@ -579,6 +602,8 @@ end:
         return result;
  }
  
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+
  int acpi_processor_preregister_performance(
                 struct acpi_processor_performance __percpu *performance)
  {
@@ -794,3 +819,5 @@ acpi_processor_unregister_performance(struct acpi_processor_performance
  }
  
  EXPORT_SYMBOL(acpi_processor_unregister_performance);
+
+#endif /* !CONFIG_PROCESSOR_EXTERNAL_CONTROL */
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c

index 85cbfdc..6ad4656 100644 (file)
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -175,6 +175,16 @@ acpi_device_hid_show(struct device *dev, struct device_attribute *attr, char *bu
  }
  static DEVICE_ATTR(hid, 0444, acpi_device_hid_show, NULL);
  
+#ifdef CONFIG_PCI_GUESTDEV
+static ssize_t
+acpi_device_uid_show(struct device *dev, struct device_attribute *attr, char *buf) {
+       struct acpi_device *acpi_dev = to_acpi_device(dev);
+
+       return sprintf(buf, "%s\n", acpi_dev->pnp.unique_id);
+}
+static DEVICE_ATTR(uid, 0444, acpi_device_uid_show, NULL);
+#endif
+
  static ssize_t
  acpi_device_path_show(struct device *dev, struct device_attribute *attr, char *buf) {
         struct acpi_device *acpi_dev = to_acpi_device(dev);
@@ -217,6 +227,13 @@ static int acpi_device_setup_files(struct acpi_device *dev)
                         goto end;
         }
  
+#ifdef CONFIG_PCI_GUESTDEV
+       if(dev->pnp.unique_id) {
+               result = device_create_file(&dev->dev, &dev_attr_uid);
+               if(result)
+                       goto end;
+       }
+#endif
          /*
           * If device has _EJ0, 'eject' file is created that is used to trigger
           * hot-removal function from userland.
@@ -280,6 +297,9 @@ static void acpi_free_ids(struct acpi_device *device)
                 kfree(id->id);
                 kfree(id);
         }
+#ifdef CONFIG_PCI_GUESTDEV
+       kfree(device->pnp.unique_id);
+#endif
  }
  
  static void acpi_device_release(struct device *dev)
@@ -1138,6 +1158,11 @@ static void acpi_device_set_id(struct acpi_device *device)
                         for (i = 0; i < cid_list->count; i++)
                                 acpi_add_id(device, cid_list->ids[i].string);
                 }
+#ifdef CONFIG_PCI_GUESTDEV
+               if (info->valid & ACPI_VALID_UID)
+                       device->pnp.unique_id = kstrdup(info->unique_id.string,
+                                                       GFP_KERNEL);
+#endif
                 if (info->valid & ACPI_VALID_ADR) {
                         device->pnp.bus_address = info->address;
                         device->flags.bus_address = 1;
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c

index eb6fd23..0eb85f8 100644 (file)
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -91,6 +91,7 @@ static struct notifier_block tts_notifier = {
  static int acpi_sleep_prepare(u32 acpi_state)
  {
  #ifdef CONFIG_ACPI_SLEEP
+#ifndef CONFIG_ACPI_PV_SLEEP
         /* do we have a wakeup address for S2 and S3? */
         if (acpi_state == ACPI_STATE_S3) {
                 if (!acpi_wakeup_address) {
@@ -100,6 +101,7 @@ static int acpi_sleep_prepare(u32 acpi_state)
                                 (acpi_physical_address)acpi_wakeup_address);
  
         }
+#endif
         ACPI_FLUSH_CPU_CACHE();
  #endif
         printk(KERN_INFO PREFIX "Preparing to enter system sleep state S%d\n",
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c

index f336bca..9534b66 100644 (file)
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -339,6 +339,9 @@ int __init acpi_table_init(void)
  {
         acpi_status status;
  
+       if (acpi_rsdt_forced)
+               printk(KERN_INFO "Using RSDT as ACPI root table\n");
+
         status = acpi_initialize_tables(initial_tables, ACPI_MAX_TABLES, 0);
         if (ACPI_FAILURE(status))
                 return 1;
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c

index 7dbebea..9f23658 100644 (file)
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -41,6 +41,7 @@
  #include <linux/kmod.h>
  #include <linux/reboot.h>
  #include <linux/device.h>
+#include <linux/dmi.h>
  #include <asm/uaccess.h>
  #include <linux/thermal.h>
  #include <acpi/acpi_bus.h>
@@ -984,6 +985,86 @@ static void acpi_thermal_guess_offset(struct acpi_thermal *tz)
                 tz->kelvin_offset = 2732;
  }
  
+static struct dmi_system_id thermal_psv_dmi_table[] = {
+       {
+               .ident = "IBM ThinkPad T41",
+               .matches = {
+                       DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
+                       DMI_MATCH(DMI_PRODUCT_VERSION,"ThinkPad T41"),
+               },
+       },
+       {
+               .ident = "IBM ThinkPad T42",
+               .matches = {
+                       DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
+                       DMI_MATCH(DMI_PRODUCT_VERSION,"ThinkPad T42"),
+               },
+       },
+       {
+               .ident = "IBM ThinkPad T43",
+               .matches = {
+                       DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
+                       DMI_MATCH(DMI_PRODUCT_VERSION,"ThinkPad T43"),
+               },
+       },
+       {
+               .ident = "IBM ThinkPad T41p",
+               .matches = {
+                       DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
+                       DMI_MATCH(DMI_PRODUCT_VERSION,"ThinkPad T41p"),
+               },
+       },
+       {
+               .ident = "IBM ThinkPad T42p",
+               .matches = {
+                       DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
+                       DMI_MATCH(DMI_PRODUCT_VERSION,"ThinkPad T42p"),
+               },
+       },
+       {
+               .ident = "IBM ThinkPad T43p",
+               .matches = {
+                       DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
+                       DMI_MATCH(DMI_PRODUCT_VERSION,"ThinkPad T43p"),
+               },
+       },
+       {
+               .ident = "IBM ThinkPad R40",
+               .matches = {
+                       DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
+                       DMI_MATCH(DMI_PRODUCT_VERSION,"ThinkPad R40"),
+               },
+       },
+       {
+               .ident = "IBM ThinkPad R50p",
+               .matches = {
+                       DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
+                       DMI_MATCH(DMI_PRODUCT_VERSION,"ThinkPad R50p"),
+               },
+       },
+       {},
+};
+
+static int acpi_thermal_set_polling(struct acpi_thermal *tz, int seconds)
+{
+       if (!tz)
+              return -EINVAL;
+
+       /* Convert value to deci-seconds */
+       tz->polling_frequency = seconds * 10;
+
+       tz->thermal_zone->polling_delay = seconds * 1000;
+
+       if (tz->tz_enabled)
+              thermal_zone_device_update(tz->thermal_zone);
+
+       ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+                        "Polling frequency set to %lu seconds\n",
+                        tz->polling_frequency/10));
+
+       return 0;
+}
+
  static int acpi_thermal_add(struct acpi_device *device)
  {
         int result = 0;
@@ -1015,6 +1096,18 @@ static int acpi_thermal_add(struct acpi_device *device)
         if (result)
                 goto free_memory;
  
+       if (dmi_check_system(thermal_psv_dmi_table)) {
+               if (tz->trips.passive.flags.valid &&
+                   tz->trips.passive.temperature > CELSIUS_TO_KELVIN(85)) {
+                       printk (KERN_INFO "Adjust passive trip point from %lu"
+                               " to %lu\n",
+                               KELVIN_TO_CELSIUS(tz->trips.passive.temperature),
+                               KELVIN_TO_CELSIUS(tz->trips.passive.temperature - 150));
+                       tz->trips.passive.temperature -= 150;
+                       acpi_thermal_set_polling(tz, 5);
+               }
+       }
+
         printk(KERN_INFO PREFIX "%s [%s] (%ld C)\n",
                acpi_device_name(device), acpi_device_bid(device),
                KELVIN_TO_CELSIUS(tz->temperature));
diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c

index 7857e8f..6efe855 100644 (file)
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -167,6 +167,7 @@ struct piix_host_priv {
  static int piix_init_one(struct pci_dev *pdev,
                          const struct pci_device_id *ent);
  static void piix_remove_one(struct pci_dev *pdev);
+static unsigned int piix_pata_read_id(struct ata_device *adev, struct ata_taskfile *tf, u16 *id);
  static int piix_pata_prereset(struct ata_link *link, unsigned long deadline);
  static void piix_set_piomode(struct ata_port *ap, struct ata_device *adev);
  static void piix_set_dmamode(struct ata_port *ap, struct ata_device *adev);
@@ -361,6 +362,7 @@ static struct ata_port_operations piix_pata_ops = {
         .set_piomode            = piix_set_piomode,
         .set_dmamode            = piix_set_dmamode,
         .prereset               = piix_pata_prereset,
+       .read_id                = piix_pata_read_id,
  };
  
  static struct ata_port_operations piix_vmw_ops = {
@@ -648,6 +650,26 @@ MODULE_LICENSE("GPL");
  MODULE_DEVICE_TABLE(pci, piix_pci_tbl);
  MODULE_VERSION(DRV_VERSION);
  
+static int piix_msft_hyperv(void)
+{
+       int hv = 0;
+#if defined(CONFIG_HYPERV_STORAGE) || defined(CONFIG_HYPERV_STORAGE_MODULE)
+       static const struct dmi_system_id hv_dmi_ident[]  = {
+               {
+                       .ident = "Hyper-V",
+                       .matches = {
+                               DMI_MATCH(DMI_SYS_VENDOR, "Microsoft Corporation"),
+                               DMI_MATCH(DMI_PRODUCT_NAME, "Virtual Machine"),
+                               DMI_MATCH(DMI_BOARD_NAME, "Virtual Machine"),
+                       },
+               },
+               { }     /* terminate list */
+       };
+       hv = !!dmi_check_system(hv_dmi_ident);
+#endif
+       return hv;
+}
+
  struct ich_laptop {
         u16 device;
         u16 subvendor;
@@ -739,6 +761,26 @@ static int piix_pata_prereset(struct ata_link *link, unsigned long deadline)
         return ata_sff_prereset(link, deadline);
  }
  
+static unsigned int piix_pata_read_id(struct ata_device *adev, struct ata_taskfile *tf, u16 *id)
+{
+       unsigned int err_mask = ata_do_dev_read_id(adev, tf, id);
+       /*
+        * Ignore disks in a hyper-v guest.
+        * There is no unplug protocol like it is done with xen_emul_unplug= option.
+        * Emulate the unplug by ignoring disks when the hv_storvsc driver is enabled.
+        * If the disks are not ignored, they will appear twice: once through
+        * piix and once through hv_storvsc.
+        * hv_storvsc can not handle ATAPI devices because they can only be
+        * accessed through the emulated code path (not through the vm_bus
+        * channel), the piix driver is still required.
+        */
+       if (ata_id_is_ata(id) && piix_msft_hyperv()) {
+               ata_dev_printk(adev, KERN_WARNING, "ATA device ignored in Hyper-V guest\n");
+               id[ATA_ID_CONFIG] |= (1 << 15);
+       }
+       return err_mask;
+}
+
  static DEFINE_SPINLOCK(piix_lock);
  
  static void piix_set_timings(struct ata_port *ap, struct ata_device *adev,
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c

index adf937b..ddcccce 100644 (file)
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -108,7 +108,7 @@ static inline void register_cpu_control(struct cpu *cpu)
  }
  #endif /* CONFIG_HOTPLUG_CPU */
  
-#ifdef CONFIG_KEXEC
+#if defined(CONFIG_KEXEC) && !defined(CONFIG_XEN)
  #include <linux/kexec.h>
  
  static ssize_t show_crash_notes(struct device *dev, struct device_attribute *attr,
@@ -256,7 +256,7 @@ int __cpuinit register_cpu(struct cpu *cpu, int num)
         if (!error)
                 register_cpu_under_node(num, cpu_to_node(num));
  
-#ifdef CONFIG_KEXEC
+#if defined(CONFIG_KEXEC) && !defined(CONFIG_XEN)
         if (!error)
                 error = device_create_file(&cpu->dev, &dev_attr_crash_notes);
  #endif
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig

index a796407..98d6d60 100644 (file)
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -488,9 +488,9 @@ config XILINX_SYSACE
         help
           Include support for the Xilinx SystemACE CompactFlash interface
  
-config XEN_BLKDEV_FRONTEND
+config PARAVIRT_XEN_BLKDEV_FRONTEND
         tristate "Xen virtual block device support"
-       depends on XEN
+       depends on PARAVIRT_XEN
         default y
         select XEN_XENBUS_FRONTEND
         help
@@ -498,16 +498,16 @@ config XEN_BLKDEV_FRONTEND
           block device driver.  It communicates with a back-end driver
           in another domain which drives the actual block device.
  
-config XEN_BLKDEV_BACKEND
+config PARAVIRT_XEN_BLKDEV_BACKEND
         tristate "Xen block-device backend driver"
-       depends on XEN_BACKEND
+       depends on PARAVIRT_XEN_BACKEND
         help
           The block-device backend driver allows the kernel to export its
           block devices to other guests via a high-performance shared-memory
           interface.
  
           The corresponding Linux frontend driver is enabled by the
-         CONFIG_XEN_BLKDEV_FRONTEND configuration option.
+         CONFIG_PARAVIRT_XEN_BLKDEV_FRONTEND configuration option.
  
           The backend driver attaches itself to a any block device specified
           in the XenBus configuration. There are no limits to what the block
diff --git a/drivers/block/Makefile b/drivers/block/Makefile

index 5b79505..34cad2a 100644 (file)
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,8 +36,8 @@ obj-$(CONFIG_BLK_DEV_SX8)     += sx8.o
  obj-$(CONFIG_BLK_DEV_UB)       += ub.o
  obj-$(CONFIG_BLK_DEV_HD)       += hd.o
  
-obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += xen-blkfront.o
-obj-$(CONFIG_XEN_BLKDEV_BACKEND)       += xen-blkback/
+obj-$(CONFIG_PARAVIRT_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
+obj-$(CONFIG_PARAVIRT_XEN_BLKDEV_BACKEND) += xen-blkback/
  obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
  obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
  obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c

index b0b00d7..887aa31 100644 (file)
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -146,7 +146,9 @@
  
  #undef  FLOPPY_SILENT_DCL_CLEAR
  
+#ifndef CONFIG_XEN
  #define REALLY_SLOW_IO
+#endif
  
  #define DEBUGT 2
  
diff --git a/drivers/block/xen-blkback/Makefile b/drivers/block/xen-blkback/Makefile

index e491c1b..4513281 100644 (file)
--- a/drivers/block/xen-blkback/Makefile
+++ b/drivers/block/xen-blkback/Makefile
@@ -1,3 +1,3 @@
-obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
+obj-$(CONFIG_PARAVIRT_XEN_BLKDEV_BACKEND) := xen-blkback.o
  
  xen-blkback-y  := blkback.o xenbus.o
diff --git a/drivers/cdrom/Makefile b/drivers/cdrom/Makefile

index ecf85fd..99757fb 100644 (file)
--- a/drivers/cdrom/Makefile
+++ b/drivers/cdrom/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_BLK_DEV_IDECD)     +=              cdrom.o
  obj-$(CONFIG_BLK_DEV_SR)       +=              cdrom.o
  obj-$(CONFIG_PARIDE_PCD)       +=              cdrom.o
  obj-$(CONFIG_CDROM_PKTCDVD)    +=              cdrom.o
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      +=              cdrom.o
  
  obj-$(CONFIG_VIOCD)            += viocd.o      cdrom.o
  obj-$(CONFIG_GDROM)            += gdrom.o      cdrom.o
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig

index ee94686..b616a74 100644 (file)
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -519,7 +519,7 @@ config MAX_RAW_DEVS
  config HPET
         bool "HPET - High Precision Event Timer" if (X86 || IA64)
         default n
-       depends on ACPI
+       depends on ACPI && !XEN
         help
           If you say Y here, you will have a miscdevice named "/dev/hpet/".  Each
           open selects one of the timers supported by the HPET.  The timers are
@@ -593,6 +593,11 @@ config RAMOOPS
           This enables panic and oops messages to be logged to a circular
           buffer in RAM where it can be read back at some later point.
  
+config CRASHER
+       tristate "Crasher Module"
+       help
+         Slab cache memory tester.  Only use this as a module
+
  config MSM_SMD_PKT
         bool "Enable device interface for some SMD packet ports"
         default n
diff --git a/drivers/char/Makefile b/drivers/char/Makefile

index 0dc5d7c..946469f 100644 (file)
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_IPMI_HANDLER)    += ipmi/
  
  obj-$(CONFIG_HANGCHECK_TIMER)  += hangcheck-timer.o
  obj-$(CONFIG_TCG_TPM)          += tpm/
+obj-$(CONFIG_CRASHER)          += crasher.o
  
  obj-$(CONFIG_PS3_FLASH)                += ps3flash.o
  obj-$(CONFIG_RAMOOPS)          += ramoops.o
diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h

index 923f99d..e3491f5 100644 (file)
--- a/drivers/char/agp/agp.h
+++ b/drivers/char/agp/agp.h
@@ -31,6 +31,10 @@
  
  #include <asm/agp.h>   /* for flush_agp_cache() */
  
+#ifndef virt_to_gart
+#define virt_to_gart virt_to_phys
+#endif
+
  #define PFX "agpgart: "
  
  //#define AGP_DEBUG 1
diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c

index f7e8878..7e630bd 100644 (file)
--- a/drivers/char/agp/amd-k7-agp.c
+++ b/drivers/char/agp/amd-k7-agp.c
@@ -142,7 +142,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge)
  
         agp_bridge->gatt_table_real = (u32 *)page_dir.real;
         agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped;
-       agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
+       agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
  
         /* Get the address for the gart region.
          * This is a bus address even on the alpha, b/c its
@@ -155,7 +155,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge)
  
         /* Calculate the agp offset */
         for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) {
-               writel(virt_to_phys(amd_irongate_private.gatt_pages[i]->real) | 1,
+               writel(virt_to_gart(amd_irongate_private.gatt_pages[i]->real) | 1,
                         page_dir.remapped+GET_PAGE_DIR_OFF(addr));
                 readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr));        /* PCI Posting. */
         }
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c

index 444f8b6..d2356c2 100644 (file)
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -178,7 +178,7 @@ static const struct aper_size_info_32 amd_8151_sizes[7] =
  
  static int amd_8151_configure(void)
  {
-       unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
+       unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real);
         int i;
  
         if (!amd_nb_has_feature(AMD_NB_GART))
@@ -583,7 +583,7 @@ static void __devexit agp_amd64_remove(struct pci_dev *pdev)
  {
         struct agp_bridge_data *bridge = pci_get_drvdata(pdev);
  
-       release_mem_region(virt_to_phys(bridge->gatt_table_real),
+       release_mem_region(virt_to_gart(bridge->gatt_table_real),
                            amd64_aperture_sizes[bridge->aperture_size_idx].size);
         agp_remove_bridge(bridge);
         agp_put_bridge(bridge);
diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c

index dc30e22..5c16d72 100644 (file)
--- a/drivers/char/agp/ati-agp.c
+++ b/drivers/char/agp/ati-agp.c
@@ -361,7 +361,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge)
  
         agp_bridge->gatt_table_real = (u32 *)page_dir.real;
         agp_bridge->gatt_table = (u32 __iomem *) page_dir.remapped;
-       agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
+       agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
  
         /* Write out the size register */
         current_size = A_SIZE_LVL2(agp_bridge->current_size);
@@ -391,7 +391,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge)
  
         /* Calculate the agp offset */
         for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) {
-               writel(virt_to_phys(ati_generic_private.gatt_pages[i]->real) | 1,
+               writel(virt_to_gart(ati_generic_private.gatt_pages[i]->real) | 1,
                         page_dir.remapped+GET_PAGE_DIR_OFF(addr));
                 readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr));        /* PCI Posting. */
         }
diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c

index d607f53..cb4fceb 100644 (file)
--- a/drivers/char/agp/efficeon-agp.c
+++ b/drivers/char/agp/efficeon-agp.c
@@ -227,7 +227,7 @@ static int efficeon_create_gatt_table(struct agp_bridge_data *bridge)
  
                 efficeon_private.l1_table[index] = page;
  
-               value = virt_to_phys((unsigned long *)page) | pati | present | index;
+               value = virt_to_gart((unsigned long *)page) | pati | present | index;
  
                 pci_write_config_dword(agp_bridge->dev,
                         EFFICEON_ATTPAGE, value);
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c

index 17e05d1..a8dd706 100644 (file)
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -960,7 +960,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
  
         bridge->gatt_table = (void *)table;
  #else
-       bridge->gatt_table = ioremap_nocache(virt_to_phys(table),
+       bridge->gatt_table = ioremap_nocache(virt_to_gart(table),
                                         (PAGE_SIZE * (1 << page_order)));
         bridge->driver->cache_flush();
  #endif
@@ -973,7 +973,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
  
                 return -ENOMEM;
         }
-       bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real);
+       bridge->gatt_bus_addr = virt_to_gart(bridge->gatt_table_real);
  
         /* AK: bogus, should encode addresses > 4GB */
         for (i = 0; i < num_entries; i++) {
@@ -1228,7 +1228,7 @@ int agp_generic_alloc_pages(struct agp_bridge_data *bridge, struct agp_memory *m
         }
  
  #ifdef CONFIG_X86
-       set_pages_array_uc(mem->pages, num_pages);
+       map_pages_into_agp(mem->pages, num_pages);
  #endif
         ret = 0;
  out:
@@ -1261,7 +1261,7 @@ void agp_generic_destroy_pages(struct agp_memory *mem)
                 return;
  
  #ifdef CONFIG_X86
-       set_pages_array_wb(mem->pages, mem->page_count);
+       unmap_pages_from_agp(mem->pages, mem->page_count);
  #endif
  
         for (i = 0; i < mem->page_count; i++) {
diff --git a/drivers/char/agp/intel-gtt.c b/drivers/char/agp/intel-gtt.c

index 7f025fb..26409e9 100644 (file)
--- a/drivers/char/agp/intel-gtt.c
+++ b/drivers/char/agp/intel-gtt.c
@@ -146,8 +146,19 @@ static struct page *i8xx_alloc_pages(void)
         if (page == NULL)
                 return NULL;
  
+#ifdef CONFIG_XEN
+       if (xen_create_contiguous_region((unsigned long)page_address(page), 2, 32)) {
+               __free_pages(page, 2);
+               return NULL;
+       }
+#endif
+
         if (set_pages_uc(page, 4) < 0) {
                 set_pages_wb(page, 4);
+#ifdef CONFIG_XEN
+               xen_destroy_contiguous_region((unsigned long)page_address(page),
+                                             2);
+#endif
                 __free_pages(page, 2);
                 return NULL;
         }
@@ -162,6 +173,9 @@ static void i8xx_destroy_pages(struct page *page)
                 return;
  
         set_pages_wb(page, 4);
+#ifdef CONFIG_XEN
+       xen_destroy_contiguous_region((unsigned long)page_address(page), 2);
+#endif
         put_page(page);
         __free_pages(page, 2);
         atomic_dec(&agp_bridge->current_memory_agp);
@@ -267,7 +281,11 @@ static struct agp_memory *alloc_agpphysmem_i8xx(size_t pg_count, int type)
         new->page_count = pg_count;
         new->num_scratch_pages = pg_count;
         new->type = AGP_PHYS_MEMORY;
+#ifndef CONFIG_XEN
         new->physical = page_to_phys(new->pages[0]);
+#else
+       new->physical = page_to_pseudophys(new->pages[0]);
+#endif
         return new;
  }
  
diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c

index f02f9b0..ff98d27 100644 (file)
--- a/drivers/char/agp/sworks-agp.c
+++ b/drivers/char/agp/sworks-agp.c
@@ -155,7 +155,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge)
         /* Create a fake scratch directory */
         for (i = 0; i < 1024; i++) {
                 writel(agp_bridge->scratch_page, serverworks_private.scratch_dir.remapped+i);
-               writel(virt_to_phys(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i);
+               writel(virt_to_gart(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i);
         }
  
         retval = serverworks_create_gatt_pages(value->num_entries / 1024);
@@ -167,7 +167,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge)
  
         agp_bridge->gatt_table_real = (u32 *)page_dir.real;
         agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped;
-       agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real);
+       agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real);
  
         /* Get the address for the gart region.
          * This is a bus address even on the alpha, b/c its
@@ -179,7 +179,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge)
  
         /* Calculate the agp offset */
         for (i = 0; i < value->num_entries / 1024; i++)
-               writel(virt_to_phys(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i);
+               writel(virt_to_gart(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i);
  
         return 0;
  }
diff --git a/drivers/char/crasher.c b/drivers/char/crasher.c

new file mode 100644 (file)

index 0000000..ec35de2
--- /dev/null
+++ b/drivers/char/crasher.c
@@ -0,0 +1,228 @@
+/*
+ * crasher.c, it breaks things
+ */
+
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/jiffies.h>
+#include <linux/sched.h>
+#include <linux/moduleparam.h>
+
+static int module_exiting;
+static struct completion startup = COMPLETION_INITIALIZER(startup);
+static unsigned long rand_seed = 152L;
+static unsigned long seed = 152L;
+static int threads = 1;
+static bool call_panic, call_bug, call_warn;
+static bool trap_null, call_null, jump_null;
+static long trap_read, trap_write, call_bad, jump_bad;
+
+module_param(seed, ulong, 0);
+module_param(call_panic, bool, 0);
+module_param(call_bug, bool, 0);
+module_param(call_warn, bool, 0);
+module_param(trap_null, bool, 0);
+module_param(trap_read, long, 0);
+module_param(trap_write, long, 0);
+module_param(call_null, bool, 0);
+module_param(call_bad, long, 0);
+module_param(jump_null, bool, 0);
+module_param(jump_bad, long, 0);
+module_param(threads, int, 0);
+MODULE_PARM_DESC(seed, "random seed for memory tests");
+MODULE_PARM_DESC(call_panic, "test option. call panic() and render the system unusable.");
+MODULE_PARM_DESC(call_bug, "test option. call BUG() and render the system unusable.");
+MODULE_PARM_DESC(call_warn, "test option. call WARN() and leave the system usable.");
+MODULE_PARM_DESC(trap_null, "test option. dereference a NULL pointer to simulate a crash and render the system unusable.");
+MODULE_PARM_DESC(trap_read, "test option. read from an invalid address to simulate a crash and render the system unusable.");
+MODULE_PARM_DESC(trap_write, "test option. write to an invalid address to simulate a crash and render the system unusable.");
+MODULE_PARM_DESC(call_null, "test option. call a NULL pointer to simulate a crash and render the system unusable.");
+MODULE_PARM_DESC(call_bad, "test option. call an invalid address to simulate a crash and render the system unusable.");
+MODULE_PARM_DESC(jump_null, "test option. jump to a NULL pointer to simulate a crash and render the system unusable.");
+MODULE_PARM_DESC(jump_bad, "test option. jump to an invalid address to simulate a crash and render the system unusable.");
+MODULE_PARM_DESC(threads, "number of threads to run");
+MODULE_LICENSE("GPL");
+
+#define NUM_ALLOC 24
+#define NUM_SIZES 8
+static int sizes[]  = { 32, 64, 128, 192, 256, 1024, 2048, 4096 };
+
+struct mem_buf {
+       char *buf;
+       int size;
+};
+
+static unsigned long crasher_random(void)
+{
+       rand_seed = rand_seed*69069L+1;
+       return rand_seed^jiffies;
+}
+
+void crasher_srandom(unsigned long entropy)
+{
+       rand_seed ^= entropy;
+       crasher_random();
+}
+
+static char *mem_alloc(int size) {
+       char *p = kmalloc(size, GFP_KERNEL);
+       int i;
+       if (!p)
+               return p;
+       for (i = 0 ; i < size; i++)
+               p[i] = (i % 119) + 8;
+       return p;
+}
+
+static void mem_check(char *p, int size) {
+       int i;
+       if (!p)
+               return;
+       for (i = 0 ; i < size; i++) {
+               if (p[i] != ((i % 119) + 8)) {
+                       printk(KERN_CRIT "verify error at %lX offset %d "
+                              " wanted %d found %d size %d\n",
+                              (unsigned long)(p + i), i, (i % 119) + 8,
+                              p[i], size);
+               }
+       }
+       // try and trigger slab poisoning for people using this buffer
+       // wrong
+       memset(p, 0, size);
+}
+
+static void mem_verify(void) {
+       struct mem_buf bufs[NUM_ALLOC];
+       struct mem_buf *b;
+       int index;
+       int size;
+       unsigned long sleep;
+       memset(bufs, 0, sizeof(struct mem_buf) * NUM_ALLOC);
+       while(!module_exiting) {
+               index = crasher_random() % NUM_ALLOC;
+               b = bufs + index;
+               if (b->size) {
+                       mem_check(b->buf, b->size);
+                       kfree(b->buf);
+                       b->buf = NULL;
+                       b->size = 0;
+               } else {
+                       size = crasher_random() % NUM_SIZES;
+                       size = sizes[size];
+                       b->buf = mem_alloc(size);
+                       b->size = size;
+               }
+               sleep = crasher_random() % (HZ / 10);
+               set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(sleep);
+               set_current_state(TASK_RUNNING);
+       }
+       for (index = 0 ; index < NUM_ALLOC ; index++) {
+               b = bufs + index;
+               if (b->size) {
+                       mem_check(b->buf, b->size);
+                       kfree(b->buf);
+               }
+       }
+}
+
+static int crasher_thread(void *unused)
+{
+       daemonize("crasher");
+       complete(&startup);
+       mem_verify();
+       complete(&startup);
+       return 0;
+}
+
+static int __init crasher_init(void)
+{
+       int i;
+       init_completion(&startup);
+       crasher_srandom(seed);
+
+       if (call_panic) {
+               panic("test panic from crasher module. Good Luck.\n");
+               return -EFAULT;
+       }
+       if (call_bug) {
+               printk("triggering BUG\n");
+               BUG_ON(1);
+               return -EFAULT;
+       }
+       if (WARN(call_warn, "triggering WARN\n"))
+               return -EFAULT;
+
+       if (trap_null) {
+               volatile char *p = NULL;
+               printk("dereferencing NULL pointer.\n");
+               p[0] = '\n';
+               return -EFAULT;
+       }
+       if (trap_read) {
+               const volatile char *p = (char *)trap_read;
+               printk("reading from invalid(?) address %p.\n", p);
+               return p[0] ? -EFAULT : -EACCES;
+       }
+       if (trap_write) {
+               volatile char *p = (char *)trap_write;
+               printk("writing to invalid(?) address %p.\n", p);
+               p[0] = ' ';
+               return -EFAULT;
+       }
+
+       if (call_null) {
+               void(*f)(void) = NULL;
+               printk("calling NULL pointer.\n");
+               f();
+               return -EFAULT;
+       }
+       if (call_bad) {
+               void(*f)(void) = (void(*)(void))call_bad;
+               printk("calling invalid(?) address %p.\n", f);
+               f();
+               return -EFAULT;
+       }
+
+       /* These two depend on the compiler doing tail call optimization. */
+       if (jump_null) {
+               int(*f)(void) = NULL;
+               printk("jumping to NULL.\n");
+               return f();
+       }
+       if (jump_bad) {
+               int(*f)(void) = (int(*)(void))jump_bad;
+               printk("jumping to invalid(?) address %p.\n", f);
+               return f();
+       }
+
+       printk("crasher module (%d threads).  Testing sizes: ", threads);
+       for (i = 0 ; i < NUM_SIZES ; i++)
+               printk("%d ", sizes[i]);
+       printk("\n");
+
+       for (i = 0 ; i < threads ; i++)
+               kernel_thread(crasher_thread, crasher_thread,
+                             CLONE_FS | CLONE_FILES);
+       for (i = 0 ; i < threads ; i++)
+               wait_for_completion(&startup);
+       return 0;
+}
+
+static void __exit crasher_exit(void)
+{
+       int i;
+       module_exiting = 1;
+       for (i = 0 ; i < threads ; i++)
+               wait_for_completion(&startup);
+       printk("all crasher threads done\n");
+       return;
+}
+
+module_init(crasher_init);
+module_exit(crasher_exit);
diff --git a/drivers/char/lp.c b/drivers/char/lp.c

index a741e41..70ac62c 100644 (file)
--- a/drivers/char/lp.c
+++ b/drivers/char/lp.c
@@ -622,9 +622,12 @@ static int lp_do_ioctl(unsigned int minor, unsigned int cmd,
                                 return -EFAULT;
                         break;
                 case LPGETSTATUS:
+                       if (mutex_lock_interruptible(&lp_table[minor].port_mutex))
+                               return -EINTR;
                         lp_claim_parport_or_block (&lp_table[minor]);
                         status = r_str(minor);
                         lp_release_parport (&lp_table[minor]);
+                       mutex_unlock(&lp_table[minor].port_mutex);
  
                         if (copy_to_user(argp, &status, sizeof(int)))
                                 return -EFAULT;
diff --git a/drivers/char/mem.c b/drivers/char/mem.c

index d6e9d08..764cfb1 100644 (file)
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -87,6 +87,7 @@ void __weak unxlate_dev_mem_ptr(unsigned long phys, void *addr)
  {
  }
  
+#ifndef ARCH_HAS_DEV_MEM
  /*
   * This funcion reads the *physical* memory. The f_pos points directly to the
   * memory location.
@@ -209,6 +210,7 @@ static ssize_t write_mem(struct file *file, const char __user *buf,
         *ppos += written;
         return written;
  }
+#endif
  
  int __weak phys_mem_access_prot_allowed(struct file *file,
         unsigned long pfn, unsigned long size, pgprot_t *vma_prot)
@@ -335,6 +337,9 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma)
  static int mmap_kmem(struct file *file, struct vm_area_struct *vma)
  {
         unsigned long pfn;
+#ifdef CONFIG_XEN
+       unsigned long i, count;
+#endif
  
         /* Turn a kernel-virtual address into a physical page frame */
         pfn = __pa((u64)vma->vm_pgoff << PAGE_SHIFT) >> PAGE_SHIFT;
@@ -349,6 +354,13 @@ static int mmap_kmem(struct file *file, struct vm_area_struct *vma)
         if (!pfn_valid(pfn))
                 return -EIO;
  
+#ifdef CONFIG_XEN
+       count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+       for (i = 0; i < count; i++)
+               if ((pfn + i) != mfn_to_local_pfn(pfn_to_mfn(pfn + i)))
+                       return -EIO;
+#endif
+
         vma->vm_pgoff = pfn;
         return mmap_mem(file, vma);
  }
@@ -740,6 +752,7 @@ static int open_port(struct inode * inode, struct file * filp)
  #define open_kmem      open_mem
  #define open_oldmem    open_mem
  
+#ifndef ARCH_HAS_DEV_MEM
  static const struct file_operations mem_fops = {
         .llseek         = memory_lseek,
         .read           = read_mem,
@@ -748,6 +761,9 @@ static const struct file_operations mem_fops = {
         .open           = open_mem,
         .get_unmapped_area = get_unmapped_area_mem,
  };
+#else
+extern const struct file_operations mem_fops;
+#endif
  
  #ifdef CONFIG_DEVKMEM
  static const struct file_operations kmem_fops = {
diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig

index a048199..28bfb7f 100644 (file)
--- a/drivers/char/tpm/Kconfig
+++ b/drivers/char/tpm/Kconfig
@@ -62,4 +62,13 @@ config TCG_INFINEON
           Further information on this driver and the supported hardware
           can be found at http://www.trust.rub.de/projects/linux-device-driver-infineon-tpm/ 
  
+config TCG_XEN
+       tristate "XEN TPM Interface"
+       depends on XEN
+       ---help---
+         If you want to make TPM support available to a Xen user domain,
+         say Yes and it will be accessible from within Linux.
+         To compile this driver as a module, choose M here; the module
+         will be called tpm_xenu.
+
  endif # TCG_TPM
diff --git a/drivers/char/tpm/Makefile b/drivers/char/tpm/Makefile

index ea3a1e0..b5cea0a 100644 (file)
--- a/drivers/char/tpm/Makefile
+++ b/drivers/char/tpm/Makefile
@@ -9,3 +9,5 @@ obj-$(CONFIG_TCG_TIS) += tpm_tis.o
  obj-$(CONFIG_TCG_NSC) += tpm_nsc.o
  obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o
  obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o
+obj-$(CONFIG_TCG_XEN) += tpm_xenu.o
+tpm_xenu-y = tpm_xen.o tpm_vtpm.o
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h

index b1c5280..35312ba 100644 (file)
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -122,6 +122,9 @@ struct tpm_chip {
         struct dentry **bios_dir;
  
         struct list_head list;
+#ifdef CONFIG_XEN
+       void *priv;
+#endif
         void (*release) (struct device *);
  };
  
@@ -286,6 +289,18 @@ struct tpm_cmd_t {
  
  ssize_t        tpm_getcap(struct device *, __be32, cap_t *, const char *);
  
+#ifdef CONFIG_XEN
+static inline void *chip_get_private(const struct tpm_chip *chip)
+{
+       return chip->priv;
+}
+
+static inline void chip_set_private(struct tpm_chip *chip, void *priv)
+{
+       chip->priv = priv;
+}
+#endif
+
  extern int tpm_get_timeouts(struct tpm_chip *);
  extern void tpm_gen_interrupt(struct tpm_chip *);
  extern int tpm_do_selftest(struct tpm_chip *);
diff --git a/drivers/char/tpm/tpm_vtpm.c b/drivers/char/tpm/tpm_vtpm.c

new file mode 100644 (file)

index 0000000..4b865f4
--- /dev/null
+++ b/drivers/char/tpm/tpm_vtpm.c
@@ -0,0 +1,543 @@
+/*
+ * Copyright (C) 2006 IBM Corporation
+ *
+ * Authors:
+ * Stefan Berger <stefanb@us.ibm.com>
+ *
+ * Generic device driver part for device drivers in a virtualized
+ * environment.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include "tpm.h"
+#include "tpm_vtpm.h"
+
+/* read status bits */
+enum {
+       STATUS_BUSY = 0x01,
+       STATUS_DATA_AVAIL = 0x02,
+       STATUS_READY = 0x04
+};
+
+struct transmission {
+       struct list_head next;
+
+       unsigned char *request;
+       size_t  request_len;
+       size_t  request_buflen;
+
+       unsigned char *response;
+       size_t  response_len;
+       size_t  response_buflen;
+
+       unsigned int flags;
+};
+
+enum {
+       TRANSMISSION_FLAG_WAS_QUEUED = 0x1
+};
+
+
+enum {
+       DATAEX_FLAG_QUEUED_ONLY = 0x1
+};
+
+
+/* local variables */
+
+/* local function prototypes */
+static int _vtpm_send_queued(struct tpm_chip *chip);
+
+
+/* =============================================================
+ * Some utility functions
+ * =============================================================
+ */
+static void vtpm_state_init(struct vtpm_state *vtpms)
+{
+       vtpms->current_request = NULL;
+       spin_lock_init(&vtpms->req_list_lock);
+       init_waitqueue_head(&vtpms->req_wait_queue);
+       INIT_LIST_HEAD(&vtpms->queued_requests);
+
+       vtpms->current_response = NULL;
+       spin_lock_init(&vtpms->resp_list_lock);
+       init_waitqueue_head(&vtpms->resp_wait_queue);
+
+       vtpms->disconnect_time = jiffies;
+}
+
+
+static inline struct transmission *transmission_alloc(void)
+{
+       return kzalloc(sizeof(struct transmission), GFP_ATOMIC);
+}
+
+static unsigned char *
+transmission_set_req_buffer(struct transmission *t,
+                            unsigned char *buffer, size_t len)
+{
+       if (t->request_buflen < len) {
+               kfree(t->request);
+               t->request = kmalloc(len, GFP_KERNEL);
+               if (!t->request) {
+                       t->request_buflen = 0;
+                       return NULL;
+               }
+               t->request_buflen = len;
+       }
+
+       memcpy(t->request, buffer, len);
+       t->request_len = len;
+
+       return t->request;
+}
+
+static unsigned char *
+transmission_set_res_buffer(struct transmission *t,
+                            const unsigned char *buffer, size_t len)
+{
+       if (t->response_buflen < len) {
+               kfree(t->response);
+               t->response = kmalloc(len, GFP_ATOMIC);
+               if (!t->response) {
+                       t->response_buflen = 0;
+                       return NULL;
+               }
+               t->response_buflen = len;
+       }
+
+       memcpy(t->response, buffer, len);
+       t->response_len = len;
+
+       return t->response;
+}
+
+static inline void transmission_free(struct transmission *t)
+{
+       kfree(t->request);
+       kfree(t->response);
+       kfree(t);
+}
+
+/* =============================================================
+ * Interface with the lower layer driver
+ * =============================================================
+ */
+/*
+ * Lower layer uses this function to make a response available.
+ */
+int vtpm_vd_recv(const struct tpm_chip *chip,
+                 const unsigned char *buffer, size_t count,
+                 void *ptr)
+{
+       unsigned long flags;
+       int ret_size = 0;
+       struct transmission *t;
+       struct vtpm_state *vtpms;
+
+       vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+       /*
+        * The list with requests must contain one request
+        * only and the element there must be the one that
+        * was passed to me from the front-end.
+        */
+       spin_lock_irqsave(&vtpms->resp_list_lock, flags);
+       if (vtpms->current_request != ptr) {
+               spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+               return 0;
+       }
+
+       if ((t = vtpms->current_request)) {
+               transmission_free(t);
+               vtpms->current_request = NULL;
+       }
+
+       t = transmission_alloc();
+       if (t) {
+               if (!transmission_set_res_buffer(t, buffer, count)) {
+                       transmission_free(t);
+                       spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+                       return -ENOMEM;
+               }
+               ret_size = count;
+               vtpms->current_response = t;
+               wake_up_interruptible(&vtpms->resp_wait_queue);
+       }
+       spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+
+       return ret_size;
+}
+
+
+/*
+ * Lower layer indicates its status (connected/disconnected)
+ */
+void vtpm_vd_status(const struct tpm_chip *chip, u8 vd_status)
+{
+       struct vtpm_state *vtpms;
+
+       vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+       vtpms->vd_status = vd_status;
+       if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
+               vtpms->disconnect_time = jiffies;
+       }
+}
+
+/* =============================================================
+ * Interface with the generic TPM driver
+ * =============================================================
+ */
+static int vtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
+{
+       int rc = 0;
+       unsigned long flags;
+       struct vtpm_state *vtpms;
+
+       vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+       /*
+        * Check if the previous operation only queued the command
+        * In this case there won't be a response, so I just
+        * return from here and reset that flag. In any other
+        * case I should receive a response from the back-end.
+        */
+       spin_lock_irqsave(&vtpms->resp_list_lock, flags);
+       if ((vtpms->flags & DATAEX_FLAG_QUEUED_ONLY) != 0) {
+               vtpms->flags &= ~DATAEX_FLAG_QUEUED_ONLY;
+               spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+               /*
+                * The first few commands (measurements) must be
+                * queued since it might not be possible to talk to the
+                * TPM, yet.
+                * Return a response of up to 30 '0's.
+                */
+
+               count = min_t(size_t, count, 30);
+               memset(buf, 0x0, count);
+               return count;
+       }
+       /*
+        * Check whether something is in the responselist and if
+        * there's nothing in the list wait for something to appear.
+        */
+
+       if (!vtpms->current_response) {
+               spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+               interruptible_sleep_on_timeout(&vtpms->resp_wait_queue,
+                                              1000);
+               spin_lock_irqsave(&vtpms->resp_list_lock ,flags);
+       }
+
+       if (vtpms->current_response) {
+               struct transmission *t = vtpms->current_response;
+               vtpms->current_response = NULL;
+               rc = min(count, t->response_len);
+               memcpy(buf, t->response, rc);
+               transmission_free(t);
+       }
+
+       spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+       return rc;
+}
+
+static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t count)
+{
+       int rc = 0;
+       unsigned long flags;
+       struct transmission *t = transmission_alloc();
+       struct vtpm_state *vtpms;
+
+       vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+       if (!t)
+               return -ENOMEM;
+       /*
+        * If there's a current request, it must be the
+        * previous request that has timed out.
+        */
+       spin_lock_irqsave(&vtpms->req_list_lock, flags);
+       if (vtpms->current_request != NULL) {
+               printk("WARNING: Sending although there is a request outstanding.\n"
+                      "         Previous request must have timed out.\n");
+               transmission_free(vtpms->current_request);
+               vtpms->current_request = NULL;
+       }
+       spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
+
+       /*
+        * Queue the packet if the driver below is not
+        * ready, yet, or there is any packet already
+        * in the queue.
+        * If the driver below is ready, unqueue all
+        * packets first before sending our current
+        * packet.
+        * For each unqueued packet, except for the
+        * last (=current) packet, call the function
+        * tpm_xen_recv to wait for the response to come
+        * back.
+        */
+       if ((vtpms->vd_status & TPM_VD_STATUS_CONNECTED) == 0) {
+               if (time_after(jiffies,
+                              vtpms->disconnect_time + HZ * 10)) {
+                       rc = -ENOENT;
+               } else {
+                       goto queue_it;
+               }
+       } else {
+               /*
+                * Send all queued packets.
+                */
+               if (_vtpm_send_queued(chip) == 0) {
+
+                       vtpms->current_request = t;
+
+                       rc = vtpm_vd_send(vtpms->tpm_private,
+                                         buf,
+                                         count,
+                                         t);
+                       /*
+                        * The generic TPM driver will call
+                        * the function to receive the response.
+                        */
+                       if (rc < 0) {
+                               vtpms->current_request = NULL;
+                               goto queue_it;
+                       }
+               } else {
+queue_it:
+                       if (!transmission_set_req_buffer(t, buf, count)) {
+                               transmission_free(t);
+                               rc = -ENOMEM;
+                               goto exit;
+                       }
+                       /*
+                        * An error occurred. Don't event try
+                        * to send the current request. Just
+                        * queue it.
+                        */
+                       spin_lock_irqsave(&vtpms->req_list_lock, flags);
+                       vtpms->flags |= DATAEX_FLAG_QUEUED_ONLY;
+                       list_add_tail(&t->next, &vtpms->queued_requests);
+                       spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
+               }
+       }
+
+exit:
+       return rc;
+}
+
+
+/*
+ * Send all queued requests.
+ */
+static int _vtpm_send_queued(struct tpm_chip *chip)
+{
+       int rc;
+       int error = 0;
+       unsigned long flags;
+       unsigned char buffer[1];
+       struct vtpm_state *vtpms;
+       vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+       spin_lock_irqsave(&vtpms->req_list_lock, flags);
+
+       while (!list_empty(&vtpms->queued_requests)) {
+               /*
+                * Need to dequeue them.
+                * Read the result into a dummy buffer.
+                */
+               struct transmission *qt = (struct transmission *)
+                                         vtpms->queued_requests.next;
+               list_del(&qt->next);
+               vtpms->current_request = qt;
+               spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
+
+               rc = vtpm_vd_send(vtpms->tpm_private,
+                                 qt->request,
+                                 qt->request_len,
+                                 qt);
+
+               if (rc < 0) {
+                       spin_lock_irqsave(&vtpms->req_list_lock, flags);
+                       if ((qt = vtpms->current_request) != NULL) {
+                               /*
+                                * requeue it at the beginning
+                                * of the list
+                                */
+                               list_add(&qt->next,
+                                        &vtpms->queued_requests);
+                       }
+                       vtpms->current_request = NULL;
+                       error = 1;
+                       break;
+               }
+               /*
+                * After this point qt is not valid anymore!
+                * It is freed when the front-end is delivering
+                * the data by calling tpm_recv
+                */
+               /*
+                * Receive response into provided dummy buffer
+                */
+               rc = vtpm_recv(chip, buffer, sizeof(buffer));
+               spin_lock_irqsave(&vtpms->req_list_lock, flags);
+       }
+
+       spin_unlock_irqrestore(&vtpms->req_list_lock, flags);
+
+       return error;
+}
+
+static void vtpm_cancel(struct tpm_chip *chip)
+{
+       unsigned long flags;
+       struct vtpm_state *vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+       spin_lock_irqsave(&vtpms->resp_list_lock,flags);
+
+       if (!vtpms->current_response && vtpms->current_request) {
+               spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+               interruptible_sleep_on(&vtpms->resp_wait_queue);
+               spin_lock_irqsave(&vtpms->resp_list_lock,flags);
+       }
+
+       if (vtpms->current_response) {
+               struct transmission *t = vtpms->current_response;
+               vtpms->current_response = NULL;
+               transmission_free(t);
+       }
+
+       spin_unlock_irqrestore(&vtpms->resp_list_lock,flags);
+}
+
+static u8 vtpm_status(struct tpm_chip *chip)
+{
+       u8 rc = 0;
+       unsigned long flags;
+       struct vtpm_state *vtpms;
+
+       vtpms = (struct vtpm_state *)chip_get_private(chip);
+
+       spin_lock_irqsave(&vtpms->resp_list_lock, flags);
+       /*
+        * Data are available if:
+        *  - there's a current response
+        *  - the last packet was queued only (this is fake, but necessary to
+        *      get the generic TPM layer to call the receive function.)
+        */
+       if (vtpms->current_response ||
+           0 != (vtpms->flags & DATAEX_FLAG_QUEUED_ONLY)) {
+               rc = STATUS_DATA_AVAIL;
+       } else if (!vtpms->current_response && !vtpms->current_request) {
+               rc = STATUS_READY;
+       }
+
+       spin_unlock_irqrestore(&vtpms->resp_list_lock, flags);
+       return rc;
+}
+
+static struct file_operations vtpm_ops = {
+       .owner = THIS_MODULE,
+       .llseek = no_llseek,
+       .open = tpm_open,
+       .read = tpm_read,
+       .write = tpm_write,
+       .release = tpm_release,
+};
+
+static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL);
+static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL);
+static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL);
+static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL);
+static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL);
+static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated,
+                  NULL);
+static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL);
+static DEVICE_ATTR(cancel, S_IWUSR |S_IWGRP, NULL, tpm_store_cancel);
+
+static struct attribute *vtpm_attrs[] = {
+       &dev_attr_pubek.attr,
+       &dev_attr_pcrs.attr,
+       &dev_attr_enabled.attr,
+       &dev_attr_active.attr,
+       &dev_attr_owned.attr,
+       &dev_attr_temp_deactivated.attr,
+       &dev_attr_caps.attr,
+       &dev_attr_cancel.attr,
+       NULL,
+};
+
+static struct attribute_group vtpm_attr_grp = { .attrs = vtpm_attrs };
+
+#define TPM_LONG_TIMEOUT   (10 * 60 * HZ)
+
+static struct tpm_vendor_specific tpm_vtpm = {
+       .recv = vtpm_recv,
+       .send = vtpm_send,
+       .cancel = vtpm_cancel,
+       .status = vtpm_status,
+       .req_complete_mask = STATUS_BUSY | STATUS_DATA_AVAIL,
+       .req_complete_val  = STATUS_DATA_AVAIL,
+       .req_canceled = STATUS_READY,
+       .attr_group = &vtpm_attr_grp,
+       .miscdev = {
+               .fops = &vtpm_ops,
+       },
+       .duration = {
+               TPM_LONG_TIMEOUT,
+               TPM_LONG_TIMEOUT,
+               TPM_LONG_TIMEOUT,
+       },
+};
+
+struct tpm_chip *init_vtpm(struct device *dev,
+                           struct tpm_private *tp)
+{
+       long rc;
+       struct tpm_chip *chip;
+       struct vtpm_state *vtpms;
+
+       vtpms = kzalloc(sizeof(struct vtpm_state), GFP_KERNEL);
+       if (!vtpms)
+               return ERR_PTR(-ENOMEM);
+
+       vtpm_state_init(vtpms);
+       vtpms->tpm_private = tp;
+
+       chip = tpm_register_hardware(dev, &tpm_vtpm);
+       if (!chip) {
+               rc = -ENODEV;
+               goto err_free_mem;
+       }
+
+       chip_set_private(chip, vtpms);
+
+       return chip;
+
+err_free_mem:
+       kfree(vtpms);
+
+       return ERR_PTR(rc);
+}
+
+void cleanup_vtpm(struct device *dev)
+{
+       struct tpm_chip *chip = dev_get_drvdata(dev);
+       struct vtpm_state *vtpms = (struct vtpm_state*)chip_get_private(chip);
+       tpm_remove_hardware(dev);
+       kfree(vtpms);
+}
diff --git a/drivers/char/tpm/tpm_vtpm.h b/drivers/char/tpm/tpm_vtpm.h

new file mode 100644 (file)

index 0000000..77aa342
--- /dev/null
+++ b/drivers/char/tpm/tpm_vtpm.h
@@ -0,0 +1,55 @@
+#ifndef TPM_VTPM_H
+#define TPM_VTPM_H
+
+struct tpm_chip;
+struct tpm_private;
+
+struct vtpm_state {
+       struct transmission *current_request;
+       spinlock_t           req_list_lock;
+       wait_queue_head_t    req_wait_queue;
+
+       struct list_head     queued_requests;
+
+       struct transmission *current_response;
+       spinlock_t           resp_list_lock;
+       wait_queue_head_t    resp_wait_queue;     // processes waiting for responses
+
+       u8                   vd_status;
+       u8                   flags;
+
+       unsigned long        disconnect_time;
+
+       /*
+        * The following is a private structure of the underlying
+        * driver. It is passed as parameter in the send function.
+        */
+       struct tpm_private *tpm_private;
+};
+
+
+enum vdev_status {
+       TPM_VD_STATUS_DISCONNECTED = 0x0,
+       TPM_VD_STATUS_CONNECTED = 0x1
+};
+
+/* this function is called from tpm_vtpm.c */
+int vtpm_vd_send(struct tpm_private * tp,
+                 const u8 * buf, size_t count, void *ptr);
+
+/* these functions are offered by tpm_vtpm.c */
+struct tpm_chip *init_vtpm(struct device *,
+                           struct tpm_private *);
+void cleanup_vtpm(struct device *);
+int vtpm_vd_recv(const struct tpm_chip* chip,
+                 const unsigned char *buffer, size_t count, void *ptr);
+void vtpm_vd_status(const struct tpm_chip *, u8 status);
+
+static inline struct tpm_private *tpm_private_from_dev(struct device *dev)
+{
+       struct tpm_chip *chip = dev_get_drvdata(dev);
+       struct vtpm_state *vtpms = chip_get_private(chip);
+       return vtpms->tpm_private;
+}
+
+#endif
diff --git a/drivers/char/tpm/tpm_xen.c b/drivers/char/tpm/tpm_xen.c

new file mode 100644 (file)

index 0000000..2548f78
--- /dev/null
+++ b/drivers/char/tpm/tpm_xen.c
@@ -0,0 +1,718 @@
+/*
+ * Copyright (c) 2005, IBM Corporation
+ *
+ * Author: Stefan Berger, stefanb@us.ibm.com
+ * Grant table support: Mahadevan Gomathisankaran
+ *
+ * This code has been derived from drivers/xen/netfront/netfront.c
+ *
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <xen/evtchn.h>
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/tpmif.h>
+#include <xen/gnttab.h>
+#include <xen/xenbus.h>
+#include "tpm.h"
+#include "tpm_vtpm.h"
+
+#undef DEBUG
+
+/* local structures */
+struct tpm_private {
+       struct tpm_chip *chip;
+
+       tpmif_tx_interface_t *tx;
+       atomic_t refcnt;
+       unsigned int irq;
+       u8 is_connected;
+       u8 is_suspended;
+
+       spinlock_t tx_lock;
+
+       struct tx_buffer *tx_buffers[TPMIF_TX_RING_SIZE];
+
+       atomic_t tx_busy;
+       void *tx_remember;
+
+       domid_t backend_id;
+       wait_queue_head_t wait_q;
+
+       struct xenbus_device *dev;
+       int ring_ref;
+};
+
+struct tx_buffer {
+       unsigned int size;      // available space in data
+       unsigned int len;       // used space in data
+       unsigned char *data;    // pointer to a page
+};
+
+
+/* locally visible variables */
+static grant_ref_t gref_head;
+static struct tpm_private *my_priv;
+
+/* local function prototypes */
+static irqreturn_t tpmif_int(int irq,
+                             void *tpm_priv);
+static void tpmif_rx_action(unsigned long unused);
+static int tpmif_connect(struct xenbus_device *dev,
+                         struct tpm_private *tp,
+                         domid_t domid);
+static DECLARE_TASKLET(tpmif_rx_tasklet, tpmif_rx_action, 0);
+static int tpmif_allocate_tx_buffers(struct tpm_private *tp);
+static void tpmif_free_tx_buffers(struct tpm_private *tp);
+static void tpmif_set_connected_state(struct tpm_private *tp,
+                                      u8 newstate);
+static int tpm_xmit(struct tpm_private *tp,
+                    const u8 * buf, size_t count, int userbuffer,
+                    void *remember);
+static void destroy_tpmring(struct tpm_private *tp);
+void __exit tpmif_exit(void);
+
+#define DPRINTK(fmt, args...) \
+    pr_debug("xen_tpm_fr (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args)
+#define IPRINTK(fmt, args...) \
+    pr_info("xen_tpm_fr: " fmt, ##args)
+#define WPRINTK(fmt, args...) \
+    pr_warning("xen_tpm_fr: " fmt, ##args)
+
+
+static inline int
+tx_buffer_copy(struct tx_buffer *txb, const u8 *src, int len,
+               int isuserbuffer)
+{
+       int copied = len;
+
+       if (len > txb->size)
+               copied = txb->size;
+       if (isuserbuffer) {
+               if (copy_from_user(txb->data, src, copied))
+                       return -EFAULT;
+       } else {
+               memcpy(txb->data, src, copied);
+       }
+       txb->len = len;
+       return copied;
+}
+
+static inline struct tx_buffer *tx_buffer_alloc(void)
+{
+       struct tx_buffer *txb;
+
+       txb = kzalloc(sizeof(struct tx_buffer), GFP_KERNEL);
+       if (!txb)
+               return NULL;
+
+       txb->len = 0;
+       txb->size = PAGE_SIZE;
+       txb->data = (unsigned char *)__get_free_page(GFP_KERNEL);
+       if (txb->data == NULL) {
+               kfree(txb);
+               txb = NULL;
+       }
+
+       return txb;
+}
+
+
+static inline void tx_buffer_free(struct tx_buffer *txb)
+{
+       if (txb) {
+               free_page((long)txb->data);
+               kfree(txb);
+       }
+}
+
+/**************************************************************
+ Utility function for the tpm_private structure
+**************************************************************/
+static void tpm_private_init(struct tpm_private *tp)
+{
+       spin_lock_init(&tp->tx_lock);
+       init_waitqueue_head(&tp->wait_q);
+       atomic_set(&tp->refcnt, 1);
+}
+
+static void tpm_private_put(void)
+{
+       if (!atomic_dec_and_test(&my_priv->refcnt))
+               return;
+
+       tpmif_free_tx_buffers(my_priv);
+       kfree(my_priv);
+       my_priv = NULL;
+}
+
+static struct tpm_private *tpm_private_get(void)
+{
+       int err;
+
+       if (my_priv) {
+               atomic_inc(&my_priv->refcnt);
+               return my_priv;
+       }
+
+       my_priv = kzalloc(sizeof(struct tpm_private), GFP_KERNEL);
+       if (!my_priv)
+               return NULL;
+
+       tpm_private_init(my_priv);
+       err = tpmif_allocate_tx_buffers(my_priv);
+       if (err < 0)
+               tpm_private_put();
+
+       return my_priv;
+}
+
+/**************************************************************
+
+ The interface to let the tpm plugin register its callback
+ function and send data to another partition using this module
+
+**************************************************************/
+
+static DEFINE_MUTEX(suspend_lock);
+/*
+ * Send data via this module by calling this function
+ */
+int vtpm_vd_send(struct tpm_private *tp,
+                 const u8 * buf, size_t count, void *ptr)
+{
+       int sent;
+
+       mutex_lock(&suspend_lock);
+       sent = tpm_xmit(tp, buf, count, 0, ptr);
+       mutex_unlock(&suspend_lock);
+
+       return sent;
+}
+
+/**************************************************************
+ XENBUS support code
+**************************************************************/
+
+static int setup_tpmring(struct xenbus_device *dev,
+                         struct tpm_private *tp)
+{
+       tpmif_tx_interface_t *sring;
+       int err;
+
+       tp->ring_ref = GRANT_INVALID_REF;
+
+       sring = (void *)__get_free_page(GFP_KERNEL);
+       if (!sring) {
+               xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
+               return -ENOMEM;
+       }
+       tp->tx = sring;
+
+       err = xenbus_grant_ring(dev, virt_to_mfn(tp->tx));
+       if (err < 0) {
+               free_page((unsigned long)sring);
+               tp->tx = NULL;
+               xenbus_dev_fatal(dev, err, "allocating grant reference");
+               goto fail;
+       }
+       tp->ring_ref = err;
+
+       err = tpmif_connect(dev, tp, dev->otherend_id);
+       if (err)
+               goto fail;
+
+       return 0;
+fail:
+       destroy_tpmring(tp);
+       return err;
+}
+
+
+static void destroy_tpmring(struct tpm_private *tp)
+{
+       tpmif_set_connected_state(tp, 0);
+
+       if (tp->ring_ref != GRANT_INVALID_REF) {
+               gnttab_end_foreign_access(tp->ring_ref, (unsigned long)tp->tx);
+               tp->ring_ref = GRANT_INVALID_REF;
+               tp->tx = NULL;
+       }
+
+       if (tp->irq)
+               unbind_from_irqhandler(tp->irq, tp);
+
+       tp->irq = 0;
+}
+
+
+static int talk_to_backend(struct xenbus_device *dev,
+                           struct tpm_private *tp)
+{
+       const char *message = NULL;
+       int err;
+       struct xenbus_transaction xbt;
+
+       err = setup_tpmring(dev, tp);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "setting up ring");
+               goto out;
+       }
+
+again:
+       err = xenbus_transaction_start(&xbt);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "starting transaction");
+               goto destroy_tpmring;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename,
+                           "ring-ref","%u", tp->ring_ref);
+       if (err) {
+               message = "writing ring-ref";
+               goto abort_transaction;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+                           irq_to_evtchn_port(tp->irq));
+       if (err) {
+               message = "writing event-channel";
+               goto abort_transaction;
+       }
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err == -EAGAIN)
+               goto again;
+       if (err) {
+               xenbus_dev_fatal(dev, err, "completing transaction");
+               goto destroy_tpmring;
+       }
+
+       xenbus_switch_state(dev, XenbusStateConnected);
+
+       return 0;
+
+abort_transaction:
+       xenbus_transaction_end(xbt, 1);
+       if (message)
+               xenbus_dev_error(dev, err, "%s", message);
+destroy_tpmring:
+       destroy_tpmring(tp);
+out:
+       return err;
+}
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+                           enum xenbus_state backend_state)
+{
+       struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+       DPRINTK("\n");
+
+       switch (backend_state) {
+       case XenbusStateInitialising:
+       case XenbusStateInitWait:
+       case XenbusStateInitialised:
+       case XenbusStateReconfiguring:
+       case XenbusStateReconfigured:
+       case XenbusStateUnknown:
+               break;
+
+       case XenbusStateConnected:
+               tpmif_set_connected_state(tp, 1);
+               break;
+
+       case XenbusStateClosing:
+               tpmif_set_connected_state(tp, 0);
+               xenbus_frontend_closed(dev);
+               break;
+
+       case XenbusStateClosed:
+               tpmif_set_connected_state(tp, 0);
+               if (tp->is_suspended == 0)
+                       device_unregister(&dev->dev);
+               xenbus_frontend_closed(dev);
+               break;
+       }
+}
+
+static int tpmfront_probe(struct xenbus_device *dev,
+                          const struct xenbus_device_id *id)
+{
+       int err;
+       int handle;
+       struct tpm_private *tp = tpm_private_get();
+
+       if (!tp)
+               return -ENOMEM;
+
+       tp->chip = init_vtpm(&dev->dev, tp);
+       if (IS_ERR(tp->chip))
+               return PTR_ERR(tp->chip);
+
+       err = xenbus_scanf(XBT_NIL, dev->nodename,
+                          "handle", "%i", &handle);
+       if (XENBUS_EXIST_ERR(err))
+               return err;
+
+       if (err < 0) {
+               xenbus_dev_fatal(dev,err,"reading virtual-device");
+               return err;
+       }
+
+       tp->dev = dev;
+
+       err = talk_to_backend(dev, tp);
+       if (err) {
+               tpm_private_put();
+               return err;
+       }
+
+       return 0;
+}
+
+
+static int tpmfront_remove(struct xenbus_device *dev)
+{
+       struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+       destroy_tpmring(tp);
+       cleanup_vtpm(&dev->dev);
+       return 0;
+}
+
+static int tpmfront_suspend(struct xenbus_device *dev)
+{
+       struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+       u32 ctr;
+
+       /* Take the lock, preventing any application from sending. */
+       mutex_lock(&suspend_lock);
+       tp->is_suspended = 1;
+
+       for (ctr = 0; atomic_read(&tp->tx_busy); ctr++) {
+               if ((ctr % 10) == 0)
+                       printk("TPM-FE [INFO]: Waiting for outstanding "
+                              "request.\n");
+               /* Wait for a request to be responded to. */
+               interruptible_sleep_on_timeout(&tp->wait_q, 100);
+       }
+
+       return 0;
+}
+
+static int tpmfront_suspend_finish(struct tpm_private *tp)
+{
+       tp->is_suspended = 0;
+       /* Allow applications to send again. */
+       mutex_unlock(&suspend_lock);
+       return 0;
+}
+
+static int tpmfront_suspend_cancel(struct xenbus_device *dev)
+{
+       struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+       return tpmfront_suspend_finish(tp);
+}
+
+static int tpmfront_resume(struct xenbus_device *dev)
+{
+       struct tpm_private *tp = tpm_private_from_dev(&dev->dev);
+       destroy_tpmring(tp);
+       return talk_to_backend(dev, tp);
+}
+
+static int tpmif_connect(struct xenbus_device *dev,
+                         struct tpm_private *tp,
+                         domid_t domid)
+{
+       int err;
+
+       tp->backend_id = domid;
+
+       err = bind_listening_port_to_irqhandler(
+               domid, tpmif_int, IRQF_SAMPLE_RANDOM, "tpmif", tp);
+       if (err <= 0) {
+               WPRINTK("bind_listening_port_to_irqhandler failed "
+                       "(err=%d)\n", err);
+               return err;
+       }
+       tp->irq = err;
+
+       return 0;
+}
+
+static const struct xenbus_device_id tpmfront_ids[] = {
+       { "vtpm" },
+       { "" }
+};
+MODULE_ALIAS("xen:vtpm");
+
+static DEFINE_XENBUS_DRIVER(tpmfront, ,
+       .probe = tpmfront_probe,
+       .remove =  tpmfront_remove,
+       .resume = tpmfront_resume,
+       .otherend_changed = backend_changed,
+       .suspend = tpmfront_suspend,
+       .suspend_cancel = tpmfront_suspend_cancel,
+);
+
+static int __init init_tpm_xenbus(void)
+{
+       return xenbus_register_frontend(&tpmfront_driver);
+}
+
+static int tpmif_allocate_tx_buffers(struct tpm_private *tp)
+{
+       unsigned int i;
+
+       for (i = 0; i < TPMIF_TX_RING_SIZE; i++) {
+               tp->tx_buffers[i] = tx_buffer_alloc();
+               if (!tp->tx_buffers[i]) {
+                       tpmif_free_tx_buffers(tp);
+                       return -ENOMEM;
+               }
+       }
+       return 0;
+}
+
+static void tpmif_free_tx_buffers(struct tpm_private *tp)
+{
+       unsigned int i;
+
+       for (i = 0; i < TPMIF_TX_RING_SIZE; i++)
+               tx_buffer_free(tp->tx_buffers[i]);
+}
+
+static void tpmif_rx_action(unsigned long priv)
+{
+       struct tpm_private *tp = (struct tpm_private *)priv;
+       int i = 0;
+       unsigned int received;
+       unsigned int offset = 0;
+       u8 *buffer;
+       tpmif_tx_request_t *tx = &tp->tx->ring[i].req;
+
+       atomic_set(&tp->tx_busy, 0);
+       wake_up_interruptible(&tp->wait_q);
+
+       received = tx->size;
+
+       buffer = kmalloc(received, GFP_ATOMIC);
+       if (!buffer)
+               return;
+
+       for (i = 0; i < TPMIF_TX_RING_SIZE && offset < received; i++) {
+               struct tx_buffer *txb = tp->tx_buffers[i];
+               tpmif_tx_request_t *tx;
+               unsigned int tocopy;
+
+               tx = &tp->tx->ring[i].req;
+               tocopy = tx->size;
+               if (tocopy > PAGE_SIZE)
+                       tocopy = PAGE_SIZE;
+
+               memcpy(&buffer[offset], txb->data, tocopy);
+
+               gnttab_release_grant_reference(&gref_head, tx->ref);
+
+               offset += tocopy;
+       }
+
+       vtpm_vd_recv(tp->chip, buffer, received, tp->tx_remember);
+       kfree(buffer);
+}
+
+
+static irqreturn_t tpmif_int(int irq, void *tpm_priv)
+{
+       struct tpm_private *tp = tpm_priv;
+       unsigned long flags;
+
+       spin_lock_irqsave(&tp->tx_lock, flags);
+       tpmif_rx_tasklet.data = (unsigned long)tp;
+       tasklet_schedule(&tpmif_rx_tasklet);
+       spin_unlock_irqrestore(&tp->tx_lock, flags);
+
+       return IRQ_HANDLED;
+}
+
+
+static int tpm_xmit(struct tpm_private *tp,
+                    const u8 * buf, size_t count, int isuserbuffer,
+                    void *remember)
+{
+       tpmif_tx_request_t *tx;
+       TPMIF_RING_IDX i;
+       unsigned int offset = 0;
+
+       spin_lock_irq(&tp->tx_lock);
+
+       if (unlikely(atomic_read(&tp->tx_busy))) {
+               printk("tpm_xmit: There's an outstanding request/response "
+                      "on the way!\n");
+               spin_unlock_irq(&tp->tx_lock);
+               return -EBUSY;
+       }
+
+       if (tp->is_connected != 1) {
+               spin_unlock_irq(&tp->tx_lock);
+               return -EIO;
+       }
+
+       for (i = 0; count > 0 && i < TPMIF_TX_RING_SIZE; i++) {
+               struct tx_buffer *txb = tp->tx_buffers[i];
+               int copied;
+
+               if (!txb) {
+                       DPRINTK("txb (i=%d) is NULL. buffers initilized?\n"
+                               "Not transmitting anything!\n", i);
+                       spin_unlock_irq(&tp->tx_lock);
+                       return -EFAULT;
+               }
+
+               copied = tx_buffer_copy(txb, &buf[offset], count,
+                                       isuserbuffer);
+               if (copied < 0) {
+                       /* An error occurred */
+                       spin_unlock_irq(&tp->tx_lock);
+                       return copied;
+               }
+               count -= copied;
+               offset += copied;
+
+               tx = &tp->tx->ring[i].req;
+               tx->addr = virt_to_machine(txb->data);
+               tx->size = txb->len;
+               tx->unused = 0;
+
+               DPRINTK("First 4 characters sent by TPM-FE are "
+                       "0x%02x 0x%02x 0x%02x 0x%02x\n",
+                       txb->data[0],txb->data[1],txb->data[2],txb->data[3]);
+
+               /* Get the granttable reference for this page. */
+               tx->ref = gnttab_claim_grant_reference(&gref_head);
+               if (tx->ref == -ENOSPC) {
+                       spin_unlock_irq(&tp->tx_lock);
+                       DPRINTK("Grant table claim reference failed in "
+                               "func:%s line:%d file:%s\n",
+                               __FUNCTION__, __LINE__, __FILE__);
+                       return -ENOSPC;
+               }
+               gnttab_grant_foreign_access_ref(tx->ref,
+                                               tp->backend_id,
+                                               virt_to_mfn(txb->data),
+                                               0 /*RW*/);
+               wmb();
+       }
+
+       atomic_set(&tp->tx_busy, 1);
+       tp->tx_remember = remember;
+
+       mb();
+
+       notify_remote_via_irq(tp->irq);
+
+       spin_unlock_irq(&tp->tx_lock);
+       return offset;
+}
+
+
+static void tpmif_notify_upperlayer(struct tpm_private *tp)
+{
+       /* Notify upper layer about the state of the connection to the BE. */
+       vtpm_vd_status(tp->chip, (tp->is_connected
+                                 ? TPM_VD_STATUS_CONNECTED
+                                 : TPM_VD_STATUS_DISCONNECTED));
+}
+
+
+static void tpmif_set_connected_state(struct tpm_private *tp, u8 is_connected)
+{
+       /*
+        * Don't notify upper layer if we are in suspend mode and
+        * should disconnect - assumption is that we will resume
+        * The mutex keeps apps from sending.
+        */
+       if (is_connected == 0 && tp->is_suspended == 1)
+               return;
+
+       /*
+        * Unlock the mutex if we are connected again
+        * after being suspended - now resuming.
+        * This also removes the suspend state.
+        */
+       if (is_connected == 1 && tp->is_suspended == 1)
+               tpmfront_suspend_finish(tp);
+
+       if (is_connected != tp->is_connected) {
+               tp->is_connected = is_connected;
+               tpmif_notify_upperlayer(tp);
+       }
+}
+
+
+
+/* =================================================================
+ * Initialization function.
+ * =================================================================
+ */
+
+
+static int __init tpmif_init(void)
+{
+       struct tpm_private *tp;
+
+       if (is_initial_xendomain())
+               return -EPERM;
+
+       tp = tpm_private_get();
+       if (!tp)
+               return -ENOMEM;
+
+       IPRINTK("Initialising the vTPM driver.\n");
+       if (gnttab_alloc_grant_references(TPMIF_TX_RING_SIZE,
+                                         &gref_head) < 0) {
+               tpm_private_put();
+               return -EFAULT;
+       }
+
+       init_tpm_xenbus();
+       return 0;
+}
+
+
+module_init(tpmif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c

index 77e1e6c..68d3f3b 100644 (file)
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -37,7 +37,7 @@
  
  #define CN_PROC_MSG_SIZE (sizeof(struct cn_msg) + sizeof(struct proc_event))
  
-static atomic_t proc_event_num_listeners = ATOMIC_INIT(0);
+static atomic_t proc_event_num_listeners __read_mostly = ATOMIC_INIT(0);
  static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
  
  /* proc_event_counts is used as the sequence number of the netlink message */
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig

index e24a2a1..530f9b6 100644 (file)
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -2,6 +2,7 @@ menu "CPU Frequency scaling"
  
  config CPU_FREQ
         bool "CPU Frequency scaling"
+       depends on !PROCESSOR_EXTERNAL_CONTROL
         help
           CPU Frequency scaling allows you to change the clock speed of 
           CPUs on the fly. This is a nice method to save power, because 
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c

index 836e9b0..bfb49f3 100644 (file)
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -37,6 +37,7 @@
  #define MICRO_FREQUENCY_MIN_SAMPLE_RATE                (10000)
  #define MIN_FREQUENCY_UP_THRESHOLD             (11)
  #define MAX_FREQUENCY_UP_THRESHOLD             (100)
+#define MAX_DEFAULT_SAMPLING_RATE              (300 * 1000U)
  
  /*
   * The polling frequency of this governor depends on the capability of
@@ -732,6 +733,29 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
                         dbs_tuners_ins.sampling_rate =
                                 max(min_sampling_rate,
                                     latency * LATENCY_MULTIPLIER);
+                       /*
+                        * Cut def_sampling rate to 300ms if it was above,
+                        * still consider to not set it above latency
+                        * transition * 100
+                        */
+                       if (dbs_tuners_ins.sampling_rate > MAX_DEFAULT_SAMPLING_RATE) {
+                               dbs_tuners_ins.sampling_rate =
+                                       max(min_sampling_rate, MAX_DEFAULT_SAMPLING_RATE);
+                               printk(KERN_INFO "CPUFREQ: ondemand sampling "
+                                      "rate set to %d ms\n",
+                                      dbs_tuners_ins.sampling_rate / 1000);
+                       }
+                       /*
+                        * Be conservative in respect to performance.
+                        * If an application calculates using two threads
+                        * depending on each other, they will be run on several
+                        * CPU cores resulting on 50% load on both.
+                        * SLED might still want to prefer 80% up_threshold
+                        * by default, but we cannot differ that here.
+                        */
+                       if (num_online_cpus() > 1)
+                               dbs_tuners_ins.up_threshold =
+                                       DEF_FREQUENCY_UP_THRESHOLD / 2;
                         dbs_tuners_ins.io_is_busy = should_io_be_busy();
                 }
                 mutex_unlock(&dbs_mutex);
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig

index 78a666d..d530ccb 100644 (file)
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -1,6 +1,7 @@
  
  config CPU_IDLE
         bool "CPU idle PM support"
+       depends on !PROCESSOR_EXTERNAL_CONTROL
         default y if ACPI || PPC_PSERIES
         help
           CPU idle is a generic framework for supporting software-controlled
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig

index ef378b5..5ccc566 100644 (file)
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -61,7 +61,7 @@ config INTEL_IOATDMA
         tristate "Intel I/OAT DMA support"
         depends on PCI && X86
         select DMA_ENGINE
-       select DCA
+       select DCA if !XEN
         select ASYNC_TX_DISABLE_PQ_VAL_DMA
         select ASYNC_TX_DISABLE_XOR_VAL_DMA
         help
diff --git a/drivers/dma/ioat/Makefile b/drivers/dma/ioat/Makefile

index 0ff7270..495983a 100644 (file)
--- a/drivers/dma/ioat/Makefile
+++ b/drivers/dma/ioat/Makefile
@@ -1,2 +1,3 @@
  obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
-ioatdma-y := pci.o dma.o dma_v2.o dma_v3.o dca.o
+dca-$(CONFIG_DCA) := dca.o
+ioatdma-y := pci.o dma.o dma_v2.o dma_v3.o $(dca-y) $(dca-m)
diff --git a/drivers/dma/ioat/dca.c b/drivers/dma/ioat/dca.c

index abd9038..fb188d1 100644 (file)
--- a/drivers/dma/ioat/dca.c
+++ b/drivers/dma/ioat/dca.c
@@ -682,3 +682,15 @@ ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase)
  
         return dca;
  }
+
+void ioat_remove_dca_provider(struct pci_dev *pdev)
+{
+       struct ioatdma_device *device = pci_get_drvdata(pdev);
+
+       if (!device->dca)
+               return;
+
+       unregister_dca_provider(device->dca, &pdev->dev);
+       free_dca_provider(device->dca);
+       device->dca = NULL;
+}
diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h

index 5e8fe01..0b8c947 100644 (file)
--- a/drivers/dma/ioat/dma.h
+++ b/drivers/dma/ioat/dma.h
@@ -324,4 +324,21 @@ void ioat_kobject_del(struct ioatdma_device *device);
  extern const struct sysfs_ops ioat_sysfs_ops;
  extern struct ioat_sysfs_entry ioat_version_attr;
  extern struct ioat_sysfs_entry ioat_cap_attr;
+
+#ifndef CONFIG_XEN
+void ioat_remove_dca_provider(struct pci_dev *);
+#else
+static inline void ioat_remove_dca_provider(struct pci_dev *pdev)
+{
+       struct ioatdma_device *device = pci_get_drvdata(pdev);
+       BUG_ON(device->dca);
+}
+static inline struct dca_provider *__devinit
+__ioat_dca_init(struct pci_dev *pdev, void __iomem *iobase)
+{
+       return NULL;
+}
+#define ioat_dca_init __ioat_dca_init
+#endif
+
  #endif /* IOATDMA_H */
diff --git a/drivers/dma/ioat/dma_v2.h b/drivers/dma/ioat/dma_v2.h

index be2a55b..68268c0 100644 (file)
--- a/drivers/dma/ioat/dma_v2.h
+++ b/drivers/dma/ioat/dma_v2.h
@@ -176,4 +176,10 @@ int ioat2_quiesce(struct ioat_chan_common *chan, unsigned long tmo);
  int ioat2_reset_sync(struct ioat_chan_common *chan, unsigned long tmo);
  extern struct kobj_type ioat2_ktype;
  extern struct kmem_cache *ioat2_cache;
+
+#ifdef CONFIG_XEN
+#define ioat2_dca_init __ioat_dca_init
+#define ioat3_dca_init __ioat_dca_init
+#endif
+
  #endif /* IOATDMA_V2_H */
diff --git a/drivers/dma/ioat/hw.h b/drivers/dma/ioat/hw.h

index 60e6754..3d0dca6 100644 (file)
--- a/drivers/dma/ioat/hw.h
+++ b/drivers/dma/ioat/hw.h
@@ -39,7 +39,11 @@
  #define IOAT_VER_3_0            0x30    /* Version 3.0 */
  #define IOAT_VER_3_2            0x32    /* Version 3.2 */
  
+#ifndef CONFIG_XEN
  int system_has_dca_enabled(struct pci_dev *pdev);
+#else
+static inline int system_has_dca_enabled(struct pci_dev *pdev) { return 0; }
+#endif
  
  struct ioat_dma_descriptor {
         uint32_t        size;
diff --git a/drivers/dma/ioat/pci.c b/drivers/dma/ioat/pci.c

index 5e3a40f..efec888 100644 (file)
--- a/drivers/dma/ioat/pci.c
+++ b/drivers/dma/ioat/pci.c
@@ -29,7 +29,6 @@
  #include <linux/module.h>
  #include <linux/pci.h>
  #include <linux/interrupt.h>
-#include <linux/dca.h>
  #include <linux/slab.h>
  #include "dma.h"
  #include "dma_v2.h"
@@ -181,11 +180,7 @@ static void __devexit ioat_remove(struct pci_dev *pdev)
                 return;
  
         dev_err(&pdev->dev, "Removing dma and dca services\n");
-       if (device->dca) {
-               unregister_dca_provider(device->dca, &pdev->dev);
-               free_dca_provider(device->dca);
-               device->dca = NULL;
-       }
+       ioat_remove_dca_provider(pdev);
         ioat_dma_remove(device);
  }
  
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig

index fdffa1b..c7743d8 100644 (file)
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -41,7 +41,7 @@ config EDAC_DEBUG
  
  config EDAC_DECODE_MCE
         tristate "Decode MCEs in human-readable form (only on AMD for now)"
-       depends on CPU_SUP_AMD && X86_MCE_AMD
+       depends on CPU_SUP_AMD && (X86_MCE_AMD || X86_XEN_MCE)
         default y
         ---help---
           Enable this option if you want to decode Machine Check Exceptions
@@ -74,6 +74,7 @@ config EDAC_MM_EDAC
  config EDAC_AMD64
         tristate "AMD64 (Opteron, Athlon64) K8, F10h"
         depends on EDAC_MM_EDAC && AMD_NB && X86_64 && EDAC_DECODE_MCE
+       depends on !XEN
         help
           Support for error detection and correction of DRAM ECC errors on
           the AMD64 families of memory controllers (K8 and F10h)
@@ -170,7 +171,7 @@ config EDAC_I5400
  
  config EDAC_I7CORE
         tristate "Intel i7 Core (Nehalem) processors"
-       depends on EDAC_MM_EDAC && PCI && X86 && X86_MCE_INTEL
+       depends on EDAC_MM_EDAC && PCI && X86 && (X86_MCE_INTEL || X86_XEN_MCE)
         help
           Support for error detection and correction the Intel
           i7 Core (Nehalem) Integrated Memory Controller that exists on
@@ -214,7 +215,7 @@ config EDAC_I7300
  
  config EDAC_SBRIDGE
         tristate "Intel Sandy-Bridge Integrated MC"
-       depends on EDAC_MM_EDAC && PCI && X86_64 && X86_MCE_INTEL
+       depends on EDAC_MM_EDAC && PCI && X86_64 && (X86_MCE_INTEL || X86_XEN_MCE)
         depends on PCI_MMCONFIG && EXPERIMENTAL
         help
           Support for error detection and correction the Intel
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c

index feef773..859012b 100644 (file)
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -610,6 +610,10 @@ static void edac_mc_scrub_block(unsigned long page, unsigned long offset,
  
         debugf3("%s()\n", __func__);
  
+#ifdef CONFIG_XEN
+       page = mfn_to_local_pfn(page);
+#endif
+
         /* ECC error page was not in our memory. Ignore it. */
         if (!pfn_valid(page))
                 return;
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c

index 85226cc..3aa1ec7 100644 (file)
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -1932,7 +1932,7 @@ static int i7core_mce_check_error(struct notifier_block *nb, unsigned long val,
         if (mce->bank != 8)
                 return NOTIFY_DONE;
  
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_XEN)
         /* Only handle if it is the right mc controller */
         if (mce->socketid != pvt->i7core_dev->socket)
                 return NOTIFY_DONE;
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c

index a203536..472045e 100644 (file)
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -1620,7 +1620,11 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
                 mce->socketid, mce->apicid);
  
         /* Only handle if it is the right mc controller */
+#ifdef CONFIG_XEN /* Could easily be used for non-Xen too. */
+       if (mce->socketid != pvt->sbridge_dev->mc)
+#else
         if (cpu_data(mce->cpu).phys_proc_id != pvt->sbridge_dev->mc)
+#endif
                 return NOTIFY_DONE;
  
         smp_rmb();
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig

index 9b00072..1d24c4d 100644 (file)
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -91,6 +91,7 @@ config DELL_RBU
  config DCDBAS
         tristate "Dell Systems Management Base Driver"
         depends on X86
+       select XEN_DOMCTL if XEN
         help
           The Dell Systems Management Base Driver provides a sysfs interface
           for systems management software to perform System Management
diff --git a/drivers/firmware/dcdbas.c b/drivers/firmware/dcdbas.c

index ea5ac2d..156a75d 100644 (file)
--- a/drivers/firmware/dcdbas.c
+++ b/drivers/firmware/dcdbas.c
@@ -37,6 +37,10 @@
  #include <linux/mutex.h>
  #include <asm/io.h>
  
+#ifdef CONFIG_XEN
+#include "../xen/core/domctl.h"
+#endif
+
  #include "dcdbas.h"
  
  #define DRIVER_NAME            "dcdbas"
@@ -107,7 +111,7 @@ static int smi_data_buf_realloc(unsigned long size)
         /* set up new buffer for use */
         smi_data_buf = buf;
         smi_data_buf_handle = handle;
-       smi_data_buf_phys_addr = (u32) virt_to_phys(buf);
+       smi_data_buf_phys_addr = (u32) handle;
         smi_data_buf_size = size;
  
         dev_dbg(&dcdbas_pdev->dev, "%s: phys: %x size: %lu\n",
@@ -245,7 +249,9 @@ static ssize_t host_control_on_shutdown_store(struct device *dev,
   */
  int dcdbas_smi_request(struct smi_cmd *smi_cmd)
  {
+#ifndef CONFIG_XEN
         cpumask_var_t old_mask;
+#endif
         int ret = 0;
  
         if (smi_cmd->magic != SMI_CMD_MAGIC) {
@@ -255,6 +261,7 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
         }
  
         /* SMI requires CPU 0 */
+#ifndef CONFIG_XEN
         if (!alloc_cpumask_var(&old_mask, GFP_KERNEL))
                 return -ENOMEM;
  
@@ -266,6 +273,14 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
                 ret = -EBUSY;
                 goto out;
         }
+#else
+       ret = xen_set_physical_cpu_affinity(0);
+       if (ret) {
+               dev_dbg(&dcdbas_pdev->dev, "%s: failed (%d) to get CPU 0\n",
+                       __func__, ret);
+               return ret;
+       }
+#endif
  
         /* generate SMI */
         /* inb to force posted write through and make SMI happen now */
@@ -280,9 +295,13 @@ int dcdbas_smi_request(struct smi_cmd *smi_cmd)
                 : "memory"
         );
  
+#ifndef CONFIG_XEN
  out:
         set_cpus_allowed_ptr(current, old_mask);
         free_cpumask_var(old_mask);
+#else
+       xen_set_physical_cpu_affinity(-1);
+#endif
         return ret;
  }
  
@@ -322,7 +341,7 @@ static ssize_t smi_request_store(struct device *dev,
                 break;
         case 1:
                 /* Calling Interface SMI */
-               smi_cmd->ebx = (u32) virt_to_phys(smi_cmd->command_buffer);
+               smi_cmd->ebx = (u32) virt_to_bus(smi_cmd->command_buffer);
                 ret = dcdbas_smi_request(smi_cmd);
                 if (!ret)
                         ret = count;
@@ -603,6 +622,11 @@ static int __init dcdbas_init(void)
  {
         int error;
  
+#ifdef CONFIG_XEN
+       if (!is_initial_xendomain())
+               return -ENODEV;
+#endif
+
         error = platform_driver_register(&dcdbas_driver);
         if (error)
                 return error;
diff --git a/drivers/firmware/dell_rbu.c b/drivers/firmware/dell_rbu.c

index 2f452f1..fb60177 100644 (file)
--- a/drivers/firmware/dell_rbu.c
+++ b/drivers/firmware/dell_rbu.c
@@ -170,9 +170,27 @@ static int create_packet(void *data, size_t length)
                         spin_lock(&rbu_data.lock);
                         goto out_alloc_packet_array;
                 }
+#ifdef CONFIG_XEN
+               if (ordernum && xen_create_contiguous_region(
+                       (unsigned long)packet_data_temp_buf, ordernum, 0)) {
+                       free_pages((unsigned long)packet_data_temp_buf,
+                                  ordernum);
+                       pr_warning("dell_rbu:%s: failed to adjust new "
+                                  "packet\n", __func__);
+                       retval = -ENOMEM;
+                       spin_lock(&rbu_data.lock);
+                       goto out_alloc_packet_array;
+               }
+#endif
  
-               if ((unsigned long)virt_to_phys(packet_data_temp_buf)
+               if ((unsigned long)virt_to_bus(packet_data_temp_buf)
                                 < allocation_floor) {
+#ifdef CONFIG_XEN
+                       if (ordernum)
+                               xen_destroy_contiguous_region(
+                                       (unsigned long)packet_data_temp_buf,
+                                       ordernum);
+#endif
                         pr_debug("packet 0x%lx below floor at 0x%lx.\n",
                                         (unsigned long)virt_to_phys(
                                                 packet_data_temp_buf),
@@ -186,7 +204,7 @@ static int create_packet(void *data, size_t length)
         newpacket->data = packet_data_temp_buf;
  
         pr_debug("create_packet: newpacket at physical addr %lx\n",
-               (unsigned long)virt_to_phys(newpacket->data));
+               (unsigned long)virt_to_bus(newpacket->data));
  
         /* packets may not have fixed size */
         newpacket->length = length;
@@ -205,7 +223,7 @@ out_alloc_packet_array:
         /* always free packet array */
         for (;idx>0;idx--) {
                 pr_debug("freeing unused packet below floor 0x%lx.\n",
-                       (unsigned long)virt_to_phys(
+                       (unsigned long)virt_to_bus(
                                 invalid_addr_packet_array[idx-1]));
                 free_pages((unsigned long)invalid_addr_packet_array[idx-1],
                         ordernum);
@@ -349,6 +367,13 @@ static void packet_empty_list(void)
                  * to make sure there are no stale RBU packets left in memory
                  */
                 memset(newpacket->data, 0, rbu_data.packetsize);
+#ifdef CONFIG_XEN
+               if (newpacket->ordernum)
+                       xen_destroy_contiguous_region(
+                               (unsigned long)newpacket->data,
+                               newpacket->ordernum);
+#endif
+
                 free_pages((unsigned long) newpacket->data,
                         newpacket->ordernum);
                 kfree(newpacket);
@@ -403,7 +428,9 @@ static int img_update_realloc(unsigned long size)
  {
         unsigned char *image_update_buffer = NULL;
         unsigned long rc;
+#ifndef CONFIG_XEN
         unsigned long img_buf_phys_addr;
+#endif
         int ordernum;
         int dma_alloc = 0;
  
@@ -434,15 +461,19 @@ static int img_update_realloc(unsigned long size)
  
         spin_unlock(&rbu_data.lock);
  
+#ifndef CONFIG_XEN
         ordernum = get_order(size);
         image_update_buffer =
                 (unsigned char *) __get_free_pages(GFP_KERNEL, ordernum);
  
         img_buf_phys_addr =
-               (unsigned long) virt_to_phys(image_update_buffer);
+               (unsigned long) virt_to_bus(image_update_buffer);
  
         if (img_buf_phys_addr > BIOS_SCAN_LIMIT) {
                 free_pages((unsigned long) image_update_buffer, ordernum);
+#else
+       {
+#endif
                 ordernum = -1;
                 image_update_buffer = dma_alloc_coherent(NULL, size,
                         &dell_rbu_dmaaddr, GFP_KERNEL);
@@ -695,6 +726,12 @@ static struct bin_attribute rbu_packet_size_attr = {
  static int __init dcdrbu_init(void)
  {
         int rc;
+
+#ifdef CONFIG_XEN
+       if (!is_initial_xendomain())
+               return -ENODEV;
+#endif
+
         spin_lock_init(&rbu_data.lock);
  
         init_packet_head();
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c

index 153980b..ad46ec8 100644 (file)
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -482,6 +482,11 @@ static bool dmi_matches(const struct dmi_system_id *dmi)
  {
         int i;
  
+#ifdef CONFIG_XEN
+       if (!is_initial_xendomain())
+               return false;
+#endif
+
         WARN(!dmi_initialized, KERN_ERR "dmi check: not initialized yet.\n");
  
         for (i = 0; i < ARRAY_SIZE(dmi->matches); i++) {
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c

index ae8a64f..a92f38a 100644 (file)
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -885,7 +885,7 @@ static const struct file_operations i915_driver_fops = {
         .open = drm_open,
         .release = drm_release,
         .unlocked_ioctl = drm_ioctl,
-       .mmap = drm_gem_mmap,
+       .mmap = i915_gem_mmap,
         .poll = drm_poll,
         .fasync = drm_fasync,
         .read = drm_read,
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h

index 5fabc6c..ec15915 100644 (file)
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1276,6 +1276,11 @@ int __must_check i915_add_request(struct intel_ring_buffer *ring,
  int __must_check i915_wait_request(struct intel_ring_buffer *ring,
                                    uint32_t seqno,
                                    bool do_retire);
+#ifdef CONFIG_XEN
+int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma);
+#else
+#define i915_gem_mmap drm_gem_mmap
+#endif
  int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
  int __must_check
  i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c

index 0d1e4b7..a4e2028 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1099,6 +1099,17 @@ i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
         return 0;
  }
  
+#ifdef CONFIG_XEN
+int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+       int ret = drm_gem_mmap(filp, vma);
+
+       pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP;
+
+       return ret;
+}
+#endif
+
  /**
   * i915_gem_fault - fault a page into the GTT
   * vma: VMA in question
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c

index 1b1cf3b..2d7e7b0 100644 (file)
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -8440,7 +8440,11 @@ void gen6_update_ring_freq(struct drm_i915_private *dev_priv)
          * over
          */
         if (!max_ia_freq)
+#ifndef CONFIG_XEN
                 max_ia_freq = tsc_khz;
+#else
+               max_ia_freq = cpu_khz;
+#endif
  
         /* Convert from kHz to MHz */
         max_ia_freq /= 1000;
diff --git a/drivers/gpu/drm/nouveau/nouveau_acpi.c b/drivers/gpu/drm/nouveau/nouveau_acpi.c

index 284bd25..85ee139 100644 (file)
--- a/drivers/gpu/drm/nouveau/nouveau_acpi.c
+++ b/drivers/gpu/drm/nouveau/nouveau_acpi.c
@@ -42,6 +42,7 @@ static struct nouveau_dsm_priv {
  #define NOUVEAU_DSM_HAS_MUX 0x1
  #define NOUVEAU_DSM_HAS_OPT 0x2
  
+#ifdef CONFIG_VGA_SWITCHEROO
  static const char nouveau_dsm_muid[] = {
         0xA0, 0xA0, 0x95, 0x9D, 0x60, 0x00, 0x48, 0x4D,
         0xB3, 0x4D, 0x7E, 0x5F, 0xEA, 0x12, 0x9F, 0xD4,
@@ -340,6 +341,10 @@ void nouveau_unregister_dsm_handler(void)
  {
         vga_switcheroo_unregister_handler();
  }
+#else
+void nouveau_register_dsm_handler(void) {}
+void nouveau_unregister_dsm_handler(void) {}
+#endif
  
  /* retrieve the ROM in 4k blocks */
  static int nouveau_rom_call(acpi_handle rom_handle, uint8_t *bios,
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c

index 5992502..07ff3bd 100644 (file)
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -448,6 +448,18 @@ int radeon_dummy_page_init(struct radeon_device *rdev)
         rdev->dummy_page.page = alloc_page(GFP_DMA32 | GFP_KERNEL | __GFP_ZERO);
         if (rdev->dummy_page.page == NULL)
                 return -ENOMEM;
+#ifdef CONFIG_XEN
+       {
+               int ret = xen_limit_pages_to_max_mfn(rdev->dummy_page.page,
+                                                    0, 32);
+
+               if (!ret)
+                       clear_page(page_address(rdev->dummy_page.page));
+               else
+                       dev_warn(rdev->dev,
+                                "Error restricting dummy page: %d\n", ret);
+       }
+#endif
         rdev->dummy_page.addr = pci_map_page(rdev->pdev, rdev->dummy_page.page,
                                         0, PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
         if (pci_dma_mapping_error(rdev->pdev, rdev->dummy_page.addr)) {
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c

index 1f5c67c..84d4372 100644 (file)
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1467,6 +1467,14 @@ int ttm_bo_global_init(struct drm_global_reference *ref)
                 ret = -ENOMEM;
                 goto out_no_drp;
         }
+#ifdef CONFIG_XEN
+       ret = xen_limit_pages_to_max_mfn(glob->dummy_read_page, 0, 32);
+       if (!ret)
+               clear_page(page_address(glob->dummy_read_page));
+       else
+               printk(KERN_WARNING
+                      "Error restricting dummy read page: %d\n", ret);
+#endif
  
         INIT_LIST_HEAD(&glob->swap_lru);
         INIT_LIST_HEAD(&glob->device_list);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c

index a877813..be41ebd 100644 (file)
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -171,7 +171,13 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
         if (bo->mem.bus.is_iomem) {
                 vma->vm_page_prot = ttm_io_prot(bo->mem.placement,
                                                 vma->vm_page_prot);
+#if defined(CONFIG_XEN) && defined(_PAGE_IOMAP)
+               pgprot_val(vma->vm_page_prot) |= _PAGE_IOMAP;
+#endif
         } else {
+#if defined(CONFIG_XEN) && defined(_PAGE_IOMAP)
+               pgprot_val(vma->vm_page_prot) &= ~_PAGE_IOMAP;
+#endif
                 ttm = bo->ttm;
                 vma->vm_page_prot = (bo->mem.placement & TTM_PL_FLAG_CACHED) ?
                     vm_get_page_prot(vma->vm_flags) :
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c

index ebc6fac..86e3efc 100644 (file)
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -491,6 +491,18 @@ static int ttm_alloc_new_pages(struct list_head *pages, gfp_t gfp_flags,
         for (i = 0, cpages = 0; i < count; ++i) {
                 p = alloc_page(gfp_flags);
  
+#ifdef CONFIG_XEN
+               if (p && (gfp_flags & __GFP_DMA32)) {
+                       r = xen_limit_pages_to_max_mfn(p, 0, 32);
+                       if (r) {
+                               __free_page(p);
+                               pr_err("Cannot restrict page (%d)", r);
+                               p = NULL;
+                       } else if (gfp_flags & __GFP_ZERO)
+                               clear_page(page_address(p));
+               }
+#endif
+
                 if (!p) {
                         pr_err("Unable to get page %u\n", i);
  
@@ -730,6 +742,21 @@ static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
                                 return -ENOMEM;
                         }
  
+#ifdef CONFIG_XEN
+                       if (flags & TTM_PAGE_FLAG_DMA32) {
+                               int rc = xen_limit_pages_to_max_mfn(p, 0, 32);
+
+                               if (rc) {
+                                       __free_page(p);
+                                       pr_err("Unable to restrict page (%d)",
+                                              rc);
+                                       return rc;
+                               }
+                               if (flags & TTM_PAGE_FLAG_ZERO_ALLOC)
+                                       clear_page(page_address(p));
+                       }
+#endif
+
                         pages[r] = p;
                 }
                 return 0;
diff --git a/drivers/gpu/drm/vmwgfx/Kconfig b/drivers/gpu/drm/vmwgfx/Kconfig

index 794ff67..6b153a3 100644 (file)
--- a/drivers/gpu/drm/vmwgfx/Kconfig
+++ b/drivers/gpu/drm/vmwgfx/Kconfig
@@ -1,6 +1,6 @@
  config DRM_VMWGFX
         tristate "DRM driver for VMware Virtual GPU"
-       depends on DRM && PCI && FB
+       depends on DRM && PCI && FB && !XEN
         select FB_DEFERRED_IO
         select FB_CFB_FILLRECT
         select FB_CFB_COPYAREA
diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c

index 299d238..ab6d013 100644 (file)
--- a/drivers/hid/hid-apple.c
+++ b/drivers/hid/hid-apple.c
@@ -38,7 +38,7 @@
  
  #define APPLE_FLAG_FKEY                0x01
  
-static unsigned int fnmode = 1;
+static unsigned int fnmode = 2;
  module_param(fnmode, uint, 0644);
  MODULE_PARM_DESC(fnmode, "Mode of fn key on Apple keyboards (0 = disabled, "
                 "[1] = fkeyslast, 2 = fkeysfirst)");
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c

index 4da66b4..8ce4ba6 100644 (file)
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1811,6 +1811,8 @@ static const struct hid_device_id hid_ignore_list[] = {
         { HID_USB_DEVICE(USB_VENDOR_ID_DELORME, USB_DEVICE_ID_DELORME_EM_LT20) },
         { HID_USB_DEVICE(USB_VENDOR_ID_DREAM_CHEEKY, 0x0004) },
         { HID_USB_DEVICE(USB_VENDOR_ID_DREAM_CHEEKY, 0x000a) },
+       { HID_USB_DEVICE(USB_VENDOR_ID_ELO, USB_DEVICE_ID_ELO_4000U) },
+       { HID_USB_DEVICE(USB_VENDOR_ID_ELO, USB_DEVICE_ID_ELO_4500U) },
         { HID_USB_DEVICE(USB_VENDOR_ID_ESSENTIAL_REALITY, USB_DEVICE_ID_ESSENTIAL_REALITY_P5) },
         { HID_USB_DEVICE(USB_VENDOR_ID_ETT, USB_DEVICE_ID_TC5UH) },
         { HID_USB_DEVICE(USB_VENDOR_ID_ETT, USB_DEVICE_ID_TC4UM) },
diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h

index e39aecb..b681318 100644 (file)
--- a/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h
@@ -263,8 +263,10 @@
  #define USB_VENDOR_ID_DREAM_CHEEKY     0x1d34
  
  #define USB_VENDOR_ID_ELO              0x04E7
+#define USB_DEVICE_ID_ELO_4000U                0x0009
  #define USB_DEVICE_ID_ELO_TS2515       0x0022
  #define USB_DEVICE_ID_ELO_TS2700       0x0020
+#define USB_DEVICE_ID_ELO_4500U                0x0030
  
  #define USB_VENDOR_ID_EMS              0x2006
  #define USB_DEVICE_ID_EMS_TRIO_LINKER_PLUS_II 0x0118
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig

index 70f5dde..347479f 100644 (file)
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -2,7 +2,7 @@ menu "Microsoft Hyper-V guest support"
  
  config HYPERV
         tristate "Microsoft Hyper-V client drivers"
-       depends on X86 && ACPI && PCI
+       depends on X86 && ACPI && PCI && !XEN
         help
           Select this option to run Linux as a Hyper-V client operating
           system.
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig

index 8deedc1..a19f66b 100644 (file)
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -435,7 +435,8 @@ config SENSORS_GPIO_FAN
  
  config SENSORS_CORETEMP
         tristate "Intel Core/Core2/Atom temperature sensor"
-       depends on X86 && PCI && EXPERIMENTAL
+       depends on X86 && PCI && !XEN_UNPRIVILEGED_GUEST && EXPERIMENTAL
+       select XEN_DOMCTL if XEN
         help
           If you say yes here you get support for the temperature
           sensor inside your CPU. Most of the family 6 CPUs
@@ -1154,8 +1155,9 @@ config SENSORS_TWL4030_MADC
  
  config SENSORS_VIA_CPUTEMP
         tristate "VIA CPU temperature sensor"
-       depends on X86
+       depends on X86 && !XEN_UNPRIVILEGED_GUEST
         select HWMON_VID
+       select XEN_DOMCTL if XEN
         help
           If you say yes here you get support for the temperature
           sensor inside your CPU. Supported are all known variants of
diff --git a/drivers/hwmon/coretemp-xen.c b/drivers/hwmon/coretemp-xen.c

new file mode 100644 (file)

index 0000000..1c9b454
--- /dev/null
+++ b/drivers/hwmon/coretemp-xen.c
@@ -0,0 +1,892 @@
+/*
+ * coretemp.c - Linux kernel module for hardware monitoring
+ *
+ * Copyright (C) 2007 Rudolf Marek <r.marek@assembler.cz>
+ *
+ * Inspired from many hwmon drivers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/hwmon.h>
+#include <linux/sysfs.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/platform_device.h>
+#include <linux/cpu.h>
+#include <linux/pci.h>
+#include <linux/smp.h>
+#include <linux/moduleparam.h>
+#include <asm/msr.h>
+#include <asm/cpu_device_id.h>
+#include <xen/pcpu.h>
+#include "../xen/core/domctl.h"
+
+#define DRVNAME        "coretemp"
+
+/*
+ * force_tjmax only matters when TjMax can't be read from the CPU itself.
+ * When set, it replaces the driver's suboptimal heuristic.
+ */
+static int force_tjmax;
+module_param_named(tjmax, force_tjmax, int, 0444);
+MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius");
+
+#define BASE_SYSFS_ATTR_NO     2       /* Sysfs Base attr no for coretemp */
+#define NUM_REAL_CORES         32      /* Number of Real cores per cpu */
+#define CORETEMP_NAME_LENGTH   17      /* String Length of attrs */
+#define MAX_CORE_ATTRS         4       /* Maximum no of basic attrs */
+#define TOTAL_ATTRS            (MAX_CORE_ATTRS + 1)
+#define MAX_CORE_DATA          (NUM_REAL_CORES + BASE_SYSFS_ATTR_NO)
+
+#define TO_PHYS_ID(cpu)                ({ \
+       u32 ppid; \
+       !xen_get_topology_info(cpu, NULL, &ppid, NULL) ? ppid : ~0; \
+})
+#define CORE_ATTR_NO(ccid)     ((ccid) + BASE_SYSFS_ATTR_NO)
+
+/*
+ * Per-Core Temperature Data
+ * @last_updated: The time when the current temperature value was updated
+ *             earlier (in jiffies).
+ * @cpu_core_id: The CPU Core from which temperature values should be read
+ *             This value is passed as "id" field to rdmsr/wrmsr functions.
+ * @status_reg: One of IA32_THERM_STATUS or IA32_PACKAGE_THERM_STATUS,
+ *             from where the temperature values should be read.
+ * @attr_size:  Total number of pre-core attrs displayed in the sysfs.
+ * @is_pkg_data: If this is 1, the temp_data holds pkgtemp data.
+ *             Otherwise, temp_data holds coretemp data.
+ * @valid: If this is 1, the current temperature is valid.
+ */
+struct temp_data {
+       int temp;
+       int ttarget;
+       int tjmax;
+       unsigned long last_updated;
+       unsigned int cpu;
+       u32 cpu_core_id;
+       u32 status_reg;
+       int attr_size;
+       bool is_pkg_data;
+       bool valid;
+       struct sensor_device_attribute sd_attrs[TOTAL_ATTRS];
+       char attr_name[TOTAL_ATTRS][CORETEMP_NAME_LENGTH];
+       struct mutex update_lock;
+};
+
+/* Platform Data per Physical CPU */
+struct platform_data {
+       struct device *hwmon_dev;
+       u16 phys_proc_id;
+       u8 x86_model, x86_mask;
+       struct temp_data *core_data[MAX_CORE_DATA];
+       struct device_attribute name_attr;
+};
+
+struct pdev_entry {
+       struct list_head list;
+       struct platform_device *pdev;
+       u16 phys_proc_id;
+};
+
+struct cpu_info {
+       struct platform_device *pdev;
+       u32 cpuid_6_eax, microcode;
+       u32 phys_proc_id, cpu_core_id;
+       u8 x86_model, x86_mask;
+};
+
+static LIST_HEAD(pdev_list);
+static DEFINE_MUTEX(pdev_list_mutex);
+
+static ssize_t show_name(struct device *dev,
+                       struct device_attribute *devattr, char *buf)
+{
+       return sprintf(buf, "%s\n", DRVNAME);
+}
+
+static ssize_t show_label(struct device *dev,
+                               struct device_attribute *devattr, char *buf)
+{
+       struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+       struct platform_data *pdata = dev_get_drvdata(dev);
+       struct temp_data *tdata = pdata->core_data[attr->index];
+
+       if (tdata->is_pkg_data)
+               return sprintf(buf, "Physical id %u\n", pdata->phys_proc_id);
+
+       return sprintf(buf, "Core %u\n", tdata->cpu_core_id);
+}
+
+static ssize_t show_crit_alarm(struct device *dev,
+                               struct device_attribute *devattr, char *buf)
+{
+       u32 eax, edx;
+       struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+       struct platform_data *pdata = dev_get_drvdata(dev);
+       struct temp_data *tdata = pdata->core_data[attr->index];
+
+       if (rdmsr_safe_on_pcpu(tdata->cpu, tdata->status_reg, &eax, &edx) < 0)
+               return sprintf(buf, "\n");
+
+       return sprintf(buf, "%d\n", (eax >> 5) & 1);
+}
+
+static ssize_t show_tjmax(struct device *dev,
+                       struct device_attribute *devattr, char *buf)
+{
+       struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+       struct platform_data *pdata = dev_get_drvdata(dev);
+
+       return sprintf(buf, "%d\n", pdata->core_data[attr->index]->tjmax);
+}
+
+static ssize_t show_ttarget(struct device *dev,
+                               struct device_attribute *devattr, char *buf)
+{
+       struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+       struct platform_data *pdata = dev_get_drvdata(dev);
+
+       return sprintf(buf, "%d\n", pdata->core_data[attr->index]->ttarget);
+}
+
+static ssize_t show_temp(struct device *dev,
+                       struct device_attribute *devattr, char *buf)
+{
+       u32 eax, edx;
+       struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+       struct platform_data *pdata = dev_get_drvdata(dev);
+       struct temp_data *tdata = pdata->core_data[attr->index];
+
+       mutex_lock(&tdata->update_lock);
+
+       /* Check whether the time interval has elapsed */
+       if (!tdata->valid || time_after(jiffies, tdata->last_updated + HZ)) {
+               if (rdmsr_safe_on_pcpu(tdata->cpu, tdata->status_reg,
+                                        &eax, &edx) < 0)
+                       eax = ~0;
+               tdata->valid = 0;
+               /* Check whether the data is valid */
+               if (eax & 0x80000000) {
+                       tdata->temp = tdata->tjmax -
+                                       ((eax >> 16) & 0x7f) * 1000;
+                       tdata->valid = 1;
+               }
+               tdata->last_updated = jiffies;
+       }
+
+       mutex_unlock(&tdata->update_lock);
+       return tdata->valid ? sprintf(buf, "%d\n", tdata->temp) : -EAGAIN;
+}
+
+static int adjust_tjmax(struct platform_data *c, u32 id, struct device *dev)
+{
+       /* The 100C is default for both mobile and non mobile CPUs */
+
+       int tjmax = 100000;
+       int tjmax_ee = 85000;
+       int usemsr_ee = 1;
+       int err;
+       u32 eax, edx;
+       struct pci_dev *host_bridge;
+
+       /* Early chips have no MSR for TjMax */
+
+       if (c->x86_model == 0xf && c->x86_mask < 4)
+               usemsr_ee = 0;
+
+       /* Atom CPUs */
+
+       if (c->x86_model == 0x1c) {
+               usemsr_ee = 0;
+
+               host_bridge = pci_get_bus_and_slot(0, PCI_DEVFN(0, 0));
+
+               if (host_bridge && host_bridge->vendor == PCI_VENDOR_ID_INTEL
+                   && (host_bridge->device == 0xa000   /* NM10 based nettop */
+                   || host_bridge->device == 0xa010))  /* NM10 based netbook */
+                       tjmax = 100000;
+               else
+                       tjmax = 90000;
+
+               pci_dev_put(host_bridge);
+       }
+
+       if (c->x86_model > 0xe && usemsr_ee) {
+               u8 platform_id;
+
+               /*
+                * Now we can detect the mobile CPU using Intel provided table
+                * http://softwarecommunity.intel.com/Wiki/Mobility/720.htm
+                * For Core2 cores, check MSR 0x17, bit 28 1 = Mobile CPU
+                */
+               err = rdmsr_safe_on_pcpu(id, 0x17, &eax, &edx);
+               if (err < 0) {
+                       dev_warn(dev,
+                                "Unable to access MSR 0x17, assuming desktop"
+                                " CPU\n");
+                       usemsr_ee = 0;
+               } else if (c->x86_model < 0x17 && !(eax & 0x10000000)) {
+                       /*
+                        * Trust bit 28 up to Penryn, I could not find any
+                        * documentation on that; if you happen to know
+                        * someone at Intel please ask
+                        */
+                       usemsr_ee = 0;
+               } else {
+                       /* Platform ID bits 52:50 (EDX starts at bit 32) */
+                       platform_id = (edx >> 18) & 0x7;
+
+                       /*
+                        * Mobile Penryn CPU seems to be platform ID 7 or 5
+                        * (guesswork)
+                        */
+                       if (c->x86_model == 0x17 &&
+                           (platform_id == 5 || platform_id == 7)) {
+                               /*
+                                * If MSR EE bit is set, set it to 90 degrees C,
+                                * otherwise 105 degrees C
+                                */
+                               tjmax_ee = 90000;
+                               tjmax = 105000;
+                       }
+               }
+       }
+
+       if (usemsr_ee) {
+               err = rdmsr_safe_on_pcpu(id, 0xee, &eax, &edx);
+               if (err < 0) {
+                       dev_warn(dev,
+                                "Unable to access MSR 0xEE, for Tjmax, left"
+                                " at default\n");
+               } else if (eax & 0x40000000) {
+                       tjmax = tjmax_ee;
+               }
+       } else if (tjmax == 100000) {
+               /*
+                * If we don't use msr EE it means we are desktop CPU
+                * (with exeception of Atom)
+                */
+               dev_warn(dev, "Using relative temperature scale!\n");
+       }
+
+       return tjmax;
+}
+
+static int get_tjmax(struct platform_data *c, u32 id, struct device *dev)
+{
+       int err;
+       u32 eax, edx;
+       u32 val;
+
+       /*
+        * A new feature of current Intel(R) processors, the
+        * IA32_TEMPERATURE_TARGET contains the TjMax value
+        */
+       err = rdmsr_safe_on_pcpu(id, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
+       if (err < 0) {
+               if (c->x86_model > 0xe && c->x86_model != 0x1c)
+                       dev_warn(dev, "Unable to read TjMax from CPU %u\n", id);
+       } else {
+               val = (eax >> 16) & 0xff;
+               /*
+                * If the TjMax is not plausible, an assumption
+                * will be used
+                */
+               if (val) {
+                       dev_dbg(dev, "TjMax is %d degrees C\n", val);
+                       return val * 1000;
+               }
+       }
+
+       if (force_tjmax) {
+               dev_notice(dev, "TjMax forced to %d degrees C by user\n",
+                          force_tjmax);
+               return force_tjmax * 1000;
+       }
+
+       /*
+        * An assumption is made for early CPUs and unreadable MSR.
+        * NOTE: the calculated value may not be correct.
+        */
+       return adjust_tjmax(c, id, dev);
+}
+
+static int create_name_attr(struct platform_data *pdata, struct device *dev)
+{
+       sysfs_attr_init(&pdata->name_attr.attr);
+       pdata->name_attr.attr.name = "name";
+       pdata->name_attr.attr.mode = S_IRUGO;
+       pdata->name_attr.show = show_name;
+       return device_create_file(dev, &pdata->name_attr);
+}
+
+static int create_core_attrs(struct temp_data *tdata, struct device *dev,
+                               int attr_no)
+{
+       int err, i;
+       static ssize_t (*const rd_ptr[TOTAL_ATTRS]) (struct device *dev,
+                       struct device_attribute *devattr, char *buf) = {
+                       show_label, show_crit_alarm, show_temp, show_tjmax,
+                       show_ttarget };
+       static const char *const names[TOTAL_ATTRS] = {
+                                       "temp%d_label", "temp%d_crit_alarm",
+                                       "temp%d_input", "temp%d_crit",
+                                       "temp%d_max" };
+
+       for (i = 0; i < tdata->attr_size; i++) {
+               snprintf(tdata->attr_name[i], CORETEMP_NAME_LENGTH, names[i],
+                       attr_no);
+               sysfs_attr_init(&tdata->sd_attrs[i].dev_attr.attr);
+               tdata->sd_attrs[i].dev_attr.attr.name = tdata->attr_name[i];
+               tdata->sd_attrs[i].dev_attr.attr.mode = S_IRUGO;
+               tdata->sd_attrs[i].dev_attr.show = rd_ptr[i];
+               tdata->sd_attrs[i].index = attr_no;
+               err = device_create_file(dev, &tdata->sd_attrs[i].dev_attr);
+               if (err)
+                       goto exit_free;
+       }
+       return 0;
+
+exit_free:
+       while (--i >= 0)
+               device_remove_file(dev, &tdata->sd_attrs[i].dev_attr);
+       return err;
+}
+
+
+static int chk_ucode_version(unsigned int cpu, const struct cpu_info *c)
+{
+       /*
+        * Check if we have problem with errata AE18 of Core processors:
+        * Readings might stop update when processor visited too deep sleep,
+        * fixed for stepping D0 (6EC).
+        */
+       if (c->x86_model == 0xe && c->x86_mask < 0xc && c->microcode < 0x39) {
+               pr_err("Errata AE18 not fixed, update BIOS or "
+                      "microcode of the CPU!\n");
+               return -ENODEV;
+       }
+       return 0;
+}
+
+static struct platform_device *coretemp_get_pdev(unsigned int cpu)
+{
+       u16 phys_proc_id = TO_PHYS_ID(cpu);
+       struct pdev_entry *p;
+
+       mutex_lock(&pdev_list_mutex);
+
+       list_for_each_entry(p, &pdev_list, list)
+               if (p->phys_proc_id == phys_proc_id) {
+                       mutex_unlock(&pdev_list_mutex);
+                       return p->pdev;
+               }
+
+       mutex_unlock(&pdev_list_mutex);
+       return NULL;
+}
+
+static struct temp_data *init_temp_data(unsigned int cpu,
+                                       const struct cpu_info *c,
+                                       int pkg_flag)
+{
+       struct temp_data *tdata;
+
+       tdata = kzalloc(sizeof(struct temp_data), GFP_KERNEL);
+       if (!tdata)
+               return NULL;
+
+       tdata->status_reg = pkg_flag ? MSR_IA32_PACKAGE_THERM_STATUS :
+                                                       MSR_IA32_THERM_STATUS;
+       tdata->is_pkg_data = pkg_flag;
+       tdata->cpu = cpu;
+       tdata->cpu_core_id = c->cpu_core_id;
+       tdata->attr_size = MAX_CORE_ATTRS;
+       mutex_init(&tdata->update_lock);
+       return tdata;
+}
+
+static int create_core_data(struct platform_device *pdev,
+                           unsigned int cpu,
+                           const struct cpu_info *c, int pkg_flag)
+{
+       struct temp_data *tdata;
+       struct platform_data *pdata = platform_get_drvdata(pdev);
+       u32 eax, edx;
+       int err, attr_no;
+
+       /*
+        * Find attr number for sysfs:
+        * We map the attr number to core id of the CPU
+        * The attr number is always core id + 2
+        * The Pkgtemp will always show up as temp1_*, if available
+        */
+       attr_no = pkg_flag ? 1 : CORE_ATTR_NO(c->cpu_core_id);
+
+       if (attr_no > MAX_CORE_DATA - 1)
+               return -ERANGE;
+
+       /*
+        * Provide a single set of attributes for all HT siblings of a core
+        * to avoid duplicate sensors (the processor ID and core ID of all
+        * HT siblings of a core are the same).
+        * Skip if a HT sibling of this core is already registered.
+        * This is not an error.
+        */
+       if (pdata->core_data[attr_no] != NULL)
+               return 0;
+
+       tdata = init_temp_data(cpu, c, pkg_flag);
+       if (!tdata)
+               return -ENOMEM;
+
+       /* Test if we can access the status register */
+       err = rdmsr_safe_on_pcpu(cpu, tdata->status_reg, &eax, &edx);
+       if (err < 0)
+               goto exit_free;
+
+       /* We can access status register. Get Critical Temperature */
+       tdata->tjmax = get_tjmax(pdata, cpu, &pdev->dev);
+
+       /*
+        * Read the still undocumented bits 8:15 of IA32_TEMPERATURE_TARGET.
+        * The target temperature is available on older CPUs but not in this
+        * register. Atoms don't have the register at all.
+        */
+       if (c->x86_model > 0xe && c->x86_model != 0x1c) {
+               err = rdmsr_safe_on_pcpu(cpu, MSR_IA32_TEMPERATURE_TARGET,
+                                        &eax, &edx);
+               if (err >= 0) {
+                       tdata->ttarget
+                         = tdata->tjmax - ((eax >> 8) & 0xff) * 1000;
+                       tdata->attr_size++;
+               }
+       }
+
+       pdata->core_data[attr_no] = tdata;
+
+       /* Create sysfs interfaces */
+       err = create_core_attrs(tdata, &pdev->dev, attr_no);
+       if (err)
+               goto exit_free;
+
+       return 0;
+exit_free:
+       pdata->core_data[attr_no] = NULL;
+       kfree(tdata);
+       return err;
+}
+
+static void coretemp_add_core(unsigned int cpu,
+                             const struct cpu_info *c, int pkg_flag)
+{
+       struct platform_device *pdev = c->pdev;
+       int err;
+
+       err = create_core_data(pdev, cpu, c, pkg_flag);
+       if (err)
+               dev_err(&pdev->dev, "Adding Core %u failed\n", cpu);
+}
+
+static void coretemp_remove_core(struct platform_data *pdata,
+                               struct device *dev, int indx)
+{
+       int i;
+       struct temp_data *tdata = pdata->core_data[indx];
+
+       /* Remove the sysfs attributes */
+       for (i = 0; i < tdata->attr_size; i++)
+               device_remove_file(dev, &tdata->sd_attrs[i].dev_attr);
+
+       kfree(pdata->core_data[indx]);
+       pdata->core_data[indx] = NULL;
+}
+
+static int coretemp_probe(struct platform_device *pdev)
+{
+       struct platform_data *pdata = platform_get_drvdata(pdev);
+       int err;
+
+       /* Initialize the per-package data structures */
+       err = create_name_attr(pdata, &pdev->dev);
+       if (err)
+               return err;
+
+       pdata->hwmon_dev = hwmon_device_register(&pdev->dev);
+       if (IS_ERR(pdata->hwmon_dev)) {
+               err = PTR_ERR(pdata->hwmon_dev);
+               dev_err(&pdev->dev, "Class registration failed (%d)\n", err);
+               goto exit_name;
+       }
+       return 0;
+
+exit_name:
+       device_remove_file(&pdev->dev, &pdata->name_attr);
+       return err;
+}
+
+static int coretemp_remove(struct platform_device *pdev)
+{
+       struct platform_data *pdata = platform_get_drvdata(pdev);
+       int i;
+
+       for (i = MAX_CORE_DATA - 1; i >= 0; --i)
+               if (pdata->core_data[i])
+                       coretemp_remove_core(pdata, &pdev->dev, i);
+
+       device_remove_file(&pdev->dev, &pdata->name_attr);
+       hwmon_device_unregister(pdata->hwmon_dev);
+       platform_set_drvdata(pdev, NULL);
+       kfree(pdata);
+       return 0;
+}
+
+static struct platform_driver coretemp_driver = {
+       .driver = {
+               .owner = THIS_MODULE,
+               .name = DRVNAME,
+       },
+       .probe = coretemp_probe,
+       .remove = coretemp_remove,
+};
+
+static int coretemp_device_add(unsigned int cpu, struct cpu_info *c)
+{
+       int err;
+       struct platform_device *pdev;
+       struct pdev_entry *pdev_entry;
+       struct platform_data *pdata = NULL;
+
+       mutex_lock(&pdev_list_mutex);
+
+       pdev = platform_device_alloc(DRVNAME, c->phys_proc_id);
+       if (!pdev) {
+               err = -ENOMEM;
+               pr_err("Device allocation failed\n");
+               goto exit;
+       }
+
+       pdata = kzalloc(sizeof(struct platform_data), GFP_KERNEL);
+       if (!pdata) {
+               err = -ENOMEM;
+               goto exit_device_put;
+       }
+
+       pdata->phys_proc_id = c->phys_proc_id;
+       pdata->x86_model = c->x86_model;
+       pdata->x86_mask = c->x86_mask;
+       platform_set_drvdata(pdev, pdata);
+
+       pdev_entry = kzalloc(sizeof(struct pdev_entry), GFP_KERNEL);
+       if (!pdev_entry) {
+               err = -ENOMEM;
+               goto exit_device_put;
+       }
+
+       err = platform_device_add(pdev);
+       if (err) {
+               pr_err("Device addition failed (%d)\n", err);
+               goto exit_device_free;
+       }
+
+       pdev_entry->pdev = pdev;
+       pdev_entry->phys_proc_id = c->phys_proc_id;
+       c->pdev = pdev;
+
+       list_add_tail(&pdev_entry->list, &pdev_list);
+       mutex_unlock(&pdev_list_mutex);
+
+       return 0;
+
+exit_device_free:
+       kfree(pdev_entry);
+exit_device_put:
+       platform_device_put(pdev);
+       kfree(pdata);
+exit:
+       mutex_unlock(&pdev_list_mutex);
+       return err;
+}
+
+static void coretemp_device_remove(unsigned int cpu)
+{
+       struct pdev_entry *p, *n;
+       u16 phys_proc_id = TO_PHYS_ID(cpu);
+
+       mutex_lock(&pdev_list_mutex);
+       list_for_each_entry_safe(p, n, &pdev_list, list) {
+               if (p->phys_proc_id != phys_proc_id)
+                       continue;
+               platform_device_unregister(p->pdev);
+               list_del(&p->list);
+               kfree(p);
+       }
+       mutex_unlock(&pdev_list_mutex);
+}
+
+static bool is_any_core_online(struct platform_data *pdata)
+{
+       int i;
+
+       /* Find online cores, except pkgtemp data */
+       for (i = MAX_CORE_DATA - 1; i >= 0; --i) {
+               if (pdata->core_data[i] &&
+                       !pdata->core_data[i]->is_pkg_data) {
+                       return true;
+               }
+       }
+       return false;
+}
+
+static void get_cpuid_info(void *arg)
+{
+       struct cpu_info *info = arg;
+       u32 val = cpuid_eax(1);
+
+       info->x86_model = ((val >> 4) & 0xf) | ((val >> 12) & 0xf0);
+       info->x86_mask = val & 0xf;
+
+       if (((val >> 8) & 0xf) != 6 || ((val >> 20) & 0xff)
+           || !info->x86_model
+           || wrmsr_safe(MSR_IA32_UCODE_REV, 0, 0) < 0
+           || (sync_core(), rdmsr_safe(MSR_IA32_UCODE_REV,
+                                       &val, &info->microcode)) < 0)
+               info->microcode = 0;
+
+       info->cpuid_6_eax = cpuid_eax(0) >= 6 ? cpuid_eax(6) : 0;
+}
+
+static void get_core_online(unsigned int cpu)
+{
+       struct cpu_info info;
+       struct platform_device *pdev = coretemp_get_pdev(cpu);
+       int err;
+
+       info.pdev = pdev;
+
+       err = xen_set_physical_cpu_affinity(cpu);
+       if (!err) {
+               get_cpuid_info(&info);
+               WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
+       } else if (err > 0) {
+               static bool warned;
+
+               if (!warned) {
+                       warned = true;
+                       pr_warn(DRVNAME "Cannot set physical CPU affinity"
+                               " (assuming use of dom0_vcpus_pin)\n");
+               }
+               err = smp_call_function_single(cpu, get_cpuid_info, &info, 1);
+       }
+       if (err)
+               return;
+
+       /*
+        * CPUID.06H.EAX[0] indicates whether the CPU has thermal
+        * sensors. We check this bit only, all the early CPUs
+        * without thermal sensors will be filtered out.
+        */
+       if (!(info.cpuid_6_eax & 0x1))
+               return;
+
+       err = xen_get_topology_info(cpu, &info.cpu_core_id,
+                                   &info.phys_proc_id, NULL);
+       if (err)
+               return;
+
+       if (!pdev) {
+               /* Check the microcode version of the CPU */
+               if (chk_ucode_version(cpu, &info))
+                       return;
+
+               /*
+                * Alright, we have DTS support.
+                * We are bringing the _first_ core in this pkg
+                * online. So, initialize per-pkg data structures and
+                * then bring this core online.
+                */
+               err = coretemp_device_add(cpu, &info);
+               if (err)
+                       return;
+               /*
+                * Check whether pkgtemp support is available.
+                * If so, add interfaces for pkgtemp.
+                */
+               if (info.cpuid_6_eax & 0x40)
+                       coretemp_add_core(cpu, &info, 1);
+       }
+       /*
+        * Physical CPU device already exists.
+        * So, just add interfaces for this core.
+        */
+       coretemp_add_core(cpu, &info, 0);
+}
+
+static void put_core_offline(unsigned int cpu)
+{
+       int i, indx;
+       struct platform_data *pdata;
+       struct platform_device *pdev = coretemp_get_pdev(cpu);
+       u32 cpu_core_id, phys_proc_id;
+
+       /* If the physical CPU device does not exist, just return */
+       if (!pdev)
+               return;
+
+       pdata = platform_get_drvdata(pdev);
+
+       if (xen_get_topology_info(cpu, &cpu_core_id, &phys_proc_id, NULL))
+               return;
+       indx = CORE_ATTR_NO(cpu_core_id);
+
+       /* The core id is too big, just return */
+       if (indx > MAX_CORE_DATA - 1)
+               return;
+
+       if (pdata->core_data[indx] && pdata->core_data[indx]->cpu == cpu)
+               coretemp_remove_core(pdata, &pdev->dev, indx);
+
+       /*
+        * If a HT sibling of a core is taken offline, but another HT sibling
+        * of the same core is still online, register the alternate sibling.
+        * This ensures that exactly one set of attributes is provided as long
+        * as at least one HT sibling of a core is online.
+        */
+       for (i = 0; ; ++i) {
+               u32 cid, pid;
+               int err;
+
+               if (i != cpu) {
+                       err = xen_get_topology_info(i, &cid, &pid, NULL);
+                       if (err == -ENOENT)
+                               continue;
+                       if (err)
+                               break;
+                       if (pid != phys_proc_id || cid != cpu_core_id)
+                               continue;
+                       get_core_online(i);
+                       /*
+                        * Display temperature sensor data for one HT sibling
+                        * per core only, so abort the loop after one such
+                        * sibling has been found.
+                        */
+                       break;
+               }
+       }
+       /*
+        * If all cores in this pkg are offline, remove the device.
+        * coretemp_device_remove calls unregister_platform_device,
+        * which in turn calls coretemp_remove. This removes the
+        * pkgtemp entry and does other clean ups.
+        */
+       if (!is_any_core_online(pdata))
+               coretemp_device_remove(cpu);
+}
+
+static int coretemp_cpu_callback(struct notifier_block *nfb,
+                                unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (unsigned long) hcpu;
+
+       switch (action) {
+       case CPU_ONLINE:
+               get_core_online(cpu);
+               break;
+       case CPU_DEAD:
+               put_core_offline(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block coretemp_cpu_notifier = {
+       .notifier_call = coretemp_cpu_callback,
+};
+
+static const struct x86_cpu_id coretemp_ids[] = {
+       { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_DTS },
+       {}
+};
+MODULE_DEVICE_TABLE(x86cpu, coretemp_ids);
+
+static int __init coretemp_init(void)
+{
+       int err = -ENODEV;
+
+       if (!is_initial_xendomain())
+               goto exit;
+
+       /*
+        * CPUID.06H.EAX[0] indicates whether the CPU has thermal
+        * sensors. We check this bit only, all the early CPUs
+        * without thermal sensors will be filtered out.
+        */
+       if (!x86_match_cpu(coretemp_ids))
+               return -ENODEV;
+
+       err = platform_driver_register(&coretemp_driver);
+       if (err)
+               goto exit;
+
+       err = register_pcpu_notifier(&coretemp_cpu_notifier);
+       if (err)
+               goto exit_driver_unreg;
+
+#ifndef CONFIG_ACPI_HOTPLUG_CPU
+       if (list_empty(&pdev_list)) {
+               unregister_pcpu_notifier(&coretemp_cpu_notifier);
+               err = -ENODEV;
+               goto exit_driver_unreg;
+       }
+#endif
+
+       return 0;
+
+exit_driver_unreg:
+       platform_driver_unregister(&coretemp_driver);
+exit:
+       return err;
+}
+
+static void __exit coretemp_exit(void)
+{
+       struct pdev_entry *p, *n;
+
+       unregister_pcpu_notifier(&coretemp_cpu_notifier);
+       mutex_lock(&pdev_list_mutex);
+       list_for_each_entry_safe(p, n, &pdev_list, list) {
+               platform_device_unregister(p->pdev);
+               list_del(&p->list);
+               kfree(p);
+       }
+       mutex_unlock(&pdev_list_mutex);
+       platform_driver_unregister(&coretemp_driver);
+}
+
+MODULE_AUTHOR("Rudolf Marek <r.marek@assembler.cz>");
+MODULE_DESCRIPTION("Intel Core temperature monitor");
+MODULE_LICENSE("GPL");
+
+module_init(coretemp_init)
+module_exit(coretemp_exit)
diff --git a/drivers/hwmon/via-cputemp-xen.c b/drivers/hwmon/via-cputemp-xen.c

new file mode 100644 (file)

index 0000000..96c17cc
--- /dev/null
+++ b/drivers/hwmon/via-cputemp-xen.c
@@ -0,0 +1,397 @@
+/*
+ * via-cputemp.c - Driver for VIA CPU core temperature monitoring
+ * Copyright (C) 2009 VIA Technologies, Inc.
+ *
+ * based on existing coretemp.c, which is
+ *
+ * Copyright (C) 2007 Rudolf Marek <r.marek@assembler.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301 USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-vid.h>
+#include <linux/sysfs.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/platform_device.h>
+#include <linux/cpu.h>
+#include <asm/msr.h>
+#include <asm/cpu_device_id.h>
+#include <xen/pcpu.h>
+#include "../xen/core/domctl.h"
+
+#define DRVNAME        "via_cputemp"
+
+enum { SHOW_TEMP, SHOW_LABEL, SHOW_NAME };
+
+/*
+ * Functions declaration
+ */
+
+struct pdev_entry {
+       struct list_head list;
+       struct platform_device *pdev;
+       struct device *hwmon_dev;
+       const char *name;
+       u8 x86_model;
+       u8 vrm;
+       u32 msr_temp;
+       u32 msr_vid;
+};
+#define via_cputemp_data pdev_entry
+
+/*
+ * Sysfs stuff
+ */
+
+static ssize_t show_name(struct device *dev, struct device_attribute
+                         *devattr, char *buf)
+{
+       int ret;
+       struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr);
+       struct via_cputemp_data *data = dev_get_drvdata(dev);
+
+       if (attr->index == SHOW_NAME)
+               ret = sprintf(buf, "%s\n", data->name);
+       else    /* show label */
+               ret = sprintf(buf, "Core %d\n", data->pdev->id);
+       return ret;
+}
+
+static ssize_t show_temp(struct device *dev,
+                        struct device_attribute *devattr, char *buf)
+{
+       struct via_cputemp_data *data = dev_get_drvdata(dev);
+       u32 eax, edx;
+       int err;
+
+       err = rdmsr_safe_on_pcpu(data->pdev->id, data->msr_temp, &eax, &edx);
+       if (err < 0)
+               return -EAGAIN;
+
+       return sprintf(buf, "%lu\n", ((unsigned long)eax & 0xffffff) * 1000);
+}
+
+static ssize_t show_cpu_vid(struct device *dev,
+                           struct device_attribute *devattr, char *buf)
+{
+       struct via_cputemp_data *data = dev_get_drvdata(dev);
+       u32 eax, edx;
+       int err;
+
+       err = rdmsr_safe_on_pcpu(data->pdev->id, data->msr_vid, &eax, &edx);
+       if (err < 0)
+               return -EAGAIN;
+
+       return sprintf(buf, "%d\n", vid_from_reg(~edx & 0x7f, data->vrm));
+}
+
+static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, show_temp, NULL,
+                         SHOW_TEMP);
+static SENSOR_DEVICE_ATTR(temp1_label, S_IRUGO, show_name, NULL, SHOW_LABEL);
+static SENSOR_DEVICE_ATTR(name, S_IRUGO, show_name, NULL, SHOW_NAME);
+
+static struct attribute *via_cputemp_attributes[] = {
+       &sensor_dev_attr_name.dev_attr.attr,
+       &sensor_dev_attr_temp1_label.dev_attr.attr,
+       &sensor_dev_attr_temp1_input.dev_attr.attr,
+       NULL
+};
+
+static const struct attribute_group via_cputemp_group = {
+       .attrs = via_cputemp_attributes,
+};
+
+/* Optional attributes */
+static DEVICE_ATTR(cpu0_vid, S_IRUGO, show_cpu_vid, NULL);
+
+static int via_cputemp_probe(struct platform_device *pdev)
+{
+       struct via_cputemp_data *data = platform_get_drvdata(pdev);
+       int err;
+       u32 eax, edx;
+
+       data->name = "via_cputemp";
+
+       switch (data->x86_model) {
+       case 0xA:
+               /* C7 A */
+       case 0xD:
+               /* C7 D */
+               data->msr_temp = 0x1169;
+               data->msr_vid = 0x198;
+               break;
+       case 0xF:
+               /* Nano */
+               data->msr_temp = 0x1423;
+               break;
+       default:
+               return -ENODEV;
+       }
+
+       /* test if we can access the TEMPERATURE MSR */
+       err = rdmsr_safe_on_pcpu(pdev->id, data->msr_temp, &eax, &edx);
+       if (err < 0) {
+               dev_err(&pdev->dev,
+                       "Unable to access TEMPERATURE MSR, giving up\n");
+               return err;
+       }
+
+       err = sysfs_create_group(&pdev->dev.kobj, &via_cputemp_group);
+       if (err)
+               return err;
+
+       if (data->msr_vid)
+               data->vrm = vid_which_vrm();
+
+       if (data->vrm) {
+               err = device_create_file(&pdev->dev, &dev_attr_cpu0_vid);
+               if (err)
+                       goto exit_remove;
+       }
+
+       data->hwmon_dev = hwmon_device_register(&pdev->dev);
+       if (IS_ERR(data->hwmon_dev)) {
+               err = PTR_ERR(data->hwmon_dev);
+               dev_err(&pdev->dev, "Class registration failed (%d)\n",
+                       err);
+               goto exit_remove;
+       }
+
+       return 0;
+
+exit_remove:
+       if (data->vrm)
+               device_remove_file(&pdev->dev, &dev_attr_cpu0_vid);
+       sysfs_remove_group(&pdev->dev.kobj, &via_cputemp_group);
+       return err;
+}
+
+static int via_cputemp_remove(struct platform_device *pdev)
+{
+       struct via_cputemp_data *data = platform_get_drvdata(pdev);
+
+       hwmon_device_unregister(data->hwmon_dev);
+       if (data->vrm)
+               device_remove_file(&pdev->dev, &dev_attr_cpu0_vid);
+       sysfs_remove_group(&pdev->dev.kobj, &via_cputemp_group);
+       return 0;
+}
+
+static struct platform_driver via_cputemp_driver = {
+       .driver = {
+               .owner = THIS_MODULE,
+               .name = DRVNAME,
+       },
+       .probe = via_cputemp_probe,
+       .remove = via_cputemp_remove,
+};
+
+static LIST_HEAD(pdev_list);
+static DEFINE_MUTEX(pdev_list_mutex);
+
+struct cpu_info {
+       struct pdev_entry *pdev_entry;
+       u8 x86;
+};
+
+static void get_cpuid_info(void *arg)
+{
+       struct cpu_info *info = arg;
+       struct pdev_entry *pdev_entry = info->pdev_entry;
+       u32 val = cpuid_eax(1);
+
+       info->x86 = ((val >> 8) & 0xf) + ((val >> 20) & 0xff);
+       pdev_entry->x86_model = ((val >> 4) & 0xf) | ((val >> 12) & 0xf0);
+}
+
+static int via_cputemp_device_add(unsigned int cpu)
+{
+       int err;
+       struct cpu_info info;
+       struct platform_device *pdev;
+       struct pdev_entry *pdev_entry;
+
+       pdev_entry = kzalloc(sizeof(*pdev_entry), GFP_KERNEL);
+       if (!pdev_entry)
+               return -ENOMEM;
+
+       info.pdev_entry = pdev_entry;
+       err = xen_set_physical_cpu_affinity(cpu);
+       if (!err) {
+               get_cpuid_info(&info);
+               WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
+       } else if (err > 0) {
+               static bool warned;
+
+               if (!warned) {
+                       warned = true;
+                       printk(KERN_WARNING DRVNAME
+                              "Cannot set physical CPU affinity"
+                              " (assuming use of dom0_vcpus_pin)\n");
+               }
+               err = smp_call_function_single(cpu, get_cpuid_info, &info, 1);
+       }
+       if (err)
+               goto exit_entry_free;
+
+       if (info.x86 != 6)
+               goto exit_entry_free;
+
+       if (pdev_entry->x86_model < 0x0a)
+               goto exit_entry_free;
+
+       if (pdev_entry->x86_model > 0x0f) {
+               pr_warn("Unknown CPU model 0x%x\n", pdev_entry->x86_model);
+               goto exit_entry_free;
+       }
+
+       pdev = platform_device_alloc(DRVNAME, cpu);
+       if (!pdev) {
+               err = -ENOMEM;
+               pr_err("Device allocation failed\n");
+               goto exit_entry_free;
+       }
+
+       platform_set_drvdata(pdev, pdev_entry);
+       pdev_entry->pdev = pdev;
+
+       err = platform_device_add(pdev);
+       if (err) {
+               pr_err("Device addition failed (%d)\n", err);
+               goto exit_device_put;
+       }
+
+       mutex_lock(&pdev_list_mutex);
+       list_add_tail(&pdev_entry->list, &pdev_list);
+       mutex_unlock(&pdev_list_mutex);
+
+       return 0;
+
+exit_device_put:
+       platform_device_put(pdev);
+exit_entry_free:
+       kfree(pdev_entry);
+       return err;
+}
+
+static void via_cputemp_device_remove(unsigned int cpu)
+{
+       struct pdev_entry *p;
+
+       mutex_lock(&pdev_list_mutex);
+       list_for_each_entry(p, &pdev_list, list) {
+               if (p->pdev->id == cpu) {
+                       platform_device_unregister(p->pdev);
+                       list_del(&p->list);
+                       mutex_unlock(&pdev_list_mutex);
+                       kfree(p);
+                       return;
+               }
+       }
+       mutex_unlock(&pdev_list_mutex);
+}
+
+static int via_cputemp_cpu_callback(struct notifier_block *nfb,
+                                unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (unsigned long) hcpu;
+
+       switch (action) {
+       case CPU_ONLINE:
+               via_cputemp_device_add(cpu);
+               break;
+       case CPU_DEAD:
+               via_cputemp_device_remove(cpu);
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block via_cputemp_cpu_notifier = {
+       .notifier_call = via_cputemp_cpu_callback,
+};
+
+static const struct x86_cpu_id cputemp_ids[] = {
+       { X86_VENDOR_CENTAUR, 6, 0xa, }, /* C7 A */
+       { X86_VENDOR_CENTAUR, 6, 0xd, }, /* C7 D */
+       { X86_VENDOR_CENTAUR, 6, 0xf, }, /* Nano */
+       {}
+};
+MODULE_DEVICE_TABLE(x86cpu, cputemp_ids);
+
+static int __init via_cputemp_init(void)
+{
+       int err;
+
+       if (!is_initial_xendomain())
+               return -ENODEV;
+
+       if (!x86_match_cpu(cputemp_ids))
+               return -ENODEV;
+
+       err = platform_driver_register(&via_cputemp_driver);
+       if (err)
+               goto exit;
+
+       err = register_pcpu_notifier(&via_cputemp_cpu_notifier);
+       if (err)
+               goto exit_driver_unreg;
+
+#ifndef CONFIG_ACPI_HOTPLUG_CPU
+       if (list_empty(&pdev_list)) {
+               unregister_pcpu_notifier(&via_cputemp_cpu_notifier);
+               err = -ENODEV;
+               goto exit_driver_unreg;
+       }
+#endif
+
+       return 0;
+
+exit_driver_unreg:
+       platform_driver_unregister(&via_cputemp_driver);
+exit:
+       return err;
+}
+
+static void __exit via_cputemp_exit(void)
+{
+       struct pdev_entry *p, *n;
+
+       unregister_pcpu_notifier(&via_cputemp_cpu_notifier);
+       mutex_lock(&pdev_list_mutex);
+       list_for_each_entry_safe(p, n, &pdev_list, list) {
+               platform_device_unregister(p->pdev);
+               list_del(&p->list);
+               kfree(p);
+       }
+       mutex_unlock(&pdev_list_mutex);
+       platform_driver_unregister(&via_cputemp_driver);
+}
+
+MODULE_AUTHOR("Harald Welte <HaraldWelte@viatech.com>");
+MODULE_DESCRIPTION("VIA CPU temperature monitor");
+MODULE_LICENSE("GPL");
+
+module_init(via_cputemp_init)
+module_exit(via_cputemp_exit)
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c

index d9c9829..6ca6423 100644 (file)
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -19,6 +19,7 @@ void ide_toggle_bounce(ide_drive_t *drive, int on)
  {
         u64 addr = BLK_BOUNCE_HIGH;     /* dma64_addr_t */
  
+#ifndef CONFIG_XEN
         if (!PCI_DMA_BUS_IS_PHYS) {
                 addr = BLK_BOUNCE_ANY;
         } else if (on && drive->media == ide_disk) {
@@ -27,6 +28,16 @@ void ide_toggle_bounce(ide_drive_t *drive, int on)
                 if (dev && dev->dma_mask)
                         addr = *dev->dma_mask;
         }
+#else
+       if (on && drive->media == ide_disk) {
+               struct device *dev = drive->hwif->dev;
+
+               if (!PCI_DMA_BUS_IS_PHYS)
+                       addr = BLK_BOUNCE_ANY;
+               else if (dev && dev->dma_mask)
+                       addr = *dev->dma_mask;
+       }
+#endif
  
         if (drive->queue)
                 blk_queue_bounce_limit(drive->queue, addr);
diff --git a/drivers/idle/Kconfig b/drivers/idle/Kconfig

index 8489eb5..9d643c1 100644 (file)
--- a/drivers/idle/Kconfig
+++ b/drivers/idle/Kconfig
@@ -10,7 +10,7 @@ config INTEL_IDLE
           processors intel_idle does not support.
  
  menu "Memory power savings"
-depends on X86_64
+depends on X86_64 && !XEN
  
  config I7300_IDLE_IOAT_CHANNEL
         bool
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig

index 7faf4a7..04b635a 100644 (file)
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -579,7 +579,7 @@ config INPUT_CMA3000_I2C
  
  config INPUT_XEN_KBDDEV_FRONTEND
         tristate "Xen virtual keyboard and mouse support"
-       depends on XEN
+       depends on PARAVIRT_XEN
         default y
         select XEN_XENBUS_FRONTEND
         help
diff --git a/drivers/input/mouse/Kconfig b/drivers/input/mouse/Kconfig

index 9b8db82..700fed3 100644 (file)
--- a/drivers/input/mouse/Kconfig
+++ b/drivers/input/mouse/Kconfig
@@ -19,6 +19,7 @@ config MOUSE_PS2
         select SERIO_LIBPS2
         select SERIO_I8042 if X86
         select SERIO_GSCPS2 if GSC
+       select LEDS_CLASS if MOUSE_PS2_SYNAPICS_LED
         help
           Say Y here if you have a PS/2 mouse connected to your system. This
           includes the standard 2 or 3-button PS/2 mouse, as well as PS/2
@@ -68,6 +69,14 @@ config MOUSE_PS2_SYNAPTICS
  
           If unsure, say Y.
  
+config MOUSE_PS2_SYNAPTICS_LED
+       bool "Support embedded LED on Synaptics devices"
+       depends on MOUSE_PS2_SYNAPTICS
+       select NEW_LEDS
+       help
+         Say Y here if you have a Synaptics device with an embedded LED.
+         This will enable LED class driver to control the LED device.
+
  config MOUSE_PS2_LIFEBOOK
         bool "Fujitsu Lifebook PS/2 mouse protocol extension" if EXPERT
         default y
diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c

index a4b14a4..a98c967 100644 (file)
--- a/drivers/input/mouse/synaptics.c
+++ b/drivers/input/mouse/synaptics.c
@@ -29,6 +29,7 @@
  #include <linux/input/mt.h>
  #include <linux/serio.h>
  #include <linux/libps2.h>
+#include <linux/leds.h>
  #include <linux/slab.h>
  #include "psmouse.h"
  #include "synaptics.h"
@@ -431,6 +432,110 @@ static void synaptics_pt_create(struct psmouse *psmouse)
         serio_register_port(serio);
  }
  
+#ifdef CONFIG_MOUSE_PS2_SYNAPTICS_LED
+/*
+ * LED handling:
+ * Some Synaptics devices have an embeded LED at the top-left corner.
+ */
+
+struct synaptics_led {
+       struct psmouse *psmouse;
+       struct work_struct work;
+       struct led_classdev cdev;
+};
+
+static void synaptics_set_led(struct psmouse *psmouse, int on)
+{
+       int i;
+       unsigned char cmd = on ? 0x88 : 0x10;
+
+       ps2_begin_command(&psmouse->ps2dev);
+       if (__ps2_command(&psmouse->ps2dev, NULL, PSMOUSE_CMD_SETSCALE11))
+               goto out;
+       for (i = 6; i >= 0; i -= 2) {
+               unsigned char d = (cmd >> i) & 3;
+               if (__ps2_command(&psmouse->ps2dev, &d, PSMOUSE_CMD_SETRES))
+                       goto out;
+       }
+       cmd = 0x0a;
+       __ps2_command(&psmouse->ps2dev, &cmd, PSMOUSE_CMD_SETRATE);
+ out:
+       ps2_end_command(&psmouse->ps2dev);
+}
+
+static void synaptics_led_work(struct work_struct *work)
+{
+       struct synaptics_led *led;
+
+       led = container_of(work, struct synaptics_led, work);
+       synaptics_set_led(led->psmouse, led->cdev.brightness);
+}
+
+static void synaptics_led_cdev_brightness_set(struct led_classdev *cdev,
+                                             enum led_brightness value)
+{
+       struct synaptics_led *led;
+
+       led = container_of(cdev, struct synaptics_led, cdev);
+       schedule_work(&led->work);
+}
+
+static void synaptics_sync_led(struct psmouse *psmouse)
+{
+       struct synaptics_data *priv = psmouse->private;
+
+       if (priv->led)
+               synaptics_set_led(psmouse, priv->led->cdev.brightness);
+}
+
+static int synaptics_init_led(struct psmouse *psmouse)
+{
+       struct synaptics_data *priv = psmouse->private;
+       struct synaptics_led *led;
+       int err;
+
+       /* FIXME: LED is supposedly detectable in cap0c[1] 0x20, but it seems
+        * not working on real machines.
+        * So we check the product id to be sure.
+        */
+       if (!priv->ext_cap_0c || SYN_CAP_PRODUCT_ID(priv->ext_cap) != 0xe4)
+               return 0;
+
+       printk(KERN_INFO "synaptics: support LED control\n");
+       led = kzalloc(sizeof(struct synaptics_led), GFP_KERNEL);
+       if (!led)
+               return -ENOMEM;
+       led->psmouse = psmouse;
+       INIT_WORK(&led->work, synaptics_led_work);
+       led->cdev.name = "psmouse::synaptics";
+       led->cdev.brightness_set = synaptics_led_cdev_brightness_set;
+       led->cdev.flags = LED_CORE_SUSPENDRESUME;
+       err = led_classdev_register(NULL, &led->cdev);
+       if (err < 0) {
+               kfree(led);
+               return err;
+       }
+       priv->led = led;
+       return 0;
+}
+
+static void synaptics_free_led(struct psmouse *psmouse)
+{
+       struct synaptics_data *priv = psmouse->private;
+
+       if (!priv->led)
+               return;
+       cancel_work_sync(&priv->led->work);
+       synaptics_set_led(psmouse, 0);
+       led_classdev_unregister(&priv->led->cdev);
+       kfree(priv->led);
+}
+#else
+#define synaptics_init_led(ps) 0
+#define synaptics_free_led(ps) do {} while (0)
+#define synaptics_sync_led(ps) do {} while (0)
+#endif
+
  /*****************************************************************************
   *     Functions to interpret the absolute mode packets
   ****************************************************************************/
@@ -1276,6 +1381,7 @@ static void synaptics_disconnect(struct psmouse *psmouse)
                 device_remove_file(&psmouse->ps2dev.serio->dev,
                                    &psmouse_attr_disable_gesture.dattr);
  
+       synaptics_free_led(psmouse);
         synaptics_reset(psmouse);
         kfree(priv);
         psmouse->private = NULL;
@@ -1332,6 +1438,8 @@ static int synaptics_reconnect(struct psmouse *psmouse)
                 return -1;
         }
  
+       synaptics_sync_led(psmouse);
+
         return 0;
  }
  
@@ -1441,6 +1549,9 @@ static int __synaptics_init(struct psmouse *psmouse, bool absolute_mode)
                      priv->model_id,
                      priv->capabilities, priv->ext_cap, priv->ext_cap_0c);
  
+       if (synaptics_init_led(psmouse) < 0)
+               goto init_fail;
+
         set_input_params(psmouse->dev, priv);
  
         /*
diff --git a/drivers/input/mouse/synaptics.h b/drivers/input/mouse/synaptics.h

index fd26ccc..c8582c7 100644 (file)
--- a/drivers/input/mouse/synaptics.h
+++ b/drivers/input/mouse/synaptics.h
@@ -145,6 +145,8 @@ struct synaptics_hw_state {
         struct synaptics_mt_state mt_state;
  };
  
+struct synaptics_led;
+
  struct synaptics_data {
         /* Data read from the touchpad */
         unsigned long int model_id;             /* Model-ID */
@@ -174,6 +176,7 @@ struct synaptics_data {
          */
         struct synaptics_hw_state agm;
         bool agm_pending;                       /* new AGM packet received */
+       struct synaptics_led *led;
  };
  
  void synaptics_module_init(void);
diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h

index 5ec774d..f769269 100644 (file)
--- a/drivers/input/serio/i8042-x86ia64io.h
+++ b/drivers/input/serio/i8042-x86ia64io.h
@@ -382,6 +382,13 @@ static const struct dmi_system_id __initconst i8042_dmi_nomux_table[] = {
                 },
         },
         {
+               /* Acer Aspire 5710 */
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 5710"),
+               },
+       },
+       {
                 /* Gericom Bellagio */
                 .matches = {
                         DMI_MATCH(DMI_SYS_VENDOR, "Gericom"),
diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig

index 2a21419..71acc12 100644 (file)
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -294,6 +294,18 @@ config TOUCHSCREEN_ELO
           To compile this driver as a module, choose M here: the
           module will be called elo.
  
+config TOUCHSCREEN_ELOUSB
+       tristate "Elo USB touchscreens"
+       select USB
+       help
+         Say Y here if you have an Elo USB touchscreen connected to
+         your system.
+
+         If unsure, say N.
+
+         To compile this driver as a module, choose M here: the
+         module will be called elousb.
+
  config TOUCHSCREEN_WACOM_W8001
         tristate "Wacom W8001 penabled serial touchscreen"
         select SERIO
diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile

index 3d5cf8c..2f239fe 100644 (file)
--- a/drivers/input/touchscreen/Makefile
+++ b/drivers/input/touchscreen/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_TOUCHSCREEN_HAMPSHIRE)   += hampshire.o
  obj-$(CONFIG_TOUCHSCREEN_GUNZE)                += gunze.o
  obj-$(CONFIG_TOUCHSCREEN_EETI)         += eeti_ts.o
  obj-$(CONFIG_TOUCHSCREEN_ELO)          += elo.o
+obj-$(CONFIG_TOUCHSCREEN_ELOUSB)       += elousb.o
  obj-$(CONFIG_TOUCHSCREEN_EGALAX)       += egalax_ts.o
  obj-$(CONFIG_TOUCHSCREEN_FUJITSU)      += fujitsu_ts.o
  obj-$(CONFIG_TOUCHSCREEN_ILI210X)      += ili210x.o
diff --git a/drivers/input/touchscreen/elousb.c b/drivers/input/touchscreen/elousb.c

new file mode 100644 (file)

index 0000000..172bf79
--- /dev/null
+++ b/drivers/input/touchscreen/elousb.c
@@ -0,0 +1,307 @@
+/*
+ *  Copyright (c) 1999-2001 Vojtech Pavlik
+ *
+ *  Elo USB touchscreen support
+ */
+
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Should you need to contact me, the author, you can do so either by
+ * e-mail - mail your message to <vojtech@suse.cz>, or by paper mail:
+ * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/usb.h>
+#include <linux/usb/input.h>
+#include <linux/hid.h>
+#include <linux/input.h>
+
+/*
+ * Version Information
+ */
+#define DRIVER_VERSION "v1.1"
+#define DRIVER_AUTHOR "Vojtech Pavlik <vojtech@suse.cz>"
+#define DRIVER_DESC "Elo USB touchscreen driver"
+#define DRIVER_LICENSE "GPL"
+
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE(DRIVER_LICENSE);
+
+struct elousb {
+       char name[128];
+       char phys[64];
+       struct usb_device *usbdev;
+       struct input_dev *dev;
+       struct urb *irq;
+
+       unsigned char *data;
+       dma_addr_t data_dma;
+};
+
+static void elousb_irq(struct urb *urb)
+{
+       struct elousb *elo = urb->context;
+       unsigned char *data = elo->data;
+       struct input_dev *dev = elo->dev;
+       int status;
+
+       switch (urb->status) {
+               case 0:            /* success */
+                       break;
+               case -ECONNRESET:    /* unlink */
+               case -ENOENT:
+               case -ESHUTDOWN:
+                       return;
+                       /* -EPIPE:  should clear the halt */
+               default:        /* error */
+                       goto resubmit;
+       }
+
+       if (data[0] != 'T')    /* Mandatory ELO packet marker */
+               return;
+
+
+       input_report_abs(dev, ABS_X, ((u32)data[3] << 8) | data[2]);
+       input_report_abs(dev, ABS_Y, ((u32)data[5] << 8) | data[4]);
+
+       input_report_abs(dev, ABS_PRESSURE,
+                       (data[1] & 0x80) ? (((u32)data[7] << 8) | data[6]): 0);
+
+       if (data[1] & 0x03) {
+               input_report_key(dev, BTN_TOUCH, 1);
+               input_sync(dev);
+       }
+
+       if (data[1] & 0x04)
+               input_report_key(dev, BTN_TOUCH, 0);
+
+       input_sync(dev);
+
+resubmit:
+       status = usb_submit_urb (urb, GFP_ATOMIC);
+       if (status)
+               err ("can't resubmit intr, %s-%s/input0, status %d",
+                               elo->usbdev->bus->bus_name,
+                               elo->usbdev->devpath, status);
+}
+
+static int elousb_open(struct input_dev *dev)
+{
+       struct elousb *elo = input_get_drvdata(dev);
+
+       elo->irq->dev = elo->usbdev;
+       if (usb_submit_urb(elo->irq, GFP_KERNEL))
+               return -EIO;
+
+       return 0;
+}
+
+static void elousb_close(struct input_dev *dev)
+{
+       struct elousb *elo = input_get_drvdata(dev);
+
+       usb_kill_urb(elo->irq);
+}
+
+static int elousb_probe(struct usb_interface *intf, const struct usb_device_id *id)
+{
+       struct usb_device *dev = interface_to_usbdev(intf);
+       struct usb_host_interface *interface;
+       struct usb_endpoint_descriptor *endpoint;
+       struct hid_descriptor *hdesc;
+       struct elousb *elo;
+       struct input_dev *input_dev;
+       int pipe, i;
+       unsigned int rsize = 0;
+       int error = -ENOMEM;
+       char *rdesc;
+
+       interface = intf->cur_altsetting;
+
+       if (interface->desc.bNumEndpoints != 1)
+               return -ENODEV;
+
+       endpoint = &interface->endpoint[0].desc;
+       if (!(endpoint->bEndpointAddress & USB_DIR_IN))
+               return -ENODEV;
+       if ((endpoint->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) != USB_ENDPOINT_XFER_INT)
+               return -ENODEV;
+
+       if (usb_get_extra_descriptor(interface, HID_DT_HID, &hdesc) &&
+                       (!interface->desc.bNumEndpoints ||
+                        usb_get_extra_descriptor(&interface->endpoint[0], HID_DT_HID, &hdesc))) {
+               err("HID class descriptor not present");
+               return -ENODEV;
+       }
+
+       for (i = 0; i < hdesc->bNumDescriptors; i++)
+               if (hdesc->desc[i].bDescriptorType == HID_DT_REPORT)
+                       rsize = le16_to_cpu(hdesc->desc[i].wDescriptorLength);
+
+       if (!rsize || rsize > HID_MAX_DESCRIPTOR_SIZE) {
+               err("weird size of report descriptor (%u)", rsize);
+               return -ENODEV;
+       }
+
+
+       pipe = usb_rcvintpipe(dev, endpoint->bEndpointAddress);
+
+       elo = kzalloc(sizeof(struct elousb), GFP_KERNEL);
+       input_dev = input_allocate_device();
+       if (!elo || !input_dev)
+               goto fail1;
+
+       elo->data = usb_alloc_coherent(dev, 8, GFP_ATOMIC, &elo->data_dma);
+       if (!elo->data)
+               goto fail1;
+
+       elo->irq = usb_alloc_urb(0, GFP_KERNEL);
+       if (!elo->irq)
+               goto fail2;
+
+       if (!(rdesc = kmalloc(rsize, GFP_KERNEL)))
+               goto fail3;
+
+       elo->usbdev = dev;
+       elo->dev = input_dev;
+
+       if ((error = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
+                                       HID_REQ_SET_IDLE, USB_TYPE_CLASS | USB_RECIP_INTERFACE, 0,
+                                       interface->desc.bInterfaceNumber,
+                                       NULL, 0, USB_CTRL_SET_TIMEOUT)) < 0) {
+               err("setting HID idle timeout failed, error %d", error);
+               error = -ENODEV;
+               goto fail4;
+       }
+
+       if ((error = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0),
+                                       USB_REQ_GET_DESCRIPTOR, USB_RECIP_INTERFACE | USB_DIR_IN,
+                                       HID_DT_REPORT << 8, interface->desc.bInterfaceNumber,
+                                       rdesc, rsize, USB_CTRL_GET_TIMEOUT)) < rsize) {
+               err("reading HID report descriptor failed, error %d", error);
+               error = -ENODEV;
+               goto fail4;
+       }
+
+       if (dev->manufacturer)
+               strlcpy(elo->name, dev->manufacturer, sizeof(elo->name));
+
+       if (dev->product) {
+               if (dev->manufacturer)
+                       strlcat(elo->name, " ", sizeof(elo->name));
+               strlcat(elo->name, dev->product, sizeof(elo->name));
+       }
+
+       if (!strlen(elo->name))
+               snprintf(elo->name, sizeof(elo->name),
+                               "Elo touchscreen %04x:%04x",
+                               le16_to_cpu(dev->descriptor.idVendor),
+                               le16_to_cpu(dev->descriptor.idProduct));
+
+       usb_make_path(dev, elo->phys, sizeof(elo->phys));
+       strlcat(elo->phys, "/input0", sizeof(elo->phys));
+
+       input_dev->name = elo->name;
+       input_dev->phys = elo->phys;
+       usb_to_input_id(dev, &input_dev->id);
+       input_dev->dev.parent = &intf->dev;
+
+       input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_ABS);
+       set_bit(BTN_TOUCH, input_dev->keybit);
+       input_dev->absbit[0] = BIT(ABS_X) | BIT(ABS_Y);
+       set_bit(ABS_PRESSURE, input_dev->absbit);
+
+       input_set_abs_params(input_dev, ABS_X, 0, 4000, 0, 0);
+       input_set_abs_params(input_dev, ABS_Y, 0, 3840, 0, 0);
+       input_set_abs_params(input_dev, ABS_PRESSURE, 0, 256, 0, 0);
+
+       input_set_drvdata(input_dev, elo);
+
+       input_dev->open = elousb_open;
+       input_dev->close = elousb_close;
+
+       usb_fill_int_urb(elo->irq, dev, pipe, elo->data, 8,
+                       elousb_irq, elo, endpoint->bInterval);
+       elo->irq->transfer_dma = elo->data_dma;
+       elo->irq->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+
+       error = input_register_device(elo->dev);
+       if (error)
+               goto fail4;
+
+       usb_set_intfdata(intf, elo);
+       return 0;
+
+fail4:
+       kfree(rdesc);
+fail3:
+       usb_free_urb(elo->irq);
+fail2:
+       usb_free_coherent(dev, 8, elo->data, elo->data_dma);
+fail1:
+       input_free_device(input_dev);
+       kfree(elo);
+       return -ENOMEM;
+}
+
+static void elousb_disconnect(struct usb_interface *intf)
+{
+       struct elousb *elo = usb_get_intfdata (intf);
+
+       usb_set_intfdata(intf, NULL);
+       if (elo) {
+               usb_kill_urb(elo->irq);
+               input_unregister_device(elo->dev);
+               usb_free_urb(elo->irq);
+               usb_free_coherent(interface_to_usbdev(intf), 8, elo->data, elo->data_dma);
+               kfree(elo);
+       }
+}
+
+static struct usb_device_id elousb_id_table [] = {
+       { USB_DEVICE(0x04e7, 0x0009) }, /* CarrolTouch 4000U */
+       { USB_DEVICE(0x04e7, 0x0030) }, /* CarrolTouch 4500U */
+       { }    /* Terminating entry */
+};
+
+MODULE_DEVICE_TABLE (usb, elousb_id_table);
+
+static struct usb_driver elousb_driver = {
+       .name        = "elousb",
+       .probe        = elousb_probe,
+       .disconnect    = elousb_disconnect,
+       .id_table    = elousb_id_table,
+};
+
+static int __init elousb_init(void)
+{
+       int retval = usb_register(&elousb_driver);
+       if (retval == 0)
+               printk(KERN_INFO KBUILD_MODNAME ": " DRIVER_VERSION ":" DRIVER_DESC);
+       return retval;
+}
+
+static void __exit elousb_exit(void)
+{
+       usb_deregister(&elousb_driver);
+}
+
+module_init(elousb_init);
+module_exit(elousb_exit);
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig

index 3bd9fff..cb74376 100644 (file)
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -4,6 +4,7 @@ config IOMMU_API
  
  menuconfig IOMMU_SUPPORT
         bool "IOMMU Hardware Support"
+       depends on !XEN
         default y
         ---help---
           Say Y here if you want to compile device drivers for IO Memory
diff --git a/drivers/isdn/mISDN/core.c b/drivers/isdn/mISDN/core.c

index a24530f..5ca5468 100644 (file)
--- a/drivers/isdn/mISDN/core.c
+++ b/drivers/isdn/mISDN/core.c
@@ -21,10 +21,13 @@
  #include "core.h"
  
  static u_int debug;
+u_int misdn_permitted_gid;
  
  MODULE_AUTHOR("Karsten Keil");
  MODULE_LICENSE("GPL");
  module_param(debug, uint, S_IRUGO | S_IWUSR);
+module_param_named(gid, misdn_permitted_gid, uint, 0);
+MODULE_PARM_DESC(gid, "Unix group for accessing misdn socket (default 0)");
  
  static u64             device_ids;
  #define MAX_DEVICE_ID  63
diff --git a/drivers/isdn/mISDN/core.h b/drivers/isdn/mISDN/core.h

index 52695bb..6ea673f 100644 (file)
--- a/drivers/isdn/mISDN/core.h
+++ b/drivers/isdn/mISDN/core.h
@@ -17,6 +17,7 @@
  
  extern struct mISDNdevice      *get_mdevice(u_int);
  extern int                     get_mdevice_count(void);
+extern u_int misdn_permitted_gid;
  
  /* stack status flag */
  #define mISDN_STACK_ACTION_MASK                0x0000ffff
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c

index abe2d69..1c02f66 100644 (file)
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -612,6 +612,10 @@ data_sock_create(struct net *net, struct socket *sock, int protocol)
  {
         struct sock *sk;
  
+       if(!capable(CAP_SYS_ADMIN) && (misdn_permitted_gid != current_gid())
+               && (!in_group_p(misdn_permitted_gid)))
+                       return -EPERM;
+
         if (sock->type != SOCK_DGRAM)
                 return -ESOCKTNOSUPPORT;
  
@@ -694,6 +698,10 @@ base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
         case IMSETDEVNAME:
         {
                 struct mISDN_devrename dn;
+               if(!capable(CAP_SYS_ADMIN)
+                       && (misdn_permitted_gid != current_gid())
+                       && (!in_group_p(misdn_permitted_gid)))
+                               return -EPERM;
                 if (copy_from_user(&dn, (void __user *)arg,
                                    sizeof(dn))) {
                         err = -EFAULT;
diff --git a/drivers/leds/ledtrig-default-on.c b/drivers/leds/ledtrig-default-on.c

index a4ef54b..2efdd30 100644 (file)
--- a/drivers/leds/ledtrig-default-on.c
+++ b/drivers/leds/ledtrig-default-on.c
@@ -23,7 +23,7 @@ static void defon_trig_activate(struct led_classdev *led_cdev)
  }
  
  static struct led_trigger defon_led_trigger = {
-       .name     = "default-on",
+       .name     = "default::on",
         .activate = defon_trig_activate,
  };
  
diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig

index fa51af1..8b228e4 100644 (file)
--- a/drivers/macintosh/Kconfig
+++ b/drivers/macintosh/Kconfig
@@ -13,7 +13,7 @@ if MACINTOSH_DRIVERS
  
  config ADB
         bool "Apple Desktop Bus (ADB) support"
-       depends on MAC || (PPC_PMAC && PPC32)
+       depends on MAC || PPC_PMAC
         help
           Apple Desktop Bus (ADB) support is for support of devices which
           are connected to an ADB port.  ADB devices tend to have 4 pins.
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c

index b026896..18ba395 100644 (file)
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -298,6 +298,10 @@ static int __init adb_init(void)
         if (!machine_is(chrp) && !machine_is(powermac))
                 return 0;
  #endif
+#ifdef CONFIG_PPC64
+       if (!machine_is(powermac))
+               return 0;
+#endif
  #ifdef CONFIG_MAC
         if (!MACH_IS_MAC)
                 return 0;
diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c

index 09d72bb..6a55cdb 100644 (file)
--- a/drivers/macintosh/adbhid.c
+++ b/drivers/macintosh/adbhid.c
@@ -1264,10 +1264,14 @@ init_ms_a3(int id)
  
  static int __init adbhid_init(void)
  {
-#ifndef CONFIG_MAC
+#ifdef CONFIG_PPC32
         if (!machine_is(chrp) && !machine_is(powermac))
                 return 0;
  #endif
+#ifdef CONFIG_PPC64
+       if (!machine_is(powermac))
+               return 0;
+#endif
  
         led_request.complete = 1;
  
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig

index 10f122a..3ec8dee 100644 (file)
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -272,6 +272,7 @@ config DM_DEBUG_SPACE_MAPS
  config DM_MIRROR
         tristate "Mirror target"
         depends on BLK_DEV_DM
+       select DM_REGION_HASH_LOG
         ---help---
           Allow volume managers to mirror logical volumes, also
           needed for live data migration tools such as 'pvmove'.
@@ -358,6 +359,20 @@ config DM_DELAY
  
         If unsure, say N.
  
+config DM_REGION_HASH_LOG
+       tristate
+       default n
+
+config DM_RAID45
+       tristate "RAID 4/5 target (EXPERIMENTAL)"
+       depends on BLK_DEV_DM && EXPERIMENTAL
+       select ASYNC_XOR
+       select DM_REGION_HASH_LOG
+       ---help---
+       A target that supports RAID4 and RAID5 mappings.
+
+       If unsure, say N.
+
  config DM_UEVENT
         bool "DM uevents"
         depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile

index 8b2e0df..15ad50d 100644 (file)
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -32,17 +32,19 @@ obj-$(CONFIG_DM_BUFIO)              += dm-bufio.o
  obj-$(CONFIG_DM_CRYPT)         += dm-crypt.o
  obj-$(CONFIG_DM_DELAY)         += dm-delay.o
  obj-$(CONFIG_DM_FLAKEY)                += dm-flakey.o
-obj-$(CONFIG_DM_MULTIPATH)     += dm-multipath.o dm-round-robin.o
+obj-$(CONFIG_DM_MULTIPATH)     += dm-multipath.o dm-round-robin.o dm-least-pending.o
  obj-$(CONFIG_DM_MULTIPATH_QL)  += dm-queue-length.o
  obj-$(CONFIG_DM_MULTIPATH_ST)  += dm-service-time.o
  obj-$(CONFIG_DM_SNAPSHOT)      += dm-snapshot.o
  obj-$(CONFIG_DM_PERSISTENT_DATA)       += persistent-data/
-obj-$(CONFIG_DM_MIRROR)                += dm-mirror.o dm-log.o dm-region-hash.o
+obj-$(CONFIG_DM_MIRROR)                += dm-mirror.o
+obj-$(CONFIG_DM_REGION_HASH_LOG)       += dm-log.o dm-region-hash.o
  obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
  obj-$(CONFIG_DM_ZERO)          += dm-zero.o
  obj-$(CONFIG_DM_RAID)  += dm-raid.o
  obj-$(CONFIG_DM_THIN_PROVISIONING)     += dm-thin-pool.o
  obj-$(CONFIG_DM_VERITY)                += dm-verity.o
+obj-$(CONFIG_DM_RAID45)                += dm-raid45.o dm-memcache.o
  
  ifeq ($(CONFIG_DM_UEVENT),y)
  dm-mod-objs                    += dm-uevent.o
diff --git a/drivers/md/dm-least-pending.c b/drivers/md/dm-least-pending.c

new file mode 100644 (file)

index 0000000..f4e98b7
--- /dev/null
+++ b/drivers/md/dm-least-pending.c
@@ -0,0 +1,259 @@
+/*
+ * (C) Copyright 2008 Hewlett-Packard Development Company, L.P
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#define DM_MSG_PREFIX "multipath least-pending"
+
+/*-----------------------------------------------------------------
+* Path-handling code, paths are held in lists
+*---------------------------------------------------------------*/
+struct path_info {
+       struct list_head list;
+       struct dm_path *path;
+       unsigned repeat_count;
+       atomic_t io_count;
+};
+
+static void free_paths(struct list_head *paths)
+{
+       struct path_info *pi, *next;
+
+       list_for_each_entry_safe(pi, next, paths, list) {
+               list_del(&pi->list);
+               kfree(pi);
+       }
+}
+
+/*-----------------------------------------------------------------
+ * Least-pending selector
+ *---------------------------------------------------------------*/
+
+#define LPP_MIN_IO     1
+
+struct selector {
+       struct list_head valid_paths;
+       struct list_head invalid_paths;
+};
+
+static struct selector *alloc_selector(void)
+{
+       struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+       if (s) {
+               INIT_LIST_HEAD(&s->valid_paths);
+               INIT_LIST_HEAD(&s->invalid_paths);
+       }
+
+       return s;
+}
+
+static int lpp_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+       struct selector *s;
+
+       s = alloc_selector();
+       if (!s)
+               return -ENOMEM;
+
+       ps->context = s;
+       return 0;
+}
+
+static void lpp_destroy(struct path_selector *ps)
+{
+       struct selector *s = ps->context;
+
+       free_paths(&s->valid_paths);
+       free_paths(&s->invalid_paths);
+       kfree(s);
+       ps->context = NULL;
+}
+
+static int lpp_status(struct path_selector *ps, struct dm_path *path,
+                       status_type_t type, char *result, unsigned int maxlen)
+{
+       struct path_info *pi;
+       int sz = 0;
+
+       if (!path)
+               switch (type) {
+               case STATUSTYPE_INFO:
+                       DMEMIT("1 ");
+               break;
+               case STATUSTYPE_TABLE:
+                       DMEMIT("0 ");
+               break;
+               }
+       else {
+               pi = path->pscontext;
+               switch (type) {
+               case STATUSTYPE_INFO:
+                       DMEMIT("%u:%u ", pi->repeat_count,
+                                        atomic_read(&pi->io_count));
+               break;
+               case STATUSTYPE_TABLE:
+               break;
+               }
+       }
+
+       return sz;
+}
+
+/*
+ * Called during initialisation to register each path with an
+ * optional repeat_count.
+ */
+static int lpp_add_path(struct path_selector *ps, struct dm_path *path,
+                       int argc, char **argv, char **error)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi;
+       unsigned repeat_count = LPP_MIN_IO;
+
+       if (argc > 1) {
+               *error = "least-pending ps: incorrect number of arguments";
+               return -EINVAL;
+       }
+
+       /* First path argument is number of I/Os before switching path */
+       if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+               *error = "least-pending ps: invalid repeat count";
+               return -EINVAL;
+       }
+
+       /* allocate the path */
+       pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+       if (!pi) {
+               *error = "least-pending ps: Error allocating path context";
+               return -ENOMEM;
+       }
+
+       pi->path = path;
+       pi->repeat_count = repeat_count;
+       atomic_set(&pi->io_count, 0);
+
+       path->pscontext = pi;
+
+       list_add(&pi->list, &s->valid_paths);
+
+       return 0;
+}
+
+static void lpp_fail_path(struct path_selector *ps, struct dm_path *p)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi = p->pscontext;
+
+       if (!pi)
+       return;
+
+       atomic_set(&pi->io_count, 0);
+
+       list_move(&pi->list, &s->invalid_paths);
+}
+
+static int lpp_reinstate_path(struct path_selector *ps, struct dm_path *p)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi = p->pscontext;
+
+       if (!pi)
+       return 1;
+
+       list_move(&pi->list, &s->valid_paths);
+
+       return 0;
+}
+
+static struct dm_path *lpp_select_path(struct path_selector *ps,
+                                       unsigned *repeat_count,
+                                       size_t nr_bytes)
+{
+       struct selector *s = ps->context;
+       struct path_info *pi, *next, *least_io_path = NULL;
+       struct list_head *paths;
+
+       if (list_empty(&s->valid_paths))
+               return NULL;
+
+       paths = &s->valid_paths;
+
+       list_for_each_entry_safe(pi, next, paths, list) {
+               if (!least_io_path || atomic_read(&least_io_path->io_count) < atomic_read(&pi->io_count))
+                       least_io_path = pi;
+               if (!atomic_read(&least_io_path->io_count))
+                       break;
+       }
+
+       if (!least_io_path)
+               return NULL;
+
+       atomic_inc(&least_io_path->io_count);
+       *repeat_count = least_io_path->repeat_count;
+
+       return least_io_path->path;
+}
+
+static int lpp_end_io(struct path_selector *ps, struct dm_path *path,
+                     size_t nr_bytes)
+{
+       struct path_info *pi = NULL;
+
+       pi = path->pscontext;
+       if (!pi)
+       return 1;
+
+       atomic_dec(&pi->io_count);
+
+       return 0;
+}
+
+static struct path_selector_type lpp_ps = {
+       .name = "least-pending",
+       .module = THIS_MODULE,
+       .table_args = 1,
+       .info_args = 0,
+       .create = lpp_create,
+       .destroy = lpp_destroy,
+       .status = lpp_status,
+       .add_path = lpp_add_path,
+       .fail_path = lpp_fail_path,
+       .reinstate_path = lpp_reinstate_path,
+       .select_path = lpp_select_path,
+       .end_io = lpp_end_io,
+};
+
+static int __init dm_lpp_init(void)
+{
+       int r = dm_register_path_selector(&lpp_ps);
+
+       if (r < 0)
+               DMERR("register failed %d", r);
+
+       DMINFO("version 1.0.0 loaded");
+
+       return r;
+}
+
+static void __exit dm_lpp_exit(void)
+{
+       int r = dm_unregister_path_selector(&lpp_ps);
+
+       if (r < 0)
+               DMERR("unregister failed %d", r);
+}
+
+module_init(dm_lpp_init);
+module_exit(dm_lpp_exit);
+
+MODULE_DESCRIPTION(DM_NAME " least-pending multipath path selector");
+MODULE_AUTHOR("Sakshi Chaitanya Veni <vsakshi@hp.com>");
+MODULE_LICENSE("GPL");
+
diff --git a/drivers/md/dm-memcache.c b/drivers/md/dm-memcache.c

new file mode 100644 (file)

index 0000000..2d7d914
--- /dev/null
+++ b/drivers/md/dm-memcache.c
@@ -0,0 +1,303 @@
+/*
+ * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
+ *
+ * Device-mapper memory object handling:
+ *
+ * o allocate/free total_pages in a per client page pool.
+ *
+ * o allocate/free memory objects with chunks (1..n) of
+ *   pages_per_chunk pages hanging off.
+ *
+ * This file is released under the GPL.
+ */
+
+#define        DM_MEM_CACHE_VERSION    "0.2"
+
+#include "dm.h"
+#include "dm-memcache.h"
+#include <linux/dm-io.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+struct dm_mem_cache_client {
+       spinlock_t lock;
+       mempool_t *objs_pool;
+       struct page_list *free_list;
+       unsigned objects;
+       unsigned chunks;
+       unsigned pages_per_chunk;
+       unsigned free_pages;
+       unsigned total_pages;
+};
+
+/*
+ * Free pages and page_list elements of client.
+ */
+static void free_cache_pages(struct page_list *list)
+{
+       while (list) {
+               struct page_list *pl = list;
+
+               list = pl->next;
+               BUG_ON(!pl->page);
+               __free_page(pl->page);
+               kfree(pl);
+       }
+}
+
+/*
+ * Alloc number of pages and page_list elements as required by client.
+ */
+static struct page_list *alloc_cache_pages(unsigned pages)
+{
+       struct page_list *pl, *ret = NULL;
+       struct page *page;
+
+       while (pages--) {
+               page = alloc_page(GFP_NOIO);
+               if (!page)
+                       goto err;
+
+               pl = kmalloc(sizeof(*pl), GFP_NOIO);
+               if (!pl) {
+                       __free_page(page);
+                       goto err;
+               }
+
+               pl->page = page;
+               pl->next = ret;
+               ret = pl;
+       }
+
+       return ret;
+
+err:
+       free_cache_pages(ret);
+       return NULL;
+}
+
+/*
+ * Allocate page_list elements from the pool to chunks of the memory object.
+ */
+static void alloc_chunks(struct dm_mem_cache_client *cl,
+                        struct dm_mem_cache_object *obj)
+{
+       unsigned chunks = cl->chunks;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       local_irq_disable();
+       while (chunks--) {
+               unsigned p = cl->pages_per_chunk;
+
+               obj[chunks].pl = NULL;
+
+               while (p--) {
+                       struct page_list *pl;
+
+                       /* Take next element from free list */
+                       spin_lock(&cl->lock);
+                       pl = cl->free_list;
+                       BUG_ON(!pl);
+                       cl->free_list = pl->next;
+                       spin_unlock(&cl->lock);
+
+                       pl->next = obj[chunks].pl;
+                       obj[chunks].pl = pl;
+               }
+       }
+
+       local_irq_restore(flags);
+}
+
+/*
+ * Free page_list elements putting them back onto free list
+ */
+static void free_chunks(struct dm_mem_cache_client *cl,
+                       struct dm_mem_cache_object *obj)
+{
+       unsigned chunks = cl->chunks;
+       unsigned long flags;
+       struct page_list *next, *pl;
+
+       local_irq_save(flags);
+       local_irq_disable();
+       while (chunks--) {
+               for (pl = obj[chunks].pl; pl; pl = next) {
+                       next = pl->next;
+
+                       spin_lock(&cl->lock);
+                       pl->next = cl->free_list;
+                       cl->free_list = pl;
+                       cl->free_pages++;
+                       spin_unlock(&cl->lock);
+               }
+       }
+
+       local_irq_restore(flags);
+}
+
+/*
+ * Create/destroy dm memory cache client resources.
+ */
+struct dm_mem_cache_client *
+dm_mem_cache_client_create(unsigned objects, unsigned chunks,
+                          unsigned pages_per_chunk)
+{
+       unsigned total_pages = objects * chunks * pages_per_chunk;
+       struct dm_mem_cache_client *client;
+
+       BUG_ON(!total_pages);
+       client = kzalloc(sizeof(*client), GFP_KERNEL);
+       if (!client)
+               return ERR_PTR(-ENOMEM);
+
+       client->objs_pool = mempool_create_kmalloc_pool(objects,
+                               chunks * sizeof(struct dm_mem_cache_object));
+       if (!client->objs_pool)
+               goto err;
+
+       client->free_list = alloc_cache_pages(total_pages);
+       if (!client->free_list)
+               goto err1;
+
+       spin_lock_init(&client->lock);
+       client->objects = objects;
+       client->chunks = chunks;
+       client->pages_per_chunk = pages_per_chunk;
+       client->free_pages = client->total_pages = total_pages;
+       return client;
+
+err1:
+       mempool_destroy(client->objs_pool);
+err:
+       kfree(client);
+       return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(dm_mem_cache_client_create);
+
+void dm_mem_cache_client_destroy(struct dm_mem_cache_client *cl)
+{
+       BUG_ON(cl->free_pages != cl->total_pages);
+       free_cache_pages(cl->free_list);
+       mempool_destroy(cl->objs_pool);
+       kfree(cl);
+}
+EXPORT_SYMBOL(dm_mem_cache_client_destroy);
+
+/*
+ * Grow a clients cache by an amount of pages.
+ *
+ * Don't call from interrupt context!
+ */
+int dm_mem_cache_grow(struct dm_mem_cache_client *cl, unsigned objects)
+{
+       unsigned pages = objects * cl->chunks * cl->pages_per_chunk;
+       struct page_list *pl, *last;
+
+       BUG_ON(!pages);
+       pl = alloc_cache_pages(pages);
+       if (!pl)
+               return -ENOMEM;
+
+       last = pl;
+       while (last->next)
+               last = last->next;
+
+       spin_lock_irq(&cl->lock);
+       last->next = cl->free_list;
+       cl->free_list = pl;
+       cl->free_pages += pages;
+       cl->total_pages += pages;
+       cl->objects += objects;
+       spin_unlock_irq(&cl->lock);
+
+       mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
+       return 0;
+}
+EXPORT_SYMBOL(dm_mem_cache_grow);
+
+/* Shrink a clients cache by an amount of pages */
+int dm_mem_cache_shrink(struct dm_mem_cache_client *cl, unsigned objects)
+{
+       int r;
+       unsigned pages = objects * cl->chunks * cl->pages_per_chunk, p = pages;
+       unsigned long flags;
+       struct page_list *last = NULL, *pl, *pos;
+
+       BUG_ON(!pages);
+
+       spin_lock_irqsave(&cl->lock, flags);
+       pl = pos = cl->free_list;
+       while (p-- && pos->next) {
+               last = pos;
+               pos = pos->next;
+       }
+
+       if (++p)
+               r = -ENOMEM;
+       else {
+               r = 0;
+               cl->free_list = pos;
+               cl->free_pages -= pages;
+               cl->total_pages -= pages;
+               cl->objects -= objects;
+               last->next = NULL;
+       }
+       spin_unlock_irqrestore(&cl->lock, flags);
+
+       if (!r) {
+               free_cache_pages(pl);
+               mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
+       }
+
+       return r;
+}
+EXPORT_SYMBOL(dm_mem_cache_shrink);
+
+/*
+ * Allocate/free a memory object
+ *
+ * Can be called from interrupt context
+ */
+struct dm_mem_cache_object *dm_mem_cache_alloc(struct dm_mem_cache_client *cl)
+{
+       int r = 0;
+       unsigned pages = cl->chunks * cl->pages_per_chunk;
+       unsigned long flags;
+       struct dm_mem_cache_object *obj;
+
+       obj = mempool_alloc(cl->objs_pool, GFP_NOIO);
+       if (!obj)
+               return ERR_PTR(-ENOMEM);
+
+       spin_lock_irqsave(&cl->lock, flags);
+       if (pages > cl->free_pages)
+               r = -ENOMEM;
+       else
+               cl->free_pages -= pages;
+       spin_unlock_irqrestore(&cl->lock, flags);
+
+       if (r) {
+               mempool_free(obj, cl->objs_pool);
+               return ERR_PTR(r);
+       }
+
+       alloc_chunks(cl, obj);
+       return obj;
+}
+EXPORT_SYMBOL(dm_mem_cache_alloc);
+
+void dm_mem_cache_free(struct dm_mem_cache_client *cl,
+                      struct dm_mem_cache_object *obj)
+{
+       free_chunks(cl, obj);
+       mempool_free(obj, cl->objs_pool);
+}
+EXPORT_SYMBOL(dm_mem_cache_free);
+
+MODULE_DESCRIPTION(DM_NAME " dm memory cache");
+MODULE_AUTHOR("Heinz Mauelshagen <heinzm@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-memcache.h b/drivers/md/dm-memcache.h

new file mode 100644 (file)

index 0000000..87e4256
--- /dev/null
+++ b/drivers/md/dm-memcache.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen <Mauelshagen@RedHat.com>
+ *
+ * Device-mapper memory object handling:
+ *
+ * o allocate/free total_pages in a per client page pool.
+ *
+ * o allocate/free memory objects with chunks (1..n) of
+ *   pages_per_chunk pages hanging off.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _DM_MEM_CACHE_H
+#define _DM_MEM_CACHE_H
+
+#define        DM_MEM_CACHE_H_VERSION  "0.1"
+
+#include "dm.h"
+#include <linux/dm-io.h>
+
+static inline struct page_list *pl_elem(struct page_list *pl, unsigned p)
+{
+       while (pl && p--)
+               pl = pl->next;
+
+       return pl;
+}
+
+struct dm_mem_cache_object {
+       struct page_list *pl; /* Dynamically allocated array */
+       void *private;        /* Caller context reference */
+};
+
+struct dm_mem_cache_client;
+
+/*
+ * Create/destroy dm memory cache client resources.
+ *
+ * On creation, a number of @objects with @chunks of
+ * @pages_per_chunk pages will be allocated.
+ */
+struct dm_mem_cache_client *
+dm_mem_cache_client_create(unsigned objects, unsigned chunks,
+                          unsigned pages_per_chunk);
+void dm_mem_cache_client_destroy(struct dm_mem_cache_client *client);
+
+/*
+ * Grow/shrink a dm memory cache client resources
+ * by @objetcs amount of objects.
+ */
+int dm_mem_cache_grow(struct dm_mem_cache_client *client, unsigned objects);
+int dm_mem_cache_shrink(struct dm_mem_cache_client *client, unsigned objects);
+
+/*
+ * Allocate/free a memory object
+ *
+ * On allocation one object with an amount of chunks and
+ * an amount of pages per chunk will be returned on success.
+ */
+struct dm_mem_cache_object *
+dm_mem_cache_alloc(struct dm_mem_cache_client *client);
+void dm_mem_cache_free(struct dm_mem_cache_client *client,
+                      struct dm_mem_cache_object *object);
+
+#endif
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c

index 754f38f..6195bff 100644 (file)
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -56,6 +56,8 @@ struct priority_group {
         struct list_head pgpaths;
  };
  
+#define FEATURE_NO_PARTITIONS 1
+
  /* Multipath context */
  struct multipath {
         struct list_head list;
@@ -87,6 +89,7 @@ struct multipath {
         unsigned pg_init_retries;       /* Number of times to retry pg_init */
         unsigned pg_init_count;         /* Number of times pg_init called */
         unsigned pg_init_delay_msecs;   /* Number of msecs before pg_init retry */
+       unsigned features;              /* Additional selected features */
  
         struct work_struct process_queued_ios;
         struct list_head queued_ios;
@@ -159,12 +162,9 @@ static struct priority_group *alloc_priority_group(void)
  static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
  {
         struct pgpath *pgpath, *tmp;
-       struct multipath *m = ti->private;
  
         list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
                 list_del(&pgpath->list);
-               if (m->hw_handler_name)
-                       scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
                 dm_put_device(ti, pgpath->path.dev);
                 free_pgpath(pgpath);
         }
@@ -299,6 +299,11 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
  
         m->current_pgpath = path_to_pgpath(path);
  
+       if (!m->current_pgpath->path.dev) {
+               m->current_pgpath = NULL;
+               return -ENODEV;
+       }
+
         if (m->current_pg != pg)
                 __switch_pg(m, m->current_pgpath);
  
@@ -565,6 +570,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
  {
         int r;
         struct pgpath *p;
+       char *path;
         struct multipath *m = ti->private;
  
         /* we need at least a path arg */
@@ -577,30 +583,57 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
         if (!p)
                 return ERR_PTR(-ENOMEM);
  
-       r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
+       path = dm_shift_arg(as);
+       r = dm_get_device(ti, path, dm_table_get_mode(ti->table),
                           &p->path.dev);
         if (r) {
-               ti->error = "error getting device";
-               goto bad;
+               unsigned major, minor;
+
+               /* Try to add a failed device */
+               if (r == -ENXIO && sscanf(path, "%u:%u", &major, &minor) == 2) {
+                       dev_t dev;
+
+                       /* Extract the major/minor numbers */
+                       dev = MKDEV(major, minor);
+                       if (MAJOR(dev) != major || MINOR(dev) != minor) {
+                               /* Nice try, didn't work */
+                               DMWARN("Invalid device path %s", path);
+                               ti->error = "error converting devnum";
+                               goto bad;
+                       }
+                       DMWARN("adding disabled device %d:%d", major, minor);
+                       p->path.dev = NULL;
+                       format_dev_t(p->path.pdev, dev);
+                       p->is_active = 0;
+               } else {
+                       ti->error = "error getting device";
+                       goto bad;
+               }
+       } else {
+               memcpy(p->path.pdev, p->path.dev->name, 16);
         }
  
-       if (m->hw_handler_name) {
+       if (p->path.dev) {
                 struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
  
-               r = scsi_dh_attach(q, m->hw_handler_name);
-               if (r == -EBUSY) {
-                       /*
-                        * Already attached to different hw_handler,
-                        * try to reattach with correct one.
-                        */
-                       scsi_dh_detach(q);
+               if (m->hw_handler_name) {
                         r = scsi_dh_attach(q, m->hw_handler_name);
-               }
-
-               if (r < 0) {
-                       ti->error = "error attaching hardware handler";
-                       dm_put_device(ti, p->path.dev);
-                       goto bad;
+                       if (r == -EBUSY) {
+                               /*
+                                * Already attached to different hw_handler,
+                                * try to reattach with correct one.
+                                */
+                               scsi_dh_detach(q);
+                               r = scsi_dh_attach(q, m->hw_handler_name);
+                       }
+                       if (r < 0) {
+                               ti->error = "error attaching hardware handler";
+                               dm_put_device(ti, p->path.dev);
+                               goto bad;
+                       }
+               } else {
+                       /* Play safe and detach hardware handler */
+                       scsi_dh_detach(q);
                 }
  
                 if (m->hw_handler_params) {
@@ -621,6 +654,11 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
                 goto bad;
         }
  
+       if (!p->is_active) {
+               ps->type->fail_path(ps, &p->path);
+               p->fail_count++;
+               m->nr_valid_paths--;
+       }
         return p;
  
   bad:
@@ -779,6 +817,10 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
                         continue;
                 }
  
+               if (!strcasecmp(arg_name, "no_partitions")) {
+                       m->features |= FEATURE_NO_PARTITIONS;
+                       continue;
+               }
                 if (!strcasecmp(arg_name, "pg_init_retries") &&
                     (argc >= 1)) {
                         r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
@@ -955,7 +997,7 @@ static int fail_path(struct pgpath *pgpath)
         if (!pgpath->is_active)
                 goto out;
  
-       DMWARN("Failing path %s.", pgpath->path.dev->name);
+       DMWARN("Failing path %s.", pgpath->path.pdev);
  
         pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
         pgpath->is_active = 0;
@@ -967,7 +1009,7 @@ static int fail_path(struct pgpath *pgpath)
                 m->current_pgpath = NULL;
  
         dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
-                     pgpath->path.dev->name, m->nr_valid_paths);
+                      pgpath->path.pdev, m->nr_valid_paths);
  
         schedule_work(&m->trigger_event);
  
@@ -991,6 +1033,12 @@ static int reinstate_path(struct pgpath *pgpath)
         if (pgpath->is_active)
                 goto out;
  
+       if (!pgpath->path.dev) {
+               DMWARN("Cannot reinstate disabled path %s", pgpath->path.pdev);
+               r = -ENODEV;
+               goto out;
+       }
+
         if (!pgpath->pg->ps.type->reinstate_path) {
                 DMWARN("Reinstate path not supported by path selector %s",
                        pgpath->pg->ps.type->name);
@@ -1013,7 +1061,7 @@ static int reinstate_path(struct pgpath *pgpath)
         }
  
         dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
-                     pgpath->path.dev->name, m->nr_valid_paths);
+                      pgpath->path.pdev, m->nr_valid_paths);
  
         schedule_work(&m->trigger_event);
  
@@ -1033,6 +1081,9 @@ static int action_dev(struct multipath *m, struct dm_dev *dev,
         struct pgpath *pgpath;
         struct priority_group *pg;
  
+       if (!dev)
+               return 0;
+
         list_for_each_entry(pg, &m->priority_groups, list) {
                 list_for_each_entry(pgpath, &pg->pgpaths, list) {
                         if (pgpath->path.dev == dev)
@@ -1156,8 +1207,9 @@ static void pg_init_done(void *data, int errors)
                         errors = 0;
                         break;
                 }
-               DMERR("Could not failover the device: Handler scsi_dh_%s "
-                     "Error %d.", m->hw_handler_name, errors);
+               DMERR("Count not failover device %s: Handler scsi_dh_%s "
+                     "was not loaded.", pgpath->path.dev->name,
+                     m->hw_handler_name);
                 /*
                  * Fail path for now, so we do not ping pong
                  */
@@ -1170,6 +1222,10 @@ static void pg_init_done(void *data, int errors)
                  */
                 bypass_pg(m, pg, 1);
                 break;
+       case SCSI_DH_DEV_OFFLINED:
+               DMWARN("Device %s offlined.", pgpath->path.dev->name);
+               errors = 0;
+               break;
         case SCSI_DH_RETRY:
                 /* Wait before retrying. */
                 delay_retry = 1;
@@ -1191,7 +1247,8 @@ static void pg_init_done(void *data, int errors)
         spin_lock_irqsave(&m->lock, flags);
         if (errors) {
                 if (pgpath == m->current_pgpath) {
-                       DMERR("Could not failover device. Error %d.", errors);
+                       DMERR("Could not failover device %s, error %d.",
+                             pgpath->path.dev->name, errors);
                         m->current_pgpath = NULL;
                         m->current_pg = NULL;
                 }
@@ -1222,8 +1279,9 @@ static void activate_path(struct work_struct *work)
         struct pgpath *pgpath =
                 container_of(work, struct pgpath, activate_path.work);
  
-       scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
-                               pg_init_done, pgpath);
+       if (pgpath->path.dev)
+               scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
+                                pg_init_done, pgpath);
  }
  
  /*
@@ -1362,11 +1420,14 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
         else {
                 DMEMIT("%u ", m->queue_if_no_path +
                               (m->pg_init_retries > 0) * 2 +
-                             (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2);
+                             (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
+                             (m->features & FEATURE_NO_PARTITIONS));
                 if (m->queue_if_no_path)
                         DMEMIT("queue_if_no_path ");
                 if (m->pg_init_retries)
                         DMEMIT("pg_init_retries %u ", m->pg_init_retries);
+               if (m->features & FEATURE_NO_PARTITIONS)
+                       DMEMIT("no_partitions ");
                 if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
                         DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
         }
@@ -1410,7 +1471,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
                                pg->ps.type->info_args);
  
                         list_for_each_entry(p, &pg->pgpaths, list) {
-                               DMEMIT("%s %s %u ", p->path.dev->name,
+                               DMEMIT("%s %s %u ", p->path.pdev,
                                        p->is_active ? "A" : "F",
                                        p->fail_count);
                                 if (pg->ps.type->status)
@@ -1436,7 +1497,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
                                pg->ps.type->table_args);
  
                         list_for_each_entry(p, &pg->pgpaths, list) {
-                               DMEMIT("%s ", p->path.dev->name);
+                               DMEMIT("%s ", p->path.pdev);
                                 if (pg->ps.type->status)
                                         sz += pg->ps.type->status(&pg->ps,
                                               &p->path, type, result + sz,
@@ -1528,7 +1589,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
         if (!m->current_pgpath)
                 __choose_pgpath(m, 0);
  
-       if (m->current_pgpath) {
+       if (m->current_pgpath && m->current_pgpath->path.dev) {
                 bdev = m->current_pgpath->path.dev->bdev;
                 mode = m->current_pgpath->path.dev->mode;
         }
diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h

index e230f71..f97388d 100644 (file)
--- a/drivers/md/dm-mpath.h
+++ b/drivers/md/dm-mpath.h
@@ -12,6 +12,7 @@
  struct dm_dev;
  
  struct dm_path {
+       char pdev[16];          /* Requested physical device */
         struct dm_dev *dev;     /* Read-only */
         void *pscontext;        /* For path-selector use */
  };
diff --git a/drivers/md/dm-raid45.c b/drivers/md/dm-raid45.c

new file mode 100644 (file)

index 0000000..fecc9b7
--- /dev/null
+++ b/drivers/md/dm-raid45.c
@@ -0,0 +1,4692 @@
+/*
+ * Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
+ *
+ * This file is released under the GPL.
+ *
+ *
+ * Linux 2.6 Device Mapper RAID4 and RAID5 target.
+ *
+ * Tested-by: Intel; Marcin.Labun@intel.com, krzysztof.wojcik@intel.com
+ *
+ *
+ * Supports the following ATARAID vendor solutions (and SNIA DDF):
+ *
+ *     Adaptec HostRAID ASR
+ *     SNIA DDF1
+ *     Hiphpoint 37x
+ *     Hiphpoint 45x
+ *     Intel IMSM
+ *     Jmicron ATARAID
+ *     LSI Logic MegaRAID
+ *     NVidia RAID
+ *     Promise FastTrack
+ *     Silicon Image Medley
+ *     VIA Software RAID
+ *
+ * via the dmraid application.
+ *
+ *
+ * Features:
+ *
+ *     o RAID4 with dedicated and selectable parity device
+ *     o RAID5 with rotating parity (left+right, symmetric+asymmetric)
+ *     o recovery of out of sync device for initial
+ *       RAID set creation or after dead drive replacement
+ *     o run time optimization of xor algorithm used to calculate parity
+ *
+ *
+ * Thanks to MD for:
+ *    o the raid address calculation algorithm
+ *    o the base of the biovec <-> page list copier.
+ *
+ *
+ * Uses region hash to keep track of how many writes are in flight to
+ * regions in order to use dirty log to keep state of regions to recover:
+ *
+ *    o clean regions (those which are synchronized
+ *     and don't have write io in flight)
+ *    o dirty regions (those with write io in flight)
+ *
+ *
+ * On startup, any dirty regions are migrated to the
+ * 'nosync' state and are subject to recovery by the daemon.
+ *
+ * See raid_ctr() for table definition.
+ *
+ * ANALYZEME: recovery bandwidth
+ */
+
+static const char *version = "v0.2597k";
+
+#include "dm.h"
+#include "dm-memcache.h"
+#include "dm-raid45.h"
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/raid/xor.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include <linux/bio.h>
+#include <linux/dm-io.h>
+#include <linux/dm-dirty-log.h>
+#include <linux/dm-region-hash.h>
+
+
+/*
+ * Configurable parameters
+ */
+
+/* Minimum/maximum and default # of selectable stripes. */
+#define        STRIPES_MIN             8
+#define        STRIPES_MAX             16384
+#define        STRIPES_DEFAULT         80
+
+/* Maximum and default chunk size in sectors if not set in constructor. */
+#define        CHUNK_SIZE_MIN          8
+#define        CHUNK_SIZE_MAX          16384
+#define        CHUNK_SIZE_DEFAULT      64
+
+/* Default io size in sectors if not set in constructor. */
+#define        IO_SIZE_MIN             CHUNK_SIZE_MIN
+#define        IO_SIZE_DEFAULT         IO_SIZE_MIN
+
+/* Recover io size default in sectors. */
+#define        RECOVER_IO_SIZE_MIN             64
+#define        RECOVER_IO_SIZE_DEFAULT         256
+
+/* Default, minimum and maximum percentage of recover io bandwidth. */
+#define        BANDWIDTH_DEFAULT       10
+#define        BANDWIDTH_MIN           1
+#define        BANDWIDTH_MAX           100
+
+/* # of parallel recovered regions */
+#define RECOVERY_STRIPES_MIN   1
+#define RECOVERY_STRIPES_MAX   64
+#define RECOVERY_STRIPES_DEFAULT       RECOVERY_STRIPES_MIN
+/*
+ * END Configurable parameters
+ */
+
+#define        TARGET  "dm-raid45"
+#define        DAEMON  "kraid45d"
+#define        DM_MSG_PREFIX   TARGET
+
+#define        SECTORS_PER_PAGE        (PAGE_SIZE >> SECTOR_SHIFT)
+
+/* Amount/size for __xor(). */
+#define        XOR_SIZE        PAGE_SIZE
+
+/* Ticks to run xor_speed() test for. */
+#define        XOR_SPEED_TICKS 5
+
+/* Check value in range. */
+#define        range_ok(i, min, max)   (i >= min && i <= max)
+
+/* Structure access macros. */
+/* Derive raid_set from stripe_cache pointer. */
+#define        RS(x)   container_of(x, struct raid_set, sc)
+
+/* Page reference. */
+#define PAGE(stripe, p)  ((stripe)->obj[p].pl->page)
+
+/* Stripe chunk reference. */
+#define CHUNK(stripe, p) ((stripe)->chunk + p)
+
+/* Bio list reference. */
+#define        BL(stripe, p, rw)       (stripe->chunk[p].bl + rw)
+#define        BL_CHUNK(chunk, rw)     (chunk->bl + rw)
+
+/* Page list reference. */
+#define        PL(stripe, p)           (stripe->obj[p].pl)
+/* END: structure access macros. */
+
+/* Factor out to dm-bio-list.h */
+static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
+{
+       bio->bi_next = bl->head;
+       bl->head = bio;
+
+       if (!bl->tail)
+               bl->tail = bio;
+}
+
+/* Factor out to dm.h */
+#define TI_ERR_RET(str, ret) \
+       do { ti->error = str; return ret; } while (0);
+#define TI_ERR(str)     TI_ERR_RET(str, -EINVAL)
+
+/* Macro to define access IO flags access inline functions. */
+#define        BITOPS(name, what, var, flag) \
+static inline int TestClear ## name ## what(struct var *v) \
+{ return test_and_clear_bit(flag, &v->io.flags); } \
+static inline int TestSet ## name ## what(struct var *v) \
+{ return test_and_set_bit(flag, &v->io.flags); } \
+static inline void Clear ## name ## what(struct var *v) \
+{ clear_bit(flag, &v->io.flags); } \
+static inline void Set ## name ## what(struct var *v) \
+{ set_bit(flag, &v->io.flags); } \
+static inline int name ## what(struct var *v) \
+{ return test_bit(flag, &v->io.flags); }
+
+/*-----------------------------------------------------------------
+ * Stripe cache
+ *
+ * Cache for all reads and writes to raid sets (operational or degraded)
+ *
+ * We need to run all data to and from a RAID set through this cache,
+ * because parity chunks need to get calculated from data chunks
+ * or, in the degraded/resynchronization case, missing chunks need
+ * to be reconstructed using the other chunks of the stripe.
+ *---------------------------------------------------------------*/
+/* Unique kmem cache name suffix # counter. */
+static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
+
+/* A chunk within a stripe (holds bios hanging off). */
+/* IO status flags for chunks of a stripe. */
+enum chunk_flags {
+       CHUNK_DIRTY,            /* Pages of chunk dirty; need writing. */
+       CHUNK_ERROR,            /* IO error on any chunk page. */
+       CHUNK_IO,               /* Allow/prohibit IO on chunk pages. */
+       CHUNK_LOCKED,           /* Chunk pages locked during IO. */
+       CHUNK_MUST_IO,          /* Chunk must io. */
+       CHUNK_UNLOCK,           /* Enforce chunk unlock. */
+       CHUNK_UPTODATE,         /* Chunk pages are uptodate. */
+};
+
+enum bl_type {
+       WRITE_QUEUED = WRITE + 1,
+       WRITE_MERGED,
+       NR_BL_TYPES,    /* Must be last one! */
+};
+struct stripe_chunk {
+       atomic_t cnt;           /* Reference count. */
+       struct stripe *stripe;  /* Backpointer to stripe for endio(). */
+       /* Bio lists for reads, writes, and writes merged. */
+       struct bio_list bl[NR_BL_TYPES];
+       struct {
+               unsigned long flags; /* IO status flags. */
+       } io;
+};
+
+/* Define chunk bit operations. */
+BITOPS(Chunk, Dirty,    stripe_chunk, CHUNK_DIRTY)
+BITOPS(Chunk, Error,    stripe_chunk, CHUNK_ERROR)
+BITOPS(Chunk, Io,       stripe_chunk, CHUNK_IO)
+BITOPS(Chunk, Locked,   stripe_chunk, CHUNK_LOCKED)
+BITOPS(Chunk, MustIo,   stripe_chunk, CHUNK_MUST_IO)
+BITOPS(Chunk, Unlock,   stripe_chunk, CHUNK_UNLOCK)
+BITOPS(Chunk, Uptodate,         stripe_chunk, CHUNK_UPTODATE)
+
+/*
+ * Stripe linked list indexes. Keep order, because the stripe
+ * and the stripe cache rely on the first 3!
+ */
+enum list_types {
+       LIST_FLUSH,     /* Stripes to flush for io. */
+       LIST_ENDIO,     /* Stripes to endio. */
+       LIST_LRU,       /* Least recently used stripes. */
+       SC_NR_LISTS,    /* # of lists in stripe cache. */
+       LIST_HASH = SC_NR_LISTS,        /* Hashed stripes. */
+       LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
+       STRIPE_NR_LISTS,/* To size array in struct stripe. */
+};
+
+/* Adressing region recovery. */
+struct recover_addr {
+       struct dm_region *reg;  /* Actual region to recover. */
+       sector_t pos;   /* Position within region to recover. */
+       sector_t end;   /* End of region to recover. */
+};
+
+/* A stripe: the io object to handle all reads and writes to a RAID set. */
+struct stripe {
+       atomic_t cnt;                   /* Reference count. */
+       struct stripe_cache *sc;        /* Backpointer to stripe cache. */
+
+       /*
+        * 4 linked lists:
+        *   o io list to flush io
+        *   o endio list
+        *   o LRU list to put stripes w/o reference count on
+        *   o stripe cache hash
+        */
+       struct list_head lists[STRIPE_NR_LISTS];
+
+       sector_t key;    /* Hash key. */
+       region_t region; /* Region stripe is mapped to. */
+
+       struct {
+               unsigned long flags;    /* Stripe state flags (see below). */
+
+               /*
+                * Pending ios in flight:
+                *
+                * used to control move of stripe to endio list
+                */
+               atomic_t pending;
+
+               /* Sectors to read and write for multi page stripe sets. */
+               unsigned size;
+       } io;
+
+       /* Address region recovery. */
+       struct recover_addr *recover;
+
+       /* Lock on stripe (Future: for clustering). */
+       void *lock;
+
+       struct {
+               unsigned short parity;  /* Parity chunk index. */
+               short recover;          /* Recovery chunk index. */
+       } idx;
+
+       /*
+        * This stripe's memory cache object (dm-mem-cache);
+        * i.e. the io chunk pages.
+        */
+       struct dm_mem_cache_object *obj;
+
+       /* Array of stripe sets (dynamically allocated). */
+       struct stripe_chunk chunk[0];
+};
+
+/* States stripes can be in (flags field). */
+enum stripe_states {
+       STRIPE_ERROR,           /* io error on stripe. */
+       STRIPE_MERGED,          /* Writes got merged to be written. */
+       STRIPE_RBW,             /* Read-before-write stripe. */
+       STRIPE_RECONSTRUCT,     /* Reconstruct of a missing chunk required. */
+       STRIPE_RECONSTRUCTED,   /* Reconstructed of a missing chunk. */
+       STRIPE_RECOVER,         /* Stripe used for RAID set recovery. */
+};
+
+/* Define stripe bit operations. */
+BITOPS(Stripe, Error,        stripe, STRIPE_ERROR)
+BITOPS(Stripe, Merged,        stripe, STRIPE_MERGED)
+BITOPS(Stripe, RBW,          stripe, STRIPE_RBW)
+BITOPS(Stripe, Reconstruct,   stripe, STRIPE_RECONSTRUCT)
+BITOPS(Stripe, Reconstructed, stripe, STRIPE_RECONSTRUCTED)
+BITOPS(Stripe, Recover,              stripe, STRIPE_RECOVER)
+
+/* A stripe hash. */
+struct stripe_hash {
+       struct list_head *hash;
+       unsigned buckets;
+       unsigned mask;
+       unsigned prime;
+       unsigned shift;
+};
+
+enum sc_lock_types {
+       LOCK_ENDIO,     /* Protect endio list. */
+       NR_LOCKS,       /* To size array in struct stripe_cache. */
+};
+
+/* A stripe cache. */
+struct stripe_cache {
+       /* Stripe hash. */
+       struct stripe_hash hash;
+
+       spinlock_t locks[NR_LOCKS];     /* Locks to protect lists. */
+
+       /* Stripes with io to flush, stripes to endio and LRU lists. */
+       struct list_head lists[SC_NR_LISTS];
+
+       /* Slab cache to allocate stripes from. */
+       struct {
+               struct kmem_cache *cache;       /* Cache itself. */
+               char name[32];  /* Unique name. */
+       } kc;
+
+       struct dm_io_client *dm_io_client; /* dm-io client resource context. */
+
+       /* dm-mem-cache client resource context. */
+       struct dm_mem_cache_client *mem_cache_client;
+
+       int stripes_parm;           /* # stripes parameter from constructor. */
+       atomic_t stripes;           /* actual # of stripes in cache. */
+       atomic_t stripes_to_set;    /* # of stripes to resize cache to. */
+       atomic_t stripes_last;      /* last # of stripes in cache. */
+       atomic_t active_stripes;    /* actual # of active stripes in cache. */
+
+       /* REMOVEME: */
+       atomic_t active_stripes_max; /* actual # of active stripes in cache. */
+};
+
+/* Flag specs for raid_dev */ ;
+enum raid_dev_flags {
+       DEV_FAILED,     /* Device failed. */
+       DEV_IO_QUEUED,  /* Io got queued to device. */
+};
+
+/* The raid device in a set. */
+struct raid_dev {
+       struct dm_dev *dev;
+       sector_t start;         /* Offset to map to. */
+       struct {        /* Using struct to be able to BITOPS(). */
+               unsigned long flags;    /* raid_dev_flags. */
+       } io;
+};
+
+BITOPS(Dev, Failed,   raid_dev, DEV_FAILED)
+BITOPS(Dev, IoQueued, raid_dev, DEV_IO_QUEUED)
+
+/* Flags spec for raid_set. */
+enum raid_set_flags {
+       RS_CHECK_OVERWRITE,     /* Check for chunk overwrites. */
+       RS_DEAD,                /* RAID set inoperational. */
+       RS_DEAD_ENDIO_MESSAGE,  /* RAID set dead endio one-off message. */
+       RS_DEGRADED,            /* Io errors on RAID device. */
+       RS_DEVEL_STATS,         /* REMOVEME: display status information. */
+       RS_ENFORCE_PARITY_CREATION,/* Enforce parity creation. */
+       RS_PROHIBIT_WRITES,     /* Prohibit writes on device failure. */
+       RS_RECOVER,             /* Do recovery. */
+       RS_RECOVERY_BANDWIDTH,  /* Allow recovery bandwidth (delayed bios). */
+       RS_SC_BUSY,             /* Stripe cache busy -> send an event. */
+       RS_SUSPEND,             /* Suspend RAID set. */
+};
+
+/* REMOVEME: devel stats counters. */
+enum stats_types {
+       S_BIOS_READ,
+       S_BIOS_ADDED_READ,
+       S_BIOS_ENDIO_READ,
+       S_BIOS_WRITE,
+       S_BIOS_ADDED_WRITE,
+       S_BIOS_ENDIO_WRITE,
+       S_CAN_MERGE,
+       S_CANT_MERGE,
+       S_CONGESTED,
+       S_DM_IO_READ,
+       S_DM_IO_WRITE,
+       S_BANDWIDTH,
+       S_BARRIER,
+       S_BIO_COPY_PL_NEXT,
+       S_DEGRADED,
+       S_DELAYED_BIOS,
+       S_FLUSHS,
+       S_HITS_1ST,
+       S_IOS_POST,
+       S_INSCACHE,
+       S_MAX_LOOKUP,
+       S_CHUNK_LOCKED,
+       S_NO_BANDWIDTH,
+       S_NOT_CONGESTED,
+       S_NO_RW,
+       S_NOSYNC,
+       S_OVERWRITE,
+       S_PROHIBITCHUNKIO,
+       S_RECONSTRUCT_EI,
+       S_RECONSTRUCT_DEV,
+       S_RECONSTRUCT_SET,
+       S_RECONSTRUCTED,
+       S_REQUEUE,
+       S_STRIPE_ERROR,
+       S_SUM_DELAYED_BIOS,
+       S_XORS,
+       S_NR_STATS,     /* # of stats counters. Must be last! */
+};
+
+/* Status type -> string mappings. */
+struct stats_map {
+       const enum stats_types type;
+       const char *str;
+};
+
+static struct stats_map stats_map[] = {
+       { S_BIOS_READ, "r=" },
+       { S_BIOS_ADDED_READ, "/" },
+       { S_BIOS_ENDIO_READ, "/" },
+       { S_BIOS_WRITE, " w=" },
+       { S_BIOS_ADDED_WRITE, "/" },
+       { S_BIOS_ENDIO_WRITE, "/" },
+       { S_DM_IO_READ, " rc=" },
+       { S_DM_IO_WRITE, " wc=" },
+       { S_BANDWIDTH, "\nbw=" },
+       { S_NO_BANDWIDTH, " no_bw=" },
+       { S_BARRIER, "\nbarrier=" },
+       { S_BIO_COPY_PL_NEXT, "\nbio_cp_next=" },
+       { S_CAN_MERGE, "\nmerge=" },
+       { S_CANT_MERGE, "/no_merge=" },
+       { S_CHUNK_LOCKED, "\nchunk_locked=" },
+       { S_CONGESTED, "\ncgst=" },
+       { S_NOT_CONGESTED, "/not_cgst=" },
+       { S_DEGRADED, "\ndegraded=" },
+       { S_DELAYED_BIOS, "\ndel_bios=" },
+       { S_SUM_DELAYED_BIOS, "/sum_del_bios=" },
+       { S_FLUSHS, "\nflushs=" },
+       { S_HITS_1ST, "\nhits_1st=" },
+       { S_IOS_POST, " ios_post=" },
+       { S_INSCACHE, " inscache=" },
+       { S_MAX_LOOKUP, " maxlookup=" },
+       { S_NO_RW, "\nno_rw=" },
+       { S_NOSYNC, " nosync=" },
+       { S_OVERWRITE, " ovr=" },
+       { S_PROHIBITCHUNKIO, " prhbt_io=" },
+       { S_RECONSTRUCT_EI, "\nrec_ei=" },
+       { S_RECONSTRUCT_DEV, " rec_dev=" },
+       { S_RECONSTRUCT_SET, " rec_set=" },
+       { S_RECONSTRUCTED, " rec=" },
+       { S_REQUEUE, " requeue=" },
+       { S_STRIPE_ERROR, " stripe_err=" },
+       { S_XORS, " xors=" },
+};
+
+/*
+ * A RAID set.
+ */
+#define        dm_rh_client    dm_region_hash
+enum count_type { IO_WORK = 0, IO_RECOVER, IO_NR_COUNT };
+typedef void (*xor_function_t)(unsigned count, unsigned long **data);
+struct raid_set {
+       struct dm_target *ti;   /* Target pointer. */
+
+       struct {
+               unsigned long flags;    /* State flags. */
+               struct mutex in_lock;   /* Protects central input list below. */
+               struct mutex xor_lock;  /* Protects xor algorithm set. */
+               struct bio_list in;     /* Pending ios (central input list). */
+               struct bio_list work;   /* ios work set. */
+               wait_queue_head_t suspendq;     /* suspend synchronization. */
+               atomic_t in_process;    /* counter of queued bios (suspendq). */
+               atomic_t in_process_max;/* counter of queued bios max. */
+
+               /* io work. */
+               struct workqueue_struct *wq;
+               struct delayed_work dws_do_raid;        /* For main worker. */
+               struct work_struct ws_do_table_event;   /* For event worker. */
+       } io;
+
+       /* Stripe locking abstraction. */
+       struct dm_raid45_locking_type *locking;
+
+       struct stripe_cache sc; /* Stripe cache for this set. */
+
+       /* Xor optimization. */
+       struct {
+               struct xor_func *f;
+               unsigned chunks;
+               unsigned speed;
+       } xor;
+
+       /* Recovery parameters. */
+       struct recover {
+               struct dm_dirty_log *dl;        /* Dirty log. */
+               struct dm_rh_client *rh;        /* Region hash. */
+
+               struct dm_io_client *dm_io_client; /* recovery dm-io client. */
+               /* dm-mem-cache client resource context for recovery stripes. */
+               struct dm_mem_cache_client *mem_cache_client;
+
+               struct list_head stripes;       /* List of recovery stripes. */
+
+               region_t nr_regions;
+               region_t nr_regions_to_recover;
+               region_t nr_regions_recovered;
+               unsigned long start_jiffies;
+               unsigned long end_jiffies;
+
+               unsigned bandwidth;      /* Recovery bandwidth [%]. */
+               unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
+               unsigned bandwidth_parm; /*  " constructor parm. */
+               unsigned io_size;        /* recovery io size <= region size. */
+               unsigned io_size_parm;   /* recovery io size ctr parameter. */
+               unsigned recovery;       /* Recovery allowed/prohibited. */
+               unsigned recovery_stripes; /* # of parallel recovery stripes. */
+
+               /* recovery io throttling. */
+               atomic_t io_count[IO_NR_COUNT]; /* counter recover/regular io.*/
+               unsigned long last_jiffies;
+       } recover;
+
+       /* RAID set parameters. */
+       struct {
+               struct raid_type *raid_type;    /* RAID type (eg, RAID4). */
+               unsigned raid_parms;    /* # variable raid parameters. */
+
+               unsigned chunk_size;    /* Sectors per chunk. */
+               unsigned chunk_size_parm;
+               unsigned chunk_shift;   /* rsector chunk size shift. */
+
+               unsigned io_size;       /* Sectors per io. */
+               unsigned io_size_parm;
+               unsigned io_mask;       /* Mask for bio_copy_page_list(). */
+               unsigned io_inv_mask;   /* Mask for raid_address(). */
+
+               sector_t sectors_per_dev;       /* Sectors per device. */
+
+               atomic_t failed_devs;           /* Amount of devices failed. */
+
+               /* Index of device to initialize. */
+               int dev_to_init;
+               int dev_to_init_parm;
+
+               /* Raid devices dynamically allocated. */
+               unsigned raid_devs;     /* # of RAID devices below. */
+               unsigned data_devs;     /* # of RAID data devices. */
+
+               int ei;         /* index of failed RAID device. */
+
+               /* Index of dedicated parity device (i.e. RAID4). */
+               int pi;
+               int pi_parm;    /* constructor parm for status output. */
+       } set;
+
+       /* REMOVEME: devel stats counters. */
+       atomic_t stats[S_NR_STATS];
+
+       /* Dynamically allocated temporary pointers for xor(). */
+       unsigned long **data;
+
+       /* Dynamically allocated RAID devices. Alignment? */
+       struct raid_dev dev[0];
+};
+
+/* Define RAID set bit operations. */
+BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
+BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
+BITOPS(RS, Dead, raid_set, RS_DEAD)
+BITOPS(RS, DeadEndioMessage, raid_set, RS_DEAD_ENDIO_MESSAGE)
+BITOPS(RS, Degraded, raid_set, RS_DEGRADED)
+BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
+BITOPS(RS, EnforceParityCreation, raid_set, RS_ENFORCE_PARITY_CREATION)
+BITOPS(RS, ProhibitWrites, raid_set, RS_PROHIBIT_WRITES)
+BITOPS(RS, Recover, raid_set, RS_RECOVER)
+BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
+BITOPS(RS, Suspend, raid_set, RS_SUSPEND)
+#undef BITOPS
+
+/*-----------------------------------------------------------------
+ * Raid-4/5 set structures.
+ *---------------------------------------------------------------*/
+/* RAID level definitions. */
+enum raid_level {
+       raid4,
+       raid5,
+};
+
+/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
+enum raid_algorithm {
+       none,
+       left_asym,
+       right_asym,
+       left_sym,
+       right_sym,
+};
+
+struct raid_type {
+       const char *name;               /* RAID algorithm. */
+       const char *descr;              /* Descriptor text for logging. */
+       const unsigned parity_devs;     /* # of parity devices. */
+       const unsigned minimal_devs;    /* minimal # of devices in set. */
+       const enum raid_level level;            /* RAID level. */
+       const enum raid_algorithm algorithm;    /* RAID algorithm. */
+};
+
+/* Supported raid types and properties. */
+static struct raid_type raid_types[] = {
+       {"raid4",    "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
+       {"raid5_la", "RAID5 (left asymmetric)",       1, 3, raid5, left_asym},
+       {"raid5_ra", "RAID5 (right asymmetric)",      1, 3, raid5, right_asym},
+       {"raid5_ls", "RAID5 (left symmetric)",        1, 3, raid5, left_sym},
+       {"raid5_rs", "RAID5 (right symmetric)",       1, 3, raid5, right_sym},
+};
+
+/* Address as calculated by raid_address(). */
+struct raid_address {
+       sector_t key;           /* Hash key (address of stripe % chunk_size). */
+       unsigned di, pi;        /* Data and parity disks index. */
+};
+
+/* REMOVEME: reset statistics counters. */
+static void stats_reset(struct raid_set *rs)
+{
+       unsigned s = S_NR_STATS;
+
+       while (s--)
+               atomic_set(rs->stats + s, 0);
+}
+
+/*----------------------------------------------------------------
+ * RAID set management routines.
+ *--------------------------------------------------------------*/
+/*
+ * Begin small helper functions.
+ */
+/* No need to be called from region hash indirectly at dm_rh_dec(). */
+static void wake_dummy(void *context) {}
+
+/* Return # of io reference. */
+static int io_ref(struct raid_set *rs)
+{
+       return atomic_read(&rs->io.in_process);
+}
+
+/* Get an io reference. */
+static void io_get(struct raid_set *rs)
+{
+       int p = atomic_inc_return(&rs->io.in_process);
+
+       if (p > atomic_read(&rs->io.in_process_max))
+               atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
+}
+
+/* Put the io reference and conditionally wake io waiters. */
+static void io_put(struct raid_set *rs)
+{
+       /* Intel: rebuild data corrupter? */
+       if (atomic_dec_and_test(&rs->io.in_process))
+               wake_up(&rs->io.suspendq);
+       else
+               BUG_ON(io_ref(rs) < 0);
+}
+
+/* Wait until all io has been processed. */
+static void wait_ios(struct raid_set *rs)
+{
+       wait_event(rs->io.suspendq, !io_ref(rs));
+}
+
+/* Queue (optionally delayed) io work. */
+static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
+{
+       queue_delayed_work(rs->io.wq, &rs->io.dws_do_raid, delay);
+}
+
+/* Queue io work immediately (called from region hash too). */
+static void wake_do_raid(void *context)
+{
+       struct raid_set *rs = context;
+
+       queue_work(rs->io.wq, &rs->io.dws_do_raid.work);
+}
+
+/* Calculate device sector offset. */
+static sector_t _sector(struct raid_set *rs, struct bio *bio)
+{
+       sector_t sector = bio->bi_sector;
+
+       sector_div(sector, rs->set.data_devs);
+       return sector;
+}
+
+/* Return # of active stripes in stripe cache. */
+static int sc_active(struct stripe_cache *sc)
+{
+       return atomic_read(&sc->active_stripes);
+}
+
+/* Stripe cache busy indicator. */
+static int sc_busy(struct raid_set *rs)
+{
+       return sc_active(&rs->sc) >
+              atomic_read(&rs->sc.stripes) - (STRIPES_MIN / 2);
+}
+
+/* Set chunks states. */
+enum chunk_dirty_type { CLEAN, DIRTY, ERROR };
+static void chunk_set(struct stripe_chunk *chunk, enum chunk_dirty_type type)
+{
+       switch (type) {
+       case CLEAN:
+               ClearChunkDirty(chunk);
+               break;
+       case DIRTY:
+               SetChunkDirty(chunk);
+               break;
+       case ERROR:
+               SetChunkError(chunk);
+               SetStripeError(chunk->stripe);
+               return;
+       default:
+               BUG();
+       }
+
+       SetChunkUptodate(chunk);
+       SetChunkIo(chunk);
+       ClearChunkError(chunk);
+}
+
+/* Return region state for a sector. */
+static int region_state(struct raid_set *rs, sector_t sector,
+                       enum dm_rh_region_states state)
+{
+       struct dm_rh_client *rh = rs->recover.rh;
+       region_t region = dm_rh_sector_to_region(rh, sector);
+
+       return !!(dm_rh_get_state(rh, region, 1) & state);
+}
+
+/*
+ * Return true in case a chunk should be read/written
+ *
+ * Conditions to read/write:
+ *     o chunk not uptodate
+ *     o chunk dirty
+ *
+ * Conditios to avoid io:
+ *     o io already ongoing on chunk
+ *     o io explitely prohibited
+ */
+static int chunk_io(struct stripe_chunk *chunk)
+{
+       /* 2nd run optimization (flag set below on first run). */
+       if (TestClearChunkMustIo(chunk))
+               return 1;
+
+       /* Avoid io if prohibited or a locked chunk. */
+       if (!ChunkIo(chunk) || ChunkLocked(chunk))
+               return 0;
+
+       if (!ChunkUptodate(chunk) || ChunkDirty(chunk)) {
+               SetChunkMustIo(chunk); /* 2nd run optimization. */
+               return 1;
+       }
+
+       return 0;
+}
+
+/* Call a function on each chunk needing io unless device failed. */
+static unsigned for_each_io_dev(struct stripe *stripe,
+                               void (*f_io)(struct stripe *stripe, unsigned p))
+{
+       struct raid_set *rs = RS(stripe->sc);
+       unsigned p, r = 0;
+
+       for (p = 0; p < rs->set.raid_devs; p++) {
+               if (chunk_io(CHUNK(stripe, p)) && !DevFailed(rs->dev + p)) {
+                       f_io(stripe, p);
+                       r++;
+               }
+       }
+
+       return r;
+}
+
+/*
+ * Index of device to calculate parity on.
+ *
+ * Either the parity device index *or* the selected
+ * device to init after a spare replacement.
+ */
+static int dev_for_parity(struct stripe *stripe, int *sync)
+{
+       struct raid_set *rs = RS(stripe->sc);
+       int r = region_state(rs, stripe->key, DM_RH_NOSYNC | DM_RH_RECOVERING);
+
+       *sync = !r;
+
+       /* Reconstruct a particular device ?. */
+       if (r && rs->set.dev_to_init > -1)
+               return rs->set.dev_to_init;
+       else if (rs->set.raid_type->level == raid4)
+               return rs->set.pi;
+       else if (!StripeRecover(stripe))
+               return stripe->idx.parity;
+       else
+               return -1;
+}
+
+/* RAID set congested function. */
+static int rs_congested(void *congested_data, int bdi_bits)
+{
+       int r;
+       unsigned p;
+       struct raid_set *rs = congested_data;
+
+       if (sc_busy(rs) || RSSuspend(rs) || RSProhibitWrites(rs))
+               r = 1;
+       else for (r = 0, p = rs->set.raid_devs; !r && p--; ) {
+               /* If any of our component devices are overloaded. */
+               struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
+
+               r |= bdi_congested(&q->backing_dev_info, bdi_bits);
+       }
+
+       /* REMOVEME: statistics. */
+       atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
+       return r;
+}
+
+/* RAID device degrade check. */
+static void rs_check_degrade_dev(struct raid_set *rs,
+                                struct stripe *stripe, unsigned p)
+{
+       if (TestSetDevFailed(rs->dev + p))
+               return;
+
+       /* Through an event in case of member device errors. */
+       if ((atomic_inc_return(&rs->set.failed_devs) >
+            rs->set.raid_type->parity_devs) &&
+            !TestSetRSDead(rs)) {
+               /* Display RAID set dead message once. */
+               unsigned p;
+               char buf[BDEVNAME_SIZE];
+
+               DMERR("FATAL: too many devices failed -> RAID set broken");
+               for (p = 0; p < rs->set.raid_devs; p++) {
+                       if (DevFailed(rs->dev + p))
+                               DMERR("device /dev/%s failed",
+                                     bdevname(rs->dev[p].dev->bdev, buf));
+               }
+       }
+
+       /* Only log the first member error. */
+       if (!TestSetRSDegraded(rs)) {
+               char buf[BDEVNAME_SIZE];
+
+               /* Store index for recovery. */
+               rs->set.ei = p;
+               DMERR("CRITICAL: %sio error on device /dev/%s "
+                     "in region=%llu; DEGRADING RAID set\n",
+                     stripe ? "" : "FAKED ",
+                     bdevname(rs->dev[p].dev->bdev, buf),
+                     (unsigned long long) (stripe ? stripe->key : 0));
+               DMERR("further device error messages suppressed");
+       }
+
+       /* Prohibit further writes to allow for userpace to update metadata. */
+       SetRSProhibitWrites(rs);
+       schedule_work(&rs->io.ws_do_table_event);
+}
+
+/* RAID set degrade check. */
+static void rs_check_degrade(struct stripe *stripe)
+{
+       struct raid_set *rs = RS(stripe->sc);
+       unsigned p = rs->set.raid_devs;
+
+       while (p--) {
+               if (ChunkError(CHUNK(stripe, p)))
+                       rs_check_degrade_dev(rs, stripe, p);
+       }
+}
+
+/* Lookup a RAID device by name or by major:minor number. */
+static int raid_dev_lookup(struct raid_set *rs, struct raid_dev *dev_lookup)
+{
+       unsigned p;
+       struct raid_dev *dev;
+
+       /*
+        * Must be an incremental loop, because the device array
+        * can have empty slots still on calls from raid_ctr()
+        */
+       for (dev = rs->dev, p = 0;
+            dev->dev && p < rs->set.raid_devs;
+            dev++, p++) {
+               if (dev_lookup->dev->bdev->bd_dev == dev->dev->bdev->bd_dev)
+                       return p;
+       }
+
+       return -ENODEV;
+}
+/*
+ * End small helper functions.
+ */
+
+/*
+ * Stripe hash functions
+ */
+/* Initialize/destroy stripe hash. */
+static int hash_init(struct stripe_hash *hash, unsigned stripes)
+{
+       unsigned buckets = roundup_pow_of_two(stripes >> 1);
+       static unsigned hash_primes[] = {
+               /* Table of primes for hash_fn/table size optimization. */
+               1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769,
+               1543, 3079, 6151, 12289, 24593, 49157, 98317,
+       };
+
+       /* Allocate stripe hash buckets. */
+       hash->hash = vmalloc(buckets * sizeof(*hash->hash));
+       if (!hash->hash)
+               return -ENOMEM;
+
+       hash->buckets = buckets;
+       hash->mask = buckets - 1;
+       hash->shift = ffs(buckets);
+       if (hash->shift > ARRAY_SIZE(hash_primes))
+               hash->shift = ARRAY_SIZE(hash_primes) - 1;
+
+       BUG_ON(hash->shift < 2);
+       hash->prime = hash_primes[hash->shift];
+
+       /* Initialize buckets. */
+       while (buckets--)
+               INIT_LIST_HEAD(hash->hash + buckets);
+       return 0;
+}
+
+static void hash_exit(struct stripe_hash *hash)
+{
+       if (hash->hash) {
+               vfree(hash->hash);
+               hash->hash = NULL;
+       }
+}
+
+static unsigned hash_fn(struct stripe_hash *hash, sector_t key)
+{
+       return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
+}
+
+static struct list_head *hash_bucket(struct stripe_hash *hash, sector_t key)
+{
+       return hash->hash + hash_fn(hash, key);
+}
+
+/* Insert an entry into a hash. */
+static void stripe_insert(struct stripe_hash *hash, struct stripe *stripe)
+{
+       list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
+}
+
+/* Lookup an entry in the stripe hash. */
+static struct stripe *stripe_lookup(struct stripe_cache *sc, sector_t key)
+{
+       unsigned look = 0;
+       struct stripe *stripe;
+       struct list_head *bucket = hash_bucket(&sc->hash, key);
+
+       list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
+               look++;
+
+               if (stripe->key == key) {
+                       /* REMOVEME: statisics. */
+                       if (look > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
+                               atomic_set(RS(sc)->stats + S_MAX_LOOKUP, look);
+                       return stripe;
+               }
+       }
+
+       return NULL;
+}
+
+/* Resize the stripe cache hash on size changes. */
+static int sc_hash_resize(struct stripe_cache *sc)
+{
+       /* Resize indicated ? */
+       if (atomic_read(&sc->stripes) != atomic_read(&sc->stripes_last)) {
+               int r;
+               struct stripe_hash hash;
+
+               r = hash_init(&hash, atomic_read(&sc->stripes));
+               if (r)
+                       return r;
+
+               if (sc->hash.hash) {
+                       unsigned b = sc->hash.buckets;
+                       struct list_head *pos, *tmp;
+
+                       /* Walk old buckets and insert into new. */
+                       while (b--) {
+                               list_for_each_safe(pos, tmp, sc->hash.hash + b)
+                                   stripe_insert(&hash,
+                                                 list_entry(pos, struct stripe,
+                                                            lists[LIST_HASH]));
+                       }
+
+               }
+
+               hash_exit(&sc->hash);
+               memcpy(&sc->hash, &hash, sizeof(sc->hash));
+               atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
+       }
+
+       return 0;
+}
+/* End hash stripe hash function. */
+
+/* List add, delete, push and pop functions. */
+/* Add stripe to flush list. */
+#define        DEL_LIST(lh) \
+       if (!list_empty(lh)) \
+               list_del_init(lh);
+
+/* Delete stripe from hash. */
+static void stripe_hash_del(struct stripe *stripe)
+{
+       DEL_LIST(stripe->lists + LIST_HASH);
+}
+
+/* Return stripe reference count. */
+static inline int stripe_ref(struct stripe *stripe)
+{
+       return atomic_read(&stripe->cnt);
+}
+
+static void stripe_flush_add(struct stripe *stripe)
+{
+       struct stripe_cache *sc = stripe->sc;
+       struct list_head *lh = stripe->lists + LIST_FLUSH;
+
+       if (!StripeReconstruct(stripe) && list_empty(lh))
+               list_add_tail(lh, sc->lists + LIST_FLUSH);
+}
+
+/*
+ * Add stripe to LRU (inactive) list.
+ *
+ * Need lock, because of concurrent access from message interface.
+ */
+static void stripe_lru_add(struct stripe *stripe)
+{
+       if (!StripeRecover(stripe)) {
+               struct list_head *lh = stripe->lists + LIST_LRU;
+
+               if (list_empty(lh))
+                       list_add_tail(lh, stripe->sc->lists + LIST_LRU);
+       }
+}
+
+#define POP_LIST(list) \
+       do { \
+               if (list_empty(sc->lists + (list))) \
+                       stripe = NULL; \
+               else { \
+                       stripe = list_first_entry(sc->lists + (list), \
+                                                 struct stripe, \
+                                                 lists[(list)]); \
+                       list_del_init(stripe->lists + (list)); \
+               } \
+       } while (0);
+
+/* Pop an available stripe off the LRU list. */
+static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
+{
+       struct stripe *stripe;
+
+       POP_LIST(LIST_LRU);
+       return stripe;
+}
+
+/* Pop an available stripe off the io list. */
+static struct stripe *stripe_io_pop(struct stripe_cache *sc)
+{
+       struct stripe *stripe;
+
+       POP_LIST(LIST_FLUSH);
+       return stripe;
+}
+
+/* Push a stripe safely onto the endio list to be handled by do_endios(). */
+static void stripe_endio_push(struct stripe *stripe)
+{
+       unsigned long flags;
+       struct stripe_cache *sc = stripe->sc;
+       struct list_head *stripe_list = stripe->lists + LIST_ENDIO,
+                        *sc_list = sc->lists + LIST_ENDIO;
+       spinlock_t *lock = sc->locks + LOCK_ENDIO;
+
+       /* This runs in parallel with do_endios(). */
+       spin_lock_irqsave(lock, flags);
+       if (list_empty(stripe_list))
+               list_add_tail(stripe_list, sc_list);
+       spin_unlock_irqrestore(lock, flags);
+
+       wake_do_raid(RS(sc)); /* Wake myself. */
+}
+
+/* Pop a stripe off safely off the endio list. */
+static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
+{
+       struct stripe *stripe;
+       spinlock_t *lock = sc->locks + LOCK_ENDIO;
+
+       /* This runs in parallel with endio(). */
+       spin_lock_irq(lock);
+       POP_LIST(LIST_ENDIO)
+       spin_unlock_irq(lock);
+       return stripe;
+}
+#undef POP_LIST
+
+/*
+ * Stripe cache locking functions
+ */
+/* Dummy lock function for single host RAID4+5. */
+static void *no_lock(sector_t key, enum dm_lock_type type)
+{
+       return &no_lock;
+}
+
+/* Dummy unlock function for single host RAID4+5. */
+static void no_unlock(void *lock_handle)
+{
+}
+
+/* No locking (for single host RAID 4+5). */
+static struct dm_raid45_locking_type locking_none = {
+       .lock = no_lock,
+       .unlock = no_unlock,
+};
+
+/* Lock a stripe (for clustering). */
+static int
+stripe_lock(struct stripe *stripe, int rw, sector_t key)
+{
+       stripe->lock = RS(stripe->sc)->locking->lock(key, rw == READ ? DM_RAID45_SHARED : DM_RAID45_EX);
+       return stripe->lock ? 0 : -EPERM;
+}
+
+/* Unlock a stripe (for clustering). */
+static void stripe_unlock(struct stripe *stripe)
+{
+       RS(stripe->sc)->locking->unlock(stripe->lock);
+       stripe->lock = NULL;
+}
+
+/* Test io pending on stripe. */
+static int stripe_io_ref(struct stripe *stripe)
+{
+       return atomic_read(&stripe->io.pending);
+}
+
+static void stripe_io_get(struct stripe *stripe)
+{
+       if (atomic_inc_return(&stripe->io.pending) == 1)
+               /* REMOVEME: statistics */
+               atomic_inc(&stripe->sc->active_stripes);
+       else
+               BUG_ON(stripe_io_ref(stripe) < 0);
+}
+
+static void stripe_io_put(struct stripe *stripe)
+{
+       if (atomic_dec_and_test(&stripe->io.pending)) {
+               if (unlikely(StripeRecover(stripe)))
+                       /* Don't put recovery stripe on endio list. */
+                       wake_do_raid(RS(stripe->sc));
+               else
+                       /* Add regular stripe to endio list and wake daemon. */
+                       stripe_endio_push(stripe);
+
+               /* REMOVEME: statistics */
+               atomic_dec(&stripe->sc->active_stripes);
+       } else
+               BUG_ON(stripe_io_ref(stripe) < 0);
+}
+
+/* Take stripe reference out. */
+static int stripe_get(struct stripe *stripe)
+{
+       int r;
+       struct list_head *lh = stripe->lists + LIST_LRU;
+
+       /* Delete stripe from LRU (inactive) list if on. */
+       DEL_LIST(lh);
+       BUG_ON(stripe_ref(stripe) < 0);
+
+       /* Lock stripe on first reference */
+       r = (atomic_inc_return(&stripe->cnt) == 1) ?
+           stripe_lock(stripe, WRITE, stripe->key) : 0;
+
+       return r;
+}
+#undef DEL_LIST
+
+/* Return references on a chunk. */
+static int chunk_ref(struct stripe_chunk *chunk)
+{
+       return atomic_read(&chunk->cnt);
+}
+
+/* Take out reference on a chunk. */
+static int chunk_get(struct stripe_chunk *chunk)
+{
+       return atomic_inc_return(&chunk->cnt);
+}
+
+/* Drop reference on a chunk. */
+static void chunk_put(struct stripe_chunk *chunk)
+{
+       BUG_ON(atomic_dec_return(&chunk->cnt) < 0);
+}
+
+/*
+ * Drop reference on a stripe.
+ *
+ * Move it to list of LRU stripes if zero.
+ */
+static void stripe_put(struct stripe *stripe)
+{
+       if (atomic_dec_and_test(&stripe->cnt)) {
+               BUG_ON(stripe_io_ref(stripe));
+               stripe_unlock(stripe);
+       } else
+               BUG_ON(stripe_ref(stripe) < 0);
+}
+
+/* Helper needed by for_each_io_dev(). */
+static void stripe_get_references(struct stripe *stripe, unsigned p)
+{
+
+       /*
+        * Another one to reference the stripe in
+        * order to protect vs. LRU list moves.
+        */
+       io_get(RS(stripe->sc)); /* Global io references. */
+       stripe_get(stripe);
+       stripe_io_get(stripe);  /* One for each chunk io. */
+}
+
+/* Helper for endio() to put all take references. */
+static void stripe_put_references(struct stripe *stripe)
+{
+       stripe_io_put(stripe);  /* One for each chunk io. */
+       stripe_put(stripe);
+       io_put(RS(stripe->sc));
+}
+
+/*
+ * Stripe cache functions.
+ */
+/*
+ * Invalidate all chunks (i.e. their pages)  of a stripe.
+ *
+ * I only keep state for the whole chunk.
+ */
+static inline void stripe_chunk_invalidate(struct stripe_chunk *chunk)
+{
+       chunk->io.flags = 0;
+}
+
+static void
+stripe_chunks_invalidate(struct stripe *stripe)
+{
+       unsigned p = RS(stripe->sc)->set.raid_devs;
+
+       while (p--)
+               stripe_chunk_invalidate(CHUNK(stripe, p));
+}
+
+/* Prepare stripe for (re)use. */
+static void stripe_invalidate(struct stripe *stripe)
+{
+       stripe->io.flags = 0;
+       stripe->idx.parity = stripe->idx.recover = -1;
+       stripe_chunks_invalidate(stripe);
+}
+
+/*
+ * Allow io on all chunks of a stripe.
+ * If not set, IO will not occur; i.e. it's prohibited.
+ *
+ * Actual IO submission for allowed chunks depends
+ * on their !uptodate or dirty state.
+ */
+static void stripe_allow_io(struct stripe *stripe)
+{
+       unsigned p = RS(stripe->sc)->set.raid_devs;
+
+       while (p--)
+               SetChunkIo(CHUNK(stripe, p));
+}
+
+/* Initialize a stripe. */
+static void stripe_init(struct stripe_cache *sc, struct stripe *stripe)
+{
+       unsigned i, p = RS(sc)->set.raid_devs;
+
+       /* Work all io chunks. */
+       while (p--) {
+               struct stripe_chunk *chunk = CHUNK(stripe, p);
+
+               atomic_set(&chunk->cnt, 0);
+               chunk->stripe = stripe;
+               i = ARRAY_SIZE(chunk->bl);
+               while (i--)
+                       bio_list_init(chunk->bl + i);
+       }
+
+       stripe->sc = sc;
+
+       i = ARRAY_SIZE(stripe->lists);
+       while (i--)
+               INIT_LIST_HEAD(stripe->lists + i);
+
+       stripe->io.size = RS(sc)->set.io_size;
+       atomic_set(&stripe->cnt, 0);
+       atomic_set(&stripe->io.pending, 0);
+       stripe_invalidate(stripe);
+}
+
+/* Number of pages per chunk. */
+static inline unsigned chunk_pages(unsigned sectors)
+{
+       return dm_div_up(sectors, SECTORS_PER_PAGE);
+}
+
+/* Number of pages per stripe. */
+static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
+{
+       return chunk_pages(io_size) * rs->set.raid_devs;
+}
+
+/* Initialize part of page_list (recovery). */
+static void stripe_zero_pl_part(struct stripe *stripe, int p,
+                               unsigned start, unsigned count)
+{
+       unsigned o = start / SECTORS_PER_PAGE, pages = chunk_pages(count);
+       /* Get offset into the page_list. */
+       struct page_list *pl = pl_elem(PL(stripe, p), o);
+
+       BUG_ON(!pl);
+       while (pl && pages--) {
+               BUG_ON(!pl->page);
+               memset(page_address(pl->page), 0, PAGE_SIZE);
+               pl = pl->next;
+       }
+}
+
+/* Initialize parity chunk of stripe. */
+static void stripe_zero_chunk(struct stripe *stripe, int p)
+{
+       if (p > -1)
+               stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
+}
+
+/* Return dynamic stripe structure size. */
+static size_t stripe_size(struct raid_set *rs)
+{
+       return sizeof(struct stripe) +
+                     rs->set.raid_devs * sizeof(struct stripe_chunk);
+}
+
+/* Allocate a stripe and its memory object. */
+/* XXX adjust to cope with stripe cache and recovery stripe caches. */
+enum grow { SC_GROW, SC_KEEP };
+static struct stripe *stripe_alloc(struct stripe_cache *sc,
+                                  struct dm_mem_cache_client *mc,
+                                  enum grow grow)
+{
+       int r;
+       struct stripe *stripe;
+
+       stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
+       if (stripe) {
+               /* Grow the dm-mem-cache by one object. */
+               if (grow == SC_GROW) {
+                       r = dm_mem_cache_grow(mc, 1);
+                       if (r)
+                               goto err_free;
+               }
+
+               stripe->obj = dm_mem_cache_alloc(mc);
+               if (IS_ERR(stripe->obj))
+                       goto err_shrink;
+
+               stripe_init(sc, stripe);
+       }
+
+       return stripe;
+
+err_shrink:
+       if (grow == SC_GROW)
+               dm_mem_cache_shrink(mc, 1);
+err_free:
+       kmem_cache_free(sc->kc.cache, stripe);
+       return NULL;
+}
+
+/*
+ * Free a stripes memory object, shrink the
+ * memory cache and free the stripe itself.
+ */
+static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
+{
+       dm_mem_cache_free(mc, stripe->obj);
+       dm_mem_cache_shrink(mc, 1);
+       kmem_cache_free(stripe->sc->kc.cache, stripe);
+}
+
+/* Free the recovery stripe. */
+static void stripe_recover_free(struct raid_set *rs)
+{
+       struct recover *rec = &rs->recover;
+       struct dm_mem_cache_client *mc;
+
+       mc = rec->mem_cache_client;
+       rec->mem_cache_client = NULL;
+       if (mc) {
+               struct stripe *stripe;
+
+               while (!list_empty(&rec->stripes)) {
+                       stripe = list_first_entry(&rec->stripes, struct stripe,
+                                                 lists[LIST_RECOVER]);
+                       list_del(stripe->lists + LIST_RECOVER);
+                       kfree(stripe->recover);
+                       stripe_free(stripe, mc);
+               }
+
+               dm_mem_cache_client_destroy(mc);
+               dm_io_client_destroy(rec->dm_io_client);
+               rec->dm_io_client = NULL;
+       }
+}
+
+/* Grow stripe cache. */
+static int sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
+{
+       int r = 0;
+
+       /* Try to allocate this many (additional) stripes. */
+       while (stripes--) {
+               struct stripe *stripe =
+                       stripe_alloc(sc, sc->mem_cache_client, grow);
+
+               if (likely(stripe)) {
+                       stripe_lru_add(stripe);
+                       atomic_inc(&sc->stripes);
+               } else {
+                       r = -ENOMEM;
+                       break;
+               }
+       }
+
+       return r ? r : sc_hash_resize(sc);
+}
+
+/* Shrink stripe cache. */
+static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
+{
+       int r = 0;
+
+       /* Try to get unused stripe from LRU list. */
+       while (stripes--) {
+               struct stripe *stripe;
+
+               stripe = stripe_lru_pop(sc);
+               if (stripe) {
+                       /* An LRU stripe may never have ios pending! */
+                       BUG_ON(stripe_io_ref(stripe));
+                       BUG_ON(stripe_ref(stripe));
+                       atomic_dec(&sc->stripes);
+                       /* Remove from hash if on before deletion. */
+                       stripe_hash_del(stripe);
+                       stripe_free(stripe, sc->mem_cache_client);
+               } else {
+                       r = -ENOENT;
+                       break;
+               }
+       }
+
+       /* Check if stats are still sane. */
+       if (atomic_read(&sc->active_stripes_max) >
+           atomic_read(&sc->stripes))
+               atomic_set(&sc->active_stripes_max, 0);
+
+       if (r)
+               return r;
+
+       return atomic_read(&sc->stripes) ? sc_hash_resize(sc) : 0;
+}
+
+/* Create stripe cache and recovery. */
+static int sc_init(struct raid_set *rs, unsigned stripes)
+{
+       unsigned i, r, rstripes;
+       struct stripe_cache *sc = &rs->sc;
+       struct stripe *stripe;
+       struct recover *rec = &rs->recover;
+       struct mapped_device *md;
+       struct gendisk *disk;
+
+
+       /* Initialize lists and locks. */
+       i = ARRAY_SIZE(sc->lists);
+       while (i--)
+               INIT_LIST_HEAD(sc->lists + i);
+
+       INIT_LIST_HEAD(&rec->stripes);
+
+       /* Initialize endio and LRU list locks. */
+       i = NR_LOCKS;
+       while (i--)
+               spin_lock_init(sc->locks + i);
+
+       /* Initialize atomic variables. */
+       atomic_set(&sc->stripes, 0);
+       atomic_set(&sc->stripes_to_set, 0);
+       atomic_set(&sc->active_stripes, 0);
+       atomic_set(&sc->active_stripes_max, 0); /* REMOVEME: statistics. */
+
+       /*
+        * We need a runtime unique # to suffix the kmem cache name
+        * because we'll have one for each active RAID set.
+        */
+       md = dm_table_get_md(rs->ti->table);
+       disk = dm_disk(md);
+       snprintf(sc->kc.name, sizeof(sc->kc.name), "%s-%d.%d", TARGET,
+                disk->first_minor, atomic_inc_return(&_stripe_sc_nr));
+       sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
+                                        0, 0, NULL);
+       if (!sc->kc.cache)
+               return -ENOMEM;
+
+       /* Create memory cache client context for RAID stripe cache. */
+       sc->mem_cache_client =
+               dm_mem_cache_client_create(stripes, rs->set.raid_devs,
+                                          chunk_pages(rs->set.io_size));
+       if (IS_ERR(sc->mem_cache_client))
+               return PTR_ERR(sc->mem_cache_client);
+
+       /* Create memory cache client context for RAID recovery stripe(s). */
+       rstripes = rec->recovery_stripes;
+       rec->mem_cache_client =
+               dm_mem_cache_client_create(rstripes, rs->set.raid_devs,
+                                          chunk_pages(rec->io_size));
+       if (IS_ERR(rec->mem_cache_client))
+               return PTR_ERR(rec->mem_cache_client);
+
+       /* Create dm-io client context for IO stripes. */
+       sc->dm_io_client = dm_io_client_create();
+       if (IS_ERR(sc->dm_io_client))
+               return PTR_ERR(sc->dm_io_client);
+
+       /* FIXME: intermingeled with stripe cache initialization. */
+       /* Create dm-io client context for recovery stripes. */
+       rec->dm_io_client = dm_io_client_create();
+       if (IS_ERR(rec->dm_io_client))
+               return PTR_ERR(rec->dm_io_client);
+
+       /* Allocate stripes for set recovery. */
+       while (rstripes--) {
+               stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
+               if (!stripe)
+                       return -ENOMEM;
+
+               stripe->recover = kzalloc(sizeof(*stripe->recover), GFP_KERNEL);
+               if (!stripe->recover) {
+                       stripe_free(stripe, rec->mem_cache_client);
+                       return -ENOMEM;
+               }
+
+               SetStripeRecover(stripe);
+               stripe->io.size = rec->io_size;
+               list_add_tail(stripe->lists + LIST_RECOVER, &rec->stripes);
+               /* Don't add recovery stripes to LRU list! */
+       }
+
+       /*
+        * Allocate the stripe objetcs from the
+        * cache and add them to the LRU list.
+        */
+       r = sc_grow(sc, stripes, SC_KEEP);
+       if (!r)
+               atomic_set(&sc->stripes_last, stripes);
+
+       return r;
+}
+
+/* Destroy the stripe cache. */
+static void sc_exit(struct stripe_cache *sc)
+{
+       struct raid_set *rs = RS(sc);
+
+       if (sc->kc.cache) {
+               stripe_recover_free(rs);
+               BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
+               kmem_cache_destroy(sc->kc.cache);
+               sc->kc.cache = NULL;
+
+               if (sc->mem_cache_client && !IS_ERR(sc->mem_cache_client))
+                       dm_mem_cache_client_destroy(sc->mem_cache_client);
+
+               if (sc->dm_io_client && !IS_ERR(sc->dm_io_client))
+                       dm_io_client_destroy(sc->dm_io_client);
+
+               hash_exit(&sc->hash);
+       }
+}
+
+/*
+ * Calculate RAID address
+ *
+ * Delivers tuple with the index of the data disk holding the chunk
+ * in the set, the parity disks index and the start of the stripe
+ * within the address space of the set (used as the stripe cache hash key).
+ */
+/* thx MD. */
+static struct raid_address *raid_address(struct raid_set *rs, sector_t sector,
+                                        struct raid_address *addr)
+{
+       sector_t stripe, tmp;
+
+       /*
+        * chunk_number = sector / chunk_size
+        * stripe_number = chunk_number / data_devs
+        * di = stripe % data_devs;
+        */
+       stripe = sector >> rs->set.chunk_shift;
+       addr->di = sector_div(stripe, rs->set.data_devs);
+
+       switch (rs->set.raid_type->level) {
+       case raid4:
+               addr->pi = rs->set.pi;
+               goto check_shift_di;
+       case raid5:
+               tmp = stripe;
+               addr->pi = sector_div(tmp, rs->set.raid_devs);
+
+               switch (rs->set.raid_type->algorithm) {
+               case left_asym:         /* Left asymmetric. */
+                       addr->pi = rs->set.data_devs - addr->pi;
+               case right_asym:        /* Right asymmetric. */
+check_shift_di:
+                       if (addr->di >= addr->pi)
+                               addr->di++;
+                       break;
+               case left_sym:          /* Left symmetric. */
+                       addr->pi = rs->set.data_devs - addr->pi;
+               case right_sym:         /* Right symmetric. */
+                       addr->di = (addr->pi + addr->di + 1) %
+                                  rs->set.raid_devs;
+                       break;
+               case none: /* Ain't happen: RAID4 algorithm placeholder. */
+                       BUG();
+               }
+       }
+
+       /*
+        * Start offset of the stripes chunk on any single device of the RAID
+        * set, adjusted in case io size differs from chunk size.
+        */
+       addr->key = (stripe << rs->set.chunk_shift) +
+                   (sector & rs->set.io_inv_mask);
+       return addr;
+}
+
+/*
+ * Copy data across between stripe pages and bio vectors.
+ *
+ * Pay attention to data alignment in stripe and bio pages.
+ */
+static void bio_copy_page_list(int rw, struct stripe *stripe,
+                              struct page_list *pl, struct bio *bio)
+{
+       unsigned i, page_offset;
+       void *page_addr;
+       struct raid_set *rs = RS(stripe->sc);
+       struct bio_vec *bv;
+
+       /* Get start page in page list for this sector. */
+       i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
+       pl = pl_elem(pl, i);
+       BUG_ON(!pl);
+       BUG_ON(!pl->page);
+
+       page_addr = page_address(pl->page);
+       page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
+
+       /* Walk all segments and copy data across between bio_vecs and pages. */
+       bio_for_each_segment(bv, bio, i) {
+               int len = bv->bv_len, size;
+               unsigned bio_offset = 0;
+               void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
+redo:
+               size = (page_offset + len > PAGE_SIZE) ?
+                      PAGE_SIZE - page_offset : len;
+
+               if (rw == READ)
+                       memcpy(bio_addr + bio_offset,
+                              page_addr + page_offset, size);
+               else
+                       memcpy(page_addr + page_offset,
+                              bio_addr + bio_offset, size);
+
+               page_offset += size;
+               if (page_offset == PAGE_SIZE) {
+                       /*
+                        * We reached the end of the chunk page ->
+                        * need to refer to the next one to copy more data.
+                        */
+                       len -= size;
+                       if (len) {
+                               /* Get next page. */
+                               pl = pl->next;
+                               BUG_ON(!pl);
+                               BUG_ON(!pl->page);
+                               page_addr = page_address(pl->page);
+                               page_offset = 0;
+                               bio_offset += size;
+                               /* REMOVEME: statistics. */
+                               atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
+                               goto redo;
+                       }
+               }
+
+               __bio_kunmap_atomic(bio_addr, KM_USER0);
+       }
+}
+
+/*
+ * Xor optimization macros.
+ */
+/* Xor data pointer declaration and initialization macros. */
+#define DECLARE_2      unsigned long *d0 = data[0], *d1 = data[1]
+#define DECLARE_3      DECLARE_2, *d2 = data[2]
+#define DECLARE_4      DECLARE_3, *d3 = data[3]
+#define DECLARE_5      DECLARE_4, *d4 = data[4]
+#define DECLARE_6      DECLARE_5, *d5 = data[5]
+#define DECLARE_7      DECLARE_6, *d6 = data[6]
+#define DECLARE_8      DECLARE_7, *d7 = data[7]
+
+/* Xor unrole macros. */
+#define D2(n)  d0[n] = d0[n] ^ d1[n]
+#define D3(n)  D2(n) ^ d2[n]
+#define D4(n)  D3(n) ^ d3[n]
+#define D5(n)  D4(n) ^ d4[n]
+#define D6(n)  D5(n) ^ d5[n]
+#define D7(n)  D6(n) ^ d6[n]
+#define D8(n)  D7(n) ^ d7[n]
+
+#define        X_2(macro, offset)      macro(offset); macro(offset + 1);
+#define        X_4(macro, offset)      X_2(macro, offset); X_2(macro, offset + 2);
+#define        X_8(macro, offset)      X_4(macro, offset); X_4(macro, offset + 4);
+#define        X_16(macro, offset)     X_8(macro, offset); X_8(macro, offset + 8);
+#define        X_32(macro, offset)     X_16(macro, offset); X_16(macro, offset + 16);
+#define        X_64(macro, offset)     X_32(macro, offset); X_32(macro, offset + 32);
+
+/* Define a _xor_#chunks_#xors_per_run() function. */
+#define        _XOR(chunks, xors_per_run) \
+static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
+{ \
+       unsigned end = XOR_SIZE / sizeof(data[0]), i; \
+       DECLARE_ ## chunks; \
+\
+       for (i = 0; i < end; i += xors_per_run) { \
+               X_ ## xors_per_run(D ## chunks, i); \
+       } \
+}
+
+/* Define xor functions for 2 - 8 chunks and xors per run. */
+#define        MAKE_XOR_PER_RUN(xors_per_run) \
+       _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
+       _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
+       _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
+       _XOR(8, xors_per_run);
+
+MAKE_XOR_PER_RUN(8)    /* Define _xor_*_8() functions. */
+MAKE_XOR_PER_RUN(16)   /* Define _xor_*_16() functions. */
+MAKE_XOR_PER_RUN(32)   /* Define _xor_*_32() functions. */
+MAKE_XOR_PER_RUN(64)   /* Define _xor_*_64() functions. */
+
+#define MAKE_XOR(xors_per_run) \
+struct { \
+       void (*f)(unsigned long **); \
+} static xor_funcs ## xors_per_run[] = { \
+       { NULL }, /* NULL pointers to optimize indexing in xor(). */ \
+       { NULL }, \
+       { _xor2_ ## xors_per_run }, \
+       { _xor3_ ## xors_per_run }, \
+       { _xor4_ ## xors_per_run }, \
+       { _xor5_ ## xors_per_run }, \
+       { _xor6_ ## xors_per_run }, \
+       { _xor7_ ## xors_per_run }, \
+       { _xor8_ ## xors_per_run }, \
+}; \
+\
+static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
+{ \
+       /* Call respective function for amount of chunks. */ \
+       xor_funcs ## xors_per_run[n].f(data); \
+}
+
+/* Define xor_8() - xor_64 functions. */
+MAKE_XOR(8)
+MAKE_XOR(16)
+MAKE_XOR(32)
+MAKE_XOR(64)
+/*
+ * END xor optimization macros.
+ */
+
+/* Maximum number of chunks, which can be xor'ed in one go. */
+#define        XOR_CHUNKS_MAX  (ARRAY_SIZE(xor_funcs8) - 1)
+
+/* xor_blocks wrapper to allow for using that crypto library function. */
+static void xor_blocks_wrapper(unsigned n, unsigned long **data)
+{
+       BUG_ON(n < 2 || n > MAX_XOR_BLOCKS + 1);
+       xor_blocks(n - 1, XOR_SIZE, (void *) data[0], (void **) data + 1);
+}
+
+struct xor_func {
+       xor_function_t f;
+       const char *name;
+} static xor_funcs[] = {
+       { xor_64,  "xor_64" },
+       { xor_32,  "xor_32" },
+       { xor_16,  "xor_16" },
+       { xor_8,   "xor_8"  },
+       { xor_blocks_wrapper, "xor_blocks" },
+};
+
+/*
+ * Check, if chunk has to be xored in/out:
+ *
+ * o if writes are queued
+ * o if writes are merged
+ * o if stripe is to be reconstructed
+ * o if recovery stripe
+ */
+static inline int chunk_must_xor(struct stripe_chunk *chunk)
+{
+       if (ChunkUptodate(chunk)) {
+               BUG_ON(!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) &&
+                      !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)));
+
+               if (!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) ||
+                   !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)))
+                       return 1;
+
+               if (StripeReconstruct(chunk->stripe) ||
+                   StripeRecover(chunk->stripe))
+                       return 1;
+       }
+
+       return 0;
+}
+
+/*
+ * Calculate crc.
+ *
+ * This indexes into the chunks of a stripe and their pages.
+ *
+ * All chunks will be xored into the indexed (@pi)
+ * chunk in maximum groups of xor.chunks.
+ *
+ */
+static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
+{
+       struct raid_set *rs = RS(stripe->sc);
+       unsigned max_chunks = rs->xor.chunks, n = 1,
+                o = sector / SECTORS_PER_PAGE, /* Offset into the page_list. */
+                p = rs->set.raid_devs;
+       unsigned long **d = rs->data;
+       xor_function_t xor_f = rs->xor.f->f;
+
+       BUG_ON(sector > stripe->io.size);
+
+       /* Address of parity page to xor into. */
+       d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
+
+       while (p--) {
+               /* Preset pointers to data pages. */
+               if (p != pi && chunk_must_xor(CHUNK(stripe, p)))
+                       d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
+
+               /* If max chunks -> xor. */
+               if (n == max_chunks) {
+                       mutex_lock(&rs->io.xor_lock);
+                       xor_f(n, d);
+                       mutex_unlock(&rs->io.xor_lock);
+                       n = 1;
+               }
+       }
+
+       /* If chunks -> xor. */
+       if (n > 1) {
+               mutex_lock(&rs->io.xor_lock);
+               xor_f(n, d);
+               mutex_unlock(&rs->io.xor_lock);
+       }
+}
+
+/* Common xor loop through all stripe page lists. */
+static void common_xor(struct stripe *stripe, sector_t count,
+                      unsigned off, unsigned pi)
+{
+       unsigned sector;
+
+       BUG_ON(!count);
+       for (sector = off; sector < count; sector += SECTORS_PER_PAGE)
+               xor(stripe, pi, sector);
+
+       /* Set parity page uptodate and clean. */
+       chunk_set(CHUNK(stripe, pi), CLEAN);
+       atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
+}
+
+/*
+ * Calculate parity sectors on intact stripes.
+ *
+ * Need to calculate raid address for recover stripe, because its
+ * chunk sizes differs and is typically larger than io chunk size.
+ */
+static void parity_xor(struct stripe *stripe)
+{
+       struct raid_set *rs = RS(stripe->sc);
+       int size_differs = stripe->io.size != rs->set.io_size;
+       unsigned chunk_size = rs->set.chunk_size, io_size = stripe->io.size,
+                xor_size = chunk_size > io_size ? io_size : chunk_size;
+       sector_t off;
+
+       /* This can be the recover stripe with a larger io size. */
+       for (off = 0; off < io_size; off += xor_size) {
+               /*
+                * Recover stripe is likely bigger than regular io
+                * ones and has no precalculated parity disk index ->
+                * need to calculate RAID address.
+                */
+               if (unlikely(size_differs)) {
+                       struct raid_address addr;
+
+                       raid_address(rs, (stripe->key + off) *
+                                        rs->set.data_devs, &addr);
+                       stripe->idx.parity = addr.pi;
+                       stripe_zero_pl_part(stripe, addr.pi, off, xor_size);
+               }
+
+               common_xor(stripe, xor_size, off, stripe->idx.parity);
+               chunk_set(CHUNK(stripe, stripe->idx.parity), DIRTY);
+       }
+}
+
+/* Reconstruct missing chunk. */
+static void stripe_reconstruct(struct stripe *stripe)
+{
+       struct raid_set *rs = RS(stripe->sc);
+       int p = rs->set.raid_devs, pr = stripe->idx.recover;
+
+       BUG_ON(pr < 0);
+
+       /* Check if all but the chunk to be reconstructed are uptodate. */
+       while (p--)
+               BUG_ON(p != pr && !ChunkUptodate(CHUNK(stripe, p)));
+
+       /* REMOVEME: statistics. */
+       atomic_inc(rs->stats + (RSDegraded(rs) ? S_RECONSTRUCT_EI :
+                                                S_RECONSTRUCT_DEV));
+       /* Zero chunk to be reconstructed. */
+       stripe_zero_chunk(stripe, pr);
+       common_xor(stripe, stripe->io.size, 0, pr);
+}
+
+/*
+ * Recovery io throttling
+ */
+/* Conditionally reset io counters. */
+static int recover_io_reset(struct raid_set *rs)
+{
+       unsigned long j = jiffies;
+
+       /* Pay attention to jiffies overflows. */
+       if (j > rs->recover.last_jiffies + HZ ||
+           j < rs->recover.last_jiffies) {
+               atomic_set(rs->recover.io_count + IO_WORK, 0);
+               atomic_set(rs->recover.io_count + IO_RECOVER, 0);
+               rs->recover.last_jiffies = j;
+               return 1;
+       }
+
+       return 0;
+}
+
+/* Count ios. */
+static void recover_io_count(struct stripe *stripe)
+{
+       struct raid_set *rs = RS(stripe->sc);
+
+       atomic_inc(rs->recover.io_count +
+                  (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
+}
+
+/* Try getting a stripe either from the hash or from the LRU list. */
+static struct stripe *stripe_find(struct raid_set *rs,
+                                 struct raid_address *addr)
+{
+       int r;
+       struct stripe_cache *sc = &rs->sc;
+       struct stripe *stripe;
+
+       /* Try stripe from hash. */
+       stripe = stripe_lookup(sc, addr->key);
+       if (stripe) {
+               r = stripe_get(stripe);
+               if (r)
+                       goto get_lock_failed;
+
+               atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
+       } else {
+               /* Not in hash -> try to get an LRU stripe. */
+               stripe = stripe_lru_pop(sc);
+               if (stripe) {
+                       /*
+                        * An LRU stripe may not be referenced
+                        * and may never have ios pending!
+                        */
+                       BUG_ON(stripe_ref(stripe));
+                       BUG_ON(stripe_io_ref(stripe));
+
+                       /* Remove from hash if on before reuse. */
+                       stripe_hash_del(stripe);
+
+                       /* Invalidate before reinserting with changed key. */
+                       stripe_invalidate(stripe);
+
+                       stripe->key = addr->key;
+                       stripe->region = dm_rh_sector_to_region(rs->recover.rh,
+                                                               addr->key);
+                       stripe->idx.parity = addr->pi;
+                       r = stripe_get(stripe);
+                       if (r)
+                               goto get_lock_failed;
+
+                       /* Insert stripe into the stripe hash. */
+                       stripe_insert(&sc->hash, stripe);
+                       /* REMOVEME: statistics. */
+                       atomic_inc(rs->stats + S_INSCACHE);
+               }
+       }
+
+       return stripe;
+
+get_lock_failed:
+       stripe_put(stripe);
+       return NULL;
+}
+
+/*
+ * Process end io
+ *
+ * I need to do it here because I can't in interrupt
+ */
+/* End io all bios on a bio list. */
+static void bio_list_endio(struct stripe *stripe, struct bio_list *bl,
+                          int p, int error)
+{
+       struct raid_set *rs = RS(stripe->sc);
+       struct bio *bio;
+       struct page_list *pl = PL(stripe, p);
+       struct stripe_chunk *chunk = CHUNK(stripe, p);
+
+       /* Update region counters. */
+       while ((bio = bio_list_pop(bl))) {
+               if (bio_data_dir(bio) == WRITE)
+                       /* Drop io pending count for any writes. */
+                       dm_rh_dec(rs->recover.rh, stripe->region);
+               else if (!error)
+                       /* Copy data accross. */
+                       bio_copy_page_list(READ, stripe, pl, bio);
+
+               bio_endio(bio, error);
+
+               /* REMOVEME: statistics. */
+               atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
+                          S_BIOS_ENDIO_READ : S_BIOS_ENDIO_WRITE));
+
+               chunk_put(chunk);
+               stripe_put(stripe);
+               io_put(rs);     /* Wake any suspend waiters on last bio. */
+       }
+}
+
+/*
+ * End io all reads/writes on a stripe copying
+ * read data accross from stripe to bios and
+ * decrementing region counters for writes.
+ *
+ * Processing of ios depeding on state:
+ * o no chunk error -> endio ok
+ * o degraded:
+ *   - chunk error and read -> ignore to be requeued
+ *   - chunk error and write -> endio ok
+ * o dead (more than parity_devs failed) and chunk_error-> endio failed
+ */
+static void stripe_endio(int rw, struct stripe *stripe)
+{
+       struct raid_set *rs = RS(stripe->sc);
+       unsigned p = rs->set.raid_devs;
+       int write = (rw != READ);
+
+       while (p--) {
+               struct stripe_chunk *chunk = CHUNK(stripe, p);
+               struct bio_list *bl;
+
+               BUG_ON(ChunkLocked(chunk));
+
+               bl = BL_CHUNK(chunk, rw);
+               if (bio_list_empty(bl))
+                       continue;
+
+               if (unlikely(ChunkError(chunk) || !ChunkUptodate(chunk))) {
+                       /* RAID set dead. */
+                       if (unlikely(RSDead(rs)))
+                               bio_list_endio(stripe, bl, p, -EIO);
+                       /* RAID set degraded. */
+                       else if (write)
+                               bio_list_endio(stripe, bl, p, 0);
+               } else {
+                       BUG_ON(!RSDegraded(rs) && ChunkDirty(chunk));
+                       bio_list_endio(stripe, bl, p, 0);
+               }
+       }
+}
+
+/* Fail all ios hanging off all bio lists of a stripe. */
+static void stripe_fail_io(struct stripe *stripe)
+{
+       struct raid_set *rs = RS(stripe->sc);
+       unsigned p = rs->set.raid_devs;
+
+       while (p--) {
+               struct stripe_chunk *chunk = CHUNK(stripe, p);
+               int i = ARRAY_SIZE(chunk->bl);
+
+               /* Fail all bios on all bio lists of the stripe. */
+               while (i--) {
+                       struct bio_list *bl = chunk->bl + i;
+
+                       if (!bio_list_empty(bl))
+                               bio_list_endio(stripe, bl, p, -EIO);
+               }
+       }
+
+       /* Put stripe on LRU list. */
+       BUG_ON(stripe_io_ref(stripe));
+       BUG_ON(stripe_ref(stripe));
+}
+
+/* Unlock all required chunks. */
+static void stripe_chunks_unlock(struct stripe *stripe)
+{
+       unsigned p = RS(stripe->sc)->set.raid_devs;
+       struct stripe_chunk *chunk;
+
+       while (p--) {
+               chunk = CHUNK(stripe, p);
+
+               if (TestClearChunkUnlock(chunk))
+                       ClearChunkLocked(chunk);
+       }
+}
+
+/*
+ * Queue reads and writes to a stripe by hanging
+ * their bios off the stripesets read/write lists.
+ */
+static int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
+                           struct bio_list *reject)
+{
+       struct raid_address addr;
+       struct stripe *stripe;
+
+       stripe = stripe_find(rs, raid_address(rs, bio->bi_sector, &addr));
+       if (stripe) {
+               int r = 0, rw = bio_data_dir(bio);
+
+               /* Distinguish reads and writes. */
+               bio_list_add(BL(stripe, addr.di, rw), bio);
+
+               if (rw == READ)
+                       /* REMOVEME: statistics. */
+                       atomic_inc(rs->stats + S_BIOS_ADDED_READ);
+               else {
+                       /* Inrement pending write count on region. */
+                       dm_rh_inc(rs->recover.rh, stripe->region);
+                       r = 1;
+
+                       /* REMOVEME: statistics. */
+                       atomic_inc(rs->stats + S_BIOS_ADDED_WRITE);
+               }
+
+               /*
+                * Put on io (flush) list in case of
+                * initial bio queued to chunk.
+                */
+               if (chunk_get(CHUNK(stripe, addr.di)) == 1)
+                       stripe_flush_add(stripe);
+
+               return r;
+       }
+
+       /* Got no stripe from cache or failed to lock it -> reject bio. */
+       bio_list_add(reject, bio);
+       atomic_inc(rs->stats + S_IOS_POST); /* REMOVEME: statistics. */
+       return 0;
+}
+
+/*
+ * Handle all stripes by handing them to the daemon, because we can't
+ * map their chunk pages to copy the data in interrupt context.
+ *
+ * We don't want to handle them here either, while interrupts are disabled.
+ */
+
+/* Read/write endio function for dm-io (interrupt context). */
+static void endio(unsigned long error, void *context)
+{
+       struct stripe_chunk *chunk = context;
+
+       if (unlikely(error)) {
+               chunk_set(chunk, ERROR);
+               /* REMOVEME: statistics. */
+               atomic_inc(RS(chunk->stripe->sc)->stats + S_STRIPE_ERROR);
+       } else
+               chunk_set(chunk, CLEAN);
+
+       /*
+        * For recovery stripes, I need to reset locked locked
+        * here, because those aren't processed in do_endios().
+        */
+       if (unlikely(StripeRecover(chunk->stripe)))
+               ClearChunkLocked(chunk);
+       else
+               SetChunkUnlock(chunk);
+
+       /* Indirectly puts stripe on cache's endio list via stripe_io_put(). */
+       stripe_put_references(chunk->stripe);
+}
+
+/* Read/Write a chunk asynchronously. */
+static void stripe_chunk_rw(struct stripe *stripe, unsigned p)
+{
+       struct stripe_cache *sc = stripe->sc;
+       struct raid_set *rs = RS(sc);
+       struct dm_mem_cache_object *obj = stripe->obj + p;
+       struct page_list *pl = obj->pl;
+       struct stripe_chunk *chunk = CHUNK(stripe, p);
+       struct raid_dev *dev = rs->dev + p;
+       struct dm_io_region io = {
+               .bdev = dev->dev->bdev,
+               .sector = stripe->key,
+               .count = stripe->io.size,
+       };
+       struct dm_io_request control = {
+               .bi_rw = ChunkDirty(chunk) ? WRITE : READ,
+               .mem = {
+                       .type = DM_IO_PAGE_LIST,
+                       .ptr.pl = pl,
+                       .offset = 0,
+               },
+               .notify = {
+                       .fn = endio,
+                       .context = chunk,
+               },
+               .client = StripeRecover(stripe) ? rs->recover.dm_io_client :
+                                                 sc->dm_io_client,
+       };
+
+       BUG_ON(ChunkLocked(chunk));
+       BUG_ON(!ChunkUptodate(chunk) && ChunkDirty(chunk));
+       BUG_ON(ChunkUptodate(chunk) && !ChunkDirty(chunk));
+
+       /*
+        * Don't rw past end of device, which can happen, because
+        * typically sectors_per_dev isn't divisible by io_size.
+        */
+       if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
+               io.count = rs->set.sectors_per_dev - io.sector;
+
+       BUG_ON(!io.count);
+       io.sector += dev->start;        /* Add <offset>. */
+       if (RSRecover(rs))
+               recover_io_count(stripe);       /* Recovery io accounting. */
+
+       /* REMOVEME: statistics. */
+       atomic_inc(rs->stats + (ChunkDirty(chunk) ? S_DM_IO_WRITE :
+                                                   S_DM_IO_READ));
+       SetChunkLocked(chunk);
+       SetDevIoQueued(dev);
+       BUG_ON(dm_io(&control, 1, &io, NULL));
+}
+
+/*
+ * Write dirty or read not uptodate page lists of a stripe.
+ */
+static int stripe_chunks_rw(struct stripe *stripe)
+{
+       int r;
+       struct raid_set *rs = RS(stripe->sc);
+
+       /*
+        * Increment the pending count on the stripe
+        * first, so that we don't race in endio().
+        *
+        * An inc (IO) is needed for any chunk unless !ChunkIo(chunk):
+        *
+        * o not uptodate
+        * o dirtied by writes merged
+        * o dirtied by parity calculations
+        */
+       r = for_each_io_dev(stripe, stripe_get_references);
+       if (r) {
+               /* Io needed: chunks are either not uptodate or dirty. */
+               int max;        /* REMOVEME: */
+               struct stripe_cache *sc = &rs->sc;
+
+               /* Submit actual io. */
+               for_each_io_dev(stripe, stripe_chunk_rw);
+
+               /* REMOVEME: statistics */
+               max = sc_active(sc);
+               if (atomic_read(&sc->active_stripes_max) < max)
+                       atomic_set(&sc->active_stripes_max, max);
+
+               atomic_inc(rs->stats + S_FLUSHS);
+               /* END REMOVEME: statistics */
+       }
+
+       return r;
+}
+
+/* Merge in all writes hence dirtying respective chunks. */
+static void stripe_merge_writes(struct stripe *stripe)
+{
+       unsigned p = RS(stripe->sc)->set.raid_devs;
+
+       while (p--) {
+               struct stripe_chunk *chunk = CHUNK(stripe, p);
+               struct bio_list *write = BL_CHUNK(chunk, WRITE_QUEUED);
+
+               if (!bio_list_empty(write)) {
+                       struct bio *bio;
+                       struct page_list *pl = stripe->obj[p].pl;
+
+                       /*
+                        * We can play with the lists without holding a lock,
+                        * because it is just us accessing them anyway.
+                        */
+                       bio_list_for_each(bio, write)
+                               bio_copy_page_list(WRITE, stripe, pl, bio);
+
+                       bio_list_merge(BL_CHUNK(chunk, WRITE_MERGED), write);
+                       bio_list_init(write);
+                       chunk_set(chunk, DIRTY);
+               }
+       }
+}
+
+/* Queue all writes to get merged. */
+static int stripe_queue_writes(struct stripe *stripe)
+{
+       int r = 0;
+       unsigned p = RS(stripe->sc)->set.raid_devs;
+
+       while (p--) {
+               struct stripe_chunk *chunk = CHUNK(stripe, p);
+               struct bio_list *write = BL_CHUNK(chunk, WRITE);
+
+               if (!bio_list_empty(write)) {
+                       bio_list_merge(BL_CHUNK(chunk, WRITE_QUEUED), write);
+                       bio_list_init(write);
+SetChunkIo(chunk);
+                       r = 1;
+               }
+       }
+
+       return r;
+}
+
+
+/* Check, if a chunk gets completely overwritten. */
+static int stripe_check_chunk_overwrite(struct stripe *stripe, unsigned p)
+{
+       unsigned sectors = 0;
+       struct bio *bio;
+       struct bio_list *bl = BL(stripe, p, WRITE_QUEUED);
+
+       bio_list_for_each(bio, bl)
+               sectors += bio_sectors(bio);
+
+       BUG_ON(sectors > RS(stripe->sc)->set.io_size);
+       return sectors == RS(stripe->sc)->set.io_size;
+}
+
+/*
+ * Avoid io on broken/reconstructed drive in order to
+ * reconstruct date on endio.
+ *
+ * (*1*) We set StripeReconstruct() in here, so that _do_endios()
+ *      will trigger a reconstruct call before resetting it.
+ */
+static int stripe_chunk_set_io_flags(struct stripe *stripe, int pr)
+{
+       struct stripe_chunk *chunk = CHUNK(stripe, pr);
+
+       /*
+        * Allow io on all chunks but the indexed one,
+        * because we're either degraded or prohibit it
+        * on the one for later reconstruction.
+        */
+       /* Includes ClearChunkIo(), ClearChunkUptodate(). */
+       stripe_chunk_invalidate(chunk);
+       stripe->idx.recover = pr;
+       SetStripeReconstruct(stripe);
+
+       /* REMOVEME: statistics. */
+       atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
+       return -EPERM;
+}
+
+/* Chunk locked/uptodate and device failed tests. */
+static struct stripe_chunk *
+stripe_chunk_check(struct stripe *stripe, unsigned p, unsigned *chunks_uptodate)
+{
+       struct raid_set *rs = RS(stripe->sc);
+       struct stripe_chunk *chunk = CHUNK(stripe, p);
+
+       /* Can't access active chunks. */
+       if (ChunkLocked(chunk)) {
+               /* REMOVEME: statistics. */
+               atomic_inc(rs->stats + S_CHUNK_LOCKED);
+               return NULL;
+       }
+
+       /* Can't access broken devive. */
+       if (ChunkError(chunk) || DevFailed(rs->dev + p))
+               return NULL;
+
+       /* Can access uptodate chunks. */
+       if (ChunkUptodate(chunk)) {
+               (*chunks_uptodate)++;
+               return NULL;
+       }
+
+       return chunk;
+}
+
+/*
+ * Degraded/reconstruction mode.
+ *
+ * Check stripe state to figure which chunks don't need IO.
+ *
+ * Returns 0 for fully operational, -EPERM for degraded/resynchronizing.
+ */
+static int stripe_check_reconstruct(struct stripe *stripe)
+{
+       struct raid_set *rs = RS(stripe->sc);
+
+       if (RSDead(rs)) {
+               ClearStripeReconstruct(stripe);
+               ClearStripeReconstructed(stripe);
+               stripe_allow_io(stripe);
+               return 0;
+       }
+
+       /* Avoid further reconstruction setting, when already set. */
+       if (StripeReconstruct(stripe)) {
+               /* REMOVEME: statistics. */
+               atomic_inc(rs->stats + S_RECONSTRUCT_SET);
+               return -EBUSY;
+       }
+
+       /* Initially allow io on all chunks. */
+       stripe_allow_io(stripe);
+
+       /* Return if stripe is already reconstructed. */
+       if (StripeReconstructed(stripe)) {
+               atomic_inc(rs->stats + S_RECONSTRUCTED);
+               return 0;
+       }
+
+       /*
+        * Degraded/reconstruction mode (device failed) ->
+        * avoid io on the failed device.
+        */
+       if (unlikely(RSDegraded(rs))) {
+               /* REMOVEME: statistics. */
+               atomic_inc(rs->stats + S_DEGRADED);
+               /* Allow IO on all devices but the dead one. */
+               BUG_ON(rs->set.ei < 0);
+               return stripe_chunk_set_io_flags(stripe, rs->set.ei);
+       } else {
+               int sync, pi = dev_for_parity(stripe, &sync);
+
+               /*
+                * Reconstruction mode (ie. a particular (replaced) device or
+                * some (rotating) parity chunk is being resynchronized) ->
+                *   o make sure all needed chunks are read in
+                *   o cope with 3/4 disk array special case where it
+                *     doesn't make a difference to read in parity
+                *     to xor data in/out
+                */
+               if (RSEnforceParityCreation(rs) || !sync) {
+                       /* REMOVEME: statistics. */
+                       atomic_inc(rs->stats + S_NOSYNC);
+                       /* Allow IO on all devs but the one to reconstruct. */
+                       return stripe_chunk_set_io_flags(stripe, pi);
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Check, if stripe is ready to merge writes.
+ * I.e. if all chunks present to allow to merge bios.
+ *
+ * We prohibit io on:
+ *
+ * o chunks without bios
+ * o chunks which get completely written over
+ */
+static int stripe_merge_possible(struct stripe *stripe, int nosync)
+{
+       struct raid_set *rs = RS(stripe->sc);
+       unsigned chunks_overwrite = 0, chunks_prohibited = 0,
+                chunks_uptodate = 0, p = rs->set.raid_devs;
+
+       /* Walk all chunks. */
+       while (p--) {
+               struct stripe_chunk *chunk;
+
+               /* Prohibit io on broken devices. */
+               if (DevFailed(rs->dev + p)) {
+                       chunk = CHUNK(stripe, p);
+                       goto prohibit_io;
+               }
+
+               /* We can't optimize any further if no chunk. */
+               chunk = stripe_chunk_check(stripe, p, &chunks_uptodate);
+               if (!chunk || nosync)
+                       continue;
+
+               /*
+                * We have a chunk, which is not uptodate.
+                *
+                * If this is not parity and we don't have
+                * reads queued, we can optimize further.
+                */
+               if (p != stripe->idx.parity &&
+                   bio_list_empty(BL_CHUNK(chunk, READ)) &&
+                   bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) {
+                       if (bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)))
+                               goto prohibit_io;
+                       else if (RSCheckOverwrite(rs) &&
+                                stripe_check_chunk_overwrite(stripe, p))
+                               /* Completely overwritten chunk. */
+                               chunks_overwrite++;
+               }
+
+               /* Allow io for chunks with bios and overwritten ones. */
+               SetChunkIo(chunk);
+               continue;
+
+prohibit_io:
+               /* No io for broken devices or for chunks w/o bios. */
+               ClearChunkIo(chunk);
+               chunks_prohibited++;
+               /* REMOVEME: statistics. */
+               atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
+       }
+
+       /* All data chunks will get written over. */
+       if (chunks_overwrite == rs->set.data_devs)
+               atomic_inc(rs->stats + S_OVERWRITE); /* REMOVEME: statistics.*/
+       else if (chunks_uptodate + chunks_prohibited < rs->set.raid_devs) {
+               /* We don't have enough chunks to merge. */
+               atomic_inc(rs->stats + S_CANT_MERGE); /* REMOVEME: statistics.*/
+               return -EPERM;
+       }
+
+       /*
+        * If we have all chunks up to date or overwrite them, we
+        * just zero the parity chunk and let stripe_rw() recreate it.
+        */
+       if (chunks_uptodate == rs->set.raid_devs ||
+           chunks_overwrite == rs->set.data_devs) {
+               stripe_zero_chunk(stripe, stripe->idx.parity);
+               BUG_ON(StripeReconstruct(stripe));
+               SetStripeReconstruct(stripe);   /* Enforce xor in caller. */
+       } else {
+               /*
+                * With less chunks, we xor parity out.
+                *
+                * (*4*) We rely on !StripeReconstruct() in chunk_must_xor(),
+                *       so that only chunks with queued or merged writes
+                *       are being xored.
+                */
+               parity_xor(stripe);
+       }
+
+       /*
+        * We do have enough chunks to merge.
+        * All chunks are uptodate or get written over.
+        */
+       atomic_inc(rs->stats + S_CAN_MERGE); /* REMOVEME: statistics. */
+       return 0;
+}
+
+/*
+ * Avoid reading chunks in case we're fully operational.
+ *
+ * We prohibit io on any chunks without bios but the parity chunk.
+ */
+static void stripe_avoid_reads(struct stripe *stripe)
+{
+       struct raid_set *rs = RS(stripe->sc);
+       unsigned dummy = 0, p = rs->set.raid_devs;
+
+       /* Walk all chunks. */
+       while (p--) {
+               struct stripe_chunk *chunk =
+                       stripe_chunk_check(stripe, p, &dummy);
+
+               if (!chunk)
+                       continue;
+
+               /* If parity or any bios pending -> allow io. */
+               if (chunk_ref(chunk) || p == stripe->idx.parity)
+                       SetChunkIo(chunk);
+               else {
+                       ClearChunkIo(chunk);
+                       /* REMOVEME: statistics. */
+                       atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
+               }
+       }
+}
+
+/*
+ * Read/write a stripe.
+ *
+ * All stripe read/write activity goes through this function
+ * unless recovery, which has to call stripe_chunk_rw() directly.
+ *
+ * Make sure we don't try already merged stripes in order
+ * to avoid data corruption.
+ *
+ * Check the state of the RAID set and if degraded (or
+ * resynchronizing for reads), read in all other chunks but
+ * the one on the dead/resynchronizing device in order to be
+ * able to reconstruct the missing one in _do_endios().
+ *
+ * Can be called on active stripes in order
+ * to dispatch new io on inactive chunks.
+ *
+ * States to cover:
+ *   o stripe to read and/or write
+ *   o stripe with error to reconstruct
+ */
+static int stripe_rw(struct stripe *stripe)
+{
+       int nosync, r;
+       struct raid_set *rs = RS(stripe->sc);
+
+       /*
+        * Check, if a chunk needs to be reconstructed
+        * because of a degraded set or a region out of sync.
+        */
+       nosync = stripe_check_reconstruct(stripe);
+       switch (nosync) {
+       case -EBUSY:
+               return 0; /* Wait for stripe reconstruction to finish. */
+       case -EPERM:
+               goto io;
+       }
+
+       /*
+        * If we don't have merged writes pending, we can schedule
+        * queued writes to be merged next without corrupting data.
+        */
+       if (!StripeMerged(stripe)) {
+               r = stripe_queue_writes(stripe);
+               if (r)
+                       /* Writes got queued -> flag RBW. */
+                       SetStripeRBW(stripe);
+       }
+
+       /*
+        * Merge all writes hanging off uptodate/overwritten
+        * chunks of the stripe.
+        */
+       if (StripeRBW(stripe)) {
+               r = stripe_merge_possible(stripe, nosync);
+               if (!r) { /* Merge possible. */
+                       struct stripe_chunk *chunk;
+
+                       /*
+                        * I rely on valid parity in order
+                        * to xor a fraction of chunks out
+                        * of parity and back in.
+                        */
+                       stripe_merge_writes(stripe);    /* Merge writes in. */
+                       parity_xor(stripe);             /* Update parity. */
+                       ClearStripeReconstruct(stripe); /* Reset xor enforce. */
+                       SetStripeMerged(stripe);        /* Writes merged. */
+                       ClearStripeRBW(stripe);         /* Disable RBW. */
+
+                       /*
+                        * REMOVEME: sanity check on parity chunk
+                        *           states after writes got merged.
+                        */
+                       chunk = CHUNK(stripe, stripe->idx.parity);
+                       BUG_ON(ChunkLocked(chunk));
+                       BUG_ON(!ChunkUptodate(chunk));
+                       BUG_ON(!ChunkDirty(chunk));
+                       BUG_ON(!ChunkIo(chunk));
+               }
+       } else if (!nosync && !StripeMerged(stripe))
+               /* Read avoidance if not degraded/resynchronizing/merged. */
+               stripe_avoid_reads(stripe);
+
+io:
+       /* Now submit any reads/writes for non-uptodate or dirty chunks. */
+       r = stripe_chunks_rw(stripe);
+       if (!r) {
+               /*
+                * No io submitted because of chunk io
+                * prohibited or locked chunks/failed devices
+                * -> push to end io list for processing.
+                */
+               stripe_endio_push(stripe);
+               atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
+       }
+
+       return r;
+}
+
+/*
+ * Recovery functions
+ */
+/* Read a stripe off a raid set for recovery. */
+static int stripe_recover_read(struct stripe *stripe, int pi)
+{
+       BUG_ON(stripe_io_ref(stripe));
+
+       /* Invalidate all chunks so that they get read in. */
+       stripe_chunks_invalidate(stripe);
+       stripe_allow_io(stripe); /* Allow io on all recovery chunks. */
+
+       /*
+        * If we are reconstructing a perticular device, we can avoid
+        * reading the respective chunk in, because we're going to
+        * reconstruct it anyway.
+        *
+        * We can't do that for resynchronization of rotating parity,
+        * because the recovery stripe chunk size is typically larger
+        * than the sets chunk size.
+        */
+       if (pi > -1)
+               ClearChunkIo(CHUNK(stripe, pi));
+
+       return stripe_chunks_rw(stripe);
+}
+
+/* Write a stripe to a raid set for recovery. */
+static int stripe_recover_write(struct stripe *stripe, int pi)
+{
+       BUG_ON(stripe_io_ref(stripe));
+
+       /*
+        * If this is a reconstruct of a particular device, then
+        * reconstruct the respective chunk, else create parity chunk.
+        */
+       if (pi > -1) {
+               stripe_zero_chunk(stripe, pi);
+               common_xor(stripe, stripe->io.size, 0, pi);
+               chunk_set(CHUNK(stripe, pi), DIRTY);
+       } else
+               parity_xor(stripe);
+
+       return stripe_chunks_rw(stripe);
+}
+
+/* Read/write a recovery stripe. */
+static int stripe_recover_rw(struct stripe *stripe)
+{
+       int r = 0, sync = 0;
+
+       /* Read/write flip-flop. */
+       if (TestClearStripeRBW(stripe)) {
+               SetStripeMerged(stripe);
+               stripe->key = stripe->recover->pos;
+               r = stripe_recover_read(stripe, dev_for_parity(stripe, &sync));
+               BUG_ON(!r);
+       } else if (TestClearStripeMerged(stripe)) {
+               r = stripe_recover_write(stripe, dev_for_parity(stripe, &sync));
+               BUG_ON(!r);
+       }
+
+       BUG_ON(sync);
+       return r;
+}
+
+/* Recover bandwidth available ?. */
+static int recover_bandwidth(struct raid_set *rs)
+{
+       int r, work;
+
+       /* On reset or when bios delayed -> allow recovery. */
+       r = recover_io_reset(rs);
+       if (r || RSBandwidth(rs))
+               goto out;
+
+       work = atomic_read(rs->recover.io_count + IO_WORK);
+       if (work) {
+               /* Pay attention to larger recover stripe size. */
+               int recover = atomic_read(rs->recover.io_count + IO_RECOVER) *
+                                         rs->recover.io_size / rs->set.io_size;
+
+               /*
+                * Don't use more than given bandwidth
+                * of the work io for recovery.
+                */
+               if (recover > work / rs->recover.bandwidth_work) {
+                       /* REMOVEME: statistics. */
+                       atomic_inc(rs->stats + S_NO_BANDWIDTH);
+                       return 0;
+               }
+       }
+
+out:
+       atomic_inc(rs->stats + S_BANDWIDTH);    /* REMOVEME: statistics. */
+       return 1;
+}
+
+/* Try to get a region to recover. */
+static int stripe_recover_get_region(struct stripe *stripe)
+{
+       struct raid_set *rs = RS(stripe->sc);
+       struct recover *rec = &rs->recover;
+       struct recover_addr *addr = stripe->recover;
+       struct dm_dirty_log *dl = rec->dl;
+       struct dm_rh_client *rh = rec->rh;
+
+       BUG_ON(!dl);
+       BUG_ON(!rh);
+
+       /* Return, that we have region first to finish it during suspension. */
+       if (addr->reg)
+               return 1;
+
+       if (RSSuspend(rs))
+               return -EPERM;
+
+       if (dl->type->get_sync_count(dl) >= rec->nr_regions)
+               return -ENOENT;
+
+       /* If we don't have enough bandwidth, we don't proceed recovering. */
+       if (!recover_bandwidth(rs))
+               return -EAGAIN;
+
+       /* Start quiescing a region. */
+       dm_rh_recovery_prepare(rh);
+       addr->reg = dm_rh_recovery_start(rh);
+       if (!addr->reg)
+               return -EAGAIN;
+
+       addr->pos = dm_rh_region_to_sector(rh, dm_rh_get_region_key(addr->reg));
+       addr->end = addr->pos + dm_rh_get_region_size(rh);
+
+       /*
+        * Take one global io reference out for the
+        * whole region, which is going to be released
+        * when the region is completely done with.
+        */
+       io_get(rs);
+       return 0;
+}
+
+/* Update region hash state. */
+enum recover_type { REC_FAILURE = 0, REC_SUCCESS = 1 };
+static void recover_rh_update(struct stripe *stripe, enum recover_type success)
+{
+       struct recover_addr *addr = stripe->recover;
+       struct raid_set *rs = RS(stripe->sc);
+       struct recover *rec = &rs->recover;
+
+       if (!addr->reg) {
+               DMERR("%s- Called w/o region", __func__);
+               return;
+       }
+
+       dm_rh_recovery_end(addr->reg, success);
+       if (success)
+               rec->nr_regions_recovered++;
+
+       addr->reg = NULL;
+
+       /*
+        * Completely done with this region ->
+        * release the 1st io reference.
+        */
+       io_put(rs);
+}
+
+/* Set start of recovery state. */
+static void set_start_recovery(struct raid_set *rs)
+{
+       /* Initialize recovery. */
+       rs->recover.start_jiffies = jiffies;
+       rs->recover.end_jiffies = 0;
+}
+
+/* Set end of recovery state. */
+static void set_end_recovery(struct raid_set *rs)
+{
+       ClearRSRecover(rs);
+/* Achtung: nicht mehr zurück setzten -> 'i' belibt in status output und userpace könnte sich darauf verlassen, das es verschiwndet!!!! */
+       rs->set.dev_to_init = -1;
+
+       /* Check for jiffies overrun. */
+       rs->recover.end_jiffies = jiffies;
+       if (rs->recover.end_jiffies < rs->recover.start_jiffies)
+               rs->recover.end_jiffies = ~0;
+}
+
+/* Handle recovery on one recovery stripe. */
+static int _do_recovery(struct stripe *stripe)
+{
+       int r;
+       struct raid_set *rs = RS(stripe->sc);
+       struct recover_addr *addr = stripe->recover;
+
+       /* If recovery is active -> return. */
+       if (stripe_io_ref(stripe))
+               return 1;
+
+       /* IO error is fatal for recovery -> stop it. */
+       if (unlikely(StripeError(stripe)))
+               goto err;
+
+       /* Recovery end required. */
+       if (unlikely(RSDegraded(rs)))
+               goto err;
+
+       /* Get a region to recover. */
+       r = stripe_recover_get_region(stripe);
+       switch (r) {
+       case 0: /* Got a new region: flag initial read before write. */
+               SetStripeRBW(stripe);
+       case 1: /* Have a region in the works. */
+               break;
+       case -EAGAIN:
+               /* No bandwidth/quiesced region yet, try later. */
+               if (!io_ref(rs))
+                       wake_do_raid_delayed(rs, HZ / 4);
+       case -EPERM:
+               /* Suspend. */
+               return 1;
+       case -ENOENT:   /* No more regions to recover. */
+               schedule_work(&rs->io.ws_do_table_event);
+               return 0;
+       default:
+               BUG();
+       }
+
+       /* Read/write a recover stripe. */
+       r = stripe_recover_rw(stripe);
+       if (r)
+               /* IO initiated. */
+               return 1;
+
+       /* Read and write finished-> update recovery position within region. */
+       addr->pos += stripe->io.size;
+
+       /* If we're at end of region, update region hash. */
+       if (addr->pos >= addr->end ||
+           addr->pos >= rs->set.sectors_per_dev)
+               recover_rh_update(stripe, REC_SUCCESS);
+       else
+               /* Prepare to read next region segment. */
+               SetStripeRBW(stripe);
+
+       /* Schedule myself for another round... */
+       wake_do_raid(rs);
+       return 1;
+
+err:
+       /* FIXME: rather try recovering other regions on error? */
+       rs_check_degrade(stripe);
+       recover_rh_update(stripe, REC_FAILURE);
+
+       /* Check state of partially recovered array. */
+       if (RSDegraded(rs) && !RSDead(rs) &&
+           rs->set.dev_to_init != -1 &&
+           rs->set.ei != rs->set.dev_to_init) {
+               /* Broken drive != drive to recover -> FATAL. */
+               SetRSDead(rs);
+               DMERR("FATAL: failed device != device to initialize -> "
+                     "RAID set broken");
+       }
+
+       if (StripeError(stripe) || RSDegraded(rs)) {
+               char buf[BDEVNAME_SIZE];
+
+               DMERR("stopping recovery due to "
+                     "ERROR on /dev/%s, stripe at offset %llu",
+                     bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
+                     (unsigned long long) stripe->key);
+
+       }
+
+       /* Make sure, that all quiesced regions get released. */
+       while (addr->reg) {
+               dm_rh_recovery_end(addr->reg, -EIO);
+               addr->reg = dm_rh_recovery_start(rs->recover.rh);
+       }
+
+       return 0;
+}
+
+/* Called by main io daemon to recover regions. */
+static int do_recovery(struct raid_set *rs)
+{
+       if (RSRecover(rs)) {
+               int r = 0;
+               struct stripe *stripe;
+
+               list_for_each_entry(stripe, &rs->recover.stripes,
+                                   lists[LIST_RECOVER])
+                       r += _do_recovery(stripe);
+
+               if (r)
+                       return r;
+
+               set_end_recovery(rs);
+               stripe_recover_free(rs);
+       }
+
+       return 0;
+}
+
+/*
+ * END recovery functions
+ */
+
+/* End io process all stripes handed in by endio() callback. */
+static void _do_endios(struct raid_set *rs, struct stripe *stripe,
+                      struct list_head *flush_list)
+{
+       /* First unlock all required chunks. */
+       stripe_chunks_unlock(stripe);
+
+       /*
+        * If an io error on a stripe occured, degrade the RAID set
+        * and try to endio as many bios as possible. If any bios can't
+        * be endio processed, requeue the stripe (stripe_ref() != 0).
+        */
+       if (TestClearStripeError(stripe)) {
+               /*
+                * FIXME: if read, rewrite the failed chunk after reconstruction
+                *        in order to trigger disk bad sector relocation.
+                */
+               rs_check_degrade(stripe); /* Resets ChunkError(). */
+               ClearStripeReconstruct(stripe);
+               ClearStripeReconstructed(stripe);
+
+               /*
+                * FIXME: if write, don't endio writes in flight and don't
+                *        allow for new writes until userspace has updated
+                *        its metadata.
+                */
+       }
+
+       /* Got to reconstruct a missing chunk. */
+       if (StripeReconstruct(stripe)) {
+               /*
+                * (*2*) We use StripeReconstruct() to allow for
+                *       all chunks to be xored into the reconstructed
+                *       one (see chunk_must_xor()).
+                */
+               stripe_reconstruct(stripe);
+
+               /*
+                * (*3*) Now we reset StripeReconstruct() and flag
+                *       StripeReconstructed() to show to stripe_rw(),
+                *       that we have reconstructed a missing chunk.
+                */
+               ClearStripeReconstruct(stripe);
+               SetStripeReconstructed(stripe);
+
+               /* FIXME: reschedule to be written in case of read. */
+               /* if (!RSDead && RSDegraded(rs) !StripeRBW(stripe)) {
+                       chunk_set(CHUNK(stripe, stripe->idx.recover), DIRTY);
+                       stripe_chunks_rw(stripe);
+               } */
+
+               stripe->idx.recover = -1;
+       }
+
+       /*
+        * Now that we eventually got a complete stripe, we
+        * can process the rest of the end ios on reads.
+        */
+       stripe_endio(READ, stripe);
+
+       /* End io all merged writes if not prohibited. */
+       if (!RSProhibitWrites(rs) && StripeMerged(stripe)) {
+               ClearStripeMerged(stripe);
+               stripe_endio(WRITE_MERGED, stripe);
+       }
+
+       /* If RAID set is dead -> fail any ios to dead drives. */
+       if (RSDead(rs)) {
+               if (!TestSetRSDeadEndioMessage(rs))
+                       DMERR("RAID set dead: failing ios to dead devices");
+
+               stripe_fail_io(stripe);
+       }
+
+       /*
+        * We have stripe references still,
+        * beacuse of read before writes or IO errors ->
+        * got to put on flush list for processing.
+        */
+       if (stripe_ref(stripe)) {
+               BUG_ON(!list_empty(stripe->lists + LIST_LRU));
+               list_add_tail(stripe->lists + LIST_FLUSH, flush_list);
+               atomic_inc(rs->stats + S_REQUEUE); /* REMOVEME: statistics. */
+       } else
+               stripe_lru_add(stripe);
+}
+
+/* Pop any endio stripes off of the endio list and belabour them. */
+static void do_endios(struct raid_set *rs)
+{
+       struct stripe_cache *sc = &rs->sc;
+       struct stripe *stripe;
+       /* IO flush list for sorted requeued stripes. */
+       struct list_head flush_list;
+
+       INIT_LIST_HEAD(&flush_list);
+
+       while ((stripe = stripe_endio_pop(sc))) {
+               /* Avoid endio on stripes with newly io'ed chunks. */
+               if (!stripe_io_ref(stripe))
+                       _do_endios(rs, stripe, &flush_list);
+       }
+
+       /*
+        * Insert any requeued stripes in the proper
+        * order at the beginning of the io (flush) list.
+        */
+       list_splice(&flush_list, sc->lists + LIST_FLUSH);
+}
+
+/* Flush any stripes on the io list. */
+static int do_flush(struct raid_set *rs)
+{
+       int r = 0;
+       struct stripe *stripe;
+
+       while ((stripe = stripe_io_pop(&rs->sc)))
+               r += stripe_rw(stripe); /* Read/write stripe. */
+
+       return r;
+}
+
+/* Stripe cache resizing. */
+static void do_sc_resize(struct raid_set *rs)
+{
+       unsigned set = atomic_read(&rs->sc.stripes_to_set);
+
+       if (set) {
+               unsigned cur = atomic_read(&rs->sc.stripes);
+               int r = (set > cur) ? sc_grow(&rs->sc, set - cur, SC_GROW) :
+                                     sc_shrink(&rs->sc, cur - set);
+
+               /* Flag end of resizeing if ok. */
+               if (!r)
+                       atomic_set(&rs->sc.stripes_to_set, 0);
+       }
+}
+
+/*
+ * Process all ios
+ *
+ * We do different things with the io depending
+ * on the state of the region that it is in:
+ *
+ * o reads: hang off stripe cache or postpone if full
+ *
+ * o writes:
+ *
+ *  CLEAN/DIRTY/NOSYNC:        increment pending and hang io off stripe's stripe set.
+ *                     In case stripe cache is full or busy, postpone the io.
+ *
+ *  RECOVERING:                delay the io until recovery of the region completes.
+ *
+ */
+static void do_ios(struct raid_set *rs, struct bio_list *ios)
+{
+       int r;
+       unsigned flush = 0, delay = 0;
+       sector_t sector;
+       struct dm_rh_client *rh = rs->recover.rh;
+       struct bio *bio;
+       struct bio_list reject;
+
+       bio_list_init(&reject);
+
+       /*
+        * Classify each io:
+        *    o delay writes to recovering regions (let reads go through)
+        *    o queue io to all other regions
+        */
+       while ((bio = bio_list_pop(ios))) {
+               /*
+                * In case we get a barrier bio, push it back onto
+                * the input queue unless all work queues are empty
+                * and the stripe cache is inactive.
+                */
+               if (bio->bi_rw & REQ_FLUSH) {
+                       /* REMOVEME: statistics. */
+                       atomic_inc(rs->stats + S_BARRIER);
+                       if (delay ||
+                           !list_empty(rs->sc.lists + LIST_FLUSH) ||
+                           !bio_list_empty(&reject) ||
+                           sc_active(&rs->sc)) {
+                               bio_list_push(ios, bio);
+                               break;
+                       }
+               }
+
+               /* If writes prohibited because of failures -> postpone. */
+               if (RSProhibitWrites(rs) && bio_data_dir(bio) == WRITE) {
+                       bio_list_add(&reject, bio);
+                       continue;
+               }
+
+               /* Check for recovering regions. */
+               sector = _sector(rs, bio);
+               r = region_state(rs, sector, DM_RH_RECOVERING);
+               if (unlikely(r)) {
+                       delay++;
+                       /* Wait writing to recovering regions. */
+                       dm_rh_delay_by_region(rh, bio,
+                                             dm_rh_sector_to_region(rh,
+                                                                    sector));
+                       /* REMOVEME: statistics.*/
+                       atomic_inc(rs->stats + S_DELAYED_BIOS);
+                       atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
+
+                       /* Force bandwidth tests in recovery. */
+                       SetRSBandwidth(rs);
+               } else {
+                       /*
+                        * Process ios to non-recovering regions by queueing
+                        * them to stripes (does dm_rh_inc()) for writes).
+                        */
+                       flush += stripe_queue_bio(rs, bio, &reject);
+               }
+       }
+
+       if (flush) {
+               /* FIXME: better error handling. */
+               r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
+               if (r)
+                       DMERR_LIMIT("dirty log flush");
+       }
+
+       /* Merge any rejected bios back to the head of the input list. */
+       bio_list_merge_head(ios, &reject);
+}
+
+/* Send an event in case we're getting too busy. */
+static void do_busy_event(struct raid_set *rs)
+{
+       if (sc_busy(rs)) {
+               if (!TestSetRSScBusy(rs))
+                       schedule_work(&rs->io.ws_do_table_event);
+       } else
+               ClearRSScBusy(rs);
+}
+
+/* Throw an event. */
+static void do_table_event(struct work_struct *ws)
+{
+       struct raid_set *rs = container_of(ws, struct raid_set,
+                                          io.ws_do_table_event);
+       dm_table_event(rs->ti->table);
+}
+
+
+/*-----------------------------------------------------------------
+ * RAID daemon
+ *---------------------------------------------------------------*/
+/*
+ * o belabour all end ios
+ * o update the region hash states
+ * o optionally shrink the stripe cache
+ * o optionally do recovery
+ * o unplug any component raid devices with queued bios
+ * o grab the input queue
+ * o work an all requeued or new ios and perform stripe cache flushs
+ * o unplug any component raid devices with queued bios
+ * o check, if the stripe cache gets too busy and throw an event if so
+ */
+static void do_raid(struct work_struct *ws)
+{
+       int r;
+       struct raid_set *rs = container_of(ws, struct raid_set,
+                                          io.dws_do_raid.work);
+       struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
+
+       /*
+        * We always need to end io, so that ios can get errored in
+        * case the set failed and the region counters get decremented
+        * before we update region hash states and go any further.
+        */
+       do_endios(rs);
+       dm_rh_update_states(rs->recover.rh, 1);
+
+       /*
+        * Now that we've end io'd, which may have put stripes on the LRU list
+        * to allow for shrinking, we resize the stripe cache if requested.
+        */
+       do_sc_resize(rs);
+
+       /* Try to recover regions. */
+       r = do_recovery(rs);
+
+       /* Quickly grab all new ios queued and add them to the work list. */
+       mutex_lock(&rs->io.in_lock);
+       bio_list_merge(ios, ios_in);
+       bio_list_init(ios_in);
+       mutex_unlock(&rs->io.in_lock);
+
+       if (!bio_list_empty(ios))
+               do_ios(rs, ios); /* Got ios to work into the cache. */
+
+       r = do_flush(rs);               /* Flush any stripes on io list. */
+
+       do_busy_event(rs);      /* Check if we got too busy. */
+}
+
+/*
+ * Callback for region hash to dispatch
+ * delayed bios queued to recovered regions
+ * (gets called via dm_rh_update_states()).
+ */
+static void dispatch_delayed_bios(void *context, struct bio_list *bl)
+{
+       struct raid_set *rs = context;
+       struct bio *bio;
+
+       /* REMOVEME: statistics; decrement pending delayed bios counter. */
+       bio_list_for_each(bio, bl)
+               atomic_dec(rs->stats + S_DELAYED_BIOS);
+
+       /* Merge region hash private list to work list. */
+       bio_list_merge_head(&rs->io.work, bl);
+       bio_list_init(bl);
+       ClearRSBandwidth(rs);
+}
+
+/*************************************************************
+ * Constructor helpers
+ *************************************************************/
+/* Calculate MB/sec. */
+static unsigned mbpers(struct raid_set *rs, unsigned io_size)
+{
+       return to_bytes((rs->xor.speed * rs->set.data_devs *
+                        io_size * HZ / XOR_SPEED_TICKS) >> 10) >> 10;
+}
+
+/*
+ * Discover fastest xor algorithm and # of chunks combination.
+ */
+/* Calculate speed of particular algorithm and # of chunks. */
+static unsigned xor_speed(struct stripe *stripe)
+{
+       int ticks = XOR_SPEED_TICKS;
+       unsigned p = RS(stripe->sc)->set.raid_devs, r = 0;
+       unsigned long j;
+
+       /* Set uptodate so that common_xor()->xor() will belabour chunks. */
+       while (p--)
+               SetChunkUptodate(CHUNK(stripe, p));
+
+       /* Wait for next tick. */
+       for (j = jiffies; j == jiffies; );
+
+       /* Do xors for a few ticks. */
+       while (ticks--) {
+               unsigned xors = 0;
+
+               for (j = jiffies; j == jiffies; ) {
+                       mb();
+                       common_xor(stripe, stripe->io.size, 0, 0);
+                       mb();
+                       xors++;
+                       mb();
+               }
+
+               if (xors > r)
+                       r = xors;
+       }
+
+       return r;
+}
+
+/* Define for xor multi recovery stripe optimization runs. */
+#define DMRAID45_XOR_TEST
+
+/* Optimize xor algorithm for this RAID set. */
+static unsigned xor_optimize(struct raid_set *rs)
+{
+       unsigned chunks_max = 2, speed_max = 0;
+       struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
+       struct stripe *stripe;
+       unsigned io_size = 0, speed_hm = 0, speed_min = ~0, speed_xor_blocks = 0;
+
+       BUG_ON(list_empty(&rs->recover.stripes));
+#ifndef DMRAID45_XOR_TEST
+       stripe = list_first_entry(&rs->recover.stripes, struct stripe,
+                                 lists[LIST_RECOVER]);
+#endif
+
+       /* Try all xor functions. */
+       while (f-- > xor_funcs) {
+               unsigned speed;
+
+#ifdef DMRAID45_XOR_TEST
+               list_for_each_entry(stripe, &rs->recover.stripes,
+                                   lists[LIST_RECOVER]) {
+                       io_size = stripe->io.size;
+#endif
+
+                       /* Set actual xor function for common_xor(). */
+                       rs->xor.f = f;
+                       rs->xor.chunks = (f->f == xor_blocks_wrapper ?
+                                         (MAX_XOR_BLOCKS + 1) :
+                                         XOR_CHUNKS_MAX);
+                       if (rs->xor.chunks > rs->set.raid_devs)
+                               rs->xor.chunks = rs->set.raid_devs;
+
+                       for ( ; rs->xor.chunks > 1; rs->xor.chunks--) {
+                               speed = xor_speed(stripe);
+
+#ifdef DMRAID45_XOR_TEST
+                               if (f->f == xor_blocks_wrapper) {
+                                       if (speed > speed_xor_blocks)
+                                               speed_xor_blocks = speed;
+                               } else if (speed > speed_hm)
+                                       speed_hm = speed;
+
+                               if (speed < speed_min)
+                                       speed_min = speed;
+#endif
+
+                               if (speed > speed_max) {
+                                       speed_max = speed;
+                                       chunks_max = rs->xor.chunks;
+                                       f_max = f;
+                               }
+                       }
+#ifdef DMRAID45_XOR_TEST
+               }
+#endif
+       }
+
+       /* Memorize optimal parameters. */
+       rs->xor.f = f_max;
+       rs->xor.chunks = chunks_max;
+#ifdef DMRAID45_XOR_TEST
+       DMINFO("%s stripes=%u/size=%u min=%u xor_blocks=%u hm=%u max=%u",
+              speed_max == speed_hm ? "HM" : "NB",
+              rs->recover.recovery_stripes, io_size, speed_min,
+              speed_xor_blocks, speed_hm, speed_max);
+#endif
+       return speed_max;
+}
+
+/*
+ * Allocate a RAID context (a RAID set)
+ */
+/* Structure for variable RAID parameters. */
+struct variable_parms {
+       int bandwidth;
+       int bandwidth_parm;
+       int chunk_size;
+       int chunk_size_parm;
+       int io_size;
+       int io_size_parm;
+       int stripes;
+       int stripes_parm;
+       int recover_io_size;
+       int recover_io_size_parm;
+       int raid_parms;
+       int recovery;
+       int recovery_stripes;
+       int recovery_stripes_parm;
+};
+
+static struct raid_set *
+context_alloc(struct raid_type *raid_type, struct variable_parms *p,
+             unsigned raid_devs, sector_t sectors_per_dev,
+             struct dm_target *ti, unsigned dl_parms, char **argv)
+{
+       int r;
+       size_t len;
+       sector_t region_size, ti_len;
+       struct raid_set *rs = NULL;
+       struct dm_dirty_log *dl;
+       struct recover *rec;
+
+       /*
+        * Create the dirty log
+        *
+        * We need to change length for the dirty log constructor,
+        * because we want an amount of regions for all stripes derived
+        * from the single device size, so that we can keep region
+        * size = 2^^n independant of the number of devices
+        */
+       ti_len = ti->len;
+       ti->len = sectors_per_dev;
+       dl = dm_dirty_log_create(argv[0], ti, NULL, dl_parms, argv + 2);
+       ti->len = ti_len;
+       if (!dl)
+               goto bad_dirty_log;
+
+       /* Chunk size *must* be smaller than region size. */
+       region_size = dl->type->get_region_size(dl);
+       if (p->chunk_size > region_size)
+               goto bad_chunk_size;
+
+       /* Recover io size *must* be smaller than region size as well. */
+       if (p->recover_io_size > region_size)
+               goto bad_recover_io_size;
+
+       /* Size and allocate the RAID set structure. */
+       len = sizeof(*rs->data) + sizeof(*rs->dev);
+       if (dm_array_too_big(sizeof(*rs), len, raid_devs))
+               goto bad_array;
+
+       len = sizeof(*rs) + raid_devs * len;
+       rs = kzalloc(len, GFP_KERNEL);
+       if (!rs)
+               goto bad_alloc;
+
+       rec = &rs->recover;
+       atomic_set(&rs->io.in_process, 0);
+       atomic_set(&rs->io.in_process_max, 0);
+       rec->io_size = p->recover_io_size;
+
+       /* Pointer to data array. */
+       rs->data = (unsigned long **)
+                  ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
+       rec->dl = dl;
+       rs->set.raid_devs = raid_devs;
+       rs->set.data_devs = raid_devs - raid_type->parity_devs;
+       rs->set.raid_type = raid_type;
+
+       rs->set.raid_parms = p->raid_parms;
+       rs->set.chunk_size_parm = p->chunk_size_parm;
+       rs->set.io_size_parm = p->io_size_parm;
+       rs->sc.stripes_parm = p->stripes_parm;
+       rec->io_size_parm = p->recover_io_size_parm;
+       rec->bandwidth_parm = p->bandwidth_parm;
+       rec->recovery = p->recovery;
+       rec->recovery_stripes = p->recovery_stripes;
+
+       /*
+        * Set chunk and io size and respective shifts
+        * (used to avoid divisions)
+        */
+       rs->set.chunk_size = p->chunk_size;
+       rs->set.chunk_shift = ffs(p->chunk_size) - 1;
+
+       rs->set.io_size = p->io_size;
+       rs->set.io_mask = p->io_size - 1;
+       /* Mask to adjust address key in case io_size != chunk_size. */
+       rs->set.io_inv_mask = (p->chunk_size - 1) & ~rs->set.io_mask;
+
+       rs->set.sectors_per_dev = sectors_per_dev;
+
+       rs->set.ei = -1;        /* Indicate no failed device. */
+       atomic_set(&rs->set.failed_devs, 0);
+
+       rs->ti = ti;
+
+       atomic_set(rec->io_count + IO_WORK, 0);
+       atomic_set(rec->io_count + IO_RECOVER, 0);
+
+       /* Initialize io lock and queues. */
+       mutex_init(&rs->io.in_lock);
+       mutex_init(&rs->io.xor_lock);
+       bio_list_init(&rs->io.in);
+       bio_list_init(&rs->io.work);
+
+       init_waitqueue_head(&rs->io.suspendq);  /* Suspend waiters (dm-io). */
+
+       rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
+       rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios,
+                       wake_dummy, wake_do_raid, 0, p->recovery_stripes,
+                       dl, region_size, rec->nr_regions);
+       if (IS_ERR(rec->rh))
+               goto bad_rh;
+
+       /* Initialize stripe cache. */
+       r = sc_init(rs, p->stripes);
+       if (r)
+               goto bad_sc;
+
+       /* REMOVEME: statistics. */
+       stats_reset(rs);
+       ClearRSDevelStats(rs);  /* Disnable development status. */
+       return rs;
+
+bad_dirty_log:
+       TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM));
+
+bad_chunk_size:
+       dm_dirty_log_destroy(dl);
+       TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL));
+
+bad_recover_io_size:
+       dm_dirty_log_destroy(dl);
+       TI_ERR_RET("Recover stripe io size larger than region size",
+                       ERR_PTR(-EINVAL));
+
+bad_array:
+       dm_dirty_log_destroy(dl);
+       TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL));
+
+bad_alloc:
+       dm_dirty_log_destroy(dl);
+       TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM));
+
+bad_rh:
+       dm_dirty_log_destroy(dl);
+       ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
+       goto free_rs;
+
+bad_sc:
+       dm_region_hash_destroy(rec->rh); /* Destroys dirty log too. */
+       sc_exit(&rs->sc);
+       ti->error = DM_MSG_PREFIX "Error creating stripe cache";
+free_rs:
+       kfree(rs);
+       return ERR_PTR(-ENOMEM);
+}
+
+/* Free a RAID context (a RAID set). */
+static void context_free(struct raid_set *rs, unsigned p)
+{
+       while (p--)
+               dm_put_device(rs->ti, rs->dev[p].dev);
+
+       sc_exit(&rs->sc);
+       dm_region_hash_destroy(rs->recover.rh); /* Destroys dirty log too. */
+       kfree(rs);
+}
+
+/* Create work queue and initialize delayed work. */
+static int rs_workqueue_init(struct raid_set *rs)
+{
+       struct dm_target *ti = rs->ti;
+
+       rs->io.wq = create_singlethread_workqueue(DAEMON);
+       if (!rs->io.wq)
+               TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
+
+       INIT_DELAYED_WORK(&rs->io.dws_do_raid, do_raid);
+       INIT_WORK(&rs->io.ws_do_table_event, do_table_event);
+       return 0;
+}
+
+/* Return pointer to raid_type structure for raid name. */
+static struct raid_type *get_raid_type(char *name)
+{
+       struct raid_type *r = ARRAY_END(raid_types);
+
+       while (r-- > raid_types) {
+               if (!strcmp(r->name, name))
+                       return r;
+       }
+
+       return NULL;
+}
+
+/* FIXME: factor out to dm core. */
+static int multiple(sector_t a, sector_t b, sector_t *n)
+{
+       sector_t r = a;
+
+       sector_div(r, b);
+       *n = r;
+       return a == r * b;
+}
+
+/* Log RAID set information to kernel log. */
+static void rs_log(struct raid_set *rs, unsigned io_size)
+{
+       unsigned p;
+       char buf[BDEVNAME_SIZE];
+
+       for (p = 0; p < rs->set.raid_devs; p++)
+               DMINFO("/dev/%s is raid disk %u%s",
+                               bdevname(rs->dev[p].dev->bdev, buf), p,
+                               (p == rs->set.pi) ? " (parity)" : "");
+
+       DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n"
+              "algorithm \"%s\", %u chunks with %uMB/s\n"
+              "%s set with net %u/%u devices",
+              rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
+              atomic_read(&rs->sc.stripes),
+              rs->xor.f->name, rs->xor.chunks, mbpers(rs, io_size),
+              rs->set.raid_type->descr, rs->set.data_devs, rs->set.raid_devs);
+}
+
+/* Get all devices and offsets. */
+static int dev_parms(struct raid_set *rs, char **argv, int *p)
+{
+       struct dm_target *ti = rs->ti;
+
+DMINFO("rs->set.sectors_per_dev=%llu", (unsigned long long) rs->set.sectors_per_dev);
+       for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
+               int r;
+               unsigned long long tmp;
+               struct raid_dev *dev = rs->dev + *p;
+
+               /* Get offset and device. */
+               if (sscanf(argv[1], "%llu", &tmp) != 1 ||
+                   tmp > rs->set.sectors_per_dev)
+                       TI_ERR("Invalid RAID device offset parameter");
+
+               dev->start = tmp;
+               r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
+                                 &dev->dev);
+               if (r)
+                       TI_ERR_RET("RAID device lookup failure", r);
+
+               r = raid_dev_lookup(rs, dev);
+               if (r != -ENODEV && r < *p) {
+                       (*p)++; /* Ensure dm_put_device() on actual device. */
+                       TI_ERR_RET("Duplicate RAID device", -ENXIO);
+               }
+       }
+
+       return 0;
+}
+
+/* Set recovery bandwidth. */
+static void
+recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
+{
+       rs->recover.bandwidth = bandwidth;
+       rs->recover.bandwidth_work = 100 / bandwidth;
+}
+
+/* Handle variable number of RAID parameters. */
+static int get_raid_variable_parms(struct dm_target *ti, char **argv,
+                                  struct variable_parms *vp)
+{
+       int p, value;
+       struct {
+               int action; /* -1: skip, 0: no power2 check, 1: power2 check */
+               char *errmsg;
+               int min, max;
+               int *var, *var2, *var3;
+       } argctr[] = {
+               { 1,
+                 "Invalid chunk size; must be -1 or 2^^n and <= 16384",
+                 IO_SIZE_MIN, CHUNK_SIZE_MAX,
+                 &vp->chunk_size_parm, &vp->chunk_size, &vp->io_size },
+               { 0,
+                 "Invalid number of stripes: must be -1 or >= 8 and <= 16384",
+                 STRIPES_MIN, STRIPES_MAX,
+                 &vp->stripes_parm, &vp->stripes, NULL },
+               { 1,
+                 "Invalid io size; must -1 or >= 8, 2^^n and less equal "
+                 "min(BIO_MAX_SECTORS/2, chunk size)",
+                 IO_SIZE_MIN, 0, /* Needs to be updated in loop below. */
+                 &vp->io_size_parm, &vp->io_size, NULL },
+               { 1,
+                 "Invalid recovery io size; must be -1 or "
+                 "2^^n and less equal BIO_MAX_SECTORS/2",
+                 RECOVER_IO_SIZE_MIN, BIO_MAX_SECTORS / 2,
+                 &vp->recover_io_size_parm, &vp->recover_io_size, NULL },
+               { 0,
+                 "Invalid recovery bandwidth percentage; "
+                 "must be -1 or > 0 and <= 100",
+                 BANDWIDTH_MIN, BANDWIDTH_MAX,
+                 &vp->bandwidth_parm, &vp->bandwidth, NULL },
+               /* Handle sync argument seperately in loop. */
+               { -1,
+                 "Invalid recovery switch; must be \"sync\" or \"nosync\"" },
+               { 0,
+                 "Invalid number of recovery stripes;"
+                 "must be -1, > 0 and <= 64",
+                 RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX,
+                 &vp->recovery_stripes_parm, &vp->recovery_stripes, NULL },
+       }, *varp;
+
+       /* Fetch # of variable raid parameters. */
+       if (sscanf(*(argv++), "%d", &vp->raid_parms) != 1 ||
+           !range_ok(vp->raid_parms, 0, 7))
+               TI_ERR("Bad variable raid parameters number");
+
+       /* Preset variable RAID parameters. */
+       vp->chunk_size = CHUNK_SIZE_DEFAULT;
+       vp->io_size = IO_SIZE_DEFAULT;
+       vp->stripes = STRIPES_DEFAULT;
+       vp->recover_io_size = RECOVER_IO_SIZE_DEFAULT;
+       vp->bandwidth = BANDWIDTH_DEFAULT;
+       vp->recovery = 1;
+       vp->recovery_stripes = RECOVERY_STRIPES_DEFAULT;
+
+       /* Walk the array of argument constraints for all given ones. */
+       for (p = 0, varp = argctr; p < vp->raid_parms; p++, varp++) {
+               BUG_ON(varp >= ARRAY_END(argctr));
+
+               /* Special case for "[no]sync" string argument. */
+               if (varp->action < 0) {
+                       if (!strcmp(*argv, "sync"))
+                               ;
+                       else if (!strcmp(*argv, "nosync"))
+                               vp->recovery = 0;
+                       else
+                               TI_ERR(varp->errmsg);
+
+                       argv++;
+                       continue;
+               }
+
+               /*
+                * Special case for io_size depending
+                * on previously set chunk size.
+                */
+               if (p == 2)
+                       varp->max = min(BIO_MAX_SECTORS / 2, vp->chunk_size);
+
+               if (sscanf(*(argv++), "%d", &value) != 1 ||
+                   (value != -1 &&
+                    ((varp->action && !is_power_of_2(value)) ||
+                     !range_ok(value, varp->min, varp->max))))
+                       TI_ERR(varp->errmsg);
+
+               *varp->var = value;
+               if (value != -1) {
+                       if (varp->var2)
+                               *varp->var2 = value;
+                       if (varp->var3)
+                               *varp->var3 = value;
+               }
+       }
+
+       return 0;
+}
+
+/* Parse optional locking parameters. */
+static int get_raid_locking_parms(struct dm_target *ti, char **argv,
+                                 int *locking_parms,
+                                 struct dm_raid45_locking_type **locking_type)
+{
+       if (!strnicmp(argv[0], "locking", strlen(argv[0]))) {
+               char *lckstr = argv[1];
+               size_t lcksz = strlen(lckstr);
+
+               if (!strnicmp(lckstr, "none", lcksz)) {
+                       *locking_type = &locking_none;
+                       *locking_parms = 2;
+               } else if (!strnicmp(lckstr, "cluster", lcksz)) {
+                       DMERR("locking type \"%s\" not yet implemented",
+                             lckstr);
+                       return -EINVAL;
+               } else {
+                       DMERR("unknown locking type \"%s\"", lckstr);
+                       return -EINVAL;
+               }
+       }
+
+       *locking_parms = 0;
+       *locking_type = &locking_none;
+       return 0;
+}
+
+/* Set backing device read ahead properties of RAID set. */
+static void rs_set_read_ahead(struct raid_set *rs,
+                             unsigned sectors, unsigned stripes)
+{
+       unsigned ra_pages = dm_div_up(sectors, SECTORS_PER_PAGE);
+       struct mapped_device *md = dm_table_get_md(rs->ti->table);
+       struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
+
+       /* Set read-ahead for the RAID set and the component devices. */
+       if (ra_pages) {
+               unsigned p = rs->set.raid_devs;
+
+               bdi->ra_pages = stripes * ra_pages * rs->set.data_devs;
+
+               while (p--) {
+                       struct request_queue *q =
+                               bdev_get_queue(rs->dev[p].dev->bdev);
+
+                       q->backing_dev_info.ra_pages = ra_pages;
+               }
+       }
+}
+
+/* Set congested function. */
+static void rs_set_congested_fn(struct raid_set *rs)
+{
+       struct mapped_device *md = dm_table_get_md(rs->ti->table);
+       struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
+
+       /* Set congested function and data. */
+       bdi->congested_fn = rs_congested;
+       bdi->congested_data = rs;
+}
+
+/*
+ * Construct a RAID4/5 mapping:
+ *
+ * log_type #log_params <log_params> \
+ * raid_type [#parity_dev] #raid_variable_params <raid_params> \
+ * [locking "none"/"cluster"]
+ * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
+ *
+ * log_type = "core"/"disk",
+ * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
+ * log_params = [dirty_log_path] region_size [[no]sync])
+ *
+ * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
+ *
+ * #parity_dev = N if raid_type = "raid4"
+ * o N = -1: pick default = last device
+ * o N >= 0 and < #raid_devs: parity device index
+ *
+ * #raid_variable_params = 0-7; raid_params (-1 = default):
+ *   [chunk_size [#stripes [io_size [recover_io_size \
+ *    [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]]
+ *   o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
+ *     and <= CHUNK_SIZE_MAX)
+ *   o #stripes is number of stripes allocated to stripe cache
+ *     (must be > 1 and < STRIPES_MAX)
+ *   o io_size (io unit size per device in sectors; must be 2^^n and > 8)
+ *   o recover_io_size (io unit size per device for recovery in sectors;
+ must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
+ *   o %recovery_bandwith is the maximum amount spend for recovery during
+ *     application io (1-100%)
+ *   o recovery switch = [sync|nosync]
+ *   o #recovery_stripes is the number of recovery stripes used for
+ *     parallel recovery of the RAID set
+ * If raid_variable_params = 0, defaults will be used.
+ * Any raid_variable_param can be set to -1 to apply a default
+ *
+ * #raid_devs = N (N >= 3)
+ *
+ * #dev_to_initialize = N
+ * -1: initialize parity on all devices
+ * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
+ * of a failed devices content after replacement
+ *
+ * <dev_path> = device_path (eg, /dev/sdd1)
+ * <offset>   = begin at offset on <dev_path>
+ *
+ */
+#define        MIN_PARMS       13
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+       int dev_to_init, dl_parms, i, locking_parms,
+           parity_parm, pi = -1, r, raid_devs;
+       sector_t tmp, sectors_per_dev;
+       struct dm_raid45_locking_type *locking;
+       struct raid_set *rs;
+       struct raid_type *raid_type;
+       struct variable_parms parms;
+
+       /* Ensure minimum number of parameters. */
+       if (argc < MIN_PARMS)
+               TI_ERR("Not enough parameters");
+
+       /* Fetch # of dirty log parameters. */
+       if (sscanf(argv[1], "%d", &dl_parms) != 1 ||
+           !range_ok(dl_parms, 1, 4711)) /* ;-) */
+               TI_ERR("Bad dirty log parameters number");
+
+       /* Check raid_type. */
+       raid_type = get_raid_type(argv[dl_parms + 2]);
+       if (!raid_type)
+               TI_ERR("Bad raid type");
+
+       /* In case of RAID4, parity drive is selectable. */
+       parity_parm = !!(raid_type->level == raid4);
+
+       /* Handle variable number of RAID parameters. */
+       r = get_raid_variable_parms(ti, argv + dl_parms + parity_parm + 3,
+                                   &parms);
+       if (r)
+               return r;
+
+       /* Handle any locking parameters. */
+       r = get_raid_locking_parms(ti,
+                                  argv + dl_parms + parity_parm +
+                                  parms.raid_parms + 4,
+                                  &locking_parms, &locking);
+       if (r)
+               return r;
+
+       /* # of raid devices. */
+       i = dl_parms + parity_parm + parms.raid_parms + locking_parms + 4;
+       if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
+           raid_devs < raid_type->minimal_devs)
+               TI_ERR("Invalid number of raid devices");
+
+       /* In case of RAID4, check parity drive index is in limits. */
+       if (raid_type->level == raid4) {
+               /* Fetch index of parity device. */
+               if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
+                   (pi != -1 && !range_ok(pi, 0, raid_devs - 1)))
+                       TI_ERR("Invalid RAID4 parity device index");
+       }
+
+       /*
+        * Index of device to initialize starts at 0
+        *
+        * o -1 -> don't initialize a selected device;
+        *         initialize parity conforming to algorithm
+        * o 0..raid_devs-1 -> initialize respective device
+        *   (used for reconstruction of a replaced device)
+        */
+       if (sscanf(argv[dl_parms + parity_parm + parms.raid_parms +
+                  locking_parms + 5], "%d", &dev_to_init) != 1 ||
+           !range_ok(dev_to_init, -1, raid_devs - 1))
+               TI_ERR("Invalid number for raid device to initialize");
+
+       /* Check # of raid device arguments. */
+       if (argc - dl_parms - parity_parm - parms.raid_parms - 6 !=
+           2 * raid_devs)
+               TI_ERR("Wrong number of raid device/offset arguments");
+
+       /*
+        * Check that the table length is devisable
+        * w/o rest by (raid_devs - parity_devs)
+        */
+       if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
+                     &sectors_per_dev))
+               TI_ERR("Target length not divisible by number of data devices");
+
+       /*
+        * Check that the device size is
+        * devisable w/o rest by chunk size
+        */
+       if (!multiple(sectors_per_dev, parms.chunk_size, &tmp))
+               TI_ERR("Device length not divisible by chunk_size");
+
+       /****************************************************************
+        * Now that we checked the constructor arguments ->
+        * let's allocate the RAID set
+        ****************************************************************/
+       rs = context_alloc(raid_type, &parms, raid_devs, sectors_per_dev,
+                          ti, dl_parms, argv);
+       if (IS_ERR(rs))
+               return PTR_ERR(rs);
+
+
+       rs->set.dev_to_init = rs->set.dev_to_init_parm = dev_to_init;
+       rs->set.pi = rs->set.pi_parm = pi;
+
+       /* Set RAID4 parity drive index. */
+       if (raid_type->level == raid4)
+               rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
+
+       recover_set_bandwidth(rs, parms.bandwidth);
+
+       /* Use locking type to lock stripe access. */
+       rs->locking = locking;
+
+       /* Get the device/offset tupels. */
+       argv += dl_parms + 6 + parity_parm + parms.raid_parms;
+       r = dev_parms(rs, argv, &i);
+       if (r)
+               goto err;
+
+       /* Set backing device information (eg. read ahead). */
+       rs_set_read_ahead(rs, 2 * rs->set.chunk_size /* sectors per device */,
+                             2 /* # of stripes */);
+       rs_set_congested_fn(rs); /* Set congested function. */
+       SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
+       rs->xor.speed = xor_optimize(rs); /* Select best xor algorithm. */
+
+       /* Set for recovery of any nosync regions. */
+       if (parms.recovery)
+               SetRSRecover(rs);
+       else {
+               /*
+                * Need to free recovery stripe(s) here in case
+                * of nosync, because xor_optimize uses one.
+                */
+               set_start_recovery(rs);
+               set_end_recovery(rs);
+               stripe_recover_free(rs);
+       }
+
+       /*
+        * Enable parity chunk creation enformcement for
+        * little numbers of array members where it doesn'ti
+        * gain us performance to xor parity out and back in as
+        * with larger array member numbers.
+        */
+       if (rs->set.raid_devs <= rs->set.raid_type->minimal_devs + 1)
+               SetRSEnforceParityCreation(rs);
+
+       /*
+        * Make sure that dm core only hands maximum io size
+        * length down and pays attention to io boundaries.
+        */
+       ti->split_io = rs->set.io_size;
+       ti->private = rs;
+
+       /* Initialize work queue to handle this RAID set's io. */
+       r = rs_workqueue_init(rs);
+       if (r)
+               goto err;
+
+       rs_log(rs, rs->recover.io_size); /* Log information about RAID set. */
+       return 0;
+
+err:
+       context_free(rs, i);
+       return r;
+}
+
+/*
+ * Destruct a raid mapping
+ */
+static void raid_dtr(struct dm_target *ti)
+{
+       struct raid_set *rs = ti->private;
+
+       destroy_workqueue(rs->io.wq);
+       context_free(rs, rs->set.raid_devs);
+}
+
+/* Raid mapping function. */
+static int raid_map(struct dm_target *ti, struct bio *bio,
+                   union map_info *map_context)
+{
+       /* I don't want to waste stripe cache capacity. */
+       if (bio_rw(bio) == READA)
+               return -EIO;
+       else {
+               struct raid_set *rs = ti->private;
+
+               /*
+                * Get io reference to be waiting for to drop
+                * to zero on device suspension/destruction.
+                */
+               io_get(rs);
+               bio->bi_sector -= ti->begin;    /* Remap sector. */
+
+               /* Queue io to RAID set. */
+               mutex_lock(&rs->io.in_lock);
+               bio_list_add(&rs->io.in, bio);
+               mutex_unlock(&rs->io.in_lock);
+
+               /* Wake daemon to process input list. */
+               wake_do_raid(rs);
+
+               /* REMOVEME: statistics. */
+               atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
+                                       S_BIOS_READ : S_BIOS_WRITE));
+               return DM_MAPIO_SUBMITTED;      /* Handle later. */
+       }
+}
+
+/* Device suspend. */
+static void raid_presuspend(struct dm_target *ti)
+{
+       struct raid_set *rs = ti->private;
+       struct dm_dirty_log *dl = rs->recover.dl;
+
+       SetRSSuspend(rs);
+
+       if (RSRecover(rs))
+               dm_rh_stop_recovery(rs->recover.rh);
+
+       cancel_delayed_work(&rs->io.dws_do_raid);
+       flush_workqueue(rs->io.wq);
+       wait_ios(rs);   /* Wait for completion of all ios being processed. */
+
+       if (dl->type->presuspend && dl->type->presuspend(dl))
+               /* FIXME: need better error handling. */
+               DMWARN("log presuspend failed");
+}
+
+static void raid_postsuspend(struct dm_target *ti)
+{
+       struct raid_set *rs = ti->private;
+       struct dm_dirty_log *dl = rs->recover.dl;
+
+       if (dl->type->postsuspend && dl->type->postsuspend(dl))
+               /* FIXME: need better error handling. */
+               DMWARN("log postsuspend failed");
+
+}
+
+/* Device resume. */
+static void raid_resume(struct dm_target *ti)
+{
+       struct raid_set *rs = ti->private;
+       struct recover *rec = &rs->recover;
+       struct dm_dirty_log *dl = rec->dl;
+
+DMINFO("%s...", __func__);
+       if (dl->type->resume && dl->type->resume(dl))
+               /* Resume dirty log. */
+               /* FIXME: need better error handling. */
+               DMWARN("log resume failed");
+
+       rec->nr_regions_to_recover =
+               rec->nr_regions - dl->type->get_sync_count(dl);
+
+       /* Restart any unfinished recovery. */
+       if (RSRecover(rs)) {
+               set_start_recovery(rs);
+               dm_rh_start_recovery(rec->rh);
+       }
+
+       ClearRSSuspend(rs);
+}
+
+/* Return stripe cache size. */
+static unsigned sc_size(struct raid_set *rs)
+{
+       return to_sector(atomic_read(&rs->sc.stripes) *
+                        (sizeof(struct stripe) +
+                         (sizeof(struct stripe_chunk) +
+                          (sizeof(struct page_list) +
+                           to_bytes(rs->set.io_size) *
+                           rs->set.raid_devs)) +
+                         (rs->recover.end_jiffies ?
+                          0 : rs->recover.recovery_stripes *
+                          to_bytes(rs->set.raid_devs * rs->recover.io_size))));
+}
+
+/* REMOVEME: status output for development. */
+static void raid_devel_stats(struct dm_target *ti, char *result,
+                            unsigned *size, unsigned maxlen)
+{
+       unsigned sz = *size;
+       unsigned long j;
+       char buf[BDEVNAME_SIZE], *p;
+       struct stats_map *sm;
+       struct raid_set *rs = ti->private;
+       struct recover *rec = &rs->recover;
+       struct timespec ts;
+
+       DMEMIT("%s %s=%u bw=%u\n",
+              version, rs->xor.f->name, rs->xor.chunks, rs->recover.bandwidth);
+       DMEMIT("act_ios=%d ", io_ref(rs));
+       DMEMIT("act_ios_max=%d\n", atomic_read(&rs->io.in_process_max));
+       DMEMIT("act_stripes=%d ", sc_active(&rs->sc));
+       DMEMIT("act_stripes_max=%d\n",
+              atomic_read(&rs->sc.active_stripes_max));
+
+       for (sm = stats_map; sm < ARRAY_END(stats_map); sm++)
+               DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
+
+       DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs) ? "on" : "off");
+       DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs->set.chunk_size,
+              atomic_read(&rs->sc.stripes), rs->set.io_size,
+              rec->recovery_stripes, rec->io_size, rs->sc.hash.buckets,
+              sc_size(rs));
+
+       j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
+           rec->start_jiffies;
+       jiffies_to_timespec(j, &ts);
+       sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
+       p = strchr(buf, '.');
+       p[3] = 0;
+
+       DMEMIT("rg=%llu/%llu/%llu/%u %s\n",
+              (unsigned long long) rec->nr_regions_recovered,
+              (unsigned long long) rec->nr_regions_to_recover,
+              (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
+
+       *size = sz;
+}
+
+static int raid_status(struct dm_target *ti, status_type_t type,
+                      char *result, unsigned maxlen)
+{
+       unsigned p, sz = 0;
+       char buf[BDEVNAME_SIZE];
+       struct raid_set *rs = ti->private;
+       struct dm_dirty_log *dl = rs->recover.dl;
+       int raid_parms[] = {
+               rs->set.chunk_size_parm,
+               rs->sc.stripes_parm,
+               rs->set.io_size_parm,
+               rs->recover.io_size_parm,
+               rs->recover.bandwidth_parm,
+               -2,
+               rs->recover.recovery_stripes,
+       };
+
+       switch (type) {
+       case STATUSTYPE_INFO:
+               /* REMOVEME: statistics. */
+               if (RSDevelStats(rs))
+                       raid_devel_stats(ti, result, &sz, maxlen);
+
+               DMEMIT("%u ", rs->set.raid_devs);
+
+               for (p = 0; p < rs->set.raid_devs; p++)
+                       DMEMIT("%s ",
+                              format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev));
+
+               DMEMIT("2 ");
+               for (p = 0; p < rs->set.raid_devs; p++) {
+                       DMEMIT("%c", !DevFailed(rs->dev + p) ? 'A' : 'D');
+
+                       if (p == rs->set.pi)
+                               DMEMIT("p");
+
+                       if (p == rs->set.dev_to_init)
+                               DMEMIT("i");
+               }
+
+               DMEMIT(" %llu/%llu ",
+                     (unsigned long long) dl->type->get_sync_count(dl),
+                     (unsigned long long) rs->recover.nr_regions);
+
+               sz += dl->type->status(dl, type, result+sz, maxlen-sz);
+               break;
+       case STATUSTYPE_TABLE:
+               sz = rs->recover.dl->type->status(rs->recover.dl, type,
+                                                 result, maxlen);
+               DMEMIT("%s %u ", rs->set.raid_type->name, rs->set.raid_parms);
+
+               for (p = 0; p < rs->set.raid_parms; p++) {
+                       if (raid_parms[p] > -2)
+                               DMEMIT("%d ", raid_parms[p]);
+                       else
+                               DMEMIT("%s ", rs->recover.recovery ?
+                                             "sync" : "nosync");
+               }
+
+               DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
+
+               for (p = 0; p < rs->set.raid_devs; p++)
+                       DMEMIT("%s %llu ",
+                              format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev),
+                              (unsigned long long) rs->dev[p].start);
+       }
+
+       return 0;
+}
+
+/*
+ * Message interface
+ */
+/* Turn a delta into an absolute value. */
+static int _absolute(char *action, int act, int r)
+{
+       size_t len = strlen(action);
+
+       if (len < 2)
+               len = 2;
+
+       /* Make delta absolute. */
+       if (!strncmp("set", action, len))
+               ;
+       else if (!strncmp("grow", action, len))
+               r += act;
+       else if (!strncmp("shrink", action, len))
+               r = act - r;
+       else
+               r = -EINVAL;
+
+       return r;
+}
+
+ /* Change recovery io bandwidth. */
+static int bandwidth_change(struct raid_set *rs, int argc, char **argv,
+                           enum raid_set_flags flag)
+{
+       int act = rs->recover.bandwidth, bandwidth;
+
+       if (argc != 2)
+               return -EINVAL;
+
+       if (sscanf(argv[1], "%d", &bandwidth) == 1 &&
+           range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+               /* Make delta bandwidth absolute. */
+               bandwidth = _absolute(argv[0], act, bandwidth);
+
+               /* Check range. */
+               if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+                       recover_set_bandwidth(rs, bandwidth);
+                       return 0;
+               }
+       }
+
+       return -EINVAL;
+}
+
+/* Set/reset development feature flags. */
+static int devel_flags(struct raid_set *rs, int argc, char **argv,
+                      enum raid_set_flags flag)
+{
+       size_t len;
+
+       if (argc != 1)
+               return -EINVAL;
+
+       len = strlen(argv[0]);
+       if (len < 2)
+               len = 2;
+
+       if (!strncmp(argv[0], "on", len))
+               return test_and_set_bit(flag, &rs->io.flags) ? -EPERM : 0;
+       else if (!strncmp(argv[0], "off", len))
+               return test_and_clear_bit(flag, &rs->io.flags) ? 0 : -EPERM;
+       else if (!strncmp(argv[0], "reset", len)) {
+               if (flag == RS_DEVEL_STATS) {
+                       if  (test_bit(flag, &rs->io.flags)) {
+                               stats_reset(rs);
+                               return 0;
+                       } else
+                               return -EPERM;
+               } else  {
+                       set_bit(flag, &rs->io.flags);
+                       return 0;
+               }
+       }
+
+       return -EINVAL;
+}
+
+/* Resize the stripe cache. */
+static int sc_resize(struct raid_set *rs, int argc, char **argv,
+                    enum raid_set_flags flag)
+{
+       int act, stripes;
+
+       if (argc != 2)
+               return -EINVAL;
+
+       /* Deny permission in case the daemon is still resizing!. */
+       if (atomic_read(&rs->sc.stripes_to_set))
+               return -EPERM;
+
+       if (sscanf(argv[1], "%d", &stripes) == 1 &&
+           stripes > 0) {
+               act = atomic_read(&rs->sc.stripes);
+
+               /* Make delta stripes absolute. */
+               stripes = _absolute(argv[0], act, stripes);
+
+               /*
+                * Check range and that the # of stripes changes.
+                * We leave the resizing to the wroker.
+                */
+               if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX) &&
+                   stripes != atomic_read(&rs->sc.stripes)) {
+                       atomic_set(&rs->sc.stripes_to_set, stripes);
+                       wake_do_raid(rs);
+                       return 0;
+               }
+       }
+
+       return -EINVAL;
+}
+
+/* Change xor algorithm and number of chunks. */
+static int xor_set(struct raid_set *rs, int argc, char **argv,
+                  enum raid_set_flags flag)
+{
+       if (argc == 2) {
+               int chunks;
+               char *algorithm = argv[0];
+               struct xor_func *f = ARRAY_END(xor_funcs);
+
+               if (sscanf(argv[1], "%d", &chunks) == 1 &&
+                   range_ok(chunks, 2, XOR_CHUNKS_MAX) &&
+                   chunks <= rs->set.raid_devs) {
+                       while (f-- > xor_funcs) {
+                               if (!strcmp(algorithm, f->name)) {
+                                       unsigned io_size = 0;
+                                       struct stripe *stripe = stripe_alloc(&rs->sc, rs->sc.mem_cache_client, SC_GROW);
+
+                                       DMINFO("xor: %s", f->name);
+                                       if (f->f == xor_blocks_wrapper &&
+                                           chunks > MAX_XOR_BLOCKS + 1) {
+                                               DMERR("chunks > MAX_XOR_BLOCKS"
+                                                     " + 1");
+                                               break;
+                                       }
+
+                                       mutex_lock(&rs->io.xor_lock);
+                                       rs->xor.f = f;
+                                       rs->xor.chunks = chunks;
+                                       rs->xor.speed = 0;
+                                       mutex_unlock(&rs->io.xor_lock);
+
+                                       if (stripe) {
+                                               rs->xor.speed = xor_speed(stripe);
+                                               io_size = stripe->io.size;
+                                               stripe_free(stripe, rs->sc.mem_cache_client);
+                                       }
+
+                                       rs_log(rs, io_size);
+                                       return 0;
+                               }
+                       }
+               }
+       }
+
+       return -EINVAL;
+}
+
+/*
+ * Allow writes after they got prohibited because of a device failure.
+ *
+ * This needs to be called after userspace updated metadata state
+ * based on an event being thrown during device failure processing.
+ */
+static int allow_writes(struct raid_set *rs, int argc, char **argv,
+                       enum raid_set_flags flag)
+{
+       if (TestClearRSProhibitWrites(rs)) {
+DMINFO("%s waking", __func__);
+               wake_do_raid(rs);
+               return 0;
+       }
+
+       return -EPERM;
+}
+
+/* Parse the RAID message. */
+/*
+ * 'all[ow_writes]'
+ * 'ba[ndwidth] {se[t],g[row],sh[rink]} #'     # e.g 'ba se 50'
+ * "o[verwrite]  {on,of[f],r[eset]}'           # e.g. 'o of'
+ * 'sta[tistics] {on,of[f],r[eset]}'           # e.g. 'stat of'
+ * 'str[ipecache] {se[t],g[row],sh[rink]} #'   # e.g. 'stripe set 1024'
+ * 'xor algorithm #chunks'                     # e.g. 'xor xor_8 5'
+ *
+ */
+static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+       if (argc) {
+               size_t len = strlen(argv[0]);
+               struct raid_set *rs = ti->private;
+               struct {
+                       const char *name;
+                       int (*f) (struct raid_set *rs, int argc, char **argv,
+                                 enum raid_set_flags flag);
+                       enum raid_set_flags flag;
+               } msg_descr[] = {
+                       { "allow_writes", allow_writes, 0 },
+                       { "bandwidth", bandwidth_change, 0 },
+                       { "overwrite", devel_flags, RS_CHECK_OVERWRITE },
+                       { "statistics", devel_flags, RS_DEVEL_STATS },
+                       { "stripe_cache", sc_resize, 0 },
+                       { "xor", xor_set, 0 },
+               }, *m = ARRAY_END(msg_descr);
+
+               if (len < 3)
+                       len = 3;
+
+               while (m-- > msg_descr) {
+                       if (!strncmp(argv[0], m->name, len))
+                               return m->f(rs, argc - 1, argv + 1, m->flag);
+               }
+
+       }
+
+       return -EINVAL;
+}
+/*
+ * END message interface
+ */
+
+/* Provide io hints. */
+static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+       struct raid_set *rs = ti->private;
+
+       blk_limits_io_min(limits, rs->set.chunk_size);
+       blk_limits_io_opt(limits, rs->set.chunk_size * rs->set.data_devs);
+}
+
+static struct target_type raid_target = {
+       .name = "raid45",
+       .version = {1, 0, 0},
+       .module = THIS_MODULE,
+       .ctr = raid_ctr,
+       .dtr = raid_dtr,
+       .map = raid_map,
+       .presuspend = raid_presuspend,
+       .postsuspend = raid_postsuspend,
+       .resume = raid_resume,
+       .status = raid_status,
+       .message = raid_message,
+       .io_hints = raid_io_hints,
+};
+
+static void init_exit(const char *bad_msg, const char *good_msg, int r)
+{
+       if (r)
+               DMERR("Failed to %sregister target [%d]", bad_msg, r);
+       else
+               DMINFO("%s %s", good_msg, version);
+}
+
+static int __init dm_raid_init(void)
+{
+       int r = dm_register_target(&raid_target);
+
+       init_exit("", "initialized", r);
+       return r;
+}
+
+static void __exit dm_raid_exit(void)
+{
+       dm_unregister_target(&raid_target);
+       init_exit("un", "exit", 0);
+}
+
+/* Module hooks. */
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+
+MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
+MODULE_AUTHOR("Heinz Mauelshagen <heinzm@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("dm-raid4");
+MODULE_ALIAS("dm-raid5");
diff --git a/drivers/md/dm-raid45.h b/drivers/md/dm-raid45.h

new file mode 100644 (file)

index 0000000..a55ee2e
--- /dev/null
+++ b/drivers/md/dm-raid45.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen (Mauelshagen@RedHat.com)
+ *
+ * Locking definitions for the device-mapper RAID45 target.
+ *
+ * This file is released under the GPL.
+ *
+ */
+
+#ifndef _DM_RAID45_H
+#define _DM_RAID45_H
+
+/* Factor out to dm.h! */
+#define        STR_LEN(ptr, str)       (ptr), (str), strlen((ptr))
+/* Reference to array end. */
+#define ARRAY_END(a)    ((a) + ARRAY_SIZE(a))
+
+enum dm_lock_type { DM_RAID45_EX, DM_RAID45_SHARED };
+
+struct dm_raid45_locking_type {
+       /* Request a lock on a stripe. */
+       void* (*lock)(sector_t key, enum dm_lock_type type);
+
+       /* Release a lock on a stripe. */
+       void (*unlock)(void *lock_handle);
+};
+
+#endif
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c

index 7771ed2..0b29f36 100644 (file)
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -113,10 +113,11 @@ struct dm_region {
  /*
   * Conversion fns
   */
-static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector)
+region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector)
  {
         return sector >> rh->region_shift;
  }
+EXPORT_SYMBOL_GPL(dm_rh_sector_to_region);
  
  sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region)
  {
@@ -496,7 +497,7 @@ void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled)
  }
  EXPORT_SYMBOL_GPL(dm_rh_update_states);
  
-static void rh_inc(struct dm_region_hash *rh, region_t region)
+void dm_rh_inc(struct dm_region_hash *rh, region_t region)
  {
         struct dm_region *reg;
  
@@ -518,6 +519,7 @@ static void rh_inc(struct dm_region_hash *rh, region_t region)
  
         read_unlock(&rh->hash_lock);
  }
+EXPORT_SYMBOL_GPL(dm_rh_inc);
  
  void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
  {
@@ -526,7 +528,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
         for (bio = bios->head; bio; bio = bio->bi_next) {
                 if (bio->bi_rw & REQ_FLUSH)
                         continue;
-               rh_inc(rh, dm_rh_bio_to_region(rh, bio));
+               dm_rh_inc(rh, dm_rh_bio_to_region(rh, bio));
         }
  }
  EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
@@ -694,6 +696,19 @@ void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio)
  }
  EXPORT_SYMBOL_GPL(dm_rh_delay);
  
+void dm_rh_delay_by_region(struct dm_region_hash *rh,
+                          struct bio *bio, region_t region)
+{
+       struct dm_region *reg;
+
+       /* FIXME: locking. */
+       read_lock(&rh->hash_lock);
+       reg = __rh_find(rh, region);
+       bio_list_add(&reg->delayed_bios, bio);
+       read_unlock(&rh->hash_lock);
+}
+EXPORT_SYMBOL_GPL(dm_rh_delay_by_region);
+
  void dm_rh_stop_recovery(struct dm_region_hash *rh)
  {
         int i;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c

index 2e227fb..8ba897f 100644 (file)
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -438,14 +438,18 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
  
         dd_new = dd_old = *dd;
  
-       dd_new.dm_dev.mode |= new_mode;
+       dd_new.dm_dev.mode = new_mode;
         dd_new.dm_dev.bdev = NULL;
  
         r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md);
-       if (r)
+       if (r == -EROFS) {
+               dd_new.dm_dev.mode &= ~FMODE_WRITE;
+               r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md);
+       }
+       if (!r)
                 return r;
  
-       dd->dm_dev.mode |= new_mode;
+       dd->dm_dev.mode = new_mode;
         close_dev(&dd_old, md);
  
         return 0;
@@ -491,17 +495,25 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
                 dd->dm_dev.mode = mode;
                 dd->dm_dev.bdev = NULL;
  
-               if ((r = open_dev(dd, dev, t->md))) {
+               r = open_dev(dd, dev, t->md);
+               if (r == -EROFS) {
+                       dd->dm_dev.mode &= ~FMODE_WRITE;
+                       r = open_dev(dd, dev, t->md);
+               }
+               if (r) {
                         kfree(dd);
                         return r;
                 }
  
+               if (dd->dm_dev.mode != mode)
+                       t->mode = dd->dm_dev.mode;
+
                 format_dev_t(dd->dm_dev.name, dev);
  
                 atomic_set(&dd->count, 0);
                 list_add(&dd->list, &t->devices);
  
-       } else if (dd->dm_dev.mode != (mode | dd->dm_dev.mode)) {
+       } else if (dd->dm_dev.mode != mode) {
                 r = upgrade_mode(dd, mode, t->md);
                 if (r)
                         return r;
@@ -554,9 +566,12 @@ EXPORT_SYMBOL_GPL(dm_set_device_limits);
   */
  void dm_put_device(struct dm_target *ti, struct dm_dev *d)
  {
-       struct dm_dev_internal *dd = container_of(d, struct dm_dev_internal,
-                                                 dm_dev);
+       struct dm_dev_internal *dd;
+
+       if (!d)
+               return;
  
+       dd = container_of(d, struct dm_dev_internal, dm_dev);
         if (atomic_dec_and_test(&dd->count)) {
                 close_dev(dd, ti->table->md);
                 list_del(&dd->list);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c

index e24143c..f3fd72d 100644 (file)
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -343,16 +343,25 @@ int dm_deleting_md(struct mapped_device *md)
  static int dm_blk_open(struct block_device *bdev, fmode_t mode)
  {
         struct mapped_device *md;
+       int retval = 0;
  
         spin_lock(&_minor_lock);
  
         md = bdev->bd_disk->private_data;
-       if (!md)
+       if (!md) {
+               retval = -ENXIO;
                 goto out;
+       }
  
         if (test_bit(DMF_FREEING, &md->flags) ||
             dm_deleting_md(md)) {
                 md = NULL;
+               retval = -ENXIO;
+               goto out;
+       }
+       if (get_disk_ro(md->disk) && (mode & FMODE_WRITE)) {
+               md = NULL;
+               retval = -EROFS;
                 goto out;
         }
  
@@ -362,7 +371,7 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
  out:
         spin_unlock(&_minor_lock);
  
-       return md ? 0 : -ENXIO;
+       return retval;
  }
  
  static int dm_blk_close(struct gendisk *disk, fmode_t mode)
@@ -421,19 +430,25 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
         if (!map || !dm_table_get_size(map))
                 goto out;
  
-       /* We only support devices that have a single target */
-       if (dm_table_get_num_targets(map) != 1)
-               goto out;
-
-       tgt = dm_table_get_target(map, 0);
-
         if (dm_suspended_md(md)) {
                 r = -EAGAIN;
                 goto out;
         }
  
-       if (tgt->type->ioctl)
-               r = tgt->type->ioctl(tgt, cmd, arg);
+       if (cmd == BLKRRPART) {
+               /* Emulate Re-read partitions table */
+               kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE);
+               r = 0;
+       } else {
+               /* We only support devices that have a single target */
+               if (dm_table_get_num_targets(map) != 1)
+                       goto out;
+
+               tgt = dm_table_get_target(map, 0);
+
+               if (tgt->type->ioctl)
+                       r = tgt->type->ioctl(tgt, cmd, arg);
+       }
  
  out:
         dm_table_put(map);
@@ -2097,6 +2112,13 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
                 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
         write_unlock_irqrestore(&md->map_lock, flags);
  
+       dm_table_get(md->map);
+       if (!(dm_table_get_mode(t) & FMODE_WRITE))
+               set_disk_ro(md->disk, 1);
+       else
+               set_disk_ro(md->disk, 0);
+       dm_table_put(md->map);
+
         return old_map;
  }
  
@@ -2653,6 +2675,7 @@ struct gendisk *dm_disk(struct mapped_device *md)
  {
         return md->disk;
  }
+EXPORT_SYMBOL_GPL(dm_disk);
  
  struct kobject *dm_kobject(struct mapped_device *md)
  {
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig

index c779509..5f510a7 100644 (file)
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -136,7 +136,7 @@ config PHANTOM
  
  config INTEL_MID_PTI
         tristate "Parallel Trace Interface for MIPI P1149.7 cJTAG standard"
-       depends on PCI
+       depends on X86_INTEL_MID
         default n
         help
           The PTI (Parallel Trace Interface) driver directs
@@ -427,7 +427,7 @@ config TI_DAC7512
  
  config VMWARE_BALLOON
         tristate "VMware Balloon Driver"
-       depends on X86
+       depends on X86 && !XEN
         help
           This is VMware physical memory management driver which acts
           like a "balloon" that can be inflated to reclaim physical pages
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig

index b982854..821b8b6 100644 (file)
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -295,9 +295,9 @@ source "drivers/net/wimax/Kconfig"
  
  source "drivers/net/wan/Kconfig"
  
-config XEN_NETDEV_FRONTEND
+config PARAVIRT_XEN_NETDEV_FRONTEND
         tristate "Xen network device frontend driver"
-       depends on XEN
+       depends on PARAVIRT_XEN
         select XEN_XENBUS_FRONTEND
         default y
         help
@@ -306,15 +306,15 @@ config XEN_NETDEV_FRONTEND
           domain 0).
  
           The corresponding Linux backend driver is enabled by the
-         CONFIG_XEN_NETDEV_BACKEND option.
+         PARAVIRT_XEN_NETDEV_BACKEND option.
  
           If you are compiling a kernel for use as Xen guest, you
           should say Y here. To compile this driver as a module, chose
           M here: the module will be called xen-netfront.
  
-config XEN_NETDEV_BACKEND
+config PARAVIRT_XEN_NETDEV_BACKEND
         tristate "Xen backend network device"
-       depends on XEN_BACKEND
+       depends on PARAVIRT_XEN_BACKEND
         help
           This driver allows the kernel to act as a Xen network driver
           domain which exports paravirtual network devices to other
@@ -322,7 +322,7 @@ config XEN_NETDEV_BACKEND
           system that implements a compatible front end.
  
           The corresponding Linux frontend driver is enabled by the
-         CONFIG_XEN_NETDEV_FRONTEND configuration option.
+         PARAVIRT_XEN_NETDEV_FRONTEND configuration option.
  
           The backend driver presents a standard network device
           endpoint for each paravirtual network device to the driver
@@ -336,7 +336,7 @@ config XEN_NETDEV_BACKEND
  
  config VMXNET3
         tristate "VMware VMXNET3 ethernet driver"
-       depends on PCI && INET
+       depends on PCI && INET && !XEN
         help
           This driver supports VMware's vmxnet3 virtual ethernet NIC.
           To compile this driver as a module, choose M here: the
diff --git a/drivers/net/Makefile b/drivers/net/Makefile

index a6b8ce1..5fdab97 100644 (file)
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -56,8 +56,8 @@ obj-$(CONFIG_WLAN) += wireless/
  obj-$(CONFIG_WIMAX) += wimax/
  
  obj-$(CONFIG_VMXNET3) += vmxnet3/
-obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
-obj-$(CONFIG_XEN_NETDEV_BACKEND) += xen-netback/
+obj-$(CONFIG_PARAVIRT_XEN_NETDEV_FRONTEND) += xen-netfront.o
+obj-$(CONFIG_PARAVIRT_XEN_NETDEV_BACKEND) += xen-netback/
  
  obj-$(CONFIG_USB_CATC)          += usb/
  obj-$(CONFIG_USB_KAWETH)        += usb/
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c

index abb6ce7..b77398f 100644 (file)
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -3313,7 +3313,17 @@ static int __devinit init_one(struct pci_dev *pdev,
          * register at least one net device.
          */
         for_each_port(adapter, i) {
+#ifndef CONFIG_XEN
                 err = register_netdev(adapter->port[i]);
+#else
+               rtnl_lock();
+               err = register_netdevice(adapter->port[i]);
+               if (!err) {
+                       adapter->port[i]->wanted_features &= ~NETIF_F_GRO;
+                       netdev_update_features(adapter->port[i]);
+               }
+               rtnl_unlock();
+#endif
                 if (err)
                         dev_warn(&pdev->dev,
                                  "cannot register net device %s, skipping\n",
diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c

index cfb60e1..d8336b8 100644 (file)
--- a/drivers/net/ethernet/chelsio/cxgb3/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c
@@ -59,11 +59,24 @@
   * It must be a divisor of PAGE_SIZE.  If set to 0 FL0 will use sk_buffs
   * directly.
   */
+#ifndef CONFIG_XEN
  #define FL0_PG_CHUNK_SIZE  2048
+#else
+/* Use skbuffs for XEN kernels. LRO is already disabled */
+#define FL0_PG_CHUNK_SIZE  0
+#endif
+
  #define FL0_PG_ORDER 0
  #define FL0_PG_ALLOC_SIZE (PAGE_SIZE << FL0_PG_ORDER)
+
+#ifndef CONFIG_XEN
  #define FL1_PG_CHUNK_SIZE (PAGE_SIZE > 8192 ? 16384 : 8192)
  #define FL1_PG_ORDER (PAGE_SIZE > 8192 ? 0 : 1)
+#else
+#define FL1_PG_CHUNK_SIZE 0
+#define FL1_PG_ORDER 0
+#endif
+
  #define FL1_PG_ALLOC_SIZE (PAGE_SIZE << FL1_PG_ORDER)
  
  #define SGE_RX_DROP_THRES 16
@@ -1268,7 +1281,27 @@ netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
  
         gen = q->gen;
         q->unacked += ndesc;
+#ifdef CONFIG_XEN
+       /*
+        * Some Guest OS clients get terrible performance when they have bad
+        * message size / socket send buffer space parameters.  For instance,
+        * if an application selects an 8KB message size and an 8KB send
+        * socket buffer size.  This forces the application into a single
+        * packet stop-and-go mode where it's only willing to have a single
+        * message outstanding.  The next message is only sent when the
+        * previous message is noted as having been sent.  Until we issue a
+        * kfree_skb() against the TX skb, the skb is charged against the
+        * application's send buffer space.  We only free up TX skbs when we
+        * get a TX credit return from the hardware / firmware which is fairly
+        * lazy about this.  So we request a TX WR Completion Notification on
+        * every TX descriptor in order to accellerate TX credit returns.  See
+        * also the change in handle_rsp_cntrl_info() to free up TX skb's when
+        * we receive the TX WR Completion Notifications ...
+        */
+       compl = F_WR_COMPL;
+#else
         compl = (q->unacked & 8) << (S_WR_COMPL - 3);
+#endif
         q->unacked &= 7;
         pidx = q->pidx;
         q->pidx += ndesc;
@@ -2154,8 +2187,35 @@ static inline void handle_rsp_cntrl_info(struct sge_qset *qs, u32 flags)
  #endif
  
         credits = G_RSPD_TXQ0_CR(flags);
-       if (credits)
+       if (credits) {
                 qs->txq[TXQ_ETH].processed += credits;
+#ifdef CONFIG_XEN
+               /*
+                * In the normal Linux driver t3_eth_xmit() routine, we call
+                * skb_orphan() on unshared TX skb.  This results in a call to
+                * the destructor for the skb which frees up the send buffer
+                * space it was holding down.  This, in turn, allows the
+                * application to make forward progress generating more data
+                * which is important at 10Gb/s.  For Virtual Machine Guest
+                * Operating Systems this doesn't work since the send buffer
+                * space is being held down in the Virtual Machine.  Thus we
+                * need to get the TX skb's freed up as soon as possible in
+                * order to prevent applications from stalling.
+                *
+                * This code is largely copied from the corresponding code in
+                * sge_timer_tx() and should probably be kept in sync with any
+                * changes there.
+                */
+               if (__netif_tx_trylock(qs->tx_q)) {
+                       struct port_info *pi = netdev_priv(qs->netdev);
+                       struct adapter *adap = pi->adapter;
+
+                       reclaim_completed_tx(adap, &qs->txq[TXQ_ETH],
+                               TX_RECLAIM_CHUNK);
+                       __netif_tx_unlock(qs->tx_q);
+               }
+#endif
+       }
  
         credits = G_RSPD_TXQ2_CR(flags);
         if (credits)
diff --git a/drivers/net/ethernet/chelsio/cxgb3/version.h b/drivers/net/ethernet/chelsio/cxgb3/version.h

index 165bfb9..8b90ed3 100644 (file)
--- a/drivers/net/ethernet/chelsio/cxgb3/version.h
+++ b/drivers/net/ethernet/chelsio/cxgb3/version.h
@@ -35,7 +35,11 @@
  #define DRV_DESC "Chelsio T3 Network Driver"
  #define DRV_NAME "cxgb3"
  /* Driver version */
+#ifndef CONFIG_XEN
  #define DRV_VERSION "1.1.5-ko"
+#else
+#define DRV_VERSION "1.1.5-xen-ko"
+#endif
  
  /* Firmware version */
  #define FW_VERSION_MAJOR 7
diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c

index fea3641..3ec086d 100644 (file)
--- a/drivers/net/ethernet/dec/tulip/tulip_core.c
+++ b/drivers/net/ethernet/dec/tulip/tulip_core.c
@@ -1937,6 +1937,10 @@ static void __devexit tulip_remove_one (struct pci_dev *pdev)
                 return;
  
         tp = netdev_priv(dev);
+
+       /* shoot NIC in the head before deallocating descriptors */
+       pci_disable_device(tp->pdev);
+
         unregister_netdev(dev);
         pci_free_consistent (pdev,
                              sizeof (struct tulip_rx_desc) * RX_RING_SIZE +
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c

index f4d2da0..e44d164 100644 (file)
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -103,6 +103,19 @@ static int __devinit ehea_probe_adapter(struct platform_device *dev,
  
  static int __devexit ehea_remove(struct platform_device *dev);
  
+static struct of_device_id ehea_module_device_table[] = {
+       {
+               .name = "lhea",
+               .compatible = "IBM,lhea",
+       },
+       {
+               .type = "network",
+               .compatible = "IBM,lhea-ethernet",
+       },
+       {},
+};
+MODULE_DEVICE_TABLE(of, ehea_module_device_table);
+
  static struct of_device_id ehea_device_table[] = {
         {
                 .name = "lhea",
@@ -110,7 +123,6 @@ static struct of_device_id ehea_device_table[] = {
         },
         {},
  };
-MODULE_DEVICE_TABLE(of, ehea_device_table);
  
  static struct of_platform_driver ehea_driver = {
         .driver = {
diff --git a/drivers/net/wireless/b43/main.c b/drivers/net/wireless/b43/main.c

index e4d6dc2..9625592 100644 (file)
--- a/drivers/net/wireless/b43/main.c
+++ b/drivers/net/wireless/b43/main.c
@@ -2077,10 +2077,13 @@ static void b43_release_firmware(struct b43_wldev *dev)
  static void b43_print_fw_helptext(struct b43_wl *wl, bool error)
  {
         const char text[] =
-               "You must go to " \
-               "http://wireless.kernel.org/en/users/Drivers/b43#devicefirmware " \
-               "and download the correct firmware for this driver version. " \
-               "Please carefully read all instructions on this website.\n";
+               "Please open a terminal and enter the command " \
+               "\"sudo /usr/sbin/install_bcm43xx_firmware\" to download " \
+               "the correct firmware for this driver version. " \
+               "For an off-line installation, go to " \
+               "http://en.opensuse.org/HCL/Network_Adapters_(Wireless)/" \
+               "Broadcom_BCM43xx and follow the instructions in the " \
+               "\"Installing firmware from RPM packages\" section.\n";
  
         if (error)
                 b43err(wl, text);
diff --git a/drivers/net/xen-netback/Makefile b/drivers/net/xen-netback/Makefile

index e346e81..e3072eb 100644 (file)
--- a/drivers/net/xen-netback/Makefile
+++ b/drivers/net/xen-netback/Makefile
@@ -1,3 +1,3 @@
-obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
+obj-$(CONFIG_PARAVIRT_XEN_NETDEV_BACKEND) := xen-netback.o
  
  xen-netback-y := netback.o xenbus.o interface.o
diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c

index f34b5b2..48532f2 100644 (file)
--- a/drivers/oprofile/buffer_sync.c
+++ b/drivers/oprofile/buffer_sync.c
@@ -8,6 +8,10 @@
   * @author Barry Kasindorf
   * @author Robert Richter <robert.richter@amd.com>
   *
+ * Modified by Aravind Menon for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ *
   * This is the core of the buffer management. Each
   * CPU buffer is processed and entered into the
   * global event buffer. Such processing is necessary
@@ -43,6 +47,11 @@ static cpumask_var_t marked_cpus;
  static DEFINE_SPINLOCK(task_mortuary);
  static void process_task_mortuary(void);
  
+#ifdef CONFIG_XEN
+#include <linux/percpu.h>
+static DEFINE_PER_CPU(int, current_domain) = COORDINATOR_DOMAIN;
+#endif
+
  /* Take ownership of the task struct and place it on the
   * list for processing. Only after two full buffer syncs
   * does the task eventually get freed, because by then
@@ -61,7 +70,6 @@ task_free_notify(struct notifier_block *self, unsigned long val, void *data)
         return NOTIFY_OK;
  }
  
-
  /* The task is on its way out. A sync of the buffer means we can catch
   * any remaining samples for this task.
   */
@@ -151,6 +159,12 @@ static void free_all_tasks(void)
  int sync_start(void)
  {
         int err;
+#ifdef CONFIG_XEN
+       unsigned int cpu;
+
+       for_each_online_cpu(cpu)
+               per_cpu(current_domain, cpu) = COORDINATOR_DOMAIN;
+#endif
  
         if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL))
                 return -ENOMEM;
@@ -287,14 +301,32 @@ static void add_cpu_switch(int i)
         last_cookie = INVALID_COOKIE;
  }
  
-static void add_kernel_ctx_switch(unsigned int in_kernel)
+static void add_cpu_mode_switch(unsigned int cpu_mode)
  {
         add_event_entry(ESCAPE_CODE);
-       if (in_kernel)
+       switch (cpu_mode) {
+       case CPU_MODE_USER:
+               add_event_entry(USER_ENTER_SWITCH_CODE);
+               break;
+       case CPU_MODE_KERNEL:
                 add_event_entry(KERNEL_ENTER_SWITCH_CODE);
-       else
-               add_event_entry(KERNEL_EXIT_SWITCH_CODE);
+               break;
+       case CPU_MODE_XEN:
+               add_event_entry(XEN_ENTER_SWITCH_CODE);
+               break;
+       default:
+               break;
+       }
+}
+
+#ifdef CONFIG_XEN
+static void add_domain_switch(unsigned long domain_id)
+{
+       add_event_entry(ESCAPE_CODE);
+       add_event_entry(DOMAIN_SWITCH_CODE);
+       add_event_entry(domain_id);
  }
+#endif
  
  static void
  add_user_ctx_switch(struct task_struct const *task, unsigned long cookie)
@@ -374,12 +406,12 @@ static inline void add_sample_entry(unsigned long offset, unsigned long event)
   * for later lookup from userspace. Return 0 on failure.
   */
  static int
-add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel)
+add_sample(struct mm_struct *mm, struct op_sample *s, int cpu_mode)
  {
         unsigned long cookie;
         off_t offset;
  
-       if (in_kernel) {
+       if (cpu_mode >= CPU_MODE_KERNEL) {
                 add_sample_entry(s->eip, s->event);
                 return 1;
         }
@@ -504,7 +536,7 @@ void sync_buffer(int cpu)
         unsigned long val;
         struct task_struct *new;
         unsigned long cookie = 0;
-       int in_kernel = 1;
+       int cpu_mode = CPU_MODE_KERNEL;
         sync_buffer_state state = sb_buffer_start;
         unsigned int i;
         unsigned long available;
@@ -516,6 +548,13 @@ void sync_buffer(int cpu)
  
         add_cpu_switch(cpu);
  
+#ifdef CONFIG_XEN
+       /* We need to assign the first samples in this CPU buffer to the
+          same domain that we were processing at the last sync_buffer */
+       if (per_cpu(current_domain, cpu) != COORDINATOR_DOMAIN)
+               add_domain_switch(per_cpu(current_domain, cpu));
+#endif
+
         op_cpu_buffer_reset(cpu);
         available = op_cpu_buffer_entries(cpu);
  
@@ -532,10 +571,10 @@ void sync_buffer(int cpu)
                         }
                         if (flags & KERNEL_CTX_SWITCH) {
                                 /* kernel/userspace switch */
-                               in_kernel = flags & IS_KERNEL;
+                               cpu_mode = flags & CPU_MODE_MASK;
                                 if (state == sb_buffer_start)
                                         state = sb_sample_start;
-                               add_kernel_ctx_switch(flags & IS_KERNEL);
+                               add_cpu_mode_switch(cpu_mode);
                         }
                         if (flags & USER_CTX_SWITCH
                             && op_cpu_buffer_get_data(&entry, &val)) {
@@ -548,16 +587,30 @@ void sync_buffer(int cpu)
                                         cookie = get_exec_dcookie(mm);
                                 add_user_ctx_switch(new, cookie);
                         }
+#ifdef CONFIG_XEN
+                       if ((flags & DOMAIN_SWITCH)
+                           && op_cpu_buffer_get_data(&entry, &val)) {
+                               per_cpu(current_domain, cpu) = val;
+                               add_domain_switch(val);
+                       }
+#endif
                         if (op_cpu_buffer_get_size(&entry))
                                 add_data(&entry, mm);
                         continue;
                 }
  
+#ifdef CONFIG_XEN
+               if (per_cpu(current_domain, cpu) != COORDINATOR_DOMAIN) {
+                       add_sample_entry(sample->eip, sample->event);
+                       continue;
+               }
+#endif
+
                 if (state < sb_bt_start)
                         /* ignore sample */
                         continue;
  
-               if (add_sample(mm, sample, in_kernel))
+               if (add_sample(mm, sample, cpu_mode))
                         continue;
  
                 /* ignore backtraces if failed to add a sample */
@@ -568,6 +621,12 @@ void sync_buffer(int cpu)
         }
         release_mm(mm);
  
+#ifdef CONFIG_XEN
+       /* We reset domain to COORDINATOR at each CPU switch */
+       if (per_cpu(current_domain, cpu) != COORDINATOR_DOMAIN)
+               add_domain_switch(COORDINATOR_DOMAIN);
+#endif
+
         mark_done(cpu);
  
         mutex_unlock(&buffer_mutex);
diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c

index b8ef8dd..b5e539e 100644 (file)
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -8,6 +8,10 @@
   * @author Barry Kasindorf <barry.kasindorf@amd.com>
   * @author Robert Richter <robert.richter@amd.com>
   *
+ * Modified by Aravind Menon for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ *
   * Each CPU has a local buffer that stores PC value/event
   * pairs. We also log context switches when we notice them.
   * Eventually each CPU's buffer is processed into the global
@@ -38,6 +42,12 @@ static void wq_sync_buffer(struct work_struct *work);
  #define DEFAULT_TIMER_EXPIRE (HZ / 10)
  static int work_enabled;
  
+#ifndef CONFIG_XEN
+#define current_domain COORDINATOR_DOMAIN
+#else
+static int32_t current_domain = COORDINATOR_DOMAIN;
+#endif
+
  unsigned long oprofile_get_cpu_buffer_size(void)
  {
         return oprofile_cpu_buffer_size;
@@ -75,7 +85,7 @@ int alloc_cpu_buffers(void)
                 struct oprofile_cpu_buffer *b = &per_cpu(op_cpu_buffer, i);
  
                 b->last_task = NULL;
-               b->last_is_kernel = -1;
+               b->last_cpu_mode = -1;
                 b->tracing = 0;
                 b->buffer_size = buffer_size;
                 b->sample_received = 0;
@@ -180,7 +190,7 @@ unsigned long op_cpu_buffer_entries(int cpu)
  
  static int
  op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
-           int is_kernel, struct task_struct *task)
+           int cpu_mode, struct task_struct *task)
  {
         struct op_entry entry;
         struct op_sample *sample;
@@ -193,16 +203,15 @@ op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
                 flags |= TRACE_BEGIN;
  
         /* notice a switch from user->kernel or vice versa */
-       is_kernel = !!is_kernel;
-       if (cpu_buf->last_is_kernel != is_kernel) {
-               cpu_buf->last_is_kernel = is_kernel;
-               flags |= KERNEL_CTX_SWITCH;
-               if (is_kernel)
-                       flags |= IS_KERNEL;
+       if (cpu_buf->last_cpu_mode != cpu_mode) {
+               cpu_buf->last_cpu_mode = cpu_mode;
+               flags |= KERNEL_CTX_SWITCH | cpu_mode;
         }
  
         /* notice a task switch */
-       if (cpu_buf->last_task != task) {
+       /* if not processing other domain samples */
+       if (cpu_buf->last_task != task &&
+           current_domain == COORDINATOR_DOMAIN) {
                 cpu_buf->last_task = task;
                 flags |= USER_CTX_SWITCH;
         }
@@ -251,14 +260,14 @@ op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
  /*
   * This must be safe from any context.
   *
- * is_kernel is needed because on some architectures you cannot
+ * cpu_mode is needed because on some architectures you cannot
   * tell if you are in kernel or user space simply by looking at
- * pc. We tag this in the buffer by generating kernel enter/exit
- * events whenever is_kernel changes
+ * pc. We tag this in the buffer by generating kernel/user (and
+ * xen) enter events whenever cpu_mode changes
   */
  static int
  log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
-          unsigned long backtrace, int is_kernel, unsigned long event,
+          unsigned long backtrace, int cpu_mode, unsigned long event,
            struct task_struct *task)
  {
         struct task_struct *tsk = task ? task : current;
@@ -269,7 +278,7 @@ log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
                 return 0;
         }
  
-       if (op_add_code(cpu_buf, backtrace, is_kernel, tsk))
+       if (op_add_code(cpu_buf, backtrace, cpu_mode, tsk))
                 goto fail;
  
         if (op_add_sample(cpu_buf, pc, event))
@@ -416,6 +425,20 @@ void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
         log_sample(cpu_buf, pc, 0, is_kernel, event, NULL);
  }
  
+#ifdef CONFIG_XEN
+/*
+ * This is basically log_sample(b, ESCAPE_CODE, 1, cpu_mode, CPU_TRACE_BEGIN),
+ * as was previously accessible through oprofile_add_pc().
+ */
+void oprofile_add_mode(int cpu_mode)
+{
+       struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
+
+       if (op_add_code(cpu_buf, 1, cpu_mode, current))
+               cpu_buf->sample_lost_overflow++;
+}
+#endif
+
  void oprofile_add_trace(unsigned long pc)
  {
         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(op_cpu_buffer);
@@ -440,6 +463,28 @@ fail:
         return;
  }
  
+#ifdef CONFIG_XEN
+int oprofile_add_domain_switch(int32_t domain_id)
+{
+       struct op_entry entry;
+       struct op_sample *sample;
+
+       sample = op_cpu_buffer_write_reserve(&entry, 1);
+       if (!sample)
+               return 0;
+
+       sample->eip = ESCAPE_CODE;
+       sample->event = DOMAIN_SWITCH;
+
+       op_cpu_buffer_add_data(&entry, domain_id);
+       op_cpu_buffer_write_commit(&entry);
+
+       current_domain = domain_id;
+
+       return 1;
+}
+#endif
+
  /*
   * This serves to avoid cpu buffer overflow, and makes sure
   * the task mortuary progresses
diff --git a/drivers/oprofile/cpu_buffer.h b/drivers/oprofile/cpu_buffer.h

index e1d097e..07c8976 100644 (file)
--- a/drivers/oprofile/cpu_buffer.h
+++ b/drivers/oprofile/cpu_buffer.h
@@ -41,7 +41,7 @@ struct op_entry;
  struct oprofile_cpu_buffer {
         unsigned long buffer_size;
         struct task_struct *last_task;
-       int last_is_kernel;
+       int last_cpu_mode;
         int tracing;
         unsigned long sample_received;
         unsigned long sample_lost_overflow;
@@ -63,7 +63,7 @@ static inline void op_cpu_buffer_reset(int cpu)
  {
         struct oprofile_cpu_buffer *cpu_buf = &per_cpu(op_cpu_buffer, cpu);
  
-       cpu_buf->last_is_kernel = -1;
+       cpu_buf->last_cpu_mode = -1;
         cpu_buf->last_task = NULL;
  }
  
@@ -113,9 +113,13 @@ int op_cpu_buffer_get_data(struct op_entry *entry, unsigned long *val)
  }
  
  /* extra data flags */
-#define KERNEL_CTX_SWITCH      (1UL << 0)
-#define IS_KERNEL              (1UL << 1)
+#define CPU_MODE_USER          0
+#define CPU_MODE_KERNEL                1
+#define CPU_MODE_XEN           2
+#define CPU_MODE_MASK          3
  #define TRACE_BEGIN            (1UL << 2)
  #define USER_CTX_SWITCH                (1UL << 3)
+#define KERNEL_CTX_SWITCH      (1UL << 4)
+#define DOMAIN_SWITCH          (1UL << 5)
  
  #endif /* OPROFILE_CPU_BUFFER_H */
diff --git a/drivers/oprofile/event_buffer.h b/drivers/oprofile/event_buffer.h

index a8d5bb3..bae7b80 100644 (file)
--- a/drivers/oprofile/event_buffer.h
+++ b/drivers/oprofile/event_buffer.h
@@ -30,6 +30,9 @@ void wake_up_buffer_waiter(void);
  #define INVALID_COOKIE ~0UL
  #define NO_COOKIE 0UL
  
+/* Constant used to refer to coordinator domain (Xen) */
+#define COORDINATOR_DOMAIN -1
+
  extern const struct file_operations event_buffer_fops;
  
  /* mutex between sync_cpu_buffers() and the
diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c

index ed2c3ec..5ece800 100644 (file)
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -5,6 +5,10 @@
   * @remark Read the file COPYING
   *
   * @author John Levon <levon@movementarian.org>
+ *
+ * Modified by Aravind Menon for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
   */
  
  #include <linux/kernel.h>
@@ -35,6 +39,34 @@ static DEFINE_MUTEX(start_mutex);
   */
  static int timer = 0;
  
+#ifdef CONFIG_XEN
+int oprofile_set_active(int active_domains[], unsigned int adomains)
+{
+       int err;
+
+       if (!oprofile_ops.set_active)
+               return -EINVAL;
+
+       mutex_lock(&start_mutex);
+       err = oprofile_ops.set_active(active_domains, adomains);
+       mutex_unlock(&start_mutex);
+       return err;
+}
+
+int oprofile_set_passive(int passive_domains[], unsigned int pdomains)
+{
+       int err;
+
+       if (!oprofile_ops.set_passive)
+               return -EINVAL;
+
+       mutex_lock(&start_mutex);
+       err = oprofile_ops.set_passive(passive_domains, pdomains);
+       mutex_unlock(&start_mutex);
+       return err;
+}
+#endif
+
  int oprofile_setup(void)
  {
         int err;
diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h

index d32ef81..a56d53d 100644 (file)
--- a/drivers/oprofile/oprof.h
+++ b/drivers/oprofile/oprof.h
@@ -48,4 +48,7 @@ static inline int op_nmi_timer_init(struct oprofile_operations *ops)
  int oprofile_set_ulong(unsigned long *addr, unsigned long val);
  int oprofile_set_timeout(unsigned long time);
  
+int oprofile_set_active(int active_domains[], unsigned int adomains);
+int oprofile_set_passive(int passive_domains[], unsigned int pdomains);
+
  #endif /* OPROF_H */
diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c

index 84a208d..51d26ea 100644 (file)
--- a/drivers/oprofile/oprofile_files.c
+++ b/drivers/oprofile/oprofile_files.c
@@ -5,11 +5,17 @@
   * @remark Read the file COPYING
   *
   * @author John Levon <levon@movementarian.org>
+ *
+ * Modified by Aravind Menon for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
   */
  
  #include <linux/fs.h>
  #include <linux/oprofile.h>
  #include <linux/jiffies.h>
+#include <asm/uaccess.h>
+#include <linux/ctype.h>
  
  #include "event_buffer.h"
  #include "oprofile_stats.h"
@@ -175,6 +181,141 @@ static const struct file_operations dump_fops = {
         .llseek         = noop_llseek,
  };
  
+#ifdef CONFIG_XEN
+#include <linux/slab.h>
+
+#define TMPBUFSIZE 512
+
+struct domain_data {
+    unsigned int nr;
+    int ids[MAX_OPROF_DOMAINS + 1];
+    struct mutex mutex;
+    int (*set)(int[], unsigned int);
+};
+#define DEFINE_DOMAIN_DATA(what) \
+       struct domain_data what##_domains = { \
+               .mutex = __MUTEX_INITIALIZER(what##_domains.mutex), \
+               .set = oprofile_set_##what \
+       }
+
+static ssize_t domain_write(struct file *filp, char const __user *buf,
+                           size_t count, loff_t *offset)
+{
+       struct domain_data *dom = filp->private_data;
+       char *tmpbuf;
+       char *startp, *endp;
+       int i;
+       unsigned long val;
+       ssize_t retval = count;
+
+       if (*offset)
+               return -EINVAL;
+       if (count > TMPBUFSIZE - 1)
+               return -EINVAL;
+
+       if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
+               return -ENOMEM;
+
+       if (copy_from_user(tmpbuf, buf, count)) {
+               kfree(tmpbuf);
+               return -EFAULT;
+       }
+       tmpbuf[count] = 0;
+
+       mutex_lock(&dom->mutex);
+
+       startp = tmpbuf;
+       /* Parse one more than MAX_OPROF_DOMAINS, for easy error checking */
+       for (i = 0; i <= MAX_OPROF_DOMAINS; i++) {
+               val = simple_strtoul(startp, &endp, 0);
+               if (endp == startp)
+                       break;
+               while (ispunct(*endp) || isspace(*endp))
+                       endp++;
+               dom->ids[i] = val;
+               if (dom->ids[i] != val)
+                       /* Overflow, force error below */
+                       i = MAX_OPROF_DOMAINS + 1;
+               startp = endp;
+       }
+       /* Force error on trailing junk */
+       dom->nr = *startp ? MAX_OPROF_DOMAINS + 1 : i;
+
+       kfree(tmpbuf);
+
+       if (dom->nr > MAX_OPROF_DOMAINS
+           || dom->set(dom->ids, dom->nr)) {
+               dom->nr = 0;
+               retval = -EINVAL;
+       }
+
+       mutex_unlock(&dom->mutex);
+       return retval;
+}
+
+static ssize_t domain_read(struct file *filp, char __user *buf,
+                           size_t count, loff_t *offset)
+{
+       struct domain_data *dom = filp->private_data;
+       char *tmpbuf;
+       size_t len;
+       int i;
+       ssize_t retval;
+
+       if (!(tmpbuf = kmalloc(TMPBUFSIZE, GFP_KERNEL)))
+               return -ENOMEM;
+
+       mutex_lock(&dom->mutex);
+
+       len = 0;
+       for (i = 0; i < dom->nr; i++)
+               len += snprintf(tmpbuf + len,
+                               len < TMPBUFSIZE ? TMPBUFSIZE - len : 0,
+                               "%u ", dom->ids[i]);
+       WARN_ON(len > TMPBUFSIZE);
+       if (len != 0 && len <= TMPBUFSIZE)
+               tmpbuf[len-1] = '\n';
+
+       mutex_unlock(&dom->mutex);
+
+       retval = simple_read_from_buffer(buf, count, offset, tmpbuf, len);
+
+       kfree(tmpbuf);
+       return retval;
+}
+
+static DEFINE_DOMAIN_DATA(active);
+
+static int adomain_open(struct inode *inode, struct file *filp)
+{
+       filp->private_data = &active_domains;
+       return 0;
+}
+
+static const struct file_operations active_domain_ops = {
+       .open           = adomain_open,
+       .read           = domain_read,
+       .write          = domain_write,
+       .llseek         = default_llseek,
+};
+
+static DEFINE_DOMAIN_DATA(passive);
+
+static int pdomain_open(struct inode *inode, struct file *filp)
+{
+       filp->private_data = &passive_domains;
+       return 0;
+}
+
+static const struct file_operations passive_domain_ops = {
+       .open           = pdomain_open,
+       .read           = domain_read,
+       .write          = domain_write,
+       .llseek         = default_llseek,
+};
+
+#endif /* CONFIG_XEN */
+
  void oprofile_create_files(struct super_block *sb, struct dentry *root)
  {
         /* reinitialize default values */
@@ -185,6 +326,10 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root)
  
         oprofilefs_create_file(sb, root, "enable", &enable_fops);
         oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666);
+#ifdef CONFIG_XEN
+       oprofilefs_create_file(sb, root, "active_domains", &active_domain_ops);
+       oprofilefs_create_file(sb, root, "passive_domains", &passive_domain_ops);
+#endif
         oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops);
         oprofilefs_create_ulong(sb, root, "buffer_size", &oprofile_buffer_size);
         oprofilefs_create_ulong(sb, root, "buffer_watershed", &oprofile_buffer_watershed);
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig

index 848bfb8..2d94485 100644 (file)
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -44,6 +44,27 @@ config PCI_REALLOC_ENABLE_AUTO
  
           When in doubt, say N.
  
+config PCI_GUESTDEV
+       bool "PCI Device Reservation for Passthrough"
+       depends on PCI && ACPI && XEN
+       default y
+       help
+         Say Y here if you want to reserve PCI device for passthrough.
+
+config PCI_IOMULTI
+       tristate "PCI Device IO Multiplex for Passthrough"
+       depends on PCI && ACPI && XEN
+       default y
+       help
+         Say Y here if you need io multiplexing.
+
+config PCI_RESERVE
+       bool "PCI IO/MEMORY space reserve"
+       depends on PCI && XEN_PRIVILEGED_GUEST
+       default y
+       help
+         Say Y here if you need PCI IO/MEMORY space reserve
+
  config PCI_STUB
         tristate "PCI Stub driver"
         depends on PCI
@@ -53,9 +74,9 @@ config PCI_STUB
  
           When in doubt, say N.
  
-config XEN_PCIDEV_FRONTEND
+config PARAVIRT_XEN_PCIDEV_FRONTEND
          tristate "Xen PCI Frontend"
-        depends on PCI && X86 && XEN
+        depends on PCI && X86 && PARAVIRT_XEN
          select HOTPLUG
          select PCI_XEN
         select XEN_XENBUS_FRONTEND
@@ -64,10 +85,30 @@ config XEN_PCIDEV_FRONTEND
            The PCI device frontend driver allows the kernel to import arbitrary
            PCI devices from a PCI backend to support PCI driver domains.
  
+config XEN_PCIDEV_FRONTEND
+       def_bool y
+       prompt "Xen PCI Frontend" if X86_64 && !XEN_UNPRIVILEGED_GUEST
+       depends on PCI && XEN && (PCI_GOXEN_FE || PCI_GOANY || X86_64)
+       select HOTPLUG
+       help
+         The PCI device frontend driver allows the kernel to import arbitrary
+         PCI devices from a PCI backend to support PCI driver domains.
+
+config XEN_PCIDEV_FE_DEBUG
+        bool "Xen PCI Frontend debugging"
+        depends on XEN_PCIDEV_FRONTEND
+       help
+         Say Y here if you want the Xen PCI frontend to produce a bunch of debug
+         messages to the system log.  Select this if you are having a
+         problem with Xen PCI frontend support and want to see more of what is
+         going on.
+
+         When in doubt, say N.
+
  config HT_IRQ
         bool "Interrupts on hypertransport devices"
         default y
-       depends on PCI && X86_LOCAL_APIC && X86_IO_APIC
+       depends on PCI && X86_LOCAL_APIC && X86_IO_APIC && !XEN
         help
            This allows native hypertransport devices to use interrupts.
  
@@ -112,7 +153,7 @@ config PCI_PASID
  
  config PCI_IOAPIC
         tristate "PCI IO-APIC hotplug support" if X86
-       depends on PCI
+       depends on PCI && !XEN
         depends on ACPI
         depends on HOTPLUG
         default !X86
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile

index 165274c..cae8caa 100644 (file)
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -7,6 +7,11 @@ obj-y          += access.o bus.o probe.o remove.o pci.o \
                         irq.o vpd.o
  obj-$(CONFIG_PROC_FS) += proc.o
  obj-$(CONFIG_SYSFS) += slot.o
+obj-$(CONFIG_PCI_GUESTDEV) += guestdev.o
+obj-$(CONFIG_PCI_IOMULTI) += pci-iomul.o
+iomul-$(CONFIG_PCI_IOMULTI) := iomulti.o
+obj-y += $(iomul-y) $(iomul-m)
+obj-$(CONFIG_PCI_RESERVE) += reserve.o
  
  obj-$(CONFIG_PCI_QUIRKS) += quirks.o
  
@@ -66,7 +71,7 @@ obj-$(CONFIG_PCI_SYSCALL) += syscall.o
  
  obj-$(CONFIG_PCI_STUB) += pci-stub.o
  
-obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
+obj-$(CONFIG_PARAVIRT_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
  
  obj-$(CONFIG_OF) += of.o
  
diff --git a/drivers/pci/guestdev.c b/drivers/pci/guestdev.c

new file mode 100644 (file)

index 0000000..2af2ff4
--- /dev/null
+++ b/drivers/pci/guestdev.c
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2008, 2009 NEC Corporation.
+ * Copyright (c) 2009 Isaku Yamahata
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/acpi.h>
+#include <asm/setup.h>
+
+#define HID_LEN 8
+#define UID_LEN 8
+#define DEV_LEN 2
+#define FUNC_LEN 1
+#define DEV_NUM_MAX 31
+#define FUNC_NUM_MAX 7
+#define INVALID_SEG (-1)
+#define INVALID_BBN (-1)
+#define GUESTDEV_STR_MAX 128
+
+#define GUESTDEV_FLAG_TYPE_MASK 0x3
+#define GUESTDEV_FLAG_DEVICEPATH 0x1
+#define GUESTDEV_FLAG_SBDF 0x2
+
+#define GUESTDEV_OPT_IOMUL     0x1
+
+struct guestdev {
+       int flags;
+       int options;
+       struct list_head root_list;
+       union {
+               struct devicepath {
+                       char hid[HID_LEN + 1];
+                       char uid[UID_LEN + 1];
+                       int seg;
+                       int bbn;
+                       struct devicepath_node *child;
+               } devicepath;
+               struct sbdf {
+                       int seg;
+                       int bus;
+                       int dev;
+                       int func;
+               } sbdf;
+       } u;
+};
+
+struct devicepath_node {
+       int dev;
+       int func;
+       struct devicepath_node *child;
+};
+
+struct pcidev_sbdf {
+       int seg;
+       int bus;
+       struct pcidev_sbdf_node *child;
+};
+
+struct pcidev_sbdf_node {
+       int dev;
+       int func;
+       struct pcidev_sbdf_node *child;
+};
+
+static char __initdata guestdev_param[COMMAND_LINE_SIZE];
+static LIST_HEAD(guestdev_list);
+
+/* Get hid and uid */
+static int __init pci_get_hid_uid(char *str, char *hid, char *uid)
+{
+       char *sp, *ep;
+       int len;
+
+       sp = str;
+       ep = strchr(sp, ':');
+       if (!ep) {
+               ep = strchr(sp, '-');
+               if (!ep)
+                       goto format_err_end;
+       }
+       /* hid length */
+       len = ep - sp;
+       if (len <= 0 || HID_LEN < len)
+               goto format_err_end;
+
+       strlcpy(hid, sp, len);
+
+       if (*ep == '-') { /* no uid */
+               uid[0] = '\0';
+               return TRUE;
+       }
+
+       sp = ep + 1;
+       ep = strchr(sp, '-');
+       if (!ep)
+               ep = strchr(sp, '\0');
+
+       /* uid length */
+       len = ep - sp;
+       if (len <= 0 || UID_LEN < len)
+               goto format_err_end;
+
+       strlcpy(uid, sp, len);
+       return TRUE;
+
+format_err_end:
+       return FALSE;
+}
+
+/* Get device and function */
+static int __init pci_get_dev_func(char *str, int *dev, int *func)
+{
+       if (sscanf(str, "%02x.%01x", dev, func) != 2)
+               goto format_err_end;
+
+       if (*dev < 0 || DEV_NUM_MAX < *dev)
+               goto format_err_end;
+
+       if (*func < 0 || FUNC_NUM_MAX < *func)
+               goto format_err_end;
+
+       return TRUE;
+
+format_err_end:
+       return FALSE;
+}
+
+/* Check extended guestdev parameter format error */
+static int __init pci_check_extended_guestdev_format(char *str)
+{
+       int flg;
+       char *p;
+
+       /* Check extended format */
+       if (strpbrk(str, "(|)") == NULL)
+               return TRUE;
+
+       flg = 0;
+       p = str;
+       while (*p) {
+               switch (*p) {
+               case '(':
+                       /* Check nesting error */
+                       if (flg != 0)
+                               goto format_err_end;
+                       flg = 1;
+                       /* Check position of '(' is head or
+                          previos charactor of '(' is not '-'. */
+                       if (p == str || *(p - 1) != '-')
+                               goto format_err_end;
+                       break;
+               case ')':
+                       /* Check nesting error */
+                       if (flg != 1)
+                               goto format_err_end;
+                       flg = 0;
+                       /* Check next charactor of ')' is not '\0' */
+                       if (*(p + 1) != '\0')
+                               goto format_err_end;
+                       break;
+               case '|':
+                       /* Check position of '|' is outside of '(' and ')' */
+                       if (flg != 1)
+                               goto format_err_end;
+                       break;
+               default:
+                       break;
+               }
+               p++;
+       }
+       /* Check number of '(' and ')' are not equal */
+       if (flg != 0)
+               goto format_err_end;
+       return TRUE;
+
+format_err_end:
+       pr_err("PCI: The format of the guestdev parameter is illegal. [%s]\n",
+              str);
+       return FALSE;
+}
+
+/* Make guestdev strings */
+static void pci_make_guestdev_str(struct guestdev *gdev,
+                                       char *gdev_str, int buf_size)
+{
+       struct devicepath_node *node;
+       int count;
+
+       switch (gdev->flags & GUESTDEV_FLAG_TYPE_MASK) {
+       case GUESTDEV_FLAG_DEVICEPATH:
+               memset(gdev_str, 0, buf_size);
+
+               if (strlen(gdev->u.devicepath.uid))
+                       count = snprintf(gdev_str, buf_size, "%s:%s",
+                                               gdev->u.devicepath.hid,
+                                               gdev->u.devicepath.uid);
+               else
+                       count = snprintf(gdev_str, buf_size, "%s",
+                                                gdev->u.devicepath.hid);
+               if (count < 0)
+                       return;
+
+               node = gdev->u.devicepath.child;
+               while (node) {
+                       gdev_str += count;
+                       buf_size -= count;
+                       if (buf_size <= 0)
+                               return;
+                       count = snprintf(gdev_str, buf_size, "-%02x.%01x",
+                               node->dev, node->func);
+                       if (count < 0)
+                               return;
+                       node = node->child;
+               }
+               break;
+       case GUESTDEV_FLAG_SBDF:
+               snprintf(gdev_str, buf_size, "%04x:%02x:%02x.%01x",
+                                       gdev->u.sbdf.seg, gdev->u.sbdf.bus,
+                                       gdev->u.sbdf.dev, gdev->u.sbdf.func);
+               break;
+       default:
+               BUG();
+       }
+}
+
+/* Free guestdev and nodes */
+static void __init pci_free_guestdev(struct guestdev *gdev)
+{
+       struct devicepath_node *node, *next;
+
+       if (!gdev)
+               return;
+       if (gdev->flags & GUESTDEV_FLAG_DEVICEPATH) {
+               node = gdev->u.devicepath.child;
+               while (node) {
+                       next = node->child;
+                       kfree(node);
+                       node = next;
+               }
+       }
+       list_del(&gdev->root_list);
+       kfree(gdev);
+}
+
+/* Copy guestdev and nodes */
+struct guestdev __init *pci_copy_guestdev(struct guestdev *gdev_src)
+{
+       struct guestdev *gdev;
+       struct devicepath_node *node, *node_src, *node_upper;
+
+       BUG_ON(!(gdev_src->flags & GUESTDEV_FLAG_DEVICEPATH));
+
+       gdev = kzalloc(sizeof(*gdev), GFP_KERNEL);
+       if (!gdev)
+               goto allocate_err_end;
+
+       INIT_LIST_HEAD(&gdev->root_list);
+       gdev->flags = gdev_src->flags;
+       gdev->options = gdev_src->options;
+       strcpy(gdev->u.devicepath.hid, gdev_src->u.devicepath.hid);
+       strcpy(gdev->u.devicepath.uid, gdev_src->u.devicepath.uid);
+       gdev->u.devicepath.seg = gdev_src->u.devicepath.seg;
+       gdev->u.devicepath.bbn = gdev_src->u.devicepath.bbn;
+
+       node_upper = NULL;
+
+       node_src = gdev_src->u.devicepath.child;
+       while (node_src) {
+               node = kzalloc(sizeof(*node), GFP_KERNEL);
+               if (!node)
+                       goto allocate_err_end;
+               node->dev = node_src->dev;
+               node->func = node_src->func;
+               if (!node_upper)
+                       gdev->u.devicepath.child = node;
+               else
+                       node_upper->child = node;
+               node_upper = node;
+               node_src = node_src->child;
+       }
+
+       return gdev;
+
+allocate_err_end:
+       if (gdev)
+               pci_free_guestdev(gdev);
+       pr_err("PCI: failed to allocate memory\n");
+       return NULL;
+}
+
+/* Make guestdev from path strings */
+static int __init pci_make_devicepath_guestdev(char *path_str, int options)
+{
+       char hid[HID_LEN + 1], uid[UID_LEN + 1];
+       char *sp, *ep;
+       struct guestdev *gdev, *gdev_org;
+       struct devicepath_node *node, *node_tmp;
+       int dev, func, ret_val;
+
+       ret_val = 0;
+       gdev = gdev_org = NULL;
+       sp = path_str;
+       /* Look for end of hid:uid'-' */
+       ep = strchr(sp, '-');
+       /* Only hid, uid. (No dev, func) */
+       if (!ep)
+               goto format_err_end;
+
+       memset(hid, 0 ,sizeof(hid));
+       memset(uid, 0, sizeof(uid));
+       if (!pci_get_hid_uid(sp, hid, uid))
+               goto format_err_end;
+
+       gdev_org = kzalloc(sizeof(*gdev_org), GFP_KERNEL);
+       if (!gdev_org)
+               goto allocate_err_end;
+       INIT_LIST_HEAD(&gdev_org->root_list);
+       gdev_org->flags = GUESTDEV_FLAG_DEVICEPATH;
+       gdev_org->options = options;
+       strcpy(gdev_org->u.devicepath.hid, hid);
+       strcpy(gdev_org->u.devicepath.uid, uid);
+       gdev_org->u.devicepath.seg = INVALID_SEG;
+       gdev_org->u.devicepath.bbn = INVALID_BBN;
+
+       gdev = gdev_org;
+
+       sp = ep + 1;
+       ep = sp;
+       do {
+               if (*sp == '(') {
+                       sp++;
+                       if (strchr(sp, '|')) {
+                               gdev = pci_copy_guestdev(gdev_org);
+                               if (!gdev) {
+                                       ret_val = -ENOMEM;
+                                       goto end;
+                               }
+                       }
+                       continue;
+               }
+               if (gdev && pci_get_dev_func(sp, &dev, &func)) {
+                       node = kzalloc(sizeof(*node), GFP_KERNEL);
+                       if (!node)
+                               goto allocate_err_end;
+                       node->dev = dev;
+                       node->func = func;
+                       /* add node to end of guestdev */
+                       if (gdev->u.devicepath.child) {
+                               node_tmp = gdev->u.devicepath.child;
+                               while (node_tmp->child) {
+                                       node_tmp = node_tmp->child;
+                               }
+                               node_tmp->child = node;
+                       } else
+                               gdev->u.devicepath.child = node;
+               } else if (gdev) {
+                       pr_err("PCI: Can't obtain dev# and #func# from %s.\n",
+                              sp);
+                       ret_val = -EINVAL;
+                       if (gdev == gdev_org)
+                               goto end;
+                       pci_free_guestdev(gdev);
+                       gdev = NULL;
+               }
+
+               ep = strpbrk(sp, "-|)");
+               if (!ep)
+                       ep = strchr(sp, '\0');
+               /* Is *ep '|' OR ')' OR '\0' ? */
+               if (*ep != '-') {
+                       if (gdev)
+                               list_add_tail(&gdev->root_list, &guestdev_list);
+                       if (*ep == '|') {
+                               /* Between '|' and '|' ? */
+                               if (strchr(ep + 1, '|')) {
+                                       gdev = pci_copy_guestdev(gdev_org);
+                                       if (!gdev) {
+                                               ret_val = -ENOMEM;
+                                               goto end;
+                                       }
+                               } else {
+                                       gdev = gdev_org;
+                                       gdev_org = NULL;
+                               }
+                       } else {
+                               gdev_org = NULL;
+                               gdev = NULL;
+                       }
+               }
+               if (*ep == ')')
+                       ep++;
+               sp = ep + 1;
+       } while (*ep != '\0');
+
+       goto end;
+
+format_err_end:
+       pr_err("PCI: The format of the guestdev parameter is illegal. [%s]\n",
+              path_str);
+       ret_val = -EINVAL;
+       goto end;
+
+allocate_err_end:
+       pr_err("PCI: failed to allocate memory\n");
+       ret_val = -ENOMEM;
+       goto end;
+
+end:
+       if (gdev_org && (gdev_org != gdev))
+               pci_free_guestdev(gdev_org);
+       if (gdev)
+               pci_free_guestdev(gdev);
+       return ret_val;
+}
+
+static int __init pci_make_sbdf_guestdev(char* str, int options)
+{
+       struct guestdev *gdev;
+       int seg, bus, dev, func;
+
+       if (sscanf(str, "%x:%x:%x.%x", &seg, &bus, &dev, &func) != 4) {
+               seg = 0;
+               if (sscanf(str, "%x:%x.%x", &bus, &dev, &func) != 3)
+                       return -EINVAL;
+       }
+       gdev = kmalloc(sizeof(*gdev), GFP_KERNEL);
+       if (!gdev) {
+               pr_err("PCI: failed to allocate memory\n");
+               return -ENOMEM;
+       }
+       INIT_LIST_HEAD(&gdev->root_list);
+       gdev->flags = GUESTDEV_FLAG_SBDF;
+       gdev->options = options;
+       gdev->u.sbdf.seg = seg;
+       gdev->u.sbdf.bus = bus;
+       gdev->u.sbdf.dev = dev;
+       gdev->u.sbdf.func = func;
+       list_add_tail(&gdev->root_list, &guestdev_list);
+       return 0;
+}
+
+static int __init pci_parse_options(const char *str)
+{
+       int options = 0;
+       char *ep;
+
+       while (str) {
+               str++;
+               ep = strchr(str, '+');
+               if (ep)
+                       ep = '\0';      /* Chop */
+
+               if (!strcmp(str, "iomul"))
+                       options |= GUESTDEV_OPT_IOMUL;
+
+               str = ep;
+       }
+       return options;
+}
+
+/* Parse guestdev parameter */
+static int __init pci_parse_guestdev(void)
+{
+       int len;
+       char *sp, *ep, *op;
+       int options;
+       struct list_head *head;
+       struct guestdev *gdev;
+       char path_str[GUESTDEV_STR_MAX];
+       int ret_val = 0;
+
+       len = strlen(guestdev_param);
+       if (len == 0)
+               return 0;
+
+       sp = guestdev_param;
+
+       do {
+               ep = strchr(sp, ',');
+               /* Chop */
+               if (ep)
+                       *ep = '\0';
+               options = 0;
+               op = strchr(sp, '+');
+               if (op && (!ep || op < ep)) {
+                       options = pci_parse_options(op);
+                       *op = '\0';     /* Chop */
+               }
+               ret_val = pci_make_sbdf_guestdev(sp, options);
+               if (ret_val == -EINVAL) {
+                       if (pci_check_extended_guestdev_format(sp)) {
+                               ret_val = pci_make_devicepath_guestdev(
+                                       sp, options);
+                               if (ret_val && ret_val != -EINVAL)
+                                       break;
+                       }
+               } else if (ret_val)
+                       break;
+
+               if (ep)
+                       ep++;
+               sp = ep;
+       } while (ep);
+
+       list_for_each(head, &guestdev_list) {
+               gdev = list_entry(head, struct guestdev, root_list);
+               pci_make_guestdev_str(gdev, path_str, GUESTDEV_STR_MAX);
+               printk(KERN_DEBUG
+                       "PCI: %s has been reserved for guest domain.\n",
+                       path_str);
+       }
+       return 0;
+}
+
+arch_initcall(pci_parse_guestdev);
+
+/* Get command line */
+static int __init pci_guestdev_setup(char *str)
+{
+       if (strlen(str) >= COMMAND_LINE_SIZE)
+               return 0;
+       strlcpy(guestdev_param, str, sizeof(guestdev_param));
+       return 1;
+}
+
+__setup("guestdev=", pci_guestdev_setup);
+
+/* Free sbdf and nodes */
+static void pci_free_sbdf(struct pcidev_sbdf *sbdf)
+{
+       struct pcidev_sbdf_node *node, *next;
+
+       node = sbdf->child;
+       while (node) {
+               next = node->child;
+               kfree(node);
+               node = next;
+       }
+       /* Skip kfree(sbdf) */
+}
+
+/* Does PCI device belong to sub tree specified by guestdev with device path? */
+typedef int (*pci_node_match_t)(const struct devicepath_node *gdev_node,
+                               const struct pcidev_sbdf_node *sbdf_node,
+                               int options);
+
+static int pci_node_match(const struct devicepath_node *gdev_node,
+                         const struct pcidev_sbdf_node *sbdf_node,
+                         int options_unused)
+{
+       return (gdev_node->dev == sbdf_node->dev &&
+               gdev_node->func == sbdf_node->func);
+}
+
+static int pci_is_in_devicepath_sub_tree(struct guestdev *gdev,
+                                        struct pcidev_sbdf *sbdf,
+                                        pci_node_match_t match)
+{
+       int seg, bbn;
+       struct devicepath_node *gdev_node;
+       struct pcidev_sbdf_node *sbdf_node;
+
+       if (!gdev || !sbdf)
+               return FALSE;
+
+       BUG_ON(!(gdev->flags & GUESTDEV_FLAG_DEVICEPATH));
+
+       /* Compare seg and bbn */
+       if (gdev->u.devicepath.seg == INVALID_SEG ||
+           gdev->u.devicepath.bbn == INVALID_BBN) {
+               if (acpi_pci_get_root_seg_bbn(gdev->u.devicepath.hid,
+                   gdev->u.devicepath.uid, &seg, &bbn)) {
+                       gdev->u.devicepath.seg = seg;
+                       gdev->u.devicepath.bbn = bbn;
+               } else
+                       return FALSE;
+       }
+
+       if (gdev->u.devicepath.seg != sbdf->seg ||
+           gdev->u.devicepath.bbn != sbdf->bus)
+               return FALSE;
+
+       gdev_node = gdev->u.devicepath.child;
+       sbdf_node = sbdf->child;
+
+       /* Compare dev and func */
+       while (gdev_node) {
+               if (!sbdf_node)
+                       return FALSE;
+               if (!match(gdev_node, sbdf_node, gdev->options))
+                       return FALSE;
+               gdev_node = gdev_node->child;
+               sbdf_node = sbdf_node->child;
+       }
+       return TRUE;
+}
+
+/* Get sbdf from device */
+static int pci_get_sbdf_from_pcidev(
+       struct pci_dev *dev, struct pcidev_sbdf *sbdf)
+{
+       struct pcidev_sbdf_node *node;
+
+       if (!dev)
+               return FALSE;
+
+       for(;;) {
+               node = kzalloc(sizeof(*node), GFP_KERNEL);
+               if (!node) {
+                       pr_err("PCI: failed to allocate memory\n");
+                       goto err_end;
+               }
+               node->dev = PCI_SLOT(dev->devfn);
+               node->func = PCI_FUNC(dev->devfn);
+
+               if (!sbdf->child)
+                       sbdf->child = node;
+               else {
+                       node->child = sbdf->child;
+                       sbdf->child = node;
+               }
+               if (!dev->bus)
+                       goto err_end;
+               if (!dev->bus->self)
+                       break;
+               dev = dev->bus->self;
+       }
+       if (sscanf(dev_name(&dev->dev), "%04x:%02x", &sbdf->seg, &sbdf->bus) != 2)
+               goto err_end;
+       return TRUE;
+
+err_end:
+       pci_free_sbdf(sbdf);
+       return FALSE;
+}
+
+/* Does PCI device belong to sub tree specified by guestdev with sbdf? */
+typedef int (*pci_sbdf_match_t)(const struct guestdev *gdev,
+                               const  struct pci_dev *dev);
+
+static int pci_sbdf_match(const struct guestdev *gdev,
+                         const struct pci_dev *dev)
+{
+       int seg, bus;
+
+       if (sscanf(dev_name(&dev->dev), "%04x:%02x", &seg, &bus) != 2)
+               return FALSE;
+
+       return gdev->u.sbdf.seg == seg &&
+               gdev->u.sbdf.bus == bus &&
+               gdev->u.sbdf.dev == PCI_SLOT(dev->devfn) &&
+               gdev->u.sbdf.func == PCI_FUNC(dev->devfn);
+}
+
+static int pci_is_in_sbdf_sub_tree(struct guestdev *gdev, struct pci_dev *dev,
+                                  pci_sbdf_match_t match)
+{
+       BUG_ON(!(gdev->flags & GUESTDEV_FLAG_SBDF));
+       for (;;) {
+               if (match(gdev, dev))
+                       return TRUE;
+               if (!dev->bus || !dev->bus->self)
+                       break;
+               dev = dev->bus->self;
+       }
+       return FALSE;
+}
+
+/* Does PCI device belong to sub tree specified by guestdev parameter? */
+static int __pci_is_guestdev(struct pci_dev *dev, pci_node_match_t node_match,
+                            pci_sbdf_match_t sbdf_match)
+{
+       struct guestdev *gdev;
+       struct pcidev_sbdf pcidev_sbdf, *sbdf = NULL;
+       struct list_head *head;
+       int result = FALSE;
+
+       if (!dev)
+               return FALSE;
+
+       list_for_each(head, &guestdev_list) {
+               gdev = list_entry(head, struct guestdev, root_list);
+               switch (gdev->flags & GUESTDEV_FLAG_TYPE_MASK) {
+               case GUESTDEV_FLAG_DEVICEPATH:
+                       if (sbdf == NULL) {
+                               sbdf = &pcidev_sbdf;
+                               memset(sbdf, 0 ,sizeof(*sbdf));
+                               if (!pci_get_sbdf_from_pcidev(dev, sbdf))
+                                       goto out;
+                       }
+                       if (pci_is_in_devicepath_sub_tree(gdev, sbdf,
+                                                         node_match)) {
+                               result = TRUE;
+                               goto out;
+                       }
+                       break;
+               case GUESTDEV_FLAG_SBDF:
+                       if (pci_is_in_sbdf_sub_tree(gdev, dev, sbdf_match)) {
+                               result = TRUE;
+                               goto out;
+                       }
+                       break;
+               default:
+                       BUG();
+               }
+       }
+out:
+       if (sbdf)
+               pci_free_sbdf(sbdf);
+       return result;
+}
+
+int pci_is_guestdev(struct pci_dev *dev)
+{
+       return __pci_is_guestdev(dev, pci_node_match, pci_sbdf_match);
+}
+EXPORT_SYMBOL_GPL(pci_is_guestdev);
+
+static int reassign_resources;
+
+static int __init pci_set_reassign_resources(char *str)
+{
+       if (str && !strcmp(str, "all"))
+               reassign_resources = -1;
+       else
+               reassign_resources = 1;
+
+       return 1;
+}
+__setup("reassign_resources", pci_set_reassign_resources);
+
+int pci_is_guestdev_to_reassign(struct pci_dev *dev)
+{
+       if (reassign_resources < 0)
+               return TRUE;
+       if (reassign_resources)
+               return pci_is_guestdev(dev);
+       return FALSE;
+}
+
+#if defined(CONFIG_PCI_IOMULTI) || defined(CONFIG_PCI_IOMULTI_MODULE)
+static int pci_iomul_node_match(const struct devicepath_node *gdev_node,
+                               const struct pcidev_sbdf_node *sbdf_node,
+                               int options)
+{
+       return (options & GUESTDEV_OPT_IOMUL) &&
+               ((gdev_node->child != NULL &&
+                 sbdf_node->child != NULL &&
+                 gdev_node->dev == sbdf_node->dev &&
+                 gdev_node->func == sbdf_node->func) ||
+                (gdev_node->child == NULL &&
+                 sbdf_node->child == NULL &&
+                 gdev_node->dev == sbdf_node->dev));
+}
+
+static int pci_iomul_sbdf_match(const struct guestdev *gdev,
+                               const struct pci_dev *dev)
+{
+       int seg, bus;
+
+       if (sscanf(dev_name(&dev->dev), "%04x:%02x", &seg, &bus) != 2)
+               return FALSE;
+
+       return (gdev->options & GUESTDEV_OPT_IOMUL) &&
+               gdev->u.sbdf.seg == seg &&
+               gdev->u.sbdf.bus == bus &&
+               gdev->u.sbdf.dev == PCI_SLOT(dev->devfn);
+}
+
+int pci_is_iomuldev(struct pci_dev *dev)
+{
+       return __pci_is_guestdev(dev,
+                                pci_iomul_node_match, pci_iomul_sbdf_match);
+}
+#endif /* CONFIG_PCI_IOMULTI */
+
+/* Check whether the devicepath exists under the pci root bus */
+static int __init pci_check_devicepath_exists(
+               struct guestdev *gdev, struct pci_bus *bus)
+{
+       struct devicepath_node *node;
+       struct pci_dev *dev;
+
+       BUG_ON(!(gdev->flags & GUESTDEV_FLAG_DEVICEPATH));
+
+       node = gdev->u.devicepath.child;
+       while (node) {
+               if (!bus)
+                       return FALSE;
+               dev = pci_get_slot(bus, PCI_DEVFN(node->dev, node->func));
+               if (!dev)
+                       return FALSE;
+               bus = dev->subordinate;
+               node = node->child;
+               pci_dev_put(dev);
+       }
+       return TRUE;
+}
+
+/* Check whether the guestdev exists in the PCI device tree */
+static int __init pci_check_guestdev_exists(void)
+{
+       struct list_head *head;
+       struct guestdev *gdev;
+       int seg, bbn;
+       struct pci_bus *bus;
+       struct pci_dev *dev;
+       char path_str[GUESTDEV_STR_MAX];
+
+       list_for_each(head, &guestdev_list) {
+               gdev = list_entry(head, struct guestdev, root_list);
+               switch (gdev->flags & GUESTDEV_FLAG_TYPE_MASK) {
+               case GUESTDEV_FLAG_DEVICEPATH:
+                       if (gdev->u.devicepath.seg == INVALID_SEG ||
+                               gdev->u.devicepath.bbn == INVALID_BBN) {
+                               if (acpi_pci_get_root_seg_bbn(
+                                       gdev->u.devicepath.hid,
+                                       gdev->u.devicepath.uid, &seg, &bbn)) {
+                                       gdev->u.devicepath.seg = seg;
+                                       gdev->u.devicepath.bbn = bbn;
+                               } else {
+                                       pci_make_guestdev_str(gdev,
+                                               path_str, GUESTDEV_STR_MAX);
+                                       pr_info("PCI: "
+                                               "device %s does not exist\n",
+                                               path_str);
+                                       continue;
+                               }
+                       }
+
+                       bus = pci_find_bus(gdev->u.devicepath.seg,
+                                               gdev->u.devicepath.bbn);
+                       if (!bus || !pci_check_devicepath_exists(gdev, bus)) {
+                               pci_make_guestdev_str(gdev, path_str,
+                                       GUESTDEV_STR_MAX);
+                               pr_info("PCI: device %s does not exist\n",
+                                       path_str);
+                       }
+                       break;
+               case GUESTDEV_FLAG_SBDF:
+                       bus = pci_find_bus(gdev->u.sbdf.seg, gdev->u.sbdf.bus);
+                       if (bus) {
+                               dev = pci_get_slot(bus,
+                                       PCI_DEVFN(gdev->u.sbdf.dev,
+                                                       gdev->u.sbdf.func));
+                               if (dev) {
+                                       pci_dev_put(dev);
+                                       continue;
+                               }
+                       }
+                       pci_make_guestdev_str(gdev, path_str, GUESTDEV_STR_MAX);
+                       pr_info("PCI: device %s does not exist\n", path_str);
+                       break;
+               default:
+                       BUG();
+               }
+       }
+       return 0;
+}
+
+fs_initcall(pci_check_guestdev_exists);
+
diff --git a/drivers/pci/iomulti.c b/drivers/pci/iomulti.c

new file mode 100644 (file)

index 0000000..9b87951
--- /dev/null
+++ b/drivers/pci/iomulti.c
@@ -0,0 +1,904 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (c) 2009 Isaku Yamahata
+ *                    VA Linux Systems Japan K.K.
+ */
+
+#include "iomulti.h"
+#include "pci.h"
+#include <linux/export.h>
+#include <linux/sort.h>
+#include <asm/setup.h>
+
+#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
+#define __pcihp_init __devinit
+#else
+#define __pcihp_init __init
+#endif
+
+#define PCI_BUS_MAX            255
+#define PCI_DEV_MAX            31
+
+/* see pci_resource_len */
+static inline resource_size_t __pcihp_init pci_iomul_len(
+       const struct resource* r)
+{
+       if (!r->start && r->start == r->end)
+               return 0;
+       return r->end - r->start + 1;
+}
+
+#define ROUND_UP(x, a)         (((x) + (a) - 1) & ~((a) - 1))
+/* stolen from pbus_size_io() */
+static unsigned long __devinit pdev_size_io(struct pci_dev *pdev)
+{
+       unsigned long size = 0, size1 = 0;
+       int i;
+
+       for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+               struct resource *r = &pdev->resource[i];
+               unsigned long r_size;
+
+               if (!(r->flags & IORESOURCE_IO))
+                       continue;
+
+               r_size = r->end - r->start + 1;
+
+               if (r_size < 0x400)
+                       /* Might be re-aligned for ISA */
+                       size += r_size;
+               else
+                       size1 += r_size;
+       }
+
+/* To be fixed in 2.5: we should have sort of HAVE_ISA
+   flag in the struct pci_bus. */
+#if defined(CONFIG_ISA) || defined(CONFIG_EISA)
+       size = (size & 0xff) + ((size & ~0xffUL) << 2);
+#endif
+       size = ROUND_UP(size + size1, 4096);
+       return size;
+}
+
+/*
+ * primary bus number of PCI-PCI bridge in switch on which
+ * this slots sits.
+ * i.e. the primary bus number of PCI-PCI bridge of downstream port
+ *      or root port in switch.
+ *      the secondary bus number of PCI-PCI bridge of upstream port
+ *      in switch.
+ */
+static inline unsigned char pci_dev_switch_busnr(struct pci_dev *pdev)
+{
+       if (pci_find_capability(pdev, PCI_CAP_ID_EXP))
+               return pdev->bus->primary;
+       return pdev->bus->number;
+}
+
+static LIST_HEAD(switch_list);
+static DEFINE_MUTEX(switch_list_lock);
+
+/*****************************************************************************/
+int pci_iomul_switch_io_allocated(const struct pci_iomul_switch *sw)
+{
+       return sw->io_base && sw->io_base <= sw->io_limit;
+}
+EXPORT_SYMBOL_GPL(pci_iomul_switch_io_allocated);
+
+static struct pci_iomul_switch *pci_iomul_find_switch_locked(int segment,
+                                                            uint8_t bus)
+{
+       struct pci_iomul_switch *sw;
+
+       BUG_ON(!mutex_is_locked(&switch_list_lock));
+       list_for_each_entry(sw, &switch_list, list) {
+               if (sw->segment == segment && sw->bus == bus)
+                       return sw;
+       }
+       return NULL;
+}
+
+static struct pci_iomul_slot *pci_iomul_find_slot_locked(
+       struct pci_iomul_switch *sw, uint8_t busnr, uint8_t dev)
+{
+       struct pci_iomul_slot *slot;
+
+       BUG_ON(!mutex_is_locked(&sw->lock));
+       list_for_each_entry(slot, &sw->slots, sibling) {
+               if (slot->bus == busnr && slot->dev == dev)
+                       return slot;
+       }
+       return NULL;
+}
+
+/* on successfull exit, sw->lock is locked for use slot and
+ * refrence count of sw is incremented.
+ */
+void pci_iomul_get_lock_switch(struct pci_dev *pdev,
+                              struct pci_iomul_switch **swp,
+                              struct pci_iomul_slot **slot)
+{
+       mutex_lock(&switch_list_lock);
+
+       *swp = pci_iomul_find_switch_locked(pci_domain_nr(pdev->bus),
+                                           pci_dev_switch_busnr(pdev));
+       if (!*swp) {
+               *slot = NULL;
+               goto out;
+       }
+
+       mutex_lock(&(*swp)->lock);
+       *slot = pci_iomul_find_slot_locked(*swp, pdev->bus->number,
+                                          PCI_SLOT(pdev->devfn));
+       if (!*slot) {
+               mutex_unlock(&(*swp)->lock);
+               *swp = NULL;
+       } else {
+               pci_iomul_switch_get(*swp);
+       }
+out:
+       mutex_unlock(&switch_list_lock);
+}
+EXPORT_SYMBOL_GPL(pci_iomul_get_lock_switch);
+
+static struct pci_iomul_switch *__devinit pci_iomul_switch_alloc(int segment,
+                                                                uint8_t bus)
+{
+       struct pci_iomul_switch *sw;
+
+       BUG_ON(!mutex_is_locked(&switch_list_lock));
+
+       sw = kmalloc(sizeof(*sw), GFP_KERNEL);
+
+       mutex_init(&sw->lock);
+       kref_init(&sw->kref);
+       sw->io_region = NULL;
+       sw->count = 0;
+       sw->current_pdev = NULL;
+       sw->segment = segment;
+       sw->bus = bus;
+       sw->io_base = 0;
+       sw->io_limit = 0;
+       sw->func = NULL;
+       INIT_LIST_HEAD(&sw->slots);
+
+       return sw;
+}
+
+static void __devinit pci_iomul_switch_add_locked(struct pci_iomul_switch *sw)
+{
+       BUG_ON(!mutex_is_locked(&switch_list_lock));
+       list_add(&sw->list, &switch_list);
+}
+
+#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
+static void __devinit pci_iomul_switch_del_locked(struct pci_iomul_switch *sw)
+{
+       BUG_ON(!mutex_is_locked(&switch_list_lock));
+       list_del(&sw->list);
+}
+#endif
+
+static int __devinit pci_iomul_slot_init(struct pci_dev *pdev,
+                                        struct pci_iomul_slot *slot)
+{
+       u16 rpcap;
+       u16 cap;
+
+       rpcap = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+       if (!rpcap) {
+               /* pci device isn't supported */
+               pr_info("PCI: sharing io port of non PCIe device %s "
+                       "isn't supported. ignoring.\n",
+                       pci_name(pdev));
+               return -ENOSYS;
+       }
+
+       pci_read_config_word(pdev, rpcap + PCI_CAP_FLAGS, &cap);
+       switch ((cap & PCI_EXP_FLAGS_TYPE) >> 4) {
+       case PCI_EXP_TYPE_RC_END:
+               pr_info("PCI: io port sharing of root complex integrated "
+                       "endpoint %s isn't supported. ignoring.\n",
+                       pci_name(pdev));
+               return -ENOSYS;
+       case PCI_EXP_TYPE_ENDPOINT:
+       case PCI_EXP_TYPE_LEG_END:
+               break;
+       default:
+               pr_info("PCI: io port sharing of non endpoint %s "
+                       "doesn't make sense. ignoring.\n",
+                       pci_name(pdev));
+               return -EINVAL;
+       }
+
+       kref_init(&slot->kref);
+       slot->switch_busnr = pci_dev_switch_busnr(pdev);
+       slot->segment = pci_domain_nr(pdev->bus);
+       slot->bus = pdev->bus->number;
+       slot->dev = PCI_SLOT(pdev->devfn);
+
+       return 0;
+}
+
+static struct pci_iomul_slot *__devinit pci_iomul_slot_alloc(
+       struct pci_dev *pdev)
+{
+       struct pci_iomul_slot *slot;
+
+       slot = kzalloc(sizeof(*slot), GFP_KERNEL);
+       if (!slot)
+               return NULL;
+
+       if (pci_iomul_slot_init(pdev, slot)) {
+               kfree(slot);
+               return NULL;
+       }
+       return slot;
+}
+
+static void __devinit pci_iomul_slot_add_locked(struct pci_iomul_switch *sw,
+                                               struct pci_iomul_slot *slot)
+{
+       BUG_ON(!mutex_is_locked(&sw->lock));
+       list_add(&slot->sibling, &sw->slots);
+}
+
+#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
+static void __devinit pci_iomul_slot_del_locked(struct pci_iomul_switch *sw,
+                                               struct pci_iomul_slot *slot)
+{
+       BUG_ON(!mutex_is_locked(&sw->lock));
+       list_del(&slot->sibling);
+}
+#endif
+
+/*****************************************************************************/
+static int __devinit pci_get_sbd(const char *str, int *segment__,
+                                uint8_t *bus__, uint8_t *dev__)
+{
+       int segment;
+       int bus;
+       int dev;
+
+       if (sscanf(str, "%x:%x:%x", &segment, &bus, &dev) != 3) {
+               if (sscanf(str, "%x:%x", &bus, &dev) == 2)
+                       segment = 0;
+               else
+                       return -EINVAL;
+       }
+
+       if (segment < 0 || INT_MAX <= segment)
+               return -EINVAL;
+       if (bus < 0 || PCI_BUS_MAX < bus)
+               return -EINVAL;
+       if (dev < 0 || PCI_DEV_MAX < dev)
+               return -EINVAL;
+
+       *segment__ = segment;
+       *bus__ = bus;
+       *dev__ = dev;
+       return 0;
+}
+
+static char __devinitdata iomul_param[COMMAND_LINE_SIZE];
+#define TOKEN_MAX      10      /* SSSS:BB:DD length is 10 */
+static int __devinit pci_is_iomul_dev_param(struct pci_dev *pdev)
+{
+       int len;
+       char *p;
+       char *next_str;
+
+       if (!strcmp(iomul_param, "all"))
+               return 1;
+       for (p = &iomul_param[0]; *p != '\0'; p = next_str + 1) {
+               next_str = strchr(p, ',');
+               if (next_str)
+                       len = next_str - p;
+               else
+                       len = strlen(p);
+
+               if (len > 0 && len <= TOKEN_MAX) {
+                       char tmp[TOKEN_MAX+1];
+                       int seg;
+                       uint8_t bus;
+                       uint8_t dev;
+
+                       strlcpy(tmp, p, len);
+                       if (!pci_get_sbd(tmp, &seg, &bus, &dev) &&
+                           pci_domain_nr(pdev->bus) == seg &&
+                           pdev->bus->number == bus &&
+                           PCI_SLOT(pdev->devfn) == dev)
+                               return 1;
+               }
+               if (!next_str)
+                       break;
+       }
+
+       /* check guestdev=<device>+iomul option */
+       return pci_is_iomuldev(pdev);
+}
+
+/*
+ * Format: [<segment>:]<bus>:<dev>[,[<segment>:]<bus>:<dev>[,...]
+ */
+static int __init pci_iomul_param_setup(char *str)
+{
+       if (!is_initial_xendomain() || strlen(str) >= COMMAND_LINE_SIZE)
+               return 0;
+
+       /* parse it after pci bus scanning */
+       strlcpy(iomul_param, str, sizeof(iomul_param));
+       return 1;
+}
+__setup("guestiomuldev=", pci_iomul_param_setup);
+
+/*****************************************************************************/
+static void __devinit pci_iomul_set_bridge_io_window(struct pci_dev *bridge,
+                                                    uint32_t io_base,
+                                                    uint32_t io_limit)
+{
+       uint16_t l;
+       uint32_t upper16;
+
+       io_base >>= 12;
+       io_base <<= 4;
+       io_limit >>= 12;
+       io_limit <<= 4;
+       l = (io_base & 0xff) | ((io_limit & 0xff) << 8);
+       upper16 = ((io_base & 0xffff00) >> 8) |
+               (((io_limit & 0xffff00) >> 8) << 16);
+
+       /* Temporarily disable the I/O range before updating PCI_IO_BASE. */
+       pci_write_config_dword(bridge, PCI_IO_BASE_UPPER16, 0x0000ffff);
+       /* Update lower 16 bits of I/O base/limit. */
+       pci_write_config_word(bridge, PCI_IO_BASE, l);
+       /* Update upper 16 bits of I/O base/limit. */
+       pci_write_config_dword(bridge, PCI_IO_BASE_UPPER16, upper16);
+}
+
+static void __devinit pci_disable_bridge_io_window(struct pci_dev *bridge)
+{
+       /* set base = 0xffffff limit = 0x0 */
+       pci_iomul_set_bridge_io_window(bridge, 0xffffff, 0);
+}
+
+static int __devinit pci_iomul_func_scan(struct pci_dev *pdev,
+                                        struct pci_iomul_slot *slot,
+                                        uint8_t func)
+{
+       struct pci_iomul_func *f;
+       unsigned int i;
+
+       f = kzalloc(sizeof(*f), GFP_KERNEL);
+       if (!f)
+               return -ENOMEM;
+
+       f->segment = slot->segment;
+       f->bus = slot->bus;
+       f->devfn = PCI_DEVFN(slot->dev, func);
+       f->io_size = pdev_size_io(pdev);
+
+       for (i = 0; i < PCI_NUM_BARS; i++) {
+               if (!(pci_resource_flags(pdev, i) & IORESOURCE_IO))
+                       continue;
+               if (!pci_resource_len(pdev, i))
+                       continue;
+
+               f->io_bar |= 1 << i;
+               f->resource[i] = pdev->resource[i];
+       }
+
+       if (f->io_bar)
+               slot->func[func] = f;
+       else
+               kfree(f);
+       return 0;
+}
+
+/*
+ * This is tricky part.
+ * fake PCI resource assignment routines by setting flags to 0.
+ * PCI resource allocate routines think the resource should
+ * be allocated by checking flags. 0 means this resource isn't used.
+ * See pbus_size_io() and pdev_sort_resources().
+ *
+ * After allocated resources, flags (IORESOURCE_IO) is exported
+ * to other part including user process.
+ * So we have to set flags to IORESOURCE_IO, but at the same time
+ * we must prevent those resources from reassigning when pci hot plug.
+ * To achieve that, set r->parent to dummy resource.
+ */
+static inline void __devinit pci_iomul_disable_resource(struct resource *r)
+{
+       /* don't allocate this resource */
+       r->flags = 0;
+}
+
+static void __pcihp_init pci_iomul_reenable_resource(
+       struct resource *dummy_parent, struct resource *r)
+{
+       int ret;
+
+       dummy_parent->start = r->start;
+       dummy_parent->end = r->end;
+       dummy_parent->flags = r->flags;
+       dummy_parent->name = "PCI IOMUL dummy resource";
+
+       ret = request_resource(dummy_parent, r);
+       BUG_ON(ret);
+}
+
+static void __devinit pci_iomul_fixup_ioresource(struct pci_dev *pdev,
+                                                struct pci_iomul_func *func,
+                                                int reassign, int dealloc)
+{
+       uint8_t i;
+       struct resource *r;
+
+       pr_info("PCI: deallocating io resource[%s]. io size 0x%lx\n",
+               pci_name(pdev), func->io_size);
+       for (i = 0; i < PCI_NUM_BARS; i++) {
+               r = &pdev->resource[i];
+               if (!(func->io_bar & (1 << i)))
+                       continue;
+
+               if (reassign) {
+                       r->end -= r->start;
+                       r->start = 0;
+                       pci_update_resource(pdev, i);
+                       func->resource[i] = *r;
+               }
+
+               if (dealloc)
+                       /* don't allocate this resource */
+                       pci_iomul_disable_resource(r);
+       }
+
+       /* parent PCI-PCI bridge */
+       if (!reassign)
+               return;
+       pdev = pdev->bus->self;
+       if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
+               return;
+       pci_disable_bridge_io_window(pdev);
+       for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+               r = &pdev->resource[i];
+               if (!(r->flags & IORESOURCE_IO))
+                       continue;
+
+               r->end -= r->start;
+               r->start = 0;
+               if (i < PCI_BRIDGE_RESOURCES)
+                       pci_update_resource(pdev, i);
+       }
+}
+
+static void __devinit __quirk_iomul_dealloc_ioresource(
+       struct pci_iomul_switch *sw,
+       struct pci_dev *pdev, struct pci_iomul_slot *slot)
+{
+       struct pci_iomul_func *f;
+       struct pci_iomul_func *__f;
+
+       if (pci_iomul_func_scan(pdev, slot, PCI_FUNC(pdev->devfn)))
+               return;
+
+       f = slot->func[PCI_FUNC(pdev->devfn)];
+       if (!f)
+               return;
+
+       __f = sw->func;
+       /* sw->io_base == 0 means that we are called at boot time.
+        * != 0 means that we are called by php after boot. */
+       if (!sw->io_base && (!__f || __f->io_size < f->io_size)) {
+               if (__f) {
+                       struct pci_bus *__pbus;
+                       struct pci_dev *__pdev;
+
+                       __pbus = pci_find_bus(__f->segment, __f->bus);
+                       BUG_ON(!__pbus);
+                       __pdev = pci_get_slot(__pbus, __f->devfn);
+                       BUG_ON(!__pdev);
+                       pci_iomul_fixup_ioresource(__pdev, __f, 0, 1);
+                       pci_dev_put(__pdev);
+               }
+
+               pci_iomul_fixup_ioresource(pdev, f, 1, 0);
+               sw->func = f;
+       } else {
+               pci_iomul_fixup_ioresource(pdev, f, 1, 1);
+       }
+}
+
+static void __devinit quirk_iomul_dealloc_ioresource(struct pci_dev *pdev)
+{
+       struct pci_iomul_switch *sw;
+       struct pci_iomul_slot *slot;
+
+       if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
+               return;
+       if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
+               return; /* PCI Host Bridge isn't a target device */
+       if (!pci_is_iomul_dev_param(pdev))
+               return;
+
+       mutex_lock(&switch_list_lock);
+       sw = pci_iomul_find_switch_locked(pci_domain_nr(pdev->bus),
+                                         pci_dev_switch_busnr(pdev));
+       if (!sw) {
+               sw = pci_iomul_switch_alloc(pci_domain_nr(pdev->bus),
+                                           pci_dev_switch_busnr(pdev));
+               if (!sw) {
+                       mutex_unlock(&switch_list_lock);
+                       pr_warn("PCI: can't allocate memory "
+                               "for sw of IO multiplexing %s",
+                               pci_name(pdev));
+                       return;
+               }
+               pci_iomul_switch_add_locked(sw);
+       }
+       pci_iomul_switch_get(sw);
+       mutex_unlock(&switch_list_lock);
+
+       mutex_lock(&sw->lock);
+       slot = pci_iomul_find_slot_locked(sw, pdev->bus->number,
+                                         PCI_SLOT(pdev->devfn));
+       if (!slot) {
+               slot = pci_iomul_slot_alloc(pdev);
+               if (!slot) {
+                       mutex_unlock(&sw->lock);
+                       pci_iomul_switch_put(sw);
+                       pr_warn("PCI: can't allocate memory "
+                               "for IO multiplexing %s", pci_name(pdev));
+                       return;
+               }
+               pci_iomul_slot_add_locked(sw, slot);
+       }
+
+       pr_info("PCI: disable device and release io resource[%s].\n",
+               pci_name(pdev));
+       pci_disable_device(pdev);
+
+       __quirk_iomul_dealloc_ioresource(sw, pdev, slot);
+
+       mutex_unlock(&sw->lock);
+       pci_iomul_switch_put(sw);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID,
+                        quirk_iomul_dealloc_ioresource);
+
+static void __pcihp_init pci_iomul_read_bridge_io(struct pci_iomul_switch *sw)
+{
+       struct pci_iomul_func *f = sw->func;
+
+       struct pci_bus *pbus;
+       struct pci_dev *pdev;
+       struct pci_dev *bridge;
+
+       uint16_t l;
+       uint16_t base_upper16;
+       uint16_t limit_upper16;
+       uint32_t io_base;
+       uint32_t io_limit;
+
+       pbus = pci_find_bus(f->segment, f->bus);
+       BUG_ON(!pbus);
+
+       pdev = pci_get_slot(pbus, f->devfn);
+       BUG_ON(!pdev);
+
+       bridge = pdev->bus->self;
+       pci_read_config_word(bridge, PCI_IO_BASE, &l);
+       pci_read_config_word(bridge, PCI_IO_BASE_UPPER16, &base_upper16);
+       pci_read_config_word(bridge, PCI_IO_LIMIT_UPPER16, &limit_upper16);
+
+       io_base = (l & 0xf0) | ((uint32_t)base_upper16 << 8);
+       io_base <<= 8;
+       io_limit = (l >> 8) | ((uint32_t)limit_upper16 << 8);
+       io_limit <<= 8;
+       io_limit |= 0xfff;
+
+       sw->io_base = io_base;
+       sw->io_limit = io_limit;
+
+       pci_dev_put(pdev);
+       pr_info("PCI: bridge %s base 0x%x limit 0x%x\n",
+               pci_name(bridge), sw->io_base, sw->io_limit);
+}
+
+static void __pcihp_init pci_iomul_setup_brige(struct pci_dev *bridge,
+                                              uint32_t io_base,
+                                              uint32_t io_limit)
+{
+       uint16_t cmd;
+
+       if ((bridge->class >> 8) == PCI_CLASS_BRIDGE_HOST)
+               return;
+
+       pci_iomul_set_bridge_io_window(bridge, io_base, io_limit);
+
+       /* and forcibly enables IO */
+       pci_read_config_word(bridge, PCI_COMMAND, &cmd);
+       if (!(cmd & PCI_COMMAND_IO)) {
+               cmd |= PCI_COMMAND_IO;
+               pr_info("PCI: forcibly enabling IO %s\n", pci_name(bridge));
+               pci_write_config_word(bridge, PCI_COMMAND, cmd);
+       }
+}
+
+struct __bar {
+       unsigned long size;
+       uint8_t bar;
+};
+
+/* decending order */
+static int __pcihp_init pci_iomul_bar_cmp(const void *lhs__, const void *rhs__)
+{
+       const struct __bar *lhs = (struct __bar*)lhs__;
+       const struct __bar *rhs = (struct __bar*)rhs__;
+       return - (lhs->size - rhs->size);
+}
+
+static void __pcihp_init pci_iomul_setup_dev(struct pci_dev *pdev,
+                                            struct pci_iomul_func *f,
+                                            uint32_t io_base)
+{
+       struct __bar bars[PCI_NUM_BARS];
+       int i;
+       uint8_t num_bars = 0;
+       struct resource *r;
+
+       pr_info("PCI: Forcibly assign IO %s from 0x%x\n",
+               pci_name(pdev), io_base);
+
+       for (i = 0; i < PCI_NUM_BARS; i++) {
+               if (!(f->io_bar & (1 << i)))
+                       continue;
+
+               r = &f->resource[i];
+               bars[num_bars].size = pci_iomul_len(r);
+               bars[num_bars].bar = i;
+
+               num_bars++;
+       }
+
+       sort(bars, num_bars, sizeof(bars[0]), &pci_iomul_bar_cmp, NULL);
+
+       for (i = 0; i < num_bars; i++) {
+               struct resource *fr = &f->resource[bars[i].bar];
+               r = &pdev->resource[bars[i].bar];
+
+               BUG_ON(r->start);
+               r->start += io_base;
+               r->end += io_base;
+
+               fr->start = r->start;
+               fr->end = r->end;
+
+               /* pci_update_resource() check flags. */
+               r->flags = fr->flags;
+               pci_update_resource(pdev, bars[i].bar);
+               pci_iomul_reenable_resource(&f->dummy_parent, r);
+
+               io_base += bars[i].size;
+       }
+}
+
+static void __pcihp_init pci_iomul_release_io_resource(
+       struct pci_dev *pdev, struct pci_iomul_switch *sw,
+       struct pci_iomul_slot *slot, struct pci_iomul_func *f)
+{
+       int i;
+       struct resource *r;
+
+       for (i = 0; i < PCI_NUM_BARS; i++) {
+               if ((pci_resource_flags(pdev, i) & IORESOURCE_IO) &&
+                   pdev->resource[i].parent) {
+                       r = &pdev->resource[i];
+                       f->resource[i] = *r;
+                       release_resource(r);
+                       pci_iomul_reenable_resource(&f->dummy_parent, r);
+               }
+       }
+
+       /* parent PCI-PCI bridge */
+       pdev = pdev->bus->self;
+       if ((pdev->class >> 8) != PCI_CLASS_BRIDGE_HOST) {
+               for (i = PCI_BRIDGE_RESOURCES; i < PCI_NUM_RESOURCES; i++) {
+                       struct resource *parent = pdev->resource[i].parent;
+
+                       if (!(pci_resource_flags(pdev, i) & IORESOURCE_IO) ||
+                           !parent)
+                               continue;
+
+                       r = &pdev->resource[i];
+
+                       sw->io_resource.flags = r->flags;
+                       sw->io_resource.start = sw->io_base;
+                       sw->io_resource.end = sw->io_limit;
+                       sw->io_resource.name = "PCI IO Multiplexer";
+
+                       release_resource(r);
+                       pci_iomul_reenable_resource(
+                               &slot->dummy_parent[i - PCI_BRIDGE_RESOURCES],
+                               r);
+
+                       if (request_resource(parent, &sw->io_resource))
+                               pr_err("PCI IOMul: can't allocate "
+                                      "resource. [0x%x, 0x%x]",
+                                      sw->io_base, sw->io_limit);
+               }
+       }
+}
+
+static void __pcihp_init quirk_iomul_reassign_ioresource(struct pci_dev *pdev)
+{
+       struct pci_iomul_switch *sw;
+       struct pci_iomul_slot *slot;
+       struct pci_iomul_func *sf;
+       struct pci_iomul_func *f;
+
+       pci_iomul_get_lock_switch(pdev, &sw, &slot);
+       if (!sw || !slot)
+               return;
+
+       if (!sw->io_base)
+               pci_iomul_read_bridge_io(sw);
+       if (!pci_iomul_switch_io_allocated(sw))
+               goto out;
+
+       sf = sw->func;
+       f = slot->func[PCI_FUNC(pdev->devfn)];
+       if (!f)
+               /*
+                * (!sf || !f) case can happen when all the
+                * specified devices don't have io space
+                */
+               goto out;
+
+       if (sf &&
+           (pci_domain_nr(pdev->bus) != sf->segment ||
+            pdev->bus->number != sf->bus ||
+            PCI_SLOT(pdev->devfn) != PCI_SLOT(sf->devfn)) &&
+           !PCI_FUNC(pdev->devfn)) {
+               pci_iomul_setup_brige(pdev->bus->self,
+                                     sw->io_base, sw->io_limit);
+       }
+
+       BUG_ON(f->io_size > sw->io_limit - sw->io_base + 1);
+       if (/* f == sf */ sf &&
+           pci_domain_nr(pdev->bus) == sf->segment &&
+           pdev->bus->number == sf->bus &&
+           pdev->devfn == sf->devfn)
+               pci_iomul_release_io_resource(pdev, sw, slot, f);
+       else
+               pci_iomul_setup_dev(pdev, f, sw->io_base);
+
+out:
+       mutex_unlock(&sw->lock);
+       pci_iomul_switch_put(sw);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID,
+                       quirk_iomul_reassign_ioresource);
+
+/*****************************************************************************/
+#if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
+static int __devinit __pci_iomul_notifier_del_device(struct pci_dev *pdev)
+{
+       struct pci_iomul_switch *sw;
+       struct pci_iomul_slot *slot;
+       int i;
+
+       pci_iomul_get_lock_switch(pdev, &sw, &slot);
+       if (!sw || !slot)
+               return 0;
+
+       if (sw->func == slot->func[PCI_FUNC(pdev->devfn)])
+               sw->func = NULL;
+       kfree(slot->func[PCI_FUNC(pdev->devfn)]);
+       slot->func[PCI_FUNC(pdev->devfn)] = NULL;
+       for (i = 0; i < PCI_NUM_FUNC; i++) {
+               if (slot->func[i])
+                       goto out;
+       }
+
+       pci_iomul_slot_del_locked(sw, slot);
+       pci_iomul_slot_put(slot);
+
+out:
+       mutex_unlock(&sw->lock);
+       pci_iomul_switch_put(sw);
+       return 0;
+}
+
+static int __devinit __pci_iomul_notifier_del_switch(struct pci_dev *pdev)
+{
+       struct pci_iomul_switch *sw;
+
+       mutex_lock(&switch_list_lock);
+       sw = pci_iomul_find_switch_locked(pci_domain_nr(pdev->bus),
+                                         pdev->bus->number);
+       if (!sw)
+               goto out;
+
+       pci_iomul_switch_del_locked(sw);
+
+       mutex_lock(&sw->lock);
+       if (sw->io_resource.parent)
+               release_resource(&sw->io_resource);
+       sw->io_base = 0;        /* to tell this switch is removed */
+       sw->io_limit = 0;
+       BUG_ON(!list_empty(&sw->slots));
+       mutex_unlock(&sw->lock);
+
+out:
+       mutex_unlock(&switch_list_lock);
+       pci_iomul_switch_put(sw);
+       return 0;
+}
+
+static int __devinit pci_iomul_notifier_del_device(struct pci_dev *pdev)
+{
+       int ret;
+       switch (pdev->hdr_type) {
+       case PCI_HEADER_TYPE_NORMAL:
+               ret = __pci_iomul_notifier_del_device(pdev);
+               break;
+       case PCI_HEADER_TYPE_BRIDGE:
+               ret = __pci_iomul_notifier_del_switch(pdev);
+               break;
+       default:
+               pr_warn("PCI IOMUL: device %s has unknown "
+                       "header type %02x, ignoring.\n",
+                       pci_name(pdev), pdev->hdr_type);
+               ret = -EIO;
+               break;
+       }
+       return ret;
+}
+
+static int __devinit pci_iomul_notifier(struct notifier_block *nb,
+                                       unsigned long action, void *data)
+{
+       struct device *dev = data;
+       struct pci_dev *pdev = to_pci_dev(dev);
+
+       switch (action) {
+       case BUS_NOTIFY_ADD_DEVICE:
+               quirk_iomul_reassign_ioresource(pdev);
+               break;
+       case BUS_NOTIFY_DEL_DEVICE:
+               pci_iomul_notifier_del_device(pdev);
+               break;
+       }
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block __devinitdata pci_iomul_nb = {
+       .notifier_call = pci_iomul_notifier,
+};
+
+static int __init pci_iomul_hotplug_init(void)
+{
+       if (!is_initial_xendomain())
+               return -ENODEV;
+
+       bus_register_notifier(&pci_bus_type, &pci_iomul_nb);
+       return 0;
+}
+late_initcall(pci_iomul_hotplug_init);
+#endif
diff --git a/drivers/pci/iomulti.h b/drivers/pci/iomulti.h

new file mode 100644 (file)

index 0000000..511ef5f
--- /dev/null
+++ b/drivers/pci/iomulti.h
@@ -0,0 +1,122 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (c) 2009 Isaku Yamahata
+ *                    VA Linux Systems Japan K.K.
+ */
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+
+#define PCI_NUM_BARS           6
+#define PCI_NUM_FUNC           8
+
+struct pci_iomul_func {
+       int             segment;
+       uint8_t         bus;
+       uint8_t         devfn;
+
+       /* only start and end are used */
+       unsigned long   io_size;
+       uint8_t         io_bar;
+       struct resource resource[PCI_NUM_BARS];
+       struct resource dummy_parent;
+};
+
+struct pci_iomul_switch {
+       struct list_head        list;   /* bus_list_lock protects */
+
+       /*
+        * This lock the following entry and following
+        * pci_iomul_slot/pci_iomul_func.
+        */
+       struct mutex            lock;
+       struct kref             kref;
+
+       struct resource         io_resource;
+       struct resource         *io_region;
+       unsigned int            count;
+       struct pci_dev          *current_pdev;
+
+       int                     segment;
+       uint8_t                 bus;
+
+       uint32_t                io_base;
+       uint32_t                io_limit;
+
+       /* func which has the largeset io size*/
+       struct pci_iomul_func   *func;
+
+       struct list_head        slots;
+};
+
+static inline void pci_iomul_switch_get(struct pci_iomul_switch *sw)
+{
+       kref_get(&sw->kref);
+}
+
+static inline void pci_iomul_switch_release(struct kref *kref)
+{
+       struct pci_iomul_switch *sw = container_of(kref,
+                                                  struct pci_iomul_switch,
+                                                  kref);
+       kfree(sw);
+}
+
+static inline void pci_iomul_switch_put(struct pci_iomul_switch *sw)
+{
+       kref_put(&sw->kref, &pci_iomul_switch_release);
+}
+
+struct pci_iomul_slot {
+       struct list_head        sibling;
+       struct kref             kref;
+       /*
+        * busnr
+        * when pcie, the primary busnr of the PCI-PCI bridge on which
+        * this devices sits.
+        */
+       uint8_t                 switch_busnr;
+       struct resource         dummy_parent[PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES];
+
+       /* device */
+       int                     segment;
+       uint8_t                 bus;
+       uint8_t                 dev;
+
+       struct pci_iomul_func   *func[PCI_NUM_FUNC];
+};
+
+static inline void pci_iomul_slot_get(struct pci_iomul_slot *slot)
+{
+       kref_get(&slot->kref);
+}
+
+static inline void pci_iomul_slot_release(struct kref *kref)
+{
+       struct pci_iomul_slot *slot = container_of(kref, struct pci_iomul_slot,
+                                                  kref);
+       kfree(slot);
+}
+
+static inline void pci_iomul_slot_put(struct pci_iomul_slot *slot)
+{
+       kref_put(&slot->kref, &pci_iomul_slot_release);
+}
+
+int pci_iomul_switch_io_allocated(const struct pci_iomul_switch *);
+void pci_iomul_get_lock_switch(struct pci_dev *, struct pci_iomul_switch **,
+                              struct pci_iomul_slot **);
diff --git a/drivers/pci/msi-xen.c b/drivers/pci/msi-xen.c

new file mode 100644 (file)

index 0000000..1aedcff
--- /dev/null
+++ b/drivers/pci/msi-xen.c
@@ -0,0 +1,1003 @@
+/*
+ * File:       msi.c
+ * Purpose:    PCI Message Signaled Interrupt (MSI)
+ *
+ * Copyright (C) 2003-2004 Intel
+ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
+ */
+
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/ioport.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/msi.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+
+#include <xen/evtchn.h>
+
+#include "pci.h"
+#include "msi.h"
+
+static int pci_msi_enable = 1;
+#if CONFIG_XEN_COMPAT < 0x040200
+static int pci_seg_supported = 1;
+#else
+#define pci_seg_supported 1
+#endif
+
+static LIST_HEAD(msi_dev_head);
+DEFINE_SPINLOCK(msi_dev_lock);
+
+struct msi_pirq_entry {
+       struct list_head list;
+       int pirq;
+       int entry_nr;
+       struct msi_dev_list *dev_entry;
+       struct kobject kobj;
+};
+
+struct msi_dev_list {
+       struct pci_dev *dev;
+       spinlock_t pirq_list_lock;
+       /* Store default pre-assigned irq */
+       unsigned int default_irq;
+       domid_t owner;
+       struct msi_pirq_entry e;
+};
+
+/* Arch hooks */
+
+#ifndef arch_msi_check_device
+int arch_msi_check_device(struct pci_dev *dev, int nvec, int type)
+{
+       return 0;
+}
+#endif
+
+static void msi_set_enable(struct pci_dev *dev, int pos, int enable)
+{
+       u16 control;
+
+       BUG_ON(!pos);
+
+       pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
+       control &= ~PCI_MSI_FLAGS_ENABLE;
+       if (enable)
+               control |= PCI_MSI_FLAGS_ENABLE;
+       pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control);
+}
+
+static void msix_set_enable(struct pci_dev *dev, int enable)
+{
+       int pos;
+       u16 control;
+
+       pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+       if (pos) {
+               pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control);
+               control &= ~PCI_MSIX_FLAGS_ENABLE;
+               if (enable)
+                       control |= PCI_MSIX_FLAGS_ENABLE;
+               pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
+       }
+}
+
+static int (*get_owner)(struct pci_dev *dev);
+
+static domid_t msi_get_dev_owner(struct pci_dev *dev)
+{
+       int owner;
+
+       if (is_initial_xendomain()
+           && get_owner && (owner = get_owner(dev)) >= 0) {
+               dev_info(&dev->dev, "get owner: %u\n", owner);
+               return owner;
+       }
+
+       return DOMID_SELF;
+}
+
+static struct msi_dev_list *get_msi_dev_pirq_list(struct pci_dev *dev)
+{
+       struct msi_dev_list *msi_dev_list, *ret = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&msi_dev_lock, flags);
+
+       list_for_each_entry(msi_dev_list, &msi_dev_head, e.list)
+               if ( msi_dev_list->dev == dev )
+                       ret = msi_dev_list;
+
+       if ( ret ) {
+               spin_unlock_irqrestore(&msi_dev_lock, flags);
+               if (ret->owner == DOMID_IO)
+                       ret->owner = msi_get_dev_owner(dev);
+               return ret;
+       }
+
+       /* Has not allocate msi_dev until now. */
+       ret = kzalloc(sizeof(struct msi_dev_list), GFP_ATOMIC);
+
+       /* Failed to allocate msi_dev structure */
+       if ( !ret ) {
+               spin_unlock_irqrestore(&msi_dev_lock, flags);
+               return NULL;
+       }
+
+       ret->dev = dev;
+       spin_lock_init(&ret->pirq_list_lock);
+       ret->owner = msi_get_dev_owner(dev);
+       ret->e.entry_nr = -1;
+       ret->e.dev_entry = ret;
+       list_add_tail(&ret->e.list, &msi_dev_head);
+       spin_unlock_irqrestore(&msi_dev_lock, flags);
+       return ret;
+}
+
+static int attach_pirq_entry(int pirq, int entry_nr,
+                             struct msi_dev_list *msi_dev_entry)
+{
+       struct msi_pirq_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+       unsigned long flags;
+
+       if (!entry)
+               return -ENOMEM;
+       entry->pirq = pirq;
+       entry->entry_nr = entry_nr;
+       entry->dev_entry = msi_dev_entry;
+       memset(&entry->kobj, 0, sizeof(entry->kobj));
+       spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
+       list_add_tail(&entry->list, &msi_dev_entry->dev->msi_list);
+       spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
+       return 0;
+}
+
+static void detach_pirq_entry(int entry_nr,
+                                                       struct msi_dev_list *msi_dev_entry)
+{
+       unsigned long flags;
+       struct msi_pirq_entry *pirq_entry;
+
+       list_for_each_entry(pirq_entry, &msi_dev_entry->dev->msi_list, list) {
+               if (pirq_entry->entry_nr == entry_nr) {
+                       spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
+                       list_del(&pirq_entry->list);
+                       spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
+                       kfree(pirq_entry);
+                       return;
+               }
+       }
+}
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+/*
+ * pciback will provide device's owner
+ */
+int register_msi_get_owner(int (*func)(struct pci_dev *dev))
+{
+       if (get_owner) {
+               pr_warning("register msi_get_owner again\n");
+               return -EEXIST;
+       }
+       get_owner = func;
+       return 0;
+}
+EXPORT_SYMBOL(register_msi_get_owner);
+
+int unregister_msi_get_owner(int (*func)(struct pci_dev *dev))
+{
+       if (get_owner != func)
+               return -EINVAL;
+       get_owner = NULL;
+       return 0;
+}
+EXPORT_SYMBOL(unregister_msi_get_owner);
+#endif
+
+static int msi_unmap_pirq(struct pci_dev *dev, int pirq, domid_t owner,
+                         struct kobject *kobj)
+{
+       struct physdev_unmap_pirq unmap;
+       int rc;
+
+       unmap.domid = owner;
+       /* See comments in msi_map_vector, input parameter pirq means
+        * irq number only if the device belongs to dom0 itself.
+        */
+       unmap.pirq = (unmap.domid != DOMID_SELF)
+               ? pirq : evtchn_get_xen_pirq(pirq);
+
+       if ((rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap)))
+               dev_warn(&dev->dev, "unmap irq %d failed\n", pirq);
+
+       if (rc < 0)
+               return rc;
+
+       /*
+        * Its possible that we get into this path when populate_msi_sysfs()
+        * fails, which means the entries were not registered with sysfs.
+        * In that case don't unregister them.
+        */
+       if (kobj->parent) {
+               kobject_del(kobj);
+               kobject_put(kobj);
+       }
+
+       if (unmap.domid == DOMID_SELF)
+               evtchn_map_pirq(pirq, 0);
+
+       return 0;
+}
+
+static u64 find_table_base(struct pci_dev *dev, int pos)
+{
+       u8 bar;
+       u32 reg;
+       unsigned long flags;
+
+       pci_read_config_dword(dev, msix_table_offset_reg(pos), &reg);
+       bar = reg & PCI_MSIX_FLAGS_BIRMASK;
+
+       flags = pci_resource_flags(dev, bar);
+       if (flags & (IORESOURCE_DISABLED | IORESOURCE_UNSET | IORESOURCE_BUSY))
+               return 0;
+
+       return pci_resource_start(dev, bar);
+}
+
+/*
+ * Protected by msi_lock
+ */
+static int msi_map_vector(struct pci_dev *dev, int entry_nr, u64 table_base,
+                         domid_t domid)
+{
+       struct physdev_map_pirq map_irq;
+       int rc = -EINVAL;
+
+       map_irq.domid = domid;
+       map_irq.type = MAP_PIRQ_TYPE_MSI_SEG;
+       map_irq.index = -1;
+       map_irq.pirq = -1;
+       map_irq.bus = dev->bus->number | (pci_domain_nr(dev->bus) << 16);
+       map_irq.devfn = dev->devfn;
+       map_irq.entry_nr = entry_nr;
+       map_irq.table_base = table_base;
+
+       if (pci_seg_supported)
+               rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+#if CONFIG_XEN_COMPAT < 0x040200
+       if (rc == -EINVAL && !pci_domain_nr(dev->bus)) {
+               map_irq.type = MAP_PIRQ_TYPE_MSI;
+               map_irq.index = -1;
+               map_irq.pirq = -1;
+               map_irq.bus = dev->bus->number;
+               rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+               if (rc != -EINVAL)
+                       pci_seg_supported = 0;
+       }
+#endif
+       if (rc)
+               dev_warn(&dev->dev, "map irq failed\n");
+
+       if (rc < 0)
+               return rc;
+       /* This happens when MSI support is not enabled in older Xen. */
+       if (rc == 0 && map_irq.pirq < 0)
+               return -ENOSYS;
+
+       BUG_ON(map_irq.pirq <= 0);
+
+       /* If mapping of this particular MSI is on behalf of another domain,
+        * we do not need to get an irq in dom0. This also implies:
+        * dev->irq in dom0 will be 'Xen pirq' if this device belongs to
+        * to another domain, and will be 'Linux irq' if it belongs to dom0.
+        */
+       if (domid == DOMID_SELF) {
+               rc = evtchn_map_pirq(-1, map_irq.pirq);
+               dev_printk(KERN_DEBUG, &dev->dev,
+                          "irq %d (%d) for MSI/MSI-X\n",
+                          rc, map_irq.pirq);
+               return rc;
+       }
+       dev_printk(KERN_DEBUG, &dev->dev, "irq %d for dom%d MSI/MSI-X\n",
+                  map_irq.pirq, domid);
+       return map_irq.pirq;
+}
+
+static void pci_intx_for_msi(struct pci_dev *dev, int enable)
+{
+       if (!(dev->dev_flags & PCI_DEV_FLAGS_MSI_INTX_DISABLE_BUG))
+               pci_intx(dev, enable);
+}
+
+void pci_restore_msi_state(struct pci_dev *dev)
+{
+       int rc = -ENOSYS;
+
+       if (!dev->msi_enabled && !dev->msix_enabled)
+               return;
+
+       pci_intx_for_msi(dev, 0);
+       if (dev->msi_enabled) {
+               int pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+
+               msi_set_enable(dev, pos, 0);
+       }
+       if (dev->msix_enabled)
+               msix_set_enable(dev, 0);
+
+       if (pci_seg_supported) {
+               struct physdev_pci_device restore = {
+                       .seg = pci_domain_nr(dev->bus),
+                       .bus = dev->bus->number,
+                       .devfn = dev->devfn
+               };
+
+               rc = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi_ext,
+                                          &restore);
+       }
+#if CONFIG_XEN_COMPAT < 0x040200
+       if (rc == -ENOSYS && !pci_domain_nr(dev->bus)) {
+               struct physdev_restore_msi restore = {
+                       .bus = dev->bus->number,
+                       .devfn = dev->devfn
+               };
+
+               pci_seg_supported = false;
+               rc = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore);
+       }
+#endif
+       WARN(rc && rc != -ENOSYS, "restore_msi -> %d\n", rc);
+}
+EXPORT_SYMBOL_GPL(pci_restore_msi_state);
+
+
+#define to_msi_attr(obj) container_of(obj, struct msi_attribute, attr)
+#define to_pirq_entry(obj) container_of(obj, struct msi_pirq_entry, kobj)
+
+struct msi_attribute {
+       struct attribute        attr;
+       ssize_t (*show)(struct msi_pirq_entry *, struct msi_attribute *,
+                       char *buf);
+       ssize_t (*store)(struct msi_pirq_entry *, struct msi_attribute *,
+                        const char *buf, size_t count);
+};
+
+static ssize_t show_msi_mode(struct msi_pirq_entry *entry,
+                            struct msi_attribute *attr, char *buf)
+{
+       return sprintf(buf, "%s\n", entry->entry_nr >= 0 ? "msix" : "msi");
+}
+
+static ssize_t show_xen_irq(struct msi_pirq_entry *entry,
+                           struct msi_attribute *attr, char *buf)
+{
+       return sprintf(buf, "%d\n", entry->dev_entry->owner == DOMID_SELF
+                                   ? evtchn_get_xen_pirq(entry->pirq)
+                                   : entry->pirq);
+}
+
+static ssize_t msi_irq_attr_show(struct kobject *kobj,
+                                struct attribute *attr, char *buf)
+{
+       struct msi_attribute *attribute = to_msi_attr(attr);
+       struct msi_pirq_entry *entry = to_pirq_entry(kobj);
+
+       if (!attribute->show)
+               return -EIO;
+
+       return attribute->show(entry, attribute, buf);
+}
+
+static const struct sysfs_ops msi_irq_sysfs_ops = {
+       .show = msi_irq_attr_show,
+};
+
+static struct msi_attribute mode_attribute =
+       __ATTR(mode, S_IRUGO, show_msi_mode, NULL);
+
+static struct msi_attribute xen_irq_attribute =
+       __ATTR(xen_irq, S_IRUGO, show_xen_irq, NULL);
+
+static struct attribute *msi_irq_default_attrs[] = {
+       &mode_attribute.attr,
+       &xen_irq_attribute.attr,
+       NULL
+};
+
+static struct attribute *msi_pirq_default_attrs[] = {
+       &mode_attribute.attr,
+       NULL
+};
+
+static void msi_kobj_release(struct kobject *kobj)
+{
+       struct msi_dev_list *entry = to_pirq_entry(kobj)->dev_entry;
+
+       pci_dev_put(entry->dev);
+}
+
+static struct kobj_type msi_irq_ktype = {
+       .release = msi_kobj_release,
+       .sysfs_ops = &msi_irq_sysfs_ops,
+       .default_attrs = msi_irq_default_attrs,
+};
+
+static struct kobj_type msi_pirq_ktype = {
+       .release = msi_kobj_release,
+       .sysfs_ops = &msi_irq_sysfs_ops,
+       .default_attrs = msi_pirq_default_attrs,
+};
+
+static int populate_msi_sysfs(struct pci_dev *pdev)
+{
+       struct msi_dev_list *dev_entry = get_msi_dev_pirq_list(pdev);
+       domid_t owner = dev_entry->owner;
+       struct msi_pirq_entry *pirq_entry;
+       struct kobject *kobj;
+       int ret;
+       int count = 0;
+
+       pdev->msi_kset = kset_create_and_add("msi_irqs", NULL, &pdev->dev.kobj);
+       if (!pdev->msi_kset)
+               return -ENOMEM;
+
+       if (pdev->msi_enabled) {
+               kobj = &dev_entry->e.kobj;
+               kobj->kset = pdev->msi_kset;
+               pci_dev_get(pdev);
+               if (owner == DOMID_SELF)
+                       ret = kobject_init_and_add(kobj, &msi_irq_ktype, NULL,
+                                                  "%u", pdev->irq);
+               else
+                       ret = kobject_init_and_add(kobj, &msi_pirq_ktype, NULL,
+                                                  "xen-%u", pdev->irq);
+               if (ret)
+                       pci_dev_put(pdev);
+               return ret;
+       }
+
+       list_for_each_entry(pirq_entry, &pdev->msi_list, list) {
+               kobj = &pirq_entry->kobj;
+               kobj->kset = pdev->msi_kset;
+               pci_dev_get(pdev);
+               if (owner == DOMID_SELF)
+                       ret = kobject_init_and_add(kobj, &msi_irq_ktype, NULL,
+                                                  "%u", pirq_entry->pirq);
+               else
+                       ret = kobject_init_and_add(kobj, &msi_pirq_ktype, NULL,
+                                                  "xen-%u", pirq_entry->pirq);
+               if (ret)
+                       goto out_unroll;
+
+               count++;
+       }
+
+       return 0;
+
+out_unroll:
+       pci_dev_put(pdev);
+       list_for_each_entry(pirq_entry, &pdev->msi_list, list) {
+               if (!count)
+                       break;
+               kobject_del(&pirq_entry->kobj);
+               kobject_put(&pirq_entry->kobj);
+               count--;
+       }
+       return ret;
+}
+
+/**
+ * msi_capability_init - configure device's MSI capability structure
+ * @dev: pointer to the pci_dev data structure of MSI device function
+ * @nvec: number of interrupts to allocate
+ *
+ * Setup the MSI capability structure of the device with the requested
+ * number of interrupts.  A return value of zero indicates the successful
+ * setup of an entry with the new MSI irq.  A negative return value indicates
+ * an error, and a positive return value indicates the number of interrupts
+ * which could have been allocated.
+ */
+static int msi_capability_init(struct pci_dev *dev, int nvec)
+{
+       struct msi_dev_list *dev_entry = get_msi_dev_pirq_list(dev);
+       int pos, pirq;
+       u16 control;
+
+       pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+       msi_set_enable(dev, pos, 0);    /* Disable MSI during set up */
+
+       pci_read_config_word(dev, msi_control_reg(pos), &control);
+
+       pirq = msi_map_vector(dev, 0, 0, dev_entry->owner);
+       if (pirq < 0)
+               return -EBUSY;
+
+       /* Set MSI enabled bits  */
+       pci_intx_for_msi(dev, 0);
+       msi_set_enable(dev, pos, 1);
+       dev->msi_enabled = 1;
+
+       dev->irq = dev_entry->e.pirq = pirq;
+       populate_msi_sysfs(dev);
+       return 0;
+}
+
+/**
+ * msix_capability_init - configure device's MSI-X capability
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ * @entries: pointer to an array of struct msix_entry entries
+ * @nvec: number of @entries
+ *
+ * Setup the MSI-X capability structure of device function with a
+ * single MSI-X irq. A return of zero indicates the successful setup of
+ * requested MSI-X entries with allocated irqs or non-zero for otherwise.
+ **/
+static int msix_capability_init(struct pci_dev *dev,
+                               struct msix_entry *entries, int nvec)
+{
+       u64 table_base;
+       int pirq, i, j, mapped, pos;
+       u16 control;
+       struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
+       struct msi_pirq_entry *pirq_entry;
+
+       if (!msi_dev_entry)
+               return -ENOMEM;
+
+       msix_set_enable(dev, 0);/* Ensure msix is disabled as I set it up */
+
+       pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+       pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control);
+
+       /* Ensure MSI-X is disabled while it is set up */
+       control &= ~PCI_MSIX_FLAGS_ENABLE;
+       pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
+
+       table_base = find_table_base(dev, pos);
+       if (!table_base)
+               return -ENODEV;
+
+       /*
+        * Some devices require MSI-X to be enabled before we can touch the
+        * MSI-X registers.  We need to mask all the vectors to prevent
+        * interrupts coming in before they're fully set up.
+        */
+       control |= PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE;
+       pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
+
+       for (i = 0; i < nvec; i++) {
+               mapped = 0;
+               list_for_each_entry(pirq_entry, &dev->msi_list, list) {
+                       if (pirq_entry->entry_nr == entries[i].entry) {
+                               dev_warn(&dev->dev,
+                                        "msix entry %d was not freed\n",
+                                        entries[i].entry);
+                               (entries + i)->vector = pirq_entry->pirq;
+                               mapped = 1;
+                               break;
+                       }
+               }
+               if (mapped)
+                       continue;
+               pirq = msi_map_vector(dev, entries[i].entry, table_base,
+                                     msi_dev_entry->owner);
+               if (pirq < 0)
+                       break;
+               attach_pirq_entry(pirq, entries[i].entry, msi_dev_entry);
+               (entries + i)->vector = pirq;
+       }
+
+       if (i != nvec) {
+               int avail = i - 1;
+               for (j = --i; j >= 0; j--) {
+                       list_for_each_entry(pirq_entry, &dev->msi_list, list)
+                               if (pirq_entry->entry_nr == entries[i].entry)
+                                       break;
+                       msi_unmap_pirq(dev, entries[j].vector,
+                                      msi_dev_entry->owner,
+                                      &pirq_entry->kobj);
+                       detach_pirq_entry(entries[j].entry, msi_dev_entry);
+                       entries[j].vector = 0;
+               }
+               /* If we had some success report the number of irqs
+                * we succeeded in setting up.
+                */
+               if (avail <= 0)
+                       avail = -EBUSY;
+               return avail;
+       }
+
+       /* Set MSI-X enabled bits and unmask the function */
+       pci_intx_for_msi(dev, 0);
+       dev->msix_enabled = 1;
+       populate_msi_sysfs(dev);
+
+       control &= ~PCI_MSIX_FLAGS_MASKALL;
+       pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
+
+       return 0;
+}
+
+/**
+ * pci_msi_check_device - check whether MSI may be enabled on a device
+ * @dev: pointer to the pci_dev data structure of MSI device function
+ * @nvec: how many MSIs have been requested ?
+ * @type: are we checking for MSI or MSI-X ?
+ *
+ * Look at global flags, the device itself, and its parent busses
+ * to determine if MSI/-X are supported for the device. If MSI/-X is
+ * supported return 0, else return an error code.
+ **/
+static int pci_msi_check_device(struct pci_dev *dev, int nvec, int type)
+{
+       struct pci_bus *bus;
+       int ret;
+
+       /* MSI must be globally enabled and supported by the device */
+       if (!pci_msi_enable || !dev || dev->no_msi)
+               return -EINVAL;
+
+       /*
+        * You can't ask to have 0 or less MSIs configured.
+        *  a) it's stupid ..
+        *  b) the list manipulation code assumes nvec >= 1.
+        */
+       if (nvec < 1)
+               return -ERANGE;
+
+       /*
+        * Any bridge which does NOT route MSI transactions from its
+        * secondary bus to its primary bus must set NO_MSI flag on
+        * the secondary pci_bus.
+        * We expect only arch-specific PCI host bus controller driver
+        * or quirks for specific PCI bridges to be setting NO_MSI.
+        */
+       for (bus = dev->bus; bus; bus = bus->parent)
+               if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI)
+                       return -EINVAL;
+
+       ret = arch_msi_check_device(dev, nvec, type);
+       if (ret)
+               return ret;
+
+       if (!pci_find_capability(dev, type))
+               return -EINVAL;
+
+       return 0;
+}
+
+/**
+ * pci_enable_msi_block - configure device's MSI capability structure
+ * @dev: device to configure
+ * @nvec: number of interrupts to configure
+ *
+ * Allocate IRQs for a device with the MSI capability.
+ * This function returns a negative errno if an error occurs.  If it
+ * is unable to allocate the number of interrupts requested, it returns
+ * the number of interrupts it might be able to allocate.  If it successfully
+ * allocates at least the number of interrupts requested, it returns 0 and
+ * updates the @dev's irq member to the lowest new interrupt number; the
+ * other interrupt numbers allocated to this device are consecutive.
+ */
+extern int pci_frontend_enable_msi(struct pci_dev *dev);
+int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec)
+{
+       int temp, status, pos, maxvec;
+       u16 msgctl;
+       struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
+
+       pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+       if (!pos)
+               return -EINVAL;
+       pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
+       maxvec = 1 /* XXX << ((msgctl & PCI_MSI_FLAGS_QMASK) >> 1) */;
+       if (nvec > maxvec)
+               return maxvec;
+
+       status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSI);
+       if (status)
+               return status;
+
+       if (!is_initial_xendomain()) {
+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
+               int ret;
+
+               temp = dev->irq;
+               ret = pci_frontend_enable_msi(dev);
+               if (ret)
+                       return ret;
+
+               dev->irq = evtchn_map_pirq(-1, dev->irq);
+               dev->msi_enabled = 1;
+               msi_dev_entry->default_irq = temp;
+               populate_msi_sysfs(dev);
+               return ret;
+#else
+               return -EOPNOTSUPP;
+#endif
+       }
+
+       temp = dev->irq;
+
+       /* Check whether driver already requested MSI-X irqs */
+       if (dev->msix_enabled) {
+               dev_info(&dev->dev, "can't enable MSI "
+                        "(MSI-X already enabled)\n");
+               return -EINVAL;
+       }
+
+       status = msi_capability_init(dev, nvec);
+       if ( !status )
+               msi_dev_entry->default_irq = temp;
+
+       return status;
+}
+EXPORT_SYMBOL(pci_enable_msi_block);
+
+extern void pci_frontend_disable_msi(struct pci_dev* dev);
+void pci_msi_shutdown(struct pci_dev *dev)
+{
+       int pirq, pos;
+       struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
+
+       if (!pci_msi_enable || !dev || !dev->msi_enabled)
+               return;
+
+       if (!is_initial_xendomain()) {
+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
+               evtchn_map_pirq(dev->irq, 0);
+               pci_frontend_disable_msi(dev);
+               dev->irq = msi_dev_entry->default_irq;
+               dev->msi_enabled = 0;
+#endif
+               return;
+       }
+
+       pirq = dev->irq;
+       /* Restore dev->irq to its default pin-assertion vector */
+       dev->irq = msi_dev_entry->default_irq;
+       msi_unmap_pirq(dev, pirq, msi_dev_entry->owner,
+                      &msi_dev_entry->e.kobj);
+       msi_dev_entry->owner = DOMID_IO;
+       memset(&msi_dev_entry->e.kobj, 0, sizeof(msi_dev_entry->e.kobj));
+
+       /* Disable MSI mode */
+       pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+       msi_set_enable(dev, pos, 0);
+       pci_intx_for_msi(dev, 1);
+       dev->msi_enabled = 0;
+}
+
+void pci_disable_msi(struct pci_dev *dev)
+{
+       pci_msi_shutdown(dev);
+       kset_unregister(dev->msi_kset);
+       dev->msi_kset = NULL;
+}
+EXPORT_SYMBOL(pci_disable_msi);
+
+/**
+ * pci_msix_table_size - return the number of device's MSI-X table entries
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ */
+int pci_msix_table_size(struct pci_dev *dev)
+{
+       int pos;
+       u16 control;
+
+       pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+       if (!pos)
+               return 0;
+
+       pci_read_config_word(dev, msi_control_reg(pos), &control);
+       return multi_msix_capable(control);
+}
+
+/**
+ * pci_enable_msix - configure device's MSI-X capability structure
+ * @dev: pointer to the pci_dev data structure of MSI-X device function
+ * @entries: pointer to an array of MSI-X entries
+ * @nvec: number of MSI-X irqs requested for allocation by device driver
+ *
+ * Setup the MSI-X capability structure of device function with the number
+ * of requested irqs upon its software driver call to request for
+ * MSI-X mode enabled on its hardware device function. A return of zero
+ * indicates the successful configuration of MSI-X capability structure
+ * with new allocated MSI-X irqs. A return of < 0 indicates a failure.
+ * Or a return of > 0 indicates that driver request is exceeding the number
+ * of irqs or MSI-X vectors available. Driver should use the returned value to
+ * re-send its request.
+ **/
+extern int pci_frontend_enable_msix(struct pci_dev *dev,
+               struct msix_entry *entries, int nvec);
+int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
+{
+       int status, nr_entries;
+       int i, j, temp;
+       struct msi_dev_list *msi_dev_entry = get_msi_dev_pirq_list(dev);
+
+       if (!entries)
+               return -EINVAL;
+
+       if (!is_initial_xendomain()) {
+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
+               struct msi_pirq_entry *pirq_entry;
+               int ret, irq;
+
+               temp = dev->irq;
+               ret = pci_frontend_enable_msix(dev, entries, nvec);
+               if (ret) {
+                       dev_warn(&dev->dev,
+                                "got %x from frontend_enable_msix\n", ret);
+                       return ret;
+               }
+               dev->msix_enabled = 1;
+               msi_dev_entry->default_irq = temp;
+
+               for (i = 0; i < nvec; i++) {
+                       int mapped = 0;
+
+                       list_for_each_entry(pirq_entry, &dev->msi_list, list) {
+                               if (pirq_entry->entry_nr == entries[i].entry) {
+                                       irq = pirq_entry->pirq;
+                                       BUG_ON(entries[i].vector != evtchn_get_xen_pirq(irq));
+                                       entries[i].vector = irq;
+                                       mapped = 1;
+                                       break;
+                               }
+                       }
+                       if (mapped)
+                               continue;
+                       irq = evtchn_map_pirq(-1, entries[i].vector);
+                       attach_pirq_entry(irq, entries[i].entry, msi_dev_entry);
+                       entries[i].vector = irq;
+               }
+               populate_msi_sysfs(dev);
+               return 0;
+#else
+               return -EOPNOTSUPP;
+#endif
+       }
+
+       status = pci_msi_check_device(dev, nvec, PCI_CAP_ID_MSIX);
+       if (status)
+               return status;
+
+       nr_entries = pci_msix_table_size(dev);
+       if (nvec > nr_entries)
+               return nr_entries;
+
+       /* Check for any invalid entries */
+       for (i = 0; i < nvec; i++) {
+               if (entries[i].entry >= nr_entries)
+                       return -EINVAL;         /* invalid entry */
+               for (j = i + 1; j < nvec; j++) {
+                       if (entries[i].entry == entries[j].entry)
+                               return -EINVAL; /* duplicate entry */
+               }
+       }
+
+       temp = dev->irq;
+       /* Check whether driver already requested for MSI vector */
+       if (dev->msi_enabled) {
+               dev_info(&dev->dev, "can't enable MSI-X "
+                      "(MSI IRQ already assigned)\n");
+               return -EINVAL;
+       }
+
+       status = msix_capability_init(dev, entries, nvec);
+
+       if ( !status )
+               msi_dev_entry->default_irq = temp;
+
+       return status;
+}
+EXPORT_SYMBOL(pci_enable_msix);
+
+extern void pci_frontend_disable_msix(struct pci_dev* dev);
+void pci_msix_shutdown(struct pci_dev *dev)
+{
+       if (!pci_msi_enable || !dev || !dev->msix_enabled)
+               return;
+
+       if (!is_initial_xendomain())
+#ifdef CONFIG_XEN_PCIDEV_FRONTEND
+               pci_frontend_disable_msix(dev);
+#else
+               return;
+#endif
+
+       msi_remove_pci_irq_vectors(dev);
+
+       /* Disable MSI mode */
+       if (is_initial_xendomain()) {
+               msix_set_enable(dev, 0);
+               pci_intx_for_msi(dev, 1);
+       }
+       dev->msix_enabled = 0;
+}
+
+void pci_disable_msix(struct pci_dev *dev)
+{
+       pci_msix_shutdown(dev);
+       kset_unregister(dev->msi_kset);
+       dev->msi_kset = NULL;
+}
+EXPORT_SYMBOL(pci_disable_msix);
+
+/**
+ * msi_remove_pci_irq_vectors - reclaim MSI(X) irqs to unused state
+ * @dev: pointer to the pci_dev data structure of MSI(X) device function
+ *
+ * Being called during hotplug remove, from which the device function
+ * is hot-removed. All previous assigned MSI/MSI-X irqs, if
+ * allocated for this device function, are reclaimed to unused state,
+ * which may be used later on.
+ **/
+void msi_remove_pci_irq_vectors(struct pci_dev *dev)
+{
+       unsigned long flags;
+       struct msi_dev_list *msi_dev_entry;
+       struct msi_pirq_entry *pirq_entry, *tmp;
+
+       if (!pci_msi_enable || !dev)
+               return;
+
+       msi_dev_entry = get_msi_dev_pirq_list(dev);
+
+       spin_lock_irqsave(&msi_dev_entry->pirq_list_lock, flags);
+       list_for_each_entry_safe(pirq_entry, tmp, &dev->msi_list, list) {
+               if (is_initial_xendomain())
+                       msi_unmap_pirq(dev, pirq_entry->pirq,
+                                      msi_dev_entry->owner,
+                                      &pirq_entry->kobj);
+               else
+                       evtchn_map_pirq(pirq_entry->pirq, 0);
+               list_del(&pirq_entry->list);
+               kfree(pirq_entry);
+       }
+       spin_unlock_irqrestore(&msi_dev_entry->pirq_list_lock, flags);
+       msi_dev_entry->owner = DOMID_IO;
+       dev->irq = msi_dev_entry->default_irq;
+}
+
+void pci_no_msi(void)
+{
+       pci_msi_enable = 0;
+}
+
+/**
+ * pci_msi_enabled - is MSI enabled?
+ *
+ * Returns true if MSI has not been disabled by the command-line option
+ * pci=nomsi.
+ **/
+int pci_msi_enabled(void)
+{
+       return pci_msi_enable;
+}
+EXPORT_SYMBOL(pci_msi_enabled);
+
+void pci_msi_init_pci_dev(struct pci_dev *dev)
+{
+       int pos;
+       INIT_LIST_HEAD(&dev->msi_list);
+
+       /* Disable the msi hardware to avoid screaming interrupts
+        * during boot.  This is the power on reset default so
+        * usually this should be a noop.
+        */
+       pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
+       if (pos)
+               msi_set_enable(dev, pos, 0);
+       msix_set_enable(dev, 0);
+}
diff --git a/drivers/pci/pci-iomul.c b/drivers/pci/pci-iomul.c

new file mode 100644 (file)

index 0000000..395c96b
--- /dev/null
+++ b/drivers/pci/pci-iomul.c
@@ -0,0 +1,440 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (c) 2009 Isaku Yamahata
+ *                    VA Linux Systems Japan K.K.
+ */
+
+#include "iomulti.h"
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <xen/public/iomulti.h>
+
+struct pci_iomul_data {
+       struct mutex lock;
+
+       struct pci_dev *pdev;
+       struct pci_iomul_switch *sw;
+       struct pci_iomul_slot *slot;    /* slot::kref */
+       struct pci_iomul_func **func;   /* when dereferencing,
+                                          sw->lock is necessary */
+};
+
+static int pci_iomul_func_ioport(struct pci_iomul_func *func,
+                                uint8_t bar, uint64_t offset, int *port)
+{
+       if (!(func->io_bar & (1 << bar)))
+               return -EINVAL;
+
+       *port = func->resource[bar].start + offset;
+       if (*port < func->resource[bar].start ||
+           *port > func->resource[bar].end)
+               return -EINVAL;
+
+       return 0;
+}
+
+static inline int pci_iomul_valid(struct pci_iomul_data *iomul)
+{
+       BUG_ON(!mutex_is_locked(&iomul->lock));
+       BUG_ON(!mutex_is_locked(&iomul->sw->lock));
+       return pci_iomul_switch_io_allocated(iomul->sw) &&
+               *iomul->func != NULL;
+}
+
+static void __pci_iomul_enable_io(struct pci_dev *pdev)
+{
+       uint16_t cmd;
+
+       pci_dev_get(pdev);
+       pci_read_config_word(pdev, PCI_COMMAND, &cmd);
+       cmd |= PCI_COMMAND_IO;
+       pci_write_config_word(pdev, PCI_COMMAND, cmd);
+}
+
+static void __pci_iomul_disable_io(struct pci_iomul_data *iomul,
+                                  struct pci_dev *pdev)
+{
+       uint16_t cmd;
+
+       if (!pci_iomul_valid(iomul))
+               return;
+
+       pci_read_config_word(pdev, PCI_COMMAND, &cmd);
+       cmd &= ~PCI_COMMAND_IO;
+       pci_write_config_word(pdev, PCI_COMMAND, cmd);
+       pci_dev_put(pdev);
+}
+
+static int pci_iomul_open(struct inode *inode, struct file *filp)
+{
+       struct pci_iomul_data *iomul;
+       iomul = kmalloc(sizeof(*iomul), GFP_KERNEL);
+       if (iomul == NULL)
+               return -ENOMEM;
+
+       mutex_init(&iomul->lock);
+       iomul->pdev = NULL;
+       iomul->sw = NULL;
+       iomul->slot = NULL;
+       iomul->func = NULL;
+       filp->private_data = (void*)iomul;
+
+       return nonseekable_open(inode, filp);
+}
+
+static int pci_iomul_release(struct inode *inode, struct file *filp)
+{
+       struct pci_iomul_data *iomul =
+               (struct pci_iomul_data*)filp->private_data;
+       struct pci_iomul_switch *sw;
+       struct pci_iomul_slot *slot = NULL;
+
+       mutex_lock(&iomul->lock);
+       sw = iomul->sw;
+       slot = iomul->slot;
+       if (iomul->pdev != NULL) {
+               if (sw != NULL) {
+                       mutex_lock(&sw->lock);
+                       if (sw->current_pdev == iomul->pdev) {
+                               __pci_iomul_disable_io(iomul,
+                                                      sw->current_pdev);
+                               sw->current_pdev = NULL;
+                       }
+                       sw->count--;
+                       if (sw->count == 0) {
+                               release_region(sw->io_region->start, sw->io_region->end - sw->io_region->start + 1);
+                               sw->io_region = NULL;
+                       }
+                       mutex_unlock(&sw->lock);
+               }
+               pci_dev_put(iomul->pdev);
+       }
+       mutex_unlock(&iomul->lock);
+
+       if (slot != NULL)
+               pci_iomul_slot_put(slot);
+       if (sw != NULL)
+               pci_iomul_switch_put(sw);
+       kfree(iomul);
+       return 0;
+}
+
+static long pci_iomul_setup(struct pci_iomul_data *iomul,
+                           struct pci_iomul_setup __user *arg)
+{
+       long error = 0;
+       struct pci_iomul_setup setup;
+       struct pci_iomul_switch *sw = NULL;
+       struct pci_iomul_slot *slot;
+       struct pci_bus *pbus;
+       struct pci_dev *pdev;
+
+       if (copy_from_user(&setup, arg, sizeof(setup)))
+               return -EFAULT;
+
+       pbus = pci_find_bus(setup.segment, setup.bus);
+       if (pbus == NULL)
+               return -ENODEV;
+       pdev = pci_get_slot(pbus, setup.dev);
+       if (pdev == NULL)
+               return -ENODEV;
+
+       mutex_lock(&iomul->lock);
+       if (iomul->sw != NULL) {
+               error = -EBUSY;
+               goto out0;
+       }
+
+       pci_iomul_get_lock_switch(pdev, &sw, &slot);
+       if (sw == NULL || slot == NULL) {
+               error = -ENODEV;
+               goto out0;
+       }
+       if (!pci_iomul_switch_io_allocated(sw)) {
+               error = -ENODEV;
+               goto out;
+       }
+
+       if (slot->func[setup.func] == NULL) {
+               error = -ENODEV;
+               goto out;
+       }
+
+       if (sw->count == 0) {
+               BUG_ON(sw->io_region != NULL);
+               sw->io_region =
+                       request_region(sw->io_base,
+                                      sw->io_limit - sw->io_base + 1,
+                                      "PCI IO Multiplexer driver");
+               if (sw->io_region == NULL) {
+                       mutex_unlock(&sw->lock);
+                       error = -EBUSY;
+                       goto out;
+               }
+       }
+       sw->count++;
+       pci_iomul_slot_get(slot);
+
+       iomul->pdev = pdev;
+       iomul->sw = sw;
+       iomul->slot = slot;
+       iomul->func = &slot->func[setup.func];
+
+out:
+       mutex_unlock(&sw->lock);
+out0:
+       mutex_unlock(&iomul->lock);
+       if (error != 0) {
+               if (sw != NULL)
+                       pci_iomul_switch_put(sw);
+               pci_dev_put(pdev);
+       }
+       return error;
+}
+
+static int pci_iomul_lock(struct pci_iomul_data *iomul,
+                         struct pci_iomul_switch **sw,
+                         struct pci_iomul_func **func)
+{
+       mutex_lock(&iomul->lock);
+       *sw = iomul->sw;
+       if (*sw == NULL) {
+               mutex_unlock(&iomul->lock);
+               return -ENODEV;
+       }
+       mutex_lock(&(*sw)->lock);
+       if (!pci_iomul_valid(iomul)) {
+               mutex_unlock(&(*sw)->lock);
+               mutex_unlock(&iomul->lock);
+               return -ENODEV;
+       }
+       *func = *iomul->func;
+
+       return 0;
+}
+
+static long pci_iomul_disable_io(struct pci_iomul_data *iomul)
+{
+       long error = 0;
+       struct pci_iomul_switch *sw;
+       struct pci_iomul_func *dummy_func;
+       struct pci_dev *pdev;
+
+       if (pci_iomul_lock(iomul, &sw, &dummy_func) < 0)
+               return -ENODEV;
+
+       pdev = iomul->pdev;
+       if (pdev == NULL)
+               error = -ENODEV;
+
+       if (pdev != NULL && sw->current_pdev == pdev) {
+               __pci_iomul_disable_io(iomul, pdev);
+               sw->current_pdev = NULL;
+       }
+
+       mutex_unlock(&sw->lock);
+       mutex_unlock(&iomul->lock);
+       return error;
+}
+
+static void pci_iomul_switch_to(
+       struct pci_iomul_data *iomul, struct pci_iomul_switch *sw,
+       struct pci_dev *next_pdev)
+{
+       if (sw->current_pdev == next_pdev)
+               /* nothing to do */
+               return;
+
+       if (sw->current_pdev != NULL)
+               __pci_iomul_disable_io(iomul, sw->current_pdev);
+
+       __pci_iomul_enable_io(next_pdev);
+       sw->current_pdev = next_pdev;
+}
+
+static long pci_iomul_in(struct pci_iomul_data *iomul,
+                        struct pci_iomul_in __user *arg)
+{
+       struct pci_iomul_in in;
+       struct pci_iomul_switch *sw;
+       struct pci_iomul_func *func;
+
+       long error = 0;
+       int port;
+       uint32_t value = 0;
+
+       if (copy_from_user(&in, arg, sizeof(in)))
+               return -EFAULT;
+
+       if (pci_iomul_lock(iomul, &sw, &func) < 0)
+               return -ENODEV;
+
+       error = pci_iomul_func_ioport(func, in.bar, in.offset, &port);
+       if (error)
+               goto out;
+
+       pci_iomul_switch_to(iomul, sw, iomul->pdev);
+       switch (in.size) {
+       case 4:
+               value = inl(port);
+               break;
+       case 2:
+               value = inw(port);
+               break;
+       case 1:
+               value = inb(port);
+               break;
+       default:
+               error = -EINVAL;
+               break;
+       }
+
+out:
+       mutex_unlock(&sw->lock);
+       mutex_unlock(&iomul->lock);
+
+       if (error == 0 && put_user(value, &arg->value))
+               return -EFAULT;
+       return error;
+}
+
+static long pci_iomul_out(struct pci_iomul_data *iomul,
+                         struct pci_iomul_out __user *arg)
+{
+       struct pci_iomul_in out;
+       struct pci_iomul_switch *sw;
+       struct pci_iomul_func *func;
+
+       long error = 0;
+       int port;
+
+       if (copy_from_user(&out, arg, sizeof(out)))
+               return -EFAULT;
+
+       if (pci_iomul_lock(iomul, &sw, &func) < 0)
+               return -ENODEV;
+
+       error = pci_iomul_func_ioport(func, out.bar, out.offset, &port);
+       if (error)
+               goto out;
+
+       pci_iomul_switch_to(iomul, sw, iomul->pdev);
+       switch (out.size) {
+       case 4:
+               outl(out.value, port);
+               break;
+       case 2:
+               outw(out.value, port);
+               break;
+       case 1:
+               outb(out.value, port);
+               break;
+       default:
+               error = -EINVAL;
+               break;
+       }
+
+out:
+       mutex_unlock(&sw->lock);
+       mutex_unlock(&iomul->lock);
+       return error;
+}
+
+static long pci_iomul_ioctl(struct file *filp,
+                           unsigned int cmd, unsigned long arg)
+{
+       long error;
+       struct pci_iomul_data *iomul =
+               (struct pci_iomul_data*)filp->private_data;
+
+       if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
+               return -EPERM;
+
+       switch (cmd) {
+       case PCI_IOMUL_SETUP:
+               error = pci_iomul_setup(iomul,
+                                       (struct pci_iomul_setup __user *)arg);
+               break;
+       case PCI_IOMUL_DISABLE_IO:
+               error = pci_iomul_disable_io(iomul);
+               break;
+       case PCI_IOMUL_IN:
+               error = pci_iomul_in(iomul, (struct pci_iomul_in __user *)arg);
+               break;
+       case PCI_IOMUL_OUT:
+               error = pci_iomul_out(iomul,
+                                     (struct pci_iomul_out __user *)arg);
+               break;
+       default:
+               error = -ENOSYS;
+               break;
+       }
+
+       return error;
+}
+
+static const struct file_operations pci_iomul_fops = {
+       .owner = THIS_MODULE,
+
+       .open = pci_iomul_open,
+       .release = pci_iomul_release,
+
+       .unlocked_ioctl = pci_iomul_ioctl,
+};
+
+static struct miscdevice pci_iomul_miscdev = {
+       .minor = MISC_DYNAMIC_MINOR,
+       .name = "pci_iomul",
+       .nodename = "xen/pci_iomul",
+       .fops = &pci_iomul_fops,
+};
+
+static int __init pci_iomul_init(void)
+{
+       int error;
+
+       if (!is_initial_xendomain())
+               return -ENODEV;
+
+       error = misc_register(&pci_iomul_miscdev);
+       if (error) {
+               pr_alert("Couldn't register /dev/xen/pci_iomul");
+               return error;
+       }
+       pr_info("PCI IO multiplexer device installed\n");
+       return 0;
+}
+
+#ifdef MODULE
+static void __exit pci_iomul_cleanup(void)
+{
+       misc_deregister(&pci_iomul_miscdev);
+}
+module_exit(pci_iomul_cleanup);
+#endif
+
+/*
+ * This must be called after pci fixup final which is called by
+ * device_initcall(pci_init).
+ */
+late_initcall(pci_iomul_init);
+
+MODULE_ALIAS("devname:xen/pci_iomul");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Isaku Yamahata <yamahata@valinux.co.jp>");
+MODULE_DESCRIPTION("PCI IO space multiplexing driver");
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c

index 111569c..472f30e 100644 (file)
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -482,7 +482,12 @@ pci_find_parent_resource(const struct pci_dev *dev, struct resource *res)
   * Restore the BAR values for a given device, so as to make it
   * accessible by its driver.
   */
+#ifndef CONFIG_XEN
  static void
+#else
+EXPORT_SYMBOL_GPL(pci_restore_bars);
+void
+#endif
  pci_restore_bars(struct pci_dev *dev)
  {
         int i;
@@ -3721,6 +3726,13 @@ resource_size_t pci_specified_resource_alignment(struct pci_dev *dev)
   */
  int pci_is_reassigndev(struct pci_dev *dev)
  {
+#ifdef CONFIG_PCI_GUESTDEV
+       int result;
+
+       result = pci_is_guestdev_to_reassign(dev);
+       if (result)
+               return result;
+#endif /* CONFIG_PCI_GUESTDEV */
         return (pci_specified_resource_alignment(dev) != 0);
  }
  
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h

index e494347..4352947 100644 (file)
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -326,4 +326,26 @@ static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe)
  }
  #endif
  
+#ifdef CONFIG_PCI_GUESTDEV
+extern int pci_is_guestdev_to_reassign(struct pci_dev *dev);
+extern int pci_is_iomuldev(struct pci_dev *dev);
+#else
+#define pci_is_iomuldev(dev)   0
+#endif
+
+#ifdef CONFIG_PCI_RESERVE
+unsigned long pci_reserve_size_io(struct pci_bus *bus);
+unsigned long pci_reserve_size_mem(struct pci_bus *bus);
+#else
+static inline unsigned long pci_reserve_size_io(struct pci_bus *bus)
+{
+       return 0;
+}
+
+static inline unsigned long pci_reserve_size_mem(struct pci_bus *bus)
+{
+       return 0;
+}
+#endif /* CONFIG_PCI_RESERVE */
+
  #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c

index 5e1ca3c..3ad6bba 100644 (file)
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1300,6 +1300,11 @@ static void pci_init_capabilities(struct pci_dev *dev)
         /* Vital Product Data */
         pci_vpd_pci22_init(dev);
  
+#ifdef CONFIG_XEN
+       if (!is_initial_xendomain())
+               return;
+#endif
+
         /* Alternative Routing-ID Forwarding */
         pci_enable_ari(dev);
  
@@ -1424,13 +1429,20 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
                 return 0; /* Already scanned the entire slot */
  
         dev = pci_scan_single_device(bus, devfn);
-       if (!dev)
+       if (!dev) {
+#ifdef pcibios_scan_all_fns
+               if (!pcibios_scan_all_fns(bus, devfn))
+#endif
                 return 0;
-       if (!dev->is_added)
+       } else if (!dev->is_added)
                 nr++;
  
         if (pci_ari_enabled(bus))
                 next_fn = next_ari_fn;
+#ifdef pcibios_scan_all_fns
+       else if (pcibios_scan_all_fns(bus, devfn))
+               next_fn = next_trad_fn;
+#endif
         else if (dev->multifunction)
                 next_fn = next_trad_fn;
  
diff --git a/drivers/pci/reserve.c b/drivers/pci/reserve.c

new file mode 100644 (file)

index 0000000..2be94ac
--- /dev/null
+++ b/drivers/pci/reserve.c
@@ -0,0 +1,137 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (c) 2009 Isaku Yamahata
+ *                    VA Linux Systems Japan K.K.
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include <asm/setup.h>
+
+static char pci_reserve_param[COMMAND_LINE_SIZE];
+
+/* pci_reserve=        [PCI]
+ * Format: [<sbdf>[+IO<size>][+MEM<size>]][,<sbdf>...]
+ * Format of sbdf: [<segment>:]<bus>:<dev>.<func>
+ */
+static int pci_reserve_parse_size(const char *str,
+                                 unsigned long *io_size,
+                                 unsigned long *mem_size)
+{
+       if (sscanf(str, "io%lx", io_size) == 1 ||
+           sscanf(str, "IO%lx", io_size) == 1)
+               return 0;
+
+       if (sscanf(str, "mem%lx", mem_size) == 1 ||
+           sscanf(str, "MEM%lx", mem_size) == 1)
+               return 0;
+
+       return -EINVAL;
+}
+
+static int pci_reserve_parse_one(const char *str,
+                                int *seg, int *bus, int *dev, int *func,
+                                unsigned long *io_size,
+                                unsigned long *mem_size)
+{
+       char *p;
+
+       *io_size = 0;
+       *mem_size = 0;
+
+       if (sscanf(str, "%x:%x:%x.%x", seg, bus, dev, func) != 4) {
+               *seg = 0;
+               if (sscanf(str, "%x:%x.%x", bus, dev, func) != 3) {
+                       return -EINVAL;
+               }
+       }
+
+       p = strchr(str, '+');
+       if (p == NULL)
+               return -EINVAL;
+       if (pci_reserve_parse_size(++p, io_size, mem_size))
+               return -EINVAL;
+
+       p = strchr(p, '+');
+       return p ? pci_reserve_parse_size(p + 1, io_size, mem_size) : 0;
+}
+
+static unsigned long pci_reserve_size(struct pci_bus *pbus, int flags)
+{
+       char *sp;
+       char *ep;
+
+       int seg;
+       int bus;
+       int dev;
+       int func;
+
+       unsigned long io_size;
+       unsigned long mem_size;
+
+       sp = pci_reserve_param;
+
+       do {
+               ep = strchr(sp, ',');
+               if (ep)
+                       *ep = '\0';     /* chomp */
+
+               if (pci_reserve_parse_one(sp, &seg, &bus, &dev, &func,
+                                         &io_size, &mem_size) == 0) {
+                       if (pci_domain_nr(pbus) == seg &&
+                           pbus->number == bus &&
+                           PCI_SLOT(pbus->self->devfn) == dev &&
+                           PCI_FUNC(pbus->self->devfn) == func) {
+                               switch (flags) {
+                               case IORESOURCE_IO:
+                                       return io_size;
+                               case IORESOURCE_MEM:
+                                       return mem_size;
+                               default:
+                                       break;
+                               }
+                       }
+               }
+
+               if (ep) {
+                       *ep = ',';      /* restore chomp'ed ',' for later */
+                       ep++;
+               }
+               sp = ep;
+       } while (ep);
+
+       return 0;
+}
+
+unsigned long pci_reserve_size_io(struct pci_bus *pbus)
+{
+       return pci_reserve_size(pbus, IORESOURCE_IO);
+}
+
+unsigned long pci_reserve_size_mem(struct pci_bus *pbus)
+{
+       return pci_reserve_size(pbus, IORESOURCE_MEM);
+}
+
+static int __init pci_reserve_setup(char *str)
+{
+       if (!is_initial_xendomain() || strlen(str) >= sizeof(pci_reserve_param))
+               return 0;
+       strlcpy(pci_reserve_param, str, sizeof(pci_reserve_param));
+       return 1;
+}
+__setup("pci_reserve=", pci_reserve_setup);
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c

index 8fa2d4b..35c093d 100644 (file)
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -708,7 +708,7 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
  {
         struct pci_dev *dev;
         struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
-       unsigned long size = 0, size0 = 0, size1 = 0;
+       unsigned long size = 0, size0 = 0, size1 = 0, res_size;
         resource_size_t children_add_size = 0;
  
         if (!b_res)
@@ -742,6 +742,11 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
         size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
                 calculate_iosize(size, min_size, add_size + size1,
                         resource_size(b_res), 4096);
+       res_size = pci_reserve_size_io(bus);
+       if (size0 < res_size)
+               size0 = ALIGN(res_size, 4096);
+       if (size1 < res_size)
+               size1 = ALIGN(res_size, 4096);
         if (!size0 && !size1) {
                 if (b_res->start || b_res->end)
                         dev_info(&bus->self->dev, "disabling bridge window "
@@ -854,6 +859,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
                         min_align = align1 >> 1;
                 align += aligns[order];
         }
+       size = max(size, (resource_size_t)pci_reserve_size_mem(bus));
         size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align);
         if (children_add_size > add_size)
                 add_size = children_add_size;
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig

index 8c8377d..c6914bb 100644 (file)
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -563,7 +563,7 @@ config RTC_DRV_DA9052
  
  config RTC_DRV_EFI
         tristate "EFI RTC"
-       depends on IA64
+       depends on IA64 || (XEN && EFI)
         help
           If you say yes here you will get support for the EFI
           Real Time Clock.
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig

index 29684c8..e64062c 100644 (file)
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -657,7 +657,7 @@ config SCSI_FLASHPOINT
  
  config VMWARE_PVSCSI
         tristate "VMware PVSCSI driver support"
-       depends on PCI && SCSI && X86
+       depends on PCI && SCSI && !XEN && X86
         help
           This driver supports VMware's para virtualized SCSI HBA.
           To compile this driver as a module, choose M here: the
diff --git a/drivers/scsi/arcmsr/arcmsr.h b/drivers/scsi/arcmsr/arcmsr.h

index 77b26f5..302efaf 100644 (file)
--- a/drivers/scsi/arcmsr/arcmsr.h
+++ b/drivers/scsi/arcmsr/arcmsr.h
@@ -46,7 +46,7 @@
  struct device_attribute;
  /*The limit of outstanding scsi command that firmware can handle*/
  #define ARCMSR_MAX_OUTSTANDING_CMD                                             256
-#ifdef CONFIG_XEN
+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
         #define ARCMSR_MAX_FREECCB_NUM  160
  #else
         #define ARCMSR_MAX_FREECCB_NUM  320
diff --git a/drivers/scsi/device_handler/scsi_dh.c b/drivers/scsi/device_handler/scsi_dh.c

index 48e46f5..f34e729 100644 (file)
--- a/drivers/scsi/device_handler/scsi_dh.c
+++ b/drivers/scsi/device_handler/scsi_dh.c
@@ -388,7 +388,7 @@ int scsi_dh_activate(struct request_queue *q, activate_complete fn, void *data)
         struct device *dev = NULL;
  
         spin_lock_irqsave(q->queue_lock, flags);
-       sdev = q->queuedata;
+       sdev = scsi_device_from_queue(q);
         if (!sdev) {
                 spin_unlock_irqrestore(q->queue_lock, flags);
                 err = SCSI_DH_NOSYS;
@@ -483,7 +483,7 @@ int scsi_dh_attach(struct request_queue *q, const char *name)
                 return -EINVAL;
  
         spin_lock_irqsave(q->queue_lock, flags);
-       sdev = q->queuedata;
+       sdev = scsi_device_from_queue(q);
         if (!sdev || !get_device(&sdev->sdev_gendev))
                 err = -ENODEV;
         spin_unlock_irqrestore(q->queue_lock, flags);
@@ -511,7 +511,7 @@ void scsi_dh_detach(struct request_queue *q)
         struct scsi_device_handler *scsi_dh = NULL;
  
         spin_lock_irqsave(q->queue_lock, flags);
-       sdev = q->queuedata;
+       sdev = scsi_device_from_queue(q);
         if (!sdev || !get_device(&sdev->sdev_gendev))
                 sdev = NULL;
         spin_unlock_irqrestore(q->queue_lock, flags);
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c

index 3a6c474..8e8b96d 100644 (file)
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -93,6 +93,8 @@ static int max_requests = IBMVSCSI_MAX_REQUESTS_DEFAULT;
  static int max_events = IBMVSCSI_MAX_REQUESTS_DEFAULT + 2;
  static int fast_fail = 1;
  static int client_reserve = 1;
+/*host data buffer size*/
+#define buff_size 4096
  
  static struct scsi_transport_template *ibmvscsi_transport_template;
  
@@ -100,6 +102,9 @@ static struct scsi_transport_template *ibmvscsi_transport_template;
  
  static struct ibmvscsi_ops *ibmvscsi_ops;
  
+#define IBMVSCSI_PROC_NAME "ibmvscsi"
+/* The driver is named ibmvscsic, map ibmvscsi to module name */
+MODULE_ALIAS(IBMVSCSI_PROC_NAME);
  MODULE_DESCRIPTION("IBM Virtual SCSI");
  MODULE_AUTHOR("Dave Boutcher");
  MODULE_LICENSE("GPL");
@@ -1663,7 +1668,7 @@ static ssize_t show_host_srp_version(struct device *dev,
         struct ibmvscsi_host_data *hostdata = shost_priv(shost);
         int len;
  
-       len = snprintf(buf, PAGE_SIZE, "%s\n",
+       len = snprintf(buf, buff_size, "%s\n",
                        hostdata->madapter_info.srp_version);
         return len;
  }
@@ -1684,7 +1689,7 @@ static ssize_t show_host_partition_name(struct device *dev,
         struct ibmvscsi_host_data *hostdata = shost_priv(shost);
         int len;
  
-       len = snprintf(buf, PAGE_SIZE, "%s\n",
+       len = snprintf(buf, buff_size, "%s\n",
                        hostdata->madapter_info.partition_name);
         return len;
  }
@@ -1705,7 +1710,7 @@ static ssize_t show_host_partition_number(struct device *dev,
         struct ibmvscsi_host_data *hostdata = shost_priv(shost);
         int len;
  
-       len = snprintf(buf, PAGE_SIZE, "%d\n",
+       len = snprintf(buf, buff_size, "%d\n",
                        hostdata->madapter_info.partition_number);
         return len;
  }
@@ -1725,7 +1730,7 @@ static ssize_t show_host_mad_version(struct device *dev,
         struct ibmvscsi_host_data *hostdata = shost_priv(shost);
         int len;
  
-       len = snprintf(buf, PAGE_SIZE, "%d\n",
+       len = snprintf(buf, buff_size, "%d\n",
                        hostdata->madapter_info.mad_version);
         return len;
  }
@@ -1745,7 +1750,7 @@ static ssize_t show_host_os_type(struct device *dev,
         struct ibmvscsi_host_data *hostdata = shost_priv(shost);
         int len;
  
-       len = snprintf(buf, PAGE_SIZE, "%d\n", hostdata->madapter_info.os_type);
+       len = snprintf(buf, buff_size, "%d\n", hostdata->madapter_info.os_type);
         return len;
  }
  
@@ -1764,7 +1769,7 @@ static ssize_t show_host_config(struct device *dev,
         struct ibmvscsi_host_data *hostdata = shost_priv(shost);
  
         /* returns null-terminated host config data */
-       if (ibmvscsi_do_host_config(hostdata, buf, PAGE_SIZE) == 0)
+       if (ibmvscsi_do_host_config(hostdata, buf, buff_size) == 0)
                 return strlen(buf);
         else
                 return 0;
@@ -1796,7 +1801,7 @@ static struct device_attribute *ibmvscsi_attrs[] = {
  static struct scsi_host_template driver_template = {
         .module = THIS_MODULE,
         .name = "IBM POWER Virtual SCSI Adapter " IBMVSCSI_VERSION,
-       .proc_name = "ibmvscsi",
+       .proc_name = IBMVSCSI_PROC_NAME,
         .queuecommand = ibmvscsi_queuecommand,
         .eh_abort_handler = ibmvscsi_eh_abort_handler,
         .eh_device_reset_handler = ibmvscsi_eh_device_reset_handler,
@@ -2061,7 +2066,7 @@ static struct vio_driver ibmvscsi_driver = {
         .probe = ibmvscsi_probe,
         .remove = ibmvscsi_remove,
         .get_desired_dma = ibmvscsi_get_desired_dma,
-       .name = "ibmvscsi",
+       .name = IBMVSCSI_PROC_NAME,
         .pm = &ibmvscsi_pm_ops,
  };
  
diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c

index 35bd138..fa2e129 100644 (file)
--- a/drivers/scsi/megaraid/megaraid_mbox.c
+++ b/drivers/scsi/megaraid/megaraid_mbox.c
@@ -1588,13 +1588,20 @@ megaraid_mbox_build_cmd(adapter_t *adapter, struct scsi_cmnd *scp, int *busy)
                 case MODE_SENSE:
                 {
                         struct scatterlist      *sgl;
-                       caddr_t                 vaddr;
+                       struct page             *pg;
+                       unsigned char           *vaddr;
+                       unsigned long           flags;
  
                         sgl = scsi_sglist(scp);
-                       if (sg_page(sgl)) {
-                               vaddr = (caddr_t) sg_virt(&sgl[0]);
+                       pg = sg_page(sgl);
+                       if (pg) {
+                               local_irq_save(flags);
+                               vaddr = kmap_atomic(pg, KM_BIO_SRC_IRQ) + sgl->offset;
  
                                 memset(vaddr, 0, scp->cmnd[4]);
+
+                               kunmap_atomic(vaddr, KM_BIO_SRC_IRQ);
+                               local_irq_restore(flags);
                         }
                         else {
                                 con_log(CL_ANN, (KERN_WARNING
@@ -2332,9 +2339,20 @@ megaraid_mbox_dpc(unsigned long devp)
                 if (scp->cmnd[0] == INQUIRY && status == 0 && islogical == 0
                                 && IS_RAID_CH(raid_dev, scb->dev_channel)) {
  
+                       struct page             *pg;
+                       unsigned char           *vaddr;
+                       unsigned long           flags;
+
                         sgl = scsi_sglist(scp);
-                       if (sg_page(sgl)) {
-                               c = *(unsigned char *) sg_virt(&sgl[0]);
+                       pg = sg_page(sgl);
+                       if (pg) {
+                               local_irq_save(flags);
+                               vaddr = kmap_atomic(pg, KM_BIO_SRC_IRQ) + sgl->offset;
+
+                               c = *vaddr;
+
+                               kunmap_atomic(vaddr, KM_BIO_SRC_IRQ);
+                               local_irq_restore(flags);
                         } else {
                                 con_log(CL_ANN, (KERN_WARNING
                                                  "megaraid mailbox: invalid sg:%d\n",
diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c

index cf8dfab..e499f67 100644 (file)
--- a/drivers/scsi/scsi_devinfo.c
+++ b/drivers/scsi/scsi_devinfo.c
@@ -160,7 +160,7 @@ static struct {
         {"DGC", "RAID", NULL, BLIST_SPARSELUN}, /* Dell PV 650F, storage on LUN 0 */
         {"DGC", "DISK", NULL, BLIST_SPARSELUN}, /* Dell PV 650F, no storage on LUN 0 */
         {"EMC",  "Invista", "*", BLIST_SPARSELUN | BLIST_LARGELUN},
-       {"EMC", "SYMMETRIX", NULL, BLIST_SPARSELUN | BLIST_LARGELUN | BLIST_FORCELUN},
+       {"EMC", "SYMMETRIX", NULL, BLIST_SPARSELUN | BLIST_LARGELUN | BLIST_REPORTLUN2},
         {"EMULEX", "MD21/S2     ESDI", NULL, BLIST_SINGLELUN},
         {"easyRAID", "16P", NULL, BLIST_NOREPORTLUN},
         {"easyRAID", "X6P", NULL, BLIST_NOREPORTLUN},
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c

index 386f0c5..cc3f237 100644 (file)
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -25,6 +25,8 @@
  #include <linux/interrupt.h>
  #include <linux/blkdev.h>
  #include <linux/delay.h>
+#include <linux/netlink.h>
+#include <net/netlink.h>
  
  #include <scsi/scsi.h>
  #include <scsi/scsi_cmnd.h>
@@ -35,6 +37,7 @@
  #include <scsi/scsi_transport.h>
  #include <scsi/scsi_host.h>
  #include <scsi/scsi_ioctl.h>
+#include <scsi/scsi_netlink_ml.h>
  
  #include "scsi_priv.h"
  #include "scsi_logging.h"
@@ -43,6 +46,7 @@
  #include <trace/events/scsi.h>
  
  #define SENSE_TIMEOUT          (10*HZ)
+#define TEST_UNIT_READY_TIMEOUT        (30*HZ)
  
  /*
   * These should *probably* be handled by the host itself.
@@ -222,6 +226,80 @@ static inline void scsi_eh_prt_fail_stats(struct Scsi_Host *shost,
  }
  #endif
  
+#ifdef CONFIG_SCSI_NETLINK
+/**
+ * scsi_post_sense_event - called to post a 'Sense Code' event
+ *
+ * @sdev:              SCSI device the sense code occured on
+ * @sshdr:             SCSI sense code
+ *
+ * Returns:
+ *   0 on succesful return
+ *   otherwise, failing error code
+ *
+ */
+static void scsi_post_sense_event(struct scsi_device *sdev,
+                       struct scsi_sense_hdr *sshdr)
+{
+       struct sk_buff *skb;
+       struct nlmsghdr *nlh;
+       struct scsi_nl_sense_msg *msg;
+       u32 len, skblen;
+       int err;
+
+       if (!scsi_nl_sock) {
+               err = -ENOENT;
+               goto send_fail;
+       }
+
+       len = SCSI_NL_MSGALIGN(sizeof(*msg));
+       skblen = NLMSG_SPACE(len);
+
+       skb = alloc_skb(skblen, GFP_ATOMIC);
+       if (!skb) {
+               err = -ENOBUFS;
+               goto send_fail;
+       }
+
+       nlh = nlmsg_put(skb, 0, 0, SCSI_TRANSPORT_MSG,
+                               skblen - sizeof(*nlh), 0);
+       if (!nlh) {
+               err = -ENOBUFS;
+               goto send_fail_skb;
+       }
+       msg = NLMSG_DATA(nlh);
+
+       INIT_SCSI_NL_HDR(&msg->snlh, SCSI_NL_TRANSPORT_ML,
+                        ML_NL_SCSI_SENSE, len);
+       msg->host_no = sdev->host->host_no;
+       msg->channel = sdev->channel;
+       msg->id = sdev->id;
+       msg->lun = sdev->lun;
+       msg->sense = (sshdr->response_code << 24) | (sshdr->sense_key << 16) |
+               (sshdr->asc << 8) | sshdr->ascq;
+
+       err = nlmsg_multicast(scsi_nl_sock, skb, 0, SCSI_NL_GRP_ML_EVENTS,
+                             GFP_KERNEL);
+       if (err && (err != -ESRCH))
+               /* nlmsg_multicast already kfree_skb'd */
+               goto send_fail;
+
+       return;
+
+send_fail_skb:
+       kfree_skb(skb);
+send_fail:
+       sdev_printk(KERN_WARNING, sdev,
+                   "Dropped SCSI Msg %02x/%02x/%02x/%02x: err %d\n",
+                   sshdr->response_code, sshdr->sense_key,
+                   sshdr->asc, sshdr->ascq, err);
+       return;
+}
+#else
+static inline void scsi_post_sense_event(struct scsi_device *sdev,
+                          struct scsi_sense_hdr *sshdr) {}
+#endif
+
  /**
   * scsi_check_sense - Examine scsi cmd sense
   * @scmd:      Cmd to have sense checked.
@@ -244,6 +322,8 @@ static int scsi_check_sense(struct scsi_cmnd *scmd)
         if (scsi_sense_is_deferred(&sshdr))
                 return NEEDS_RETRY;
  
+       scsi_post_sense_event(sdev, &sshdr);
+
         if (sdev->scsi_dh_data && sdev->scsi_dh_data->scsi_dh &&
                         sdev->scsi_dh_data->scsi_dh->check_sense) {
                 int rc;
@@ -309,7 +389,8 @@ static int scsi_check_sense(struct scsi_cmnd *scmd)
                  * if the device is in the process of becoming ready, we
                  * should retry.
                  */
-               if ((sshdr.asc == 0x04) && (sshdr.ascq == 0x01))
+               if ((sshdr.asc == 0x04) &&
+                   (sshdr.ascq == 0x01 || sshdr.ascq == 0x0a))
                         return NEEDS_RETRY;
                 /*
                  * if the device is not started, we need to wake
@@ -953,7 +1034,7 @@ static int scsi_eh_tur(struct scsi_cmnd *scmd)
         int retry_cnt = 1, rtn;
  
  retry_tur:
-       rtn = scsi_send_eh_cmnd(scmd, tur_command, 6, SENSE_TIMEOUT, 0);
+       rtn = scsi_send_eh_cmnd(scmd, tur_command, 6, TEST_UNIT_READY_TIMEOUT, 0);
  
         SCSI_LOG_ERROR_RECOVERY(3, printk("%s: scmd %p rtn %x\n",
                 __func__, scmd, rtn));
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c

index 5dfd749..ae595be 100644 (file)
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1612,6 +1612,17 @@ out:
         spin_lock_irq(q->queue_lock);
  }
  
+struct scsi_device *scsi_device_from_queue(struct request_queue *q)
+{
+       struct scsi_device *sdev = NULL;
+
+       if (q->request_fn == scsi_request_fn)
+               sdev = q->queuedata;
+
+       return sdev;
+}
+EXPORT_SYMBOL_GPL(scsi_device_from_queue);
+
  u64 scsi_calculate_bounce_limit(struct Scsi_Host *shost)
  {
         struct device *host_dev;
diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c

index c77628a..a8a17d7 100644 (file)
--- a/drivers/scsi/scsi_netlink.c
+++ b/drivers/scsi/scsi_netlink.c
@@ -260,7 +260,7 @@ scsi_generic_msg_handler(struct sk_buff *skb)
  
                 /* if successful, scsi_host_lookup takes a shost reference */
                 shost = scsi_host_lookup(msg->host_no);
-               if (!shost) {
+               if (IS_ERR(shost)) {
                         err = -ENODEV;
                         goto driver_exit;
                 }
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c

index 01b0374..c97aa77 100644 (file)
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -693,7 +693,7 @@ static int scsi_probe_lun(struct scsi_device *sdev, unsigned char *inq_result,
          * and displaying garbage for the Vendor, Product, or Revision
          * strings.
          */
-       if (sdev->inquiry_len < 36) {
+       if (sdev->inquiry_len < 36 && printk_ratelimit()) {
                 printk(KERN_INFO "scsi scan: INQUIRY result too short (%d),"
                                 " using 36\n", sdev->inquiry_len);
                 sdev->inquiry_len = 36;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c

index 5ba5c2a..383f931 100644 (file)
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1637,8 +1637,7 @@ sd_spinup_disk(struct scsi_disk *sdkp)
                  * Yes, this sense key/ASC combination shouldn't
                  * occur here.  It's characteristic of these devices.
                  */
-               } else if (sense_valid &&
-                               sshdr.sense_key == UNIT_ATTENTION &&
+               } else if (sshdr.sense_key == UNIT_ATTENTION &&
                                 sshdr.asc == 0x28) {
                         if (!spintime) {
                                 spintime_expire = jiffies + 5 * HZ;
@@ -2634,6 +2633,20 @@ static void sd_probe_async(void *data, async_cookie_t cookie)
         put_device(&sdkp->dev);
  }
  
+static int sd_get_index(int *index)
+{
+       int error = -ENOMEM;
+       do {
+               if (!ida_pre_get(&sd_index_ida, GFP_KERNEL))
+                       break;
+
+               spin_lock(&sd_index_lock);
+               error = ida_get_new(&sd_index_ida, index);
+               spin_unlock(&sd_index_lock);
+       } while (error == -EAGAIN);
+
+       return error;
+}
  /**
   *     sd_probe - called during driver initialization and whenever a
   *     new scsi device is attached to the system. It is called once
@@ -2676,15 +2689,7 @@ static int sd_probe(struct device *dev)
         if (!gd)
                 goto out_free;
  
-       do {
-               if (!ida_pre_get(&sd_index_ida, GFP_KERNEL))
-                       goto out_put;
-
-               spin_lock(&sd_index_lock);
-               error = ida_get_new(&sd_index_ida, &index);
-               spin_unlock(&sd_index_lock);
-       } while (error == -EAGAIN);
-
+       error = sd_get_index(&index);
         if (error) {
                 sdev_printk(KERN_WARNING, sdp, "sd_probe: memory exhausted.\n");
                 goto out_put;
@@ -2894,6 +2899,42 @@ done:
         return ret;
  }
  
+/*
+* Each major represents 16 disks. A minor is used for the disk itself and 15
+* partitions. Mark each disk busy so that sd_probe can not reclaim this major.
+*/
+static int __init init_sd_ida(int *error)
+{
+       int *index, i, j, err;
+
+       index = kmalloc(SD_MAJORS * (256 / SD_MINORS) * sizeof(int), GFP_KERNEL);
+       if (!index)
+               return -ENOMEM;
+
+       /* Mark minors for all majors as busy */
+       for (i = 0; i < SD_MAJORS; i++)
+       {
+               for (j = 0; j < (256 / SD_MINORS); j++) {
+                       err = sd_get_index(&index[i * (256 / SD_MINORS) + j]);
+                       if (err) {
+                               kfree(index);
+                               return err;
+                       }
+               }
+       }
+
+       /* Mark minors for claimed majors as free */
+       for (i = 0; i < SD_MAJORS; i++)
+       {
+               if (error[i])
+                       continue;
+               for (j = 0; j < (256 / SD_MINORS); j++)
+                       ida_remove(&sd_index_ida, index[i * (256 / SD_MINORS) + j]);
+       }
+       kfree(index);
+       return 0;
+}
+
  /**
   *     init_sd - entry point for this driver (both when built in or when
   *     a module).
@@ -2903,16 +2944,26 @@ done:
  static int __init init_sd(void)
  {
         int majors = 0, i, err;
+       int error[SD_MAJORS];
  
         SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n"));
  
         for (i = 0; i < SD_MAJORS; i++)
-               if (register_blkdev(sd_major(i), "sd") == 0)
+       {
+               error[i] = register_blkdev(sd_major(i), "sd");
+               if (error[i] == 0)
                         majors++;
+       }
  
         if (!majors)
                 return -ENODEV;
  
+       if (majors < SD_MAJORS) {
+               err = init_sd_ida(error);
+               if (err)
+                       return err;
+       }
+
         err = class_register(&sd_disk_class);
         if (err)
                 goto err_out;
diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c

index 83a1972..3b0f98c 100644 (file)
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -785,12 +785,24 @@ static void storvsc_command_completion(struct storvsc_cmd_request *cmd_request)
         /*
          * If there is an error; offline the device since all
          * error recovery strategies would have already been
-        * deployed on the host side.
+        * deployed on the host side. However, if the command
+        * were a pass-through command deal with it appropriately.
          */
-       if (vm_srb->srb_status == SRB_STATUS_ERROR)
-               scmnd->result = DID_TARGET_FAILURE << 16;
-       else
+       switch (vm_srb->srb_status) {
+       case SRB_STATUS_ERROR:
+               switch (scmnd->cmnd[0]) {
+               case ATA_16:
+               case ATA_12:
+                       scmnd->result = DID_PASSTHROUGH << 16;
+                       break;
+               default:
+                       scmnd->result = DID_TARGET_FAILURE << 16;
+               }
+               break;
+       default:
                 scmnd->result = vm_srb->scsi_status;
+       }
+
  
         /*
          * If the LUN is invalid; remove the device.
@@ -1211,7 +1223,12 @@ static int storvsc_host_reset_handler(struct scsi_cmnd *scmnd)
         /*
          * At this point, all outstanding requests in the adapter
          * should have been flushed out and return to us
+        * There is a potential race here where the host may be in
+        * the process of responding when we return from here.
+        * Just wait for all in-transit packets to be accounted for
+        * before we return from here.
          */
+       storvsc_wait_to_drain(stor_device);
  
         return SUCCESS;
  }
diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c

index 1e824fb..5d34c8b 100644 (file)
--- a/drivers/sfi/sfi_core.c
+++ b/drivers/sfi/sfi_core.c
@@ -486,6 +486,11 @@ void __init sfi_init(void)
         if (!acpi_disabled)
                 disable_sfi();
  
+#ifdef CONFIG_XEN
+       if (!is_initial_xendomain())
+               disable_sfi();
+#endif
+
         if (sfi_disabled)
                 return;
  
diff --git a/drivers/tty/hvc/Kconfig b/drivers/tty/hvc/Kconfig

index 0282a83..46a1a7c 100644 (file)
--- a/drivers/tty/hvc/Kconfig
+++ b/drivers/tty/hvc/Kconfig
@@ -59,7 +59,7 @@ config HVC_IUCV
  
  config HVC_XEN
         bool "Xen Hypervisor Console support"
-       depends on XEN
+       depends on PARAVIRT_XEN
         select HVC_DRIVER
         select HVC_IRQ
         default y
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c

index 94b6eda..ce7ca7e 100644 (file)
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -50,6 +50,7 @@
  #include <linux/uaccess.h>
  #include <linux/module.h>
  
+#include <linux/bootsplash.h>
  
  /* number of characters left in xmit buffer before select has we have room */
  #define WAKEUP_CHARS 256
@@ -1795,6 +1796,15 @@ do_it_again:
                         tty->minimum_to_wake = (minimum - (b - buf));
  
                 if (!input_available_p(tty, 0)) {
+                       dev_t i_rdev = file->f_dentry->d_inode->i_rdev;
+
+                       if (i_rdev == MKDEV(TTY_MAJOR, 0) ||
+                           i_rdev == MKDEV(TTY_MAJOR, 1) ||
+                           i_rdev == MKDEV(TTYAUX_MAJOR, 0) ||
+                           i_rdev == MKDEV(TTYAUX_MAJOR, 1)) {
+                               SPLASH_VERBOSE();
+                       }
+
                         if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) {
                                 retval = -EIO;
                                 break;
diff --git a/drivers/tty/serial/8250/8250.c b/drivers/tty/serial/8250/8250.c

index 5c27f7e..34704a5 100644 (file)
--- a/drivers/tty/serial/8250/8250.c
+++ b/drivers/tty/serial/8250/8250.c
@@ -92,6 +92,8 @@ static unsigned int skip_txen_test; /* force skip of txen test at init time */
  #define CONFIG_SERIAL_MANY_PORTS 1
  #endif
  
+#define arch_8250_sysrq_via_ctrl_o(a,b) 0
+
  /*
   * HUB6 is always on.  This will be removed once the header
   * files have been cleaned.
@@ -1368,7 +1370,11 @@ serial8250_rx_chars(struct uart_8250_port *up, unsigned char lsr)
  
         do {
                 if (likely(lsr & UART_LSR_DR))
+               {
                         ch = serial_in(up, UART_RX);
+                       if (arch_8250_sysrq_via_ctrl_o(ch, &up->port))
+                               goto ignore_char;
+               }
                 else
                         /*
                          * Intel 82571 has a Serial Over Lan device that will
diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig

index 591f801..4c97267 100644 (file)
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -5,6 +5,7 @@
  
  config SERIAL_8250
         tristate "8250/16550 and compatible serial support"
+       depends on !XEN_DISABLE_SERIAL
         select SERIAL_CORE
         ---help---
           This selects whether you want to include the driver for the standard
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c

index d939bd7..fef4cd3 100644 (file)
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -137,6 +137,8 @@ EXPORT_SYMBOL(tty_mutex);
  /* Spinlock to protect the tty->tty_files list */
  DEFINE_SPINLOCK(tty_files_lock);
  
+bool __read_mostly console_use_vt = true;
+
  static ssize_t tty_read(struct file *, char __user *, size_t, loff_t *);
  static ssize_t tty_write(struct file *, const char __user *, size_t, loff_t *);
  ssize_t redirected_tty_write(struct file *, const char __user *,
@@ -1841,6 +1843,10 @@ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp,
  #ifdef CONFIG_VT
         case MKDEV(TTY_MAJOR, 0): {
                 extern struct tty_driver *console_driver;
+
+               if (!console_use_vt)
+                       return get_tty_driver(device, index)
+                              ?: ERR_PTR(-ENODEV);
                 driver = tty_driver_kref_get(console_driver);
                 *index = fg_console;
                 *noctty = 1;
@@ -3390,7 +3396,8 @@ int __init tty_init(void)
                 WARN_ON(device_create_file(consdev, &dev_attr_active) < 0);
  
  #ifdef CONFIG_VT
-       vty_init(&console_fops);
+       if (console_use_vt)
+               vty_init(&console_fops);
  #endif
         return 0;
  }
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c

index 3b0c4e3..d0fbb40 100644 (file)
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -45,6 +45,8 @@
  
  #include <asm/irq_regs.h>
  
+#include <linux/bootsplash.h>
+
  extern void ctrl_alt_del(void);
  
  /*
@@ -1292,6 +1294,13 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw)
                                 pr_warning("can't emulate rawmode for keycode %d\n",
                                            keycode);
  
+       /* This code has to be redone for some non-x86 platforms */
+       if (down == 1 && (keycode == 0x3c || keycode == 0x01)) {
+               /* F2 and ESC on PC keyboard */
+               if (splash_verbose())
+                       return;
+       }
+
  #ifdef CONFIG_SPARC
         if (keycode == KEY_A && sparc_l1_a_state) {
                 sparc_l1_a_state = false;
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c

index 2156188..ac5904e 100644 (file)
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -4208,6 +4208,35 @@ void vcs_scr_updated(struct vc_data *vc)
         notify_update(vc);
  }
  
+#ifdef CONFIG_BOOTSPLASH
+void con_remap_def_color(struct vc_data *vc, int new_color)
+{
+       unsigned short *sbuf = screenpos(vc, 0, 1);
+       unsigned c, len = vc->vc_screenbuf_size >> 1;
+       unsigned int bits, old_color;
+
+       if (sbuf) {
+               old_color = vc->vc_def_color << 8;
+               new_color <<= 8;
+               while (len--) {
+                       c = scr_readw(sbuf);
+                       bits = (old_color ^ new_color) & 0xf000;
+                       if (((c ^ old_color) & 0xf000) == 0)
+                               scr_writew((c ^ bits), sbuf);
+                       *sbuf ^= bits;
+                       bits = (old_color ^ new_color) & 0x0f00;
+                       if (((c ^ old_color) & 0x0f00) == 0)
+                               scr_writew((c ^ bits), sbuf);
+                       *sbuf ^= bits;
+                       sbuf++;
+               }
+               new_color >>= 8;
+       }
+       vc->vc_def_color = vc->vc_color = new_color;
+       update_attr(vc);
+}
+#endif
+
  /*
   *     Visible symbols for modules
   */
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig

index a290be5..4f0d98f 100644 (file)
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -754,7 +754,7 @@ config FB_UVESA
  
  config FB_VESA
         bool "VESA VGA graphics support"
-       depends on (FB = y) && X86
+       depends on (FB = y) && X86 && !XEN_UNPRIVILEGED_GUEST
         select FB_CFB_FILLRECT
         select FB_CFB_COPYAREA
         select FB_CFB_IMAGEBLIT
@@ -2264,7 +2264,7 @@ config FB_VIRTUAL
  
  config XEN_FBDEV_FRONTEND
         tristate "Xen virtual frame buffer support"
-       depends on FB && XEN
+       depends on FB && PARAVIRT_XEN
         select FB_SYS_FILLRECT
         select FB_SYS_COPYAREA
         select FB_SYS_IMAGEBLIT
@@ -2424,6 +2424,10 @@ if FB || SGI_NEWPORT_CONSOLE
         source "drivers/video/logo/Kconfig"
  endif
  
+if FB
+       source "drivers/video/bootsplash/Kconfig"
+endif
+
  config FB_SH_MOBILE_MERAM
         tristate "SuperH Mobile MERAM read ahead support"
         depends on (SUPERH || ARCH_SHMOBILE)
diff --git a/drivers/video/Makefile b/drivers/video/Makefile

index 9356add..38fd67b 100644 (file)
--- a/drivers/video/Makefile
+++ b/drivers/video/Makefile
@@ -14,6 +14,7 @@ fb-objs                           := $(fb-y)
  obj-$(CONFIG_VT)                 += console/
  obj-$(CONFIG_LOGO)               += logo/
  obj-y                            += backlight/
+obj-$(CONFIG_BOOTSPLASH)         += bootsplash/
  
  obj-$(CONFIG_EXYNOS_VIDEO)     += exynos/
  
diff --git a/drivers/video/aty/radeon_monitor.c b/drivers/video/aty/radeon_monitor.c

index 9261c91..5c23eac 100644 (file)
--- a/drivers/video/aty/radeon_monitor.c
+++ b/drivers/video/aty/radeon_monitor.c
@@ -730,6 +730,25 @@ static void radeon_videomode_to_var(struct fb_var_screeninfo *var,
         var->vmode = mode->vmode;
  }
  
+#ifdef CONFIG_PPC_PSERIES
+static int is_powerblade(const char *model)
+{
+       struct device_node *root;
+       const char* cp;
+       int len, l, rc = 0;
+
+       root = of_find_node_by_path("/");
+       if (root && model) {
+               l = strlen(model);
+               cp = of_get_property(root, "model", &len);
+               if (cp)
+                       rc = memcmp(model, cp, min(len, l)) == 0;
+               of_node_put(root);
+       }
+       return rc;
+}
+#endif
+
  /*
   * Build the modedb for head 1 (head 2 will come later), check panel infos
   * from either BIOS or EDID, and pick up the default mode
@@ -865,6 +884,22 @@ void __devinit radeon_check_modes(struct radeonfb_info *rinfo, const char *mode_
                         has_default_mode = 1;
         }
  
+#ifdef CONFIG_PPC_PSERIES
+       if (!has_default_mode && (
+               is_powerblade("IBM,8842") || /* JS20 */
+               is_powerblade("IBM,8844") || /* JS21 */
+               is_powerblade("IBM,7998") || /* JS12/JS21/JS22 */
+               is_powerblade("IBM,0792") || /* QS21 */
+               is_powerblade("IBM,0793")    /* QS22 */
+           )) {
+               printk("Falling back to 800x600 on JSxx hardware\n");
+               if (fb_find_mode(&info->var, info, "800x600@60",
+                                info->monspecs.modedb,
+                                info->monspecs.modedb_len, NULL, 8) != 0)
+                       has_default_mode = 1;
+       }
+#endif
+
         /*
          * Still no mode, let's pick up a default from the db
          */
diff --git a/drivers/video/bootsplash/Kconfig b/drivers/video/bootsplash/Kconfig

new file mode 100644 (file)

index 0000000..17c4c04
--- /dev/null
+++ b/drivers/video/bootsplash/Kconfig
@@ -0,0 +1,17 @@
+#
+# Bootsplash configuration
+#
+
+menu "Bootsplash configuration"
+
+config BOOTSPLASH
+       bool "Bootup splash screen"
+       depends on FRAMEBUFFER_CONSOLE && FB_VESA
+       default n
+        ---help---
+          This option enables the Linux bootsplash screen. For more
+          information on the bootsplash screen have a look at
+          http://www.bootsplash.org/.
+          If you are unsure, say N
+endmenu
+
diff --git a/drivers/video/bootsplash/Makefile b/drivers/video/bootsplash/Makefile

new file mode 100644 (file)

index 0000000..8ba1c08
--- /dev/null
+++ b/drivers/video/bootsplash/Makefile
@@ -0,0 +1,5 @@
+# Makefile for the Linux bootsplash
+
+obj-$(CONFIG_BOOTSPLASH)               += bootsplash.o
+obj-$(CONFIG_BOOTSPLASH)               += decode-jpg.o
+obj-$(CONFIG_BOOTSPLASH)               += render.o
diff --git a/drivers/video/bootsplash/bootsplash.c b/drivers/video/bootsplash/bootsplash.c

new file mode 100644 (file)

index 0000000..d96edce
--- /dev/null
+++ b/drivers/video/bootsplash/bootsplash.c
@@ -0,0 +1,2492 @@
+/*
+ *           linux/drivers/video/bootsplash/bootsplash.c -
+ *                 splash screen handling functions.
+ *
+ *     (w) 2001-2004 by Volker Poplawski, <volker@poplawski.de>,
+ *                     Stefan Reinauer, <stepan@suse.de>,
+ *                     Steffen Winterfeldt, <snwint@suse.de>,
+ *                     Michael Schroeder <mls@suse.de>
+ *         2009-2011 Egbert Eich <eich@suse.de>
+ *
+ *        Ideas & SuSE screen work by Ken Wimer, <wimer@suse.de>
+ *
+ *  For more information on this code check http://www.bootsplash.org/
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fb.h>
+#include <linux/vt_kern.h>
+#include <linux/vmalloc.h>
+#include <linux/unistd.h>
+#include <linux/syscalls.h>
+#include <linux/console.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+
+#include <asm/irq.h>
+
+#include "../console/fbcon.h"
+#include <linux/bootsplash.h>
+#include "decode-jpg.h"
+
+#ifndef DEBUG
+# define SPLASH_DEBUG(fmt, args...)
+#else
+# define SPLASH_DEBUG(fmt, args...) \
+       printk(KERN_WARNING "%s: " fmt "\n", __func__, ##args)
+#endif
+extern signed char con2fb_map[MAX_NR_CONSOLES];
+
+#define SPLASH_VERSION "3.2.0-2010/03/31"
+
+/* These errors have to match fbcon-jpegdec.h */
+static unsigned char *jpg_errors[] = {
+       "no SOI found",
+       "not 8 bit",
+       "height mismatch",
+       "width mismatch",
+       "bad width or height",
+       "too many COMPPs",
+       "illegal HV",
+       "quant table selector",
+       "picture is not YCBCR 221111",
+       "unknow CID in scan",
+       "dct not sequential",
+       "wrong marker",
+       "no EOI",
+       "bad tables",
+       "depth mismatch",
+       "scale error",
+       "out of memory"
+};
+
+static int splash_usesilent;
+static unsigned long splash_default = 0xf01;
+
+static int jpeg_get(unsigned char *buf, unsigned char *pic,
+                   int width, int height, enum splash_color_format cf,
+                   struct jpeg_decdata *decdata);
+static int splash_look_for_jpeg(struct vc_data *vc, int width, int height);
+
+static int __init splash_setup(char *options)
+{
+       splash_usesilent = 0;
+
+       if (!strncmp("silent", options, 6)) {
+               printk(KERN_INFO "bootsplash: silent mode.\n");
+               splash_usesilent = 1;
+               /* skip "silent," */
+               if (strlen(options) == 6)
+                       return 0;
+               options += 7;
+       }
+       if (!strncmp("verbose", options, 7)) {
+               printk(KERN_INFO "bootsplash: verbose mode.\n");
+               splash_usesilent = 0;
+               if (strlen(options) == 7)
+                       return 0;
+               options += 8;
+       }
+       if (strict_strtoul(options, 0, &splash_default) == -EINVAL)
+               splash_default = 0;
+
+       return 0;
+}
+
+__setup("splash=", splash_setup);
+
+
+static int splash_hasinter(unsigned char *buf, int num)
+{
+       unsigned char *bufend = buf + num * 12;
+       while (buf < bufend) {
+               if (buf[1] > 127)               /* inter? */
+                       return 1;
+               buf += buf[3] > 127 ? 24 : 12;  /* blend? */
+       }
+       return 0;
+}
+
+static int boxextract(unsigned char *buf, unsigned short *dp,
+                     unsigned char *cols, int *blendp)
+{
+       dp[0] = buf[0] | buf[1] << 8;
+       dp[1] = buf[2] | buf[3] << 8;
+       dp[2] = buf[4] | buf[5] << 8;
+       dp[3] = buf[6] | buf[7] << 8;
+       *(unsigned int *)(cols + 0) =
+               *(unsigned int *)(cols + 4) =
+               *(unsigned int *)(cols + 8) =
+               *(unsigned int *)(cols + 12) = *(unsigned int *)(buf + 8);
+       if (dp[1] > 32767) {
+               dp[1] = ~dp[1];
+               *(unsigned int *)(cols + 4) = *(unsigned int *)(buf + 12);
+               *(unsigned int *)(cols + 8) = *(unsigned int *)(buf + 16);
+               *(unsigned int *)(cols + 12) = *(unsigned int *)(buf + 20);
+               *blendp = 1;
+               return 24;
+       }
+       return 12;
+}
+
+static void boxit(unsigned char *pic, int bytes, unsigned char *buf, int num,
+                 int percent, int xoff, int yoff, int overpaint,
+                 enum splash_color_format cf)
+{
+       int x, y, p, doblend, r, g, b, a, add;
+       unsigned int i = 0;
+       unsigned short data1[4];
+       unsigned char cols1[16];
+       unsigned short data2[4];
+       unsigned char cols2[16];
+       unsigned char *bufend;
+       union pt picp;
+       unsigned int stipple[32], sti, stin, stinn, stixs, stixe, stiys, stiye;
+       int xs, xe, ys, ye, xo, yo;
+       int octpp = splash_octpp(cf);
+
+       SPLASH_DEBUG();
+       if (num == 0 || percent < -1)
+               return;
+       bufend = buf + num * 12;
+       stipple[0] = 0xffffffff;
+       stin = 1;
+       stinn = 0;
+       stixs = stixe = 0;
+       stiys = stiye = 0;
+       while (buf < bufend) {
+               doblend = 0;
+               buf += boxextract(buf, data1, cols1, &doblend);
+               if (data1[0] == 32767 && data1[1] == 32767) {
+                       /* box stipple */
+                       if (stinn == 32)
+                               continue;
+                       if (stinn == 0) {
+                               stixs = data1[2];
+                               stixe = data1[3];
+                               stiys = stiye = 0;
+                       } else if (stinn == 4) {
+                               stiys = data1[2];
+                               stiye = data1[3];
+                       }
+                       stipple[stinn++] = (cols1[0]  << 24) |
+                               (cols1[1]  << 16) |
+                               (cols1[2]  << 8)  |
+                               cols1[3] ;
+                       stipple[stinn++] = (cols1[4]  << 24) |
+                               (cols1[5]  << 16) |
+                               (cols1[6]  << 8)  |
+                               cols1[7] ;
+                       stipple[stinn++] = (cols1[8]  << 24) |
+                               (cols1[9]  << 16) |
+                               (cols1[10] << 8)  |
+                               cols1[11] ;
+                       stipple[stinn++] = (cols1[12] << 24) |
+                               (cols1[13] << 16) |
+                               (cols1[14] << 8)  |
+                               cols1[15] ;
+                       stin = stinn;
+                       continue;
+               }
+               stinn = 0;
+               if (data1[0] > 32767)
+                       buf += boxextract(buf, data2, cols2, &doblend);
+               if (data1[0] == 32767 && data1[1] == 32766) {
+                       /* box copy */
+                       i = 12 * (short)data1[3];
+                       doblend = 0;
+                       i += boxextract(buf + i, data1, cols1, &doblend);
+                       if (data1[0] > 32767)
+                               boxextract(buf + i, data2, cols2, &doblend);
+               }
+               if (data1[0] == 32767)
+                       continue;
+               if (data1[2] > 32767) {
+                       if (overpaint)
+                               continue;
+                       data1[2] = ~data1[2];
+               }
+               if (data1[3] > 32767) {
+                       if (percent == 65536)
+                               continue;
+                       data1[3] = ~data1[3];
+               }
+               if (data1[0] > 32767) {
+                       if (percent < 0)
+                               continue;
+                       data1[0] = ~data1[0];
+                       for (i = 0; i < 4; i++)
+                               data1[i] = (data1[i] * (65536 - percent)
+                                           + data2[i] * percent) >> 16;
+                       for (i = 0; i < 16; i++)
+                               cols1[i] = (cols1[i] * (65536 - percent)
+                                           + cols2[i] * percent) >> 16;
+               }
+               *(unsigned int *)cols2 = *(unsigned int *)cols1;
+               a = cols2[3];
+               if (a == 0 && !doblend)
+                       continue;
+
+               if (stixs >= 32768) {
+                       xo = xs = (stixs ^ 65535) + data1[0];
+                       xe = stixe ? stixe + data1[0] : data1[2];
+               } else if (stixe >= 32768) {
+                       xs = stixs ? data1[2] - stixs : data1[0];
+                       xe = data1[2] - (stixe ^ 65535);
+                       xo = xe + 1;
+               } else {
+                       xo = xs = stixs;
+                       xe = stixe ? stixe : data1[2];
+               }
+               if (stiys >= 32768) {
+                       yo = ys = (stiys ^ 65535) + data1[1];
+                       ye = stiye ? stiye + data1[1] : data1[3];
+               } else if (stiye >= 32768) {
+                       ys = stiys ? data1[3] - stiys : data1[1];
+                       ye = data1[3] - (stiye ^ 65535);
+                       yo = ye + 1;
+               } else {
+                       yo = ys = stiys;
+                       ye = stiye ? stiye : data1[3];
+               }
+               xo = 32 - (xo & 31);
+               yo = stin - (yo % stin);
+               if (xs < data1[0])
+                       xs = data1[0];
+               if (xe > data1[2])
+                       xe = data1[2];
+               if (ys < data1[1])
+                       ys = data1[1];
+               if (ye > data1[3])
+                       ye = data1[3];
+
+               for (y = ys; y <= ye; y++) {
+                       sti = stipple[(y + yo) % stin];
+                       x = (xs + xo) & 31;
+                       if (x)
+                               sti = (sti << x) | (sti >> (32 - x));
+                       if (doblend) {
+                               p = data1[3] - data1[1];
+                               if (p != 0)
+                                       p = ((y - data1[1]) << 16) / p;
+                               for (i = 0; i < 8; i++)
+                                       cols2[i + 8] = (cols1[i] * (65536 - p)
+                                                       + cols1[i + 8] * p)
+                                               >> 16;
+                       }
+                       add = (xs & 1);
+                       add ^= (add ^ y) & 1 ? 1 : 3; /*2x2 ordered dithering*/
+                       picp.ub = (pic + (xs + xoff) * octpp
+                                  + (y + yoff) * bytes);
+                       for (x = xs; x <= xe; x++) {
+                               if (!(sti & 0x80000000)) {
+                                       sti <<= 1;
+                                       switch (octpp) {
+                                       case 2:
+                                               picp.us++;
+                                               break;
+                                       case 3:
+                                               picp.ub += 3;
+                                               break;
+                                       case 4:
+                                               picp.ul++;
+                                               break;
+                                       }
+                                       add ^= 3;
+                                       continue;
+                               }
+                               sti = (sti << 1) | 1;
+                               if (doblend) {
+                                       p = data1[2] - data1[0];
+                                       if (p != 0)
+                                               p = ((x - data1[0]) << 16) / p;
+                                       for (i = 0; i < 4; i++)
+                                               cols2[i] = (cols2[i + 8] * (65536 - p)
+                                                           + cols2[i + 12] * p)
+                                                       >> 16;
+                                       a = cols2[3];
+                               }
+                               r = cols2[0];
+                               g = cols2[1];
+                               b = cols2[2];
+#define CLAMP(x) ((x) >= 256 ? 255 : (x))
+#define BLEND(x, v, a) ((x * (255 - a) + v * a) / 255)
+                               switch (cf) {
+                               case SPLASH_DEPTH_15:
+                                       if (a != 255) {
+                                               i = *picp.us;
+                                               r = BLEND((i>>7 & 0xf8), r, a);
+                                               g = BLEND((i>>2 & 0xf8), g, a);
+                                               b = BLEND((i<<3 & 0xf8), b, a);
+                                       }
+                                       r += add * 2 + 1;
+                                       g += add;
+                                       b += add * 2 + 1;
+                                       i =     ((CLAMP(r) & 0xf8) <<  7) |
+                                               ((CLAMP(g) & 0xf8) <<  2) |
+                                               ((CLAMP(b))        >>  3);
+                                       *(picp.us++) = i;
+                                       break;
+                               case SPLASH_DEPTH_16:
+                                       if (a != 255) {
+                                               i = *picp.us;
+                                               r = BLEND((i>>8 & 0xf8), r, a);
+                                               g = BLEND((i>>3 & 0xfc), g, a);
+                                               b = BLEND((i<<3 & 0xf8), b, a);
+                                       }
+                                       r += add * 2 + 1;
+                                       g += add;
+                                       b += add * 2 + 1;
+                                               i = ((CLAMP(r) & 0xf8) <<  8) |
+                                                   ((CLAMP(g) & 0xfc) <<  3) |
+                                                   ((CLAMP(b))        >>  3);
+                                       *(picp.us++) = i;
+                                       break;
+                               case SPLASH_DEPTH_24_PACKED:
+                                       if (a != 255) {
+                                               i = *picp.ub;
+                                               r = BLEND((i & 0xff), r, a);
+                                               i = *(picp.ub + 1);
+                                               g = BLEND((i & 0xff), g, a);
+                                               i = *(picp.ub + 2);
+                                               b = BLEND((i & 0xff), b, a);
+                                       }
+                                       *(picp.ub++) = CLAMP(r);
+                                       *(picp.ub++) = CLAMP(g);
+                                       *(picp.ub++) = CLAMP(b);
+                                       break;
+                               case SPLASH_DEPTH_24:
+                                       if (a != 255) {
+                                               i = *picp.ul;
+                                               r = BLEND((i>>16 & 0xff), r, a);
+                                               g = BLEND((i>>8  & 0xff), g, a);
+                                               b = BLEND((i     & 0xff), b, a);
+                                       }
+                                       i = ((CLAMP(r) << 16)
+                                            | (CLAMP(g) << 8)
+                                            | (CLAMP(b)));
+                                       *(picp.ul++) = i;
+                                       break;
+                               default:
+                                       break;
+                               }
+                               add ^= 3;
+                       }
+               }
+       }
+}
+
+static void box_offsets(unsigned char *buf, int num,
+                       int screen_w, int screen_h, int pic_w, int pic_h,
+                       int *x_off, int *y_off)
+{
+       int a, doblend;
+       int x_min = pic_w, x_max = 0;
+       int y_min = pic_h, y_max = 0;
+       unsigned int i = 0;
+       unsigned short data1[4];
+       unsigned char cols1[16];
+       unsigned short data2[4];
+       unsigned char cols2[16];
+       unsigned char *bufend;
+       unsigned int stin, stinn, stixs, stixe, stiys, stiye;
+       int xs, xe, ys, ye;
+
+       SPLASH_DEBUG();
+
+       if ((screen_w == pic_w && screen_h == pic_h) || num == 0)
+               *x_off = *y_off = 0;
+
+       bufend = buf + num * 12;
+       stin = 1;
+       stinn = 0;
+       stixs = stixe = 0;
+       stiys = stiye = 0;
+
+       while (buf < bufend) {
+               doblend = 0;
+               buf += boxextract(buf, data1, cols1, &doblend);
+               if (data1[0] == 32767 && data1[1] == 32767) {
+                       /* box stipple */
+                       if (stinn == 32)
+                               continue;
+                       if (stinn == 0) {
+                               stixs = data1[2];
+                               stixe = data1[3];
+                               stiys = stiye = 0;
+                       } else if (stinn == 4) {
+                               stiys = data1[2];
+                               stiye = data1[3];
+                       }
+                       stin = stinn;
+                       continue;
+               }
+               stinn = 0;
+               if (data1[0] > 32767)
+                       buf += boxextract(buf, data2, cols2, &doblend);
+               if (data1[0] == 32767 && data1[1] == 32766) {
+                       /* box copy */
+                       i = 12 * (short)data1[3];
+                       doblend = 0;
+                       i += boxextract(buf + i, data1, cols1, &doblend);
+                       if (data1[0] > 32767)
+                               boxextract(buf + i, data2, cols2, &doblend);
+               }
+               if (data1[0] == 32767)
+                       continue;
+               if (data1[2] > 32767)
+                       data1[2] = ~data1[2];
+               if (data1[3] > 32767)
+                       data1[3] = ~data1[3];
+               if (data1[0] > 32767) {
+                       data1[0] = ~data1[0];
+                       for (i = 0; i < 4; i++)
+                               data1[i] = (data1[i] * (65536 - 1)
+                                           + data2[i] * 1) >> 16;
+               }
+               *(unsigned int *)cols2 = *(unsigned int *)cols1;
+               a = cols2[3];
+               if (a == 0 && !doblend)
+                       continue;
+
+               if (stixs >= 32768) {
+                       xs = (stixs ^ 65535) + data1[0];
+                       xe = stixe ? stixe + data1[0] : data1[2];
+               } else if (stixe >= 32768) {
+                       xs = stixs ? data1[2] - stixs : data1[0];
+                       xe = data1[2] - (stixe ^ 65535);
+               } else {
+                       xs = stixs;
+                       xe = stixe ? stixe : data1[2];
+               }
+               if (stiys >= 32768) {
+                       ys = (stiys ^ 65535) + data1[1];
+                       ye = stiye ? stiye + data1[1] : data1[3];
+               } else if (stiye >= 32768) {
+                       ys = stiys ? data1[3] - stiys : data1[1];
+                       ye = data1[3] - (stiye ^ 65535);
+               } else {
+                       ys = stiys;
+                       ye = stiye ? stiye : data1[3];
+               }
+               if (xs < data1[0])
+                       xs = data1[0];
+               if (xe > data1[2])
+                       xe = data1[2];
+               if (ys < data1[1])
+                       ys = data1[1];
+               if (ye > data1[3])
+                       ye = data1[3];
+
+               if (xs < x_min)
+                       x_min = xs;
+               if (xe > x_max)
+                       x_max = xe;
+               if (ys < y_min)
+                       y_min = ys;
+               if (ye > y_max)
+                       y_max = ye;
+       }
+       {
+               int x_center = (x_min + x_max) / 2;
+               int y_center = (y_min + y_max) / 2;
+
+               if (screen_w == pic_w)
+                       *x_off = 0;
+               else {
+                       if (x_center < (pic_w + pic_w / 5) >> 1 &&
+                           x_center > (pic_w - pic_w / 5) >> 1) {
+                               *x_off = (screen_w - pic_w) >> 1;
+                       } else {
+                               int x = x_center * screen_w / pic_w;
+                               *x_off = x - x_center;
+                               if (x_min + *x_off < 0)
+                                       *x_off = 0;
+                               if (x_max + *x_off > screen_w)
+                                       *x_off = screen_w - pic_w;
+                       }
+               }
+               if (screen_h == pic_h)
+                       *y_off = 0;
+               else {
+                       if (y_center < (pic_h + pic_h / 5) >> 1 &&
+                           y_center > (pic_h - pic_h / 5) >> 1)
+                               *y_off = (screen_h - pic_h) >> 1;
+                       else {
+                               int x = y_center * screen_h / pic_h;
+                               *y_off = x - y_center;
+                               if (y_min + *y_off < 0)
+                                       *y_off = 0;
+                               if (y_max + *x_off > screen_h)
+                                       *y_off = screen_h - pic_h;
+                       }
+               }
+       }
+}
+
+static int splash_check_jpeg(unsigned char *jpeg,
+                            int width, int height)
+{
+       int size, err;
+       unsigned char *mem;
+       struct jpeg_decdata *decdata; /* private decoder data */
+
+
+       size = ((width + 15) & ~15) * ((height + 15) & ~15) * 2;
+       mem = vmalloc(size);
+       if (!mem) {
+               printk(KERN_INFO "bootsplash: no memory for decoded picture.\n");
+               return -1;
+       }
+       decdata = vmalloc(sizeof(*decdata));
+       if (!decdata) {
+               printk(KERN_INFO "bootsplash: not enough memory.\n");
+               vfree(mem);
+               return -1;
+       }
+       /* test decode: use fixed depth of 16 */
+       err = jpeg_decode(jpeg, mem,
+                         ((width + 15) & ~15), ((height + 15) & ~15),
+                         SPLASH_DEPTH_16,
+                         decdata);
+       if (err)
+               printk(KERN_INFO "bootsplash: "
+                      "error while decompressing picture: %s (%d)\n",
+                      jpg_errors[err - 1], err);
+       vfree(decdata);
+       vfree(mem);
+       return err ? -1 : 0;
+}
+
+static void splash_free(struct vc_data *vc, struct fb_info *info)
+{
+       struct splash_data *sd;
+       struct splash_data *next;
+       SPLASH_DEBUG();
+       for (sd = vc->vc_splash_data; sd; sd = next) {
+               next = sd->next;
+               sd->pic->ref_cnt--;
+               if (!sd->pic->ref_cnt) {
+                       vfree(sd->pic->splash_pic);
+                       vfree(sd->pic);
+               }
+               sd->imgd->ref_cnt--;
+               if (!sd->imgd->ref_cnt) {
+                       vfree(sd->imgd->splash_sboxes);
+                       vfree(sd->imgd);
+               }
+               vfree(sd);
+       }
+       vc->vc_splash_data = 0;
+       if (info)
+               info->splash_data = 0;
+}
+
+static int splash_mkpenguin(struct splash_data *data,
+                           int pxo, int pyo, int pwi, int phe,
+                           int pr, int pg, int pb)
+{
+       unsigned char *buf;
+       int i;
+
+       if (pwi == 0 || phe == 0)
+               return 0;
+
+       buf = (unsigned char *)data + sizeof(*data);
+
+       pwi += pxo - 1;
+       phe += pyo - 1;
+
+       *buf++ = pxo;
+       *buf++ = pxo >> 8;
+       *buf++ = pyo;
+       *buf++ = pyo >> 8;
+       *buf++ = pwi;
+       *buf++ = pwi >> 8;
+       *buf++ = phe;
+       *buf++ = phe >> 8;
+       *buf++ = pr;
+       *buf++ = pg;
+       *buf++ = pb;
+       *buf++ = 0;
+
+       for (i = 0; i < 12; i++, buf++)
+               *buf = buf[-12];
+
+       buf[-24] ^= 0xff;
+       buf[-23] ^= 0xff;
+       buf[-1] = 0xff;
+
+       return 2;
+}
+
+static const int splash_offsets[3][16] = {
+    /* len, unit, size, state, fgcol, col, xo, yo, wi, he
+       boxcnt, ssize, sboxcnt, percent, overok, palcnt */
+    /* V1 */
+       {   20,   -1,   16,    -1,    -1,  -1,  8, 10, 12, 14,
+           -1,    -1,      -1,      -1,     -1,     -1 },
+       /* V2 */
+       {   35,    8,   12,     9,    10,  11, 16, 18, 20, 22,
+           -1,    -1,      -1,      -1,     -1,     -1 },
+       /* V3 */
+       {   38,    8,   12,     9,    10,  11, 16, 18, 20, 22,
+           24,    28,      32,      34,     36,     37 },
+};
+
+#define SPLASH_OFF_LEN     offsets[0]
+#define SPLASH_OFF_UNIT    offsets[1]
+#define SPLASH_OFF_SIZE    offsets[2]
+#define SPLASH_OFF_STATE   offsets[3]
+#define SPLASH_OFF_FGCOL   offsets[4]
+#define SPLASH_OFF_COL     offsets[5]
+#define SPLASH_OFF_XO      offsets[6]
+#define SPLASH_OFF_YO      offsets[7]
+#define SPLASH_OFF_WI      offsets[8]
+#define SPLASH_OFF_HE      offsets[9]
+#define SPLASH_OFF_BOXCNT  offsets[10]
+#define SPLASH_OFF_SSIZE   offsets[11]
+#define SPLASH_OFF_SBOXCNT offsets[12]
+#define SPLASH_OFF_PERCENT offsets[13]
+#define SPLASH_OFF_OVEROK  offsets[14]
+#define SPLASH_OFF_PALCNT  offsets[15]
+
+static inline int splash_getb(unsigned char *pos, int off)
+{
+       return off == -1 ? 0 : pos[off];
+}
+
+static inline int splash_gets(unsigned char *pos, int off)
+{
+       return off == -1 ? 0 : pos[off] | pos[off + 1] << 8;
+}
+
+static inline int splash_geti(unsigned char *pos, int off)
+{
+       return off == -1 ? 0 : (pos[off] |
+                               pos[off + 1] << 8 |
+                               pos[off + 2] << 16 |
+                               pos[off + 3] << 24);
+}
+
+/* move the given splash_data to the current one */
+static void splash_pivot_current(struct vc_data *vc, struct splash_data *new)
+{
+       struct splash_data *sd;
+       struct splash_pic_data *pic;
+       int state, percent, silent;
+
+       sd = vc->vc_splash_data;
+       if (!sd || sd == new)
+               return;
+
+       state = sd->splash_state;
+       percent = sd->splash_percent;
+       silent = sd->splash_dosilent;
+       if (sd->pic->ref_cnt > 1) {
+               pic = kzalloc(sizeof(struct splash_pic_data), GFP_KERNEL);
+               if (!pic)
+                       return;
+               sd->pic = pic;
+       }
+       sd->pic->ref_cnt = 1;
+       sd->pic->splash_pic_size = 0;
+       sd->pic->splash_pic = NULL;
+       sd->splash_vc_text_wi = sd->imgd->splash_text_wi;
+       sd->splash_vc_text_he = sd->imgd->splash_text_he;
+       for (; sd->next; sd = sd->next) {
+               if (sd->next == new) {
+                       sd->next = new->next;
+                       new->next = vc->vc_splash_data;
+                       vc->vc_splash_data = new;
+                       /* copy the current states */
+                       new->splash_state = state;
+                       new->splash_percent = percent;
+                       new->splash_dosilent = silent;
+                       new->splash_vc_text_wi = new->imgd->splash_text_wi;
+                       new->splash_vc_text_he = new->imgd->splash_text_he;
+
+                       new->splash_boxes_xoff = 0;
+                       new->splash_boxes_yoff = 0;
+                       new->splash_sboxes_xoff = 0;
+                       new->splash_sboxes_yoff = 0;
+
+                       if (new->pic->ref_cnt > 1) {
+                               struct splash_pic_data *pic;
+                               pic = kzalloc(sizeof(struct splash_pic_data),
+                                             GFP_KERNEL);
+                               if (!pic)
+                                       return;
+
+                               new->pic = pic;
+                       }
+                       new->pic->ref_cnt = 1;
+                       new->pic->splash_pic_size = 0;
+                       new->pic->splash_pic = NULL;
+
+                       return;
+               }
+       }
+}
+
+static int update_boxes(struct vc_data *vc,
+              const int *offsets,
+              unsigned char *ndata, int len, unsigned char * end,
+              int *update)
+{
+       int boxcnt;
+       int sboxcnt;
+       struct splash_data *sd;
+       struct splash_img_data *imgd;
+       int i;
+
+       sd = vc->vc_splash_data;
+       if (sd != 0) {
+               int up = 0;
+               imgd = sd->imgd;
+               i = splash_getb(ndata, SPLASH_OFF_STATE);
+               if (i != 255) {
+                       sd->splash_state = i; /*@!@*/
+                       up = -1;
+               }
+               i = splash_getb(ndata, SPLASH_OFF_FGCOL);
+               if (i != 255) {
+                       imgd->splash_fg_color = i;
+                       up = -1;
+               }
+               i = splash_getb(ndata, SPLASH_OFF_COL);
+               if (i != 255) {
+                       imgd->splash_color = i;
+                       up = -1;
+               }
+               boxcnt = sboxcnt = 0;
+               if (ndata + len <= end) {
+                       boxcnt = splash_gets(ndata, SPLASH_OFF_BOXCNT);
+                       sboxcnt = splash_gets(ndata, SPLASH_OFF_SBOXCNT);
+               }
+               if (boxcnt) {
+                       i = splash_gets(ndata, len);
+                       if (boxcnt + i
+                           <= imgd->splash_boxcount &&
+                           ndata + len + 2 + boxcnt * 12
+                           <= end) {
+                               if (splash_geti(ndata, len + 2)
+                                   != 0x7ffd7fff ||
+                                   !memcmp(ndata + len + 2,
+                                           imgd->splash_boxes + i * 12,
+                                           8)) {
+                                       memcpy(imgd->splash_boxes + i * 12,
+                                              ndata + len + 2,
+                                              boxcnt * 12);
+                                       up |= 1;
+                               }
+                       }
+                       len += boxcnt * 12 + 2;
+               }
+               if (sboxcnt) {
+                       i = splash_gets(ndata, len);
+                       if ((sboxcnt + i <= imgd->splash_sboxcount) &&
+                           (ndata + len + 2 + sboxcnt * 12 <= end)) {
+                               if ((splash_geti(ndata, len + 2) != 0x7ffd7fff)
+                                   || !memcmp(ndata + len + 2,
+                                              imgd->splash_sboxes + i * 12,
+                                              8)) {
+                                       memcpy(imgd->splash_sboxes + i * 12,
+                                              ndata + len + 2,
+                                              sboxcnt * 12);
+                                       up |= 2;
+                               }
+                       }
+               }
+               if (update)
+                       *update = up;
+       }
+       return 0;
+}
+
+static int splash_getraw(unsigned char *start, unsigned char *end, int *update)
+{
+       unsigned char *ndata;
+       int version;
+       int splash_size;
+       int unit;
+       int width, height;
+       int silentsize;
+       int boxcnt;
+       int sboxcnt;
+       int palcnt;
+       int len;
+       const int *offsets;
+       struct vc_data *vc = NULL;
+       struct fb_info *info = NULL;
+       struct splash_data *sd;
+       struct splash_img_data *imgd;
+       struct splash_pic_data *pic;
+       struct splash_data *splash_found = NULL;
+       int unit_found = -1;
+       int oldpercent, oldsilent;
+
+       if (update)
+               *update = -1;
+
+       if (!update ||
+           start[7] < '2' ||
+           start[7] > '3' ||
+           splash_geti(start, 12) != (int)0xffffffff)
+               printk(KERN_INFO "bootsplash %s: looking for picture...\n",
+                      SPLASH_VERSION);
+
+       oldpercent = -3;
+       oldsilent = -1;
+       for (ndata = start; ndata < end; ndata++) {
+               if (ndata[0] != 'B' ||
+                   ndata[1] != 'O' ||
+                   ndata[2] != 'O' ||
+                   ndata[3] != 'T')
+                       continue;
+               if (ndata[4] != 'S' ||
+                   ndata[5] != 'P' ||
+                   ndata[6] != 'L' ||
+                   ndata[7]  < '1' ||
+                   ndata[7]  > '3')
+                       continue;
+
+               version = ndata[7] - '0';
+               offsets = splash_offsets[version - 1];
+               len = SPLASH_OFF_LEN;
+
+               unit = splash_getb(ndata, SPLASH_OFF_UNIT);
+               if (unit >= MAX_NR_CONSOLES)
+                       continue;
+
+               if (unit)
+                       vc_allocate(unit);
+
+               vc = vc_cons[unit].d;
+               if (!vc)
+                       continue;
+
+               info = registered_fb[(int)con2fb_map[unit]];
+
+               splash_size = splash_geti(ndata, SPLASH_OFF_SIZE);
+
+               /*
+                * Update. Wonder what should happen here now
+                * since we can have multiple splash_data records
+                */
+               if (splash_size == (int)0xffffffff && version > 1) {
+                       if (update_boxes(vc, offsets, ndata, len, end, update) < 0)
+                               return -1;
+
+                       return unit;
+               }
+
+               if (splash_size == 0) {
+                       printk(KERN_INFO
+                              "bootsplash: ...found, freeing memory.\n");
+                       if (vc->vc_splash_data)
+                               splash_free(vc, info);
+                       return unit;
+               }
+               boxcnt = splash_gets(ndata, SPLASH_OFF_BOXCNT);
+               palcnt = 3 * splash_getb(ndata, SPLASH_OFF_PALCNT);
+               if (ndata + len + splash_size > end) {
+                       printk(KERN_ERR
+                              "bootsplash: ...found, but truncated!\n");
+                       return -1;
+               }
+               silentsize = splash_geti(ndata, SPLASH_OFF_SSIZE);
+               if (silentsize)
+                       printk(KERN_INFO
+                              "bootsplash: silentjpeg size %d bytes\n",
+                              silentsize);
+               if (silentsize >= splash_size) {
+                       printk(KERN_ERR "bootsplash: bigger than splashsize!\n");
+                       return -1;
+               }
+               splash_size -= silentsize;
+               if (!splash_usesilent)
+                       silentsize = 0;
+
+               sboxcnt = splash_gets(ndata, SPLASH_OFF_SBOXCNT);
+               if (vc->vc_splash_data) {
+                       oldpercent = vc->vc_splash_data->splash_percent;/*@!@*/
+                       oldsilent = vc->vc_splash_data->splash_dosilent;/*@!@*/
+               }
+               sd = kzalloc(sizeof(*sd), GFP_KERNEL);
+               if (!sd)
+                       break;
+               imgd = vmalloc(sizeof(*imgd)
+                              + splash_size + (version < 3 ? 2 * 12 : 0));
+               if (!imgd) {
+                       vfree(sd);
+                       break;
+               }
+               pic = kzalloc(sizeof(*pic), GFP_KERNEL);
+               if (!pic) {
+                       vfree(sd);
+                       vfree(pic);
+                       break;
+               }
+               memset(imgd, 0, sizeof(*imgd));
+               sd->imgd = imgd;
+               sd->pic = pic;
+               imgd->ref_cnt = 1;
+               pic->ref_cnt = 1;
+               jpeg_get_size(ndata + len + boxcnt * 12 + palcnt,
+                             &imgd->splash_width, &imgd->splash_height);
+               if (splash_check_jpeg(ndata + len + boxcnt * 12 + palcnt,
+                                     imgd->splash_width,
+                                     imgd->splash_height)) {
+                       ndata += len + splash_size - 1;
+                       vfree(imgd);
+                       vfree(sd);
+                       continue;
+               }
+               if (silentsize) {
+                       imgd->splash_silentjpeg = vmalloc(silentsize);
+                       if (imgd->splash_silentjpeg) {
+                               memcpy(imgd->splash_silentjpeg,
+                                      ndata + len + splash_size, silentsize);
+                               imgd->splash_sboxes = imgd->splash_silentjpeg;
+                               imgd->splash_silentjpeg += 12 * sboxcnt;
+                               imgd->splash_sboxcount = sboxcnt;
+                       }
+               }
+               imgd->splash_fg_color = splash_getb(ndata, SPLASH_OFF_FGCOL);
+               imgd->splash_color = splash_getb(ndata, SPLASH_OFF_COL);
+               imgd->splash_overpaintok = splash_getb(ndata, SPLASH_OFF_OVEROK);
+               imgd->splash_text_xo = splash_gets(ndata, SPLASH_OFF_XO);
+               imgd->splash_text_yo = splash_gets(ndata, SPLASH_OFF_YO);
+               imgd->splash_text_wi = splash_gets(ndata, SPLASH_OFF_WI);
+               imgd->splash_text_he = splash_gets(ndata, SPLASH_OFF_HE);
+               if (version == 1) {
+                       imgd->splash_text_xo *= 8;
+                       imgd->splash_text_wi *= 8;
+                       imgd->splash_text_yo *= 16;
+                       imgd->splash_text_he *= 16;
+                       imgd->splash_color    = (splash_default >> 8) & 0x0f;
+                       imgd->splash_fg_color = (splash_default >> 4) & 0x0f;
+               }
+
+               /* fake penguin box for older formats */
+               if (version == 1)
+                       boxcnt = splash_mkpenguin(sd, imgd->splash_text_xo + 10,
+                                                 imgd->splash_text_yo + 10,
+                                                 imgd->splash_text_wi - 20,
+                                                 imgd->splash_text_he - 20,
+                                                 0xf0, 0xf0, 0xf0);
+               else if (version == 2)
+                       boxcnt = splash_mkpenguin(sd,
+                                                 splash_gets(ndata, 24),
+                                                 splash_gets(ndata, 26),
+                                                 splash_gets(ndata, 28),
+                                                 splash_gets(ndata, 30),
+                                                 splash_getb(ndata, 32),
+                                                 splash_getb(ndata, 33),
+                                                 splash_getb(ndata, 34));
+
+               memcpy((char *)imgd
+                      + sizeof(*imgd) + (version < 3 ? boxcnt * 12 : 0),
+                      ndata + len,
+                      splash_size);
+               imgd->splash_boxcount = boxcnt;
+               imgd->splash_boxes = (unsigned char *)imgd + sizeof(*imgd);
+               imgd->splash_palette = imgd->splash_boxes + boxcnt * 12;
+               imgd->splash_jpeg = imgd->splash_palette + palcnt;
+
+               sd->splash_state = splash_getb(ndata, SPLASH_OFF_STATE);/*@!@*/
+               sd->splash_percent = oldpercent == -3 ?
+                       splash_gets(ndata, SPLASH_OFF_PERCENT) :
+                       oldpercent; /*@!@*/
+               sd->pic->splash_pic = NULL;
+               sd->pic->splash_pic_size = 0;
+
+               sd->splash_dosilent = imgd->splash_silentjpeg != 0 ?
+                       (oldsilent == -1 ? 1 : oldsilent) :
+                       0; /* @!@ */
+
+               sd->splash_vc_text_wi = imgd->splash_text_wi;
+               sd->splash_vc_text_he = imgd->splash_text_he;
+
+               sd->next = vc->vc_splash_data;
+               vc->vc_splash_data = sd;
+
+               if (info) {
+                       width = info->var.xres;
+                       height = info->var.yres;
+                       if (imgd->splash_width != width ||
+                           imgd->splash_height != height) {
+                               ndata += len + splash_size - 1;
+                               continue;
+                       }
+               }
+               printk(KERN_INFO
+                      "bootsplash: ...found (%dx%d, %d bytes, v%d).\n",
+                      imgd->splash_width, imgd->splash_height,
+                      splash_size, version);
+               if (version == 1) {
+                       printk(KERN_WARNING
+                              "bootsplash: Using deprecated v1 header. "
+                              "Updating your splash utility recommended.\n");
+                       printk(KERN_INFO
+                              "bootsplash: Find the latest version at "
+                      "http://www.bootsplash.org/\n");
+               }
+
+               splash_found = sd;
+               unit_found = unit;
+       }
+
+       if (splash_found) {
+               splash_pivot_current(vc, splash_found);
+               return unit_found;
+       } else {
+               vc = vc_cons[0].d;
+               if (vc) {
+                       info = registered_fb[(int)con2fb_map[0]];
+                       if (info) {
+                               width = info->var.xres;
+                               height = info->var.yres;
+                       } else
+                               width = height = 0;
+                       if (!splash_look_for_jpeg(vc, width, height))
+                               return -1;
+                       return 0;
+               }
+       }
+
+       printk(KERN_ERR "bootsplash: ...no good signature found.\n");
+       return -1;
+}
+
+static void splash_update_redraw(struct vc_data *vc, struct fb_info *info)
+{
+       update_region(vc,
+                     vc->vc_origin + vc->vc_size_row * vc->vc_top,
+                     vc->vc_size_row * (vc->vc_bottom - vc->vc_top) / 2);
+       splash_clear_margins(vc, info, 0);
+}
+
+int splash_do_verbose(void)
+{
+       struct vc_data *vc;
+       struct fb_info *info;
+       int ret = 0;
+
+       SPLASH_DEBUG();
+       if (!oops_in_progress)
+               console_lock();
+
+       if (!splash_usesilent)
+               goto done;
+
+       vc = vc_cons[0].d;
+
+       if (!vc || !vc->vc_splash_data || !vc->vc_splash_data->splash_state)
+               goto done;
+       if (!vc->vc_splash_data->imgd->splash_silentjpeg)
+               goto done;
+
+       if (!vc->vc_splash_data->splash_dosilent)
+               goto done;
+       vc->vc_splash_data->splash_dosilent = 0;
+       if (fg_console != vc->vc_num)
+               goto done;
+
+       info = registered_fb[(int)con2fb_map[0]];
+
+       if (!info || !info->splash_data)
+               goto done;
+
+       splash_update_redraw(vc, info);
+       ret = 0;
+
+ done:
+       if (!oops_in_progress)
+               console_unlock();
+
+       return ret;
+}
+
+static void splash_verbose_callback(struct work_struct *ignored)
+{
+       splash_do_verbose();
+}
+
+static DECLARE_WORK(splash_work, splash_verbose_callback);
+
+int splash_verbose(void)
+{
+       if (!oops_in_progress)
+               schedule_work(&splash_work);
+       else
+               return splash_do_verbose();
+       return 0;
+}
+
+static void splash_off(struct vc_data *vc, struct fb_info *info)
+{
+       int rows = info->var.xres / vc->vc_font.width;
+       int cols = info->var.yres / vc->vc_font.height;
+       SPLASH_DEBUG();
+
+       info->splash_data = 0;
+       if (rows != vc->vc_rows || cols != vc->vc_cols)
+               vc_resize(vc, rows, cols);
+}
+
+/* look for the splash with the matching size and set it as the current */
+static int splash_look_for_jpeg(struct vc_data *vc, int width, int height)
+{
+       struct splash_data *sd, *found = NULL;
+       int found_delta_x = INT_MAX, found_delta_y = INT_MAX;
+
+       for (sd = vc->vc_splash_data; sd; sd = sd->next) {
+               int delta_x = abs(sd->imgd->splash_width - width) * height;
+               int delta_y = abs(sd->imgd->splash_height - height) * width;
+               if (!found ||
+                   (found_delta_x + found_delta_y > delta_x + delta_y)) {
+                       found = sd;
+                       found_delta_x = delta_x;
+                       found_delta_y = delta_y;
+               }
+       }
+
+       if (found) {
+               SPLASH_DEBUG("bootsplash: "
+                            "scalable image found (%dx%d scaled to %dx%d).",
+                            found->imgd->splash_width,
+                            found->imgd->splash_height,
+                            width, height);
+
+               splash_pivot_current(vc, found);
+
+               /* textarea margins are constant independent from image size */
+               if (found->imgd->splash_height != height)
+                       found->splash_vc_text_he = height
+                               - (found->imgd->splash_height
+                                  - found->imgd->splash_text_he);
+               else
+                       found->splash_vc_text_he = found->imgd->splash_text_he;
+               if (found->imgd->splash_width != width)
+                       found->splash_vc_text_wi =
+                               width
+                               - (found->imgd->splash_width
+                                  - found->imgd->splash_text_wi);
+               else
+                       found->splash_vc_text_wi = found->imgd->splash_text_wi;
+
+               if (found->imgd->splash_width != width
+                   || found->imgd->splash_height != height) {
+                       box_offsets(found->imgd->splash_boxes,
+                                   found->imgd->splash_boxcount,
+                                   width, height,
+                                   found->imgd->splash_width,
+                                   found->imgd->splash_height,
+                                   &found->splash_boxes_xoff,
+                                   &found->splash_boxes_yoff);
+                       SPLASH_DEBUG("bootsplash: offsets for boxes: x=%d y=%d",
+                                    found->splash_boxes_xoff,
+                                    found->splash_boxes_yoff);
+
+                       if (found->imgd->splash_sboxes) {
+                               box_offsets(found->imgd->splash_sboxes,
+                                           found->imgd->splash_sboxcount,
+                                           width, height,
+                                           found->imgd->splash_width,
+                                           found->imgd->splash_height,
+                                           &found->splash_sboxes_xoff,
+                                           &found->splash_sboxes_yoff);
+                               SPLASH_DEBUG("bootsplash: "
+                                            "offsets sboxes: x=%d y=%d",
+                                            found->splash_sboxes_xoff,
+                                            found->splash_sboxes_yoff);
+                       }
+               } else {
+                       found->splash_sboxes_xoff = 0;
+                       found->splash_sboxes_yoff = 0;
+               }
+               return 0;
+       }
+       return -1;
+}
+
+static int splash_recolor(struct vc_data *vc, struct fb_info *info)
+{
+       int color;
+
+       SPLASH_DEBUG();
+       if (!vc->vc_splash_data)
+               return -1;
+       if (!vc->vc_splash_data->splash_state)
+               return 0;
+       color = vc->vc_splash_data->imgd->splash_color << 4 |
+               vc->vc_splash_data->imgd->splash_fg_color;
+       if (vc->vc_def_color != color)
+               con_remap_def_color(vc, color);
+       if (info && info->splash_data && fg_console == vc->vc_num)
+               splash_update_redraw(vc, info);
+       vc->vc_splash_data->color_set = 1;
+       return 0;
+}
+
+int splash_prepare(struct vc_data *vc, struct fb_info *info)
+{
+       int err;
+       int width, height, octpp, size, sbytes;
+       enum splash_color_format cf = SPLASH_DEPTH_UNKNOWN;
+       int pic_update = 0;
+       struct jpeg_decdata *decdata; /* private decoder data */
+
+       SPLASH_DEBUG("vc_num: %i", vc->vc_num);
+
+#if 0 /* Nouveau fb sets a different ops, so we can't use the condition */
+       if (info->fbops->fb_imageblit != cfb_imageblit) {
+               printk(KERN_ERR "bootsplash: "
+                      "found, but framebuffer can't "
+                      "handle it!\n");
+               return -1;
+       }
+#endif
+
+       if (!vc->vc_splash_data || !vc->vc_splash_data->splash_state) {
+               splash_off(vc, info);
+               return -1;
+       }
+
+       width = info->var.xres;
+       height = info->var.yres;
+       switch (info->var.bits_per_pixel) {
+       case 16:
+               if ((info->var.red.length +
+                    info->var.green.length +
+                    info->var.blue.length) == 15)
+                       cf = SPLASH_DEPTH_15;
+               else
+                       cf = SPLASH_DEPTH_16;
+               break;
+       case 24:
+               cf = SPLASH_DEPTH_24_PACKED;
+               break;
+       case 32:
+               cf = SPLASH_DEPTH_24;
+               break;
+       }
+       if (cf == SPLASH_DEPTH_UNKNOWN) {
+               printk(KERN_INFO "bootsplash: unsupported pixel format: %i\n",
+                      info->var.bits_per_pixel);
+               splash_off(vc, info);
+               return -2;
+       }
+       octpp = splash_octpp(cf);
+
+       if (splash_look_for_jpeg(vc, width, height) < 0) {
+               printk(KERN_INFO "bootsplash: no matching splash %dx%d\n",
+                      width, height);
+               splash_off(vc, info);
+               return -2;
+       }
+
+       sbytes = ((width + 15) & ~15) * octpp;
+       size = sbytes * ((height + 15) & ~15);
+
+       if (size != vc->vc_splash_data->pic->splash_pic_size) {
+               if (vc->vc_splash_data->pic->ref_cnt > 1) {
+                       struct splash_pic_data *pic;
+                       pic = kzalloc(sizeof(struct splash_pic_data),
+                                     GFP_KERNEL);
+                       if (!pic)
+                               return -2;
+                       vc->vc_splash_data->pic = pic;
+               }
+               vc->vc_splash_data->pic->ref_cnt = 1;
+               vc->vc_splash_data->pic->splash_pic = NULL;
+               vc->vc_splash_data->pic->splash_pic_size = 0;
+       }
+       if (!vc->vc_splash_data->pic->splash_pic) {
+               vc->vc_splash_data->pic->splash_pic = vmalloc(size);
+               pic_update = 1;
+       }
+       if (!vc->vc_splash_data->pic->splash_pic) {
+               printk(KERN_INFO "bootsplash: not enough memory.\n");
+               splash_off(vc, info);
+               return -3;
+       }
+
+       decdata = vmalloc(sizeof(*decdata));
+       if (!decdata) {
+               printk(KERN_INFO "bootsplash: not enough memory.\n");
+               splash_off(vc, info);
+               return -3;
+       }
+
+       if (vc->vc_splash_data->imgd->splash_silentjpeg &&
+           vc->vc_splash_data->splash_dosilent) {
+               pic_update = 1;
+               err = jpeg_get(vc->vc_splash_data->imgd->splash_silentjpeg,
+                              vc->vc_splash_data->pic->splash_pic,
+                              width, height, cf, decdata);
+               if (err) {
+                       printk(KERN_INFO "bootsplash: "
+                              "error while decompressing silent picture: "
+                              "%s (%d)\n",
+                              jpg_errors[err - 1], err);
+                       vc->vc_splash_data->splash_dosilent = 0;
+               } else {
+                       if (vc->vc_splash_data->imgd->splash_sboxcount)
+                               boxit(vc->vc_splash_data->pic->splash_pic,
+                                     sbytes,
+                                     vc->vc_splash_data->imgd->splash_sboxes,
+                                     vc->vc_splash_data->imgd->splash_sboxcount,
+                                     vc->vc_splash_data->splash_percent,
+                                     vc->vc_splash_data->splash_sboxes_xoff,
+                                     vc->vc_splash_data->splash_sboxes_yoff,
+                                     vc->vc_splash_data->splash_percent < 0 ?
+                                     1 : 0,
+                                     cf);
+                       splashcopy(info->screen_base,
+                                  vc->vc_splash_data->pic->splash_pic,
+                                  info->var.yres,
+                                  info->var.xres,
+                                  info->fix.line_length, sbytes,
+                                  octpp);
+               }
+       } else
+               vc->vc_splash_data->splash_dosilent = 0;
+
+       if (pic_update) {
+               err = jpeg_get(vc->vc_splash_data->imgd->splash_jpeg,
+                              vc->vc_splash_data->pic->splash_pic,
+                              width, height, cf, decdata);
+               if (err) {
+                       printk(KERN_INFO "bootsplash: "
+                              "error while decompressing picture: %s (%d) .\n",
+                              jpg_errors[err - 1], err);
+                       splash_off(vc, info);
+                       return -4;
+               }
+       }
+
+       vfree(decdata);
+
+       vc->vc_splash_data->pic->splash_pic_size = size;
+       vc->vc_splash_data->pic->splash_pic_stride = sbytes;
+
+       if (vc->vc_splash_data->imgd->splash_boxcount)
+               boxit(vc->vc_splash_data->pic->splash_pic,
+                     sbytes,
+                     vc->vc_splash_data->imgd->splash_boxes,
+                     vc->vc_splash_data->imgd->splash_boxcount,
+                     vc->vc_splash_data->splash_percent,
+                     vc->vc_splash_data->splash_boxes_xoff,
+                     vc->vc_splash_data->splash_boxes_yoff,
+                     0,
+                     cf);
+       if (vc->vc_splash_data->splash_state) {
+               int cols = vc->vc_splash_data->splash_vc_text_wi
+                       / vc->vc_font.width;
+               int rows = vc->vc_splash_data->splash_vc_text_he
+                       / vc->vc_font.height;
+
+               info->splash_data = vc->vc_splash_data;
+
+               info->splash_data->need_sync = 0;
+               /* XEN fb needs some sync after the direct modification of
+                * fb area; maybe other FBs would need similar hack, but
+                * so far I don't care.
+                */
+               if (!strcmp(info->fix.id, "xen")) {
+                       info->splash_data->need_sync = 1;
+                       /* sync the whole splash once */
+                       splash_sync_region(info, 0, 0,
+                                          info->var.xres, info->var.yres);
+               }
+
+               /* vc_resize also calls con_switch which resets yscroll */
+               if (rows != vc->vc_rows || cols != vc->vc_cols)
+                       vc_resize(vc, cols, rows);
+               if (!vc->vc_splash_data->color_set)
+                       splash_recolor(vc, NULL);
+       } else {
+               SPLASH_DEBUG("Splash Status is off\n");
+               splash_off(vc, info);
+               return -5;
+       }
+       return 0;
+}
+
+
+#ifdef CONFIG_PROC_FS
+
+#include <linux/proc_fs.h>
+
+static int splash_read_proc(char *buffer, char **start, off_t offset, int size,
+                           int *eof, void *data);
+static int splash_write_proc(struct file *file, const char *buffer,
+                            unsigned long count, void *data);
+static int splash_status(struct vc_data *vc);
+static int splash_proc_register(void);
+
+static struct proc_dir_entry *proc_splash;
+
+static int splash_status(struct vc_data *vc)
+{
+       struct fb_info *info;
+
+       printk(KERN_INFO "bootsplash: status on console %d changed to %s\n",
+              vc->vc_num,
+              vc->vc_splash_data &&
+              vc->vc_splash_data->splash_state ? "on" : "off");
+
+       info = registered_fb[(int) con2fb_map[vc->vc_num]];
+       if (!info)
+               return 0;
+
+       if (fg_console == vc->vc_num)
+               splash_prepare(vc, info);
+       if (vc->vc_splash_data && vc->vc_splash_data->splash_state)
+               splash_recolor(vc, info);
+       else {
+               splash_off(vc, info);
+               if (vc->vc_def_color != 0x07)
+                       con_remap_def_color(vc, 0x07);
+       }
+
+       return 0;
+}
+
+int splash_copy_current_img(int unit_s, int unit_t)
+{
+       struct fb_info *info;
+       struct vc_data *vc_s;
+       struct vc_data *vc_t;
+       struct splash_data *sd_s;
+       struct splash_data *sd_t;
+       int size;
+
+       if (unit_s >= MAX_NR_CONSOLES || unit_t >= MAX_NR_CONSOLES)
+               return -1;
+
+       vc_s = vc_cons[unit_s].d;
+       if (!vc_s) {
+               printk(KERN_WARNING "bootsplash: "
+                      "copy: source (%i) is invalid.\n", unit_s);
+               return -1;
+       }
+       sd_s = vc_s->vc_splash_data;
+       if (!sd_s || !sd_s->imgd) {
+               printk(KERN_INFO "bootsplash: "
+                      "copy: source_vc (%i) doesn't have valid splash data.\n",
+                      unit_s);
+               return -1;
+       }
+       vc_allocate(unit_t);
+       vc_t = vc_cons[unit_t].d;
+       if (!vc_t) {
+               printk(KERN_WARNING "bootsplash: copy: dest (%i) is invalid.\n",
+                      unit_t);
+               return -1;
+       }
+       sd_t = kzalloc(sizeof(*sd_t), GFP_KERNEL);
+       if (!sd_t)
+               return -1;
+       vc_t->vc_splash_data = sd_t;
+
+       sd_t->imgd = sd_s->imgd;
+       sd_t->imgd->ref_cnt++;
+
+       /* now recreate all the rest */
+       sd_t->splash_state = sd_s->splash_state;
+       sd_t->splash_percent = sd_s->splash_percent;
+       sd_t->splash_dosilent = sd_s->splash_dosilent;
+       sd_t->splash_vc_text_wi = sd_s->imgd->splash_text_wi;
+       sd_t->splash_vc_text_he = sd_s->imgd->splash_text_he;
+
+       sd_t->splash_boxes_xoff = 0;
+       sd_t->splash_boxes_yoff = 0;
+       sd_t->splash_sboxes_xoff = 0;
+       sd_t->splash_sboxes_yoff = 0;
+
+       info = registered_fb[(int) con2fb_map[vc_t->vc_num]];
+       size = (((info->var.xres + 15) & ~15)
+               * ((info->var.bits_per_pixel + 1) >> 3))
+               * ((info->var.yres + 15) & ~15);
+       if (size != vc_s->vc_splash_data->pic->splash_pic_size) {
+               sd_t->pic = kzalloc(sizeof(struct splash_pic_data), GFP_KERNEL);
+               if (!sd_t->pic)
+                       return -1;
+               sd_t->pic->ref_cnt = 1;
+       } else {
+               sd_t->pic = sd_s->pic;
+               sd_t->pic->ref_cnt++;
+       }
+
+       splash_status(vc_t);
+
+       return 0;
+}
+
+static int splash_read_proc(char *buffer, char **start, off_t offset, int size,
+                       int *eof, void *data)
+{
+       int len;
+       int xres, yres;
+       struct vc_data *vc = vc_cons[0].d;
+       struct fb_info *info = registered_fb[(int)con2fb_map[0]];
+       int color = vc->vc_splash_data ?
+               vc->vc_splash_data->imgd->splash_color << 4 |
+               vc->vc_splash_data->imgd->splash_fg_color : splash_default >> 4;
+       int status = vc->vc_splash_data ?
+               vc->vc_splash_data->splash_state & 1 : 0;
+
+       if (info) {
+               xres = info->var.xres;
+               yres = info->var.yres;
+       } else
+               xres = yres = 0;
+
+       len = sprintf(buffer, "Splash screen v%s (0x%02x, %dx%d%s): %s\n",
+                     SPLASH_VERSION, color, xres, yres,
+                     (vc->vc_splash_data ?
+                      vc->vc_splash_data->splash_dosilent : 0) ? ", silent" :
+                     "",
+                     status ? "on" : "off");
+       if (offset >= len)
+               return 0;
+
+       *start = buffer - offset;
+
+       return (size < len - offset ? size : len - offset);
+}
+
+void splash_set_percent(struct vc_data *vc, int pe)
+{
+       struct fb_info *info;
+       struct fbcon_ops *ops;
+       struct splash_data *vc_splash_data;
+       int oldpe;
+
+       SPLASH_DEBUG(" console: %d val: %d\n", vc->vc_num, pe);
+
+       if (pe < -2)
+               pe = 0;
+       if (pe > 65535)
+               pe = 65535;
+       pe += pe > 32767;
+
+       vc_splash_data = vc->vc_splash_data;
+       if (!vc_splash_data || vc_splash_data->splash_percent == pe)
+               return;
+
+       oldpe = vc_splash_data->splash_percent;
+       vc_splash_data->splash_percent = pe;
+       if (fg_console != vc->vc_num ||
+           !vc_splash_data->splash_state) {
+               return;
+       }
+       info = registered_fb[(int) con2fb_map[vc->vc_num]];
+       if (!info)
+               return;
+
+       ops = info->fbcon_par;
+       if (ops->blank_state)
+               return;
+       if (!vc_splash_data->imgd->splash_overpaintok
+           || pe == 65536
+           || pe < oldpe) {
+               if (splash_hasinter(vc_splash_data->imgd->splash_boxes,
+                                   vc_splash_data->imgd->splash_boxcount)) {
+                       splash_status(vc);
+               } else
+                       splash_prepare(vc, info);
+       } else {
+               struct splash_data *splash_data = info->splash_data;
+               enum splash_color_format cf = SPLASH_DEPTH_UNKNOWN;
+               switch (info->var.bits_per_pixel) {
+               case 16:
+                       if ((info->var.red.length +
+                            info->var.green.length +
+                            info->var.blue.length) == 15)
+                               cf = SPLASH_DEPTH_15;
+                       else
+                               cf = SPLASH_DEPTH_16;
+                       break;
+               case 24:
+                       cf = SPLASH_DEPTH_24_PACKED;
+                       break;
+               case 32:
+                       cf = SPLASH_DEPTH_24;
+                       break;
+               }
+               if (cf == SPLASH_DEPTH_UNKNOWN)
+                       return;
+               if (splash_data) {
+                       if (splash_data->imgd->splash_silentjpeg
+                           && splash_data->splash_dosilent) {
+                               boxit(info->screen_base,
+                                     info->fix.line_length,
+                                     splash_data->imgd->splash_sboxes,
+                                     splash_data->imgd->splash_sboxcount,
+                                     splash_data->splash_percent,
+                                     splash_data->splash_sboxes_xoff,
+                                     splash_data->splash_sboxes_yoff,
+                                     1,
+                                     cf);
+                               /* FIXME: get a proper width/height */
+                               splash_sync_region(info,
+                                       splash_data->splash_sboxes_xoff,
+                                       splash_data->splash_sboxes_yoff,
+                                       info->var.xres -
+                                       splash_data->splash_sboxes_xoff,
+                                       8);
+                       }
+               }
+       }
+}
+
+static const char *get_unit(const char *buffer, int *unit)
+{
+
+       *unit = -1;
+       if (buffer[0] >= '0' && buffer[0] <= '9') {
+               *unit = buffer[0] - '0';
+               buffer++;
+               if (buffer[0] >= '0' && buffer[0] <= '9') {
+                       *unit = *unit * 10 + buffer[0] - '0';
+                       buffer++;
+               }
+               if (*buffer == ' ')
+                       buffer++;
+       }
+       return buffer;
+}
+
+static int splash_write_proc(struct file *file, const char *buffer,
+                            unsigned long count, void *data)
+{
+       int new, unit;
+       unsigned long uval;
+       struct vc_data *vc;
+       struct splash_data *vc_splash_data;
+
+       SPLASH_DEBUG();
+
+       if (!buffer || !splash_default)
+               return count;
+
+       console_lock();
+       unit = 0;
+       if (buffer[0] == '@') {
+               buffer++;
+               buffer = get_unit(buffer, &unit);
+               if (unit < 0 || unit >= MAX_NR_CONSOLES || !vc_cons[unit].d) {
+                       console_unlock();
+                       return count;
+               }
+       }
+       SPLASH_DEBUG(" unit: %i", unit);
+       vc = vc_cons[unit].d;
+       vc_splash_data = vc->vc_splash_data;
+
+       if (!strncmp(buffer, "redraw", 6)) {
+               SPLASH_DEBUG(" redraw");
+               splash_status(vc);
+               console_unlock();
+               return count;
+       }
+
+       if (!strncmp(buffer, "show", 4) || !strncmp(buffer, "hide", 4)) {
+               long int pe;
+
+               SPLASH_DEBUG("show/hide");
+               if (buffer[4] == ' ' && buffer[5] == 'p')
+                       pe = 0;
+               else if (buffer[4] == '\n')
+                       pe = 65535;
+               else if (strict_strtol(buffer + 5, 0, &pe) == -EINVAL)
+                       pe = 0;
+               if (pe < -2)
+                       pe = 0;
+               if (pe > 65535)
+                       pe = 65535;
+               if (*buffer == 'h')
+                       pe = 65535 - pe;
+               splash_set_percent(vc, pe);
+               console_unlock();
+               return count;
+       }
+
+       if (!strncmp(buffer, "copy", 4)) {
+               buffer += 4;
+               if (buffer[0] == ' ')
+                       buffer++;
+               buffer = get_unit(buffer, &unit);
+               if (unit < 0 || unit >= MAX_NR_CONSOLES) {
+                       console_unlock();
+                       return count;
+               }
+               buffer = get_unit(buffer, &new);
+               if (new < 0 || new >= MAX_NR_CONSOLES) {
+                       console_unlock();
+                       return count;
+               }
+               splash_copy_current_img(unit, new);
+               console_unlock();
+               return count;
+       }
+
+       if (!strncmp(buffer, "silent\n", 7)
+           || !strncmp(buffer, "verbose\n", 8)) {
+               SPLASH_DEBUG(" silent/verbose");
+
+               if (vc_splash_data &&
+                   vc_splash_data->imgd->splash_silentjpeg) {
+                       if (vc_splash_data->splash_dosilent !=
+                           (buffer[0] == 's')) {
+                               vc_splash_data->splash_dosilent =
+                                       buffer[0] == 's';
+                               splash_status(vc);
+                       }
+               }
+               console_unlock();
+               return count;
+       }
+
+       if (!strncmp(buffer, "freesilent\n", 11)) {
+               SPLASH_DEBUG(" freesilent");
+
+               if (vc_splash_data &&
+                   vc_splash_data->imgd->splash_silentjpeg) {
+                       struct splash_data *sd;
+                       printk(KERN_INFO "bootsplash: freeing silent jpeg\n");
+                       for (sd = vc_splash_data; sd; sd = sd->next) {
+                               sd->imgd->splash_silentjpeg = 0;
+                               vfree(sd->imgd->splash_sboxes);
+                               sd->imgd->splash_sboxes = 0;
+                               sd->imgd->splash_sboxcount = 0;
+                       }
+                       if (vc_splash_data->splash_dosilent)
+                               splash_status(vc);
+
+                       vc->vc_splash_data->splash_dosilent = 0;
+               }
+               console_unlock();
+               return count;
+       }
+
+       if (!strncmp(buffer, "BOOTSPL", 7)) {
+               int up = -1;
+
+               SPLASH_DEBUG(" BOOTSPL");
+               unit = splash_getraw((unsigned char *)buffer,
+                                    (unsigned char *)buffer + count,
+                                    &up);
+               SPLASH_DEBUG(" unit: %i up: %i", unit, up);
+               if (unit >= 0) {
+                       struct fb_info *info;
+
+                       vc = vc_cons[unit].d;
+                       info = registered_fb[(int) con2fb_map[vc->vc_num]];
+                       if (!info) {
+                               console_unlock();
+                               return count;
+                       }
+
+                       if (up == -1) {
+                               splash_status(vc);
+                       } else {
+                               struct splash_data *vc_splash_data
+                                       = vc->vc_splash_data;
+                               struct splash_data *splash_data
+                                       = info->splash_data;
+                               struct fbcon_ops *ops = info->fbcon_par;
+                               enum splash_color_format cf = SPLASH_DEPTH_UNKNOWN;
+
+                               switch (info->var.bits_per_pixel) {
+                               case 16:
+                                       if ((info->var.red.length +
+                                            info->var.green.length +
+                                            info->var.blue.length) == 15)
+                                               cf = SPLASH_DEPTH_15;
+                                       else
+                                               cf = SPLASH_DEPTH_16;
+                                       break;
+                               case 24:
+                                       cf = SPLASH_DEPTH_24_PACKED;
+                                       break;
+                               case 32:
+                                       cf = SPLASH_DEPTH_24;
+                                       break;
+                               }
+                               if (cf == SPLASH_DEPTH_UNKNOWN)
+                                       up = 0;
+                               if (ops->blank_state ||
+                                   !vc_splash_data ||
+                                   !splash_data)
+                                       up = 0;
+                               if ((up & 2) != 0
+                                   && splash_data->imgd->splash_silentjpeg
+                                   && splash_data->splash_dosilent) {
+                                       boxit(info->screen_base,
+                                             info->fix.line_length,
+                                             splash_data->imgd->splash_sboxes,
+                                             splash_data->imgd->splash_sboxcount,
+                                             splash_data->splash_percent,
+                                             splash_data->splash_sboxes_xoff,
+                                             splash_data->splash_sboxes_yoff,
+                                             1,
+                                             cf);
+                               } else if ((up & 1) != 0) {
+                                       boxit(info->screen_base,
+                                             info->fix.line_length,
+                                             splash_data->imgd->splash_boxes,
+                                             splash_data->imgd->splash_boxcount,
+                                             splash_data->splash_percent,
+                                             splash_data->splash_boxes_xoff,
+                                             splash_data->splash_boxes_yoff,
+                                             1,
+                                             cf);
+                               }
+                       }
+               }
+               console_unlock();
+               return count;
+       }
+
+       if (!vc_splash_data) {
+               console_unlock();
+               return count;
+       }
+
+       if (buffer[0] == 't') {
+               vc_splash_data->splash_state ^= 1;
+               SPLASH_DEBUG(" t");
+               splash_status(vc);
+               console_unlock();
+               return count;
+       }
+       if (strict_strtoul(buffer, 0, &uval) == -EINVAL)
+               uval = 1;
+       if (uval > 1) {
+               /* expert user */
+               vc_splash_data->imgd->splash_color    = uval >> 8 & 0xff;
+               vc_splash_data->imgd->splash_fg_color = uval >> 4 & 0x0f;
+       }
+       if ((uval & 1) == vc_splash_data->splash_state)
+               splash_recolor(vc, NULL);
+       else {
+               vc_splash_data->splash_state = uval & 1;
+               splash_status(vc);
+       }
+       console_unlock();
+       return count;
+}
+
+static int splash_proc_register(void)
+{
+       proc_splash = create_proc_entry("splash", 0, 0);
+       if (proc_splash) {
+               proc_splash->read_proc = splash_read_proc;
+               proc_splash->write_proc = splash_write_proc;
+               return 0;
+       }
+       return 1;
+}
+
+#endif /* CONFIG_PROC_FS */
+
+#define INIT_CONSOLE 0
+
+void splash_init(void)
+{
+       static bool splash_not_initialized = true;
+       struct fb_info *info;
+       struct vc_data *vc;
+       int isramfs = 1;
+       int fd;
+       int len;
+       int max_len = 1024*1024*2;
+       char *mem;
+
+       if (splash_not_initialized == false)
+               return;
+       vc = vc_cons[INIT_CONSOLE].d;
+       info = registered_fb[(int)con2fb_map[INIT_CONSOLE]];
+       if (!vc
+           || !info
+           || info->var.bits_per_pixel < 16) /* not supported */
+               return;
+#ifdef CONFIG_PROC_FS
+       splash_proc_register();
+#endif
+       splash_not_initialized = false;
+       if (vc->vc_splash_data)
+               return;
+       fd = sys_open("/bootsplash", O_RDONLY, 0);
+       if (fd < 0) {
+               isramfs = 0;
+               fd = sys_open("/initrd.image", O_RDONLY, 0);
+       }
+       if (fd < 0)
+               return;
+       len = (int)sys_lseek(fd, (off_t)0, 2);
+       if (len <= 0) {
+               sys_close(fd);
+               return;
+       }
+       /* Don't look for more than the last 2MB */
+       if (len > max_len) {
+               printk(KERN_INFO "bootsplash: "
+                      "scanning last %dMB of initrd for signature\n",
+                      max_len>>20);
+               sys_lseek(fd, (off_t)(len - max_len), 0);
+               len = max_len;
+       } else {
+               sys_lseek(fd, (off_t)0, 0);
+       }
+
+       mem = vmalloc(len);
+       if (mem) {
+               console_lock();
+               if ((int)sys_read(fd, mem, len) == len
+                   && (splash_getraw((unsigned char *)mem,
+                                     (unsigned char *)mem + len, (int *)0)
+                       == INIT_CONSOLE)
+                   && vc->vc_splash_data)
+                       vc->vc_splash_data->splash_state = splash_default & 1;
+               console_unlock();
+               vfree(mem);
+       }
+       sys_close(fd);
+       if (isramfs)
+               sys_unlink("/bootsplash");
+       return;
+}
+
+#define SPLASH_ALIGN 15
+
+static u32 *do_coefficients(u32 from, u32 to, u32 *shift)
+{
+       u32 *coefficients;
+       u32 left = to;
+       int n = 1;
+       u32 upper = 31;
+       int col_cnt = 0;
+       int row_cnt = 0;
+       int m;
+       u32 rnd = from >> 1;
+
+       if (from > to) {
+               left = to;
+               rnd = from >> 1;
+
+               while (upper > 0) {
+                       if ((1 << upper) & from)
+                               break;
+                       upper--;
+               }
+               upper++;
+
+               *shift = 32 - 8 - 1 - upper;
+
+               coefficients = vmalloc(sizeof(u32) * (from / to + 2) * from + 1);
+               if (!coefficients)
+                       return NULL;
+
+               n = 1;
+               while (1) {
+                       u32 sum = left;
+                       col_cnt = 0;
+                       m = n++;
+                       while (sum < from) {
+                               coefficients[n++] =
+                                       ((left << *shift) + rnd) / from;
+                               col_cnt++;
+                               left = to;
+                               sum += left;
+                       }
+                       left = sum - from;
+                       coefficients[n++] =
+                               (((to - left) << *shift) + rnd) / from;
+                       col_cnt++;
+                       coefficients[m] = col_cnt;
+                       row_cnt++;
+                       if (!left) {
+                               coefficients[0] = row_cnt;
+                               return coefficients;
+                       }
+               }
+       } else {
+               left = 0;
+               rnd = to >> 1;
+
+               while (upper > 0) {
+                       if ((1 << upper) & to)
+                               break;
+                       upper--;
+               }
+               upper++;
+
+               *shift = 32 - 8 - 1 - upper;
+
+               coefficients = vmalloc(sizeof(u32) * 3 * from + 1);
+               if (!coefficients)
+                       return NULL;
+
+               while (1) {
+                       u32 diff;
+                       u32 sum = left;
+                       col_cnt = 0;
+                       row_cnt++;
+                       while (sum < to) {
+                               col_cnt++;
+                               sum += from;
+                       }
+                       left = sum - to;
+                       diff = from - left;
+                       if (!left) {
+                               coefficients[n] = col_cnt;
+                               coefficients[0] = row_cnt;
+                               return coefficients;
+                       }
+                       coefficients[n++] = col_cnt - 1;
+                       coefficients[n++] = ((diff << *shift) + rnd) / from;
+                       coefficients[n++] = ((left << *shift) + rnd) / from;
+               }
+       }
+}
+
+
+struct pixel {
+       u32 red;
+       u32 green;
+       u32 blue;
+};
+
+#define put_pixel(pix, buf, cf)                                                \
+       switch (cf) {                                                   \
+       case SPLASH_DEPTH_15:                                           \
+               *(u16 *)(buf) = (u16)((pix).red << 10 |                 \
+                                     (pix).green << 5 | (pix).blue);   \
+       (buf) += 2;                                                     \
+       break;                                                          \
+       case SPLASH_DEPTH_16:                                           \
+               *(u16 *)(buf) = (u16)((pix).red << 11 |                 \
+                                     (pix).green << 5 | (pix).blue);   \
+               (buf) += 2;                                             \
+               break;                                                  \
+       case SPLASH_DEPTH_24_PACKED:                                    \
+               *(u16 *)(buf) = (u16)((pix).red << 8 | (pix).green);    \
+               buf += 2;                                               \
+               *((buf)++) = (pix).blue;                                \
+               break;                                                  \
+       case SPLASH_DEPTH_24:                                           \
+               *(u32 *)(buf) = (u32)((pix).red << 16 |                 \
+                                     (pix).green << 8 | (pix).blue);   \
+               (buf) += 4;                                             \
+               break;                                                  \
+        case SPLASH_DEPTH_UNKNOWN:                                     \
+               break;                                                  \
+       }
+
+#define get_pixel(pix, buf, depth)                                    \
+       switch (depth) {                                               \
+       case SPLASH_DEPTH_15:                                          \
+               (pix).red = ((*(u16 *)(buf)) >> 10) & 0x1f;            \
+               (pix).green = ((*(u16 *)(buf)) >> 5) & 0x1f;           \
+               (pix).blue = (*(u16 *)(buf)) & 0x1f;                   \
+               (buf) += 2;                                            \
+               break;                                                 \
+       case SPLASH_DEPTH_16:                                          \
+               (pix).red = ((*(u16 *)(buf)) >> 11) & 0x1f;            \
+               (pix).green = ((*(u16 *)(buf)) >> 5) & 0x3f;           \
+               (pix).blue = (*(u16 *)(buf)) & 0x1f;                   \
+               (buf) += 2;                                            \
+               break;                                                 \
+       case SPLASH_DEPTH_24_PACKED:                                   \
+               (pix).blue = *(((buf))++);                             \
+               (pix).green = *(((buf))++);                            \
+               (pix).red = *(((buf))++);                              \
+               break;                                                 \
+       case SPLASH_DEPTH_24:                                          \
+               (pix).blue = *(((buf))++);                             \
+               (pix).green = *(((buf))++);                            \
+               (pix).red = *(((buf))++);                              \
+               (buf)++;                                               \
+               break;                                                 \
+        case SPLASH_DEPTH_UNKNOWN:                                    \
+               break;                                                 \
+       }
+
+static inline void
+scale_x_down(enum splash_color_format cf, int src_w,
+            unsigned char **src_p, u32 *x_coeff,
+            u32 x_shift,  u32 y_coeff, struct pixel *row_buffer)
+{
+       u32 curr_x_coeff = 1;
+       struct pixel curr_pixel, tmp_pixel;
+       u32 x_array_size = x_coeff[0];
+       int x_column_num;
+       int i;
+       int l, m;
+       int k = 0;
+       u32 rnd = (1 << (x_shift - 1));
+
+       for (i = 0; i < src_w; ) {
+               curr_x_coeff = 1;
+               get_pixel(tmp_pixel, *src_p, cf);
+               i++;
+               for (l = 0; l < x_array_size; l++) {
+                       x_column_num = x_coeff[curr_x_coeff++];
+                       curr_pixel.red = 0;
+                       curr_pixel.green = 0;
+                       curr_pixel.blue = 0;
+                       for (m = 0; m < x_column_num - 1; m++) {
+                               curr_pixel.red += tmp_pixel.red
+                                       * x_coeff[curr_x_coeff];
+                               curr_pixel.green += tmp_pixel.green
+                                       * x_coeff[curr_x_coeff];
+                               curr_pixel.blue += tmp_pixel.blue
+                                       * x_coeff[curr_x_coeff];
+                               curr_x_coeff++;
+                               get_pixel(tmp_pixel, *src_p, cf);
+                               i++;
+                       }
+                       curr_pixel.red += tmp_pixel.red * x_coeff[curr_x_coeff];
+                       curr_pixel.green += tmp_pixel.green
+                               * x_coeff[curr_x_coeff];
+                       curr_pixel.blue += tmp_pixel.blue
+                               * x_coeff[curr_x_coeff];
+                       curr_x_coeff++;
+                       curr_pixel.red = (curr_pixel.red + rnd) >> x_shift;
+                       curr_pixel.green = (curr_pixel.green + rnd) >> x_shift;
+                       curr_pixel.blue = (curr_pixel.blue + rnd) >> x_shift;
+                       row_buffer[k].red += curr_pixel.red * y_coeff;
+                       row_buffer[k].green += curr_pixel.green * y_coeff;
+                       row_buffer[k].blue += curr_pixel.blue * y_coeff;
+                       k++;
+               }
+       }
+}
+
+static inline void
+scale_x_up(enum splash_color_format cf, int src_w,
+          unsigned char **src_p, u32 *x_coeff,
+          u32 x_shift,  u32 y_coeff, struct pixel *row_buffer)
+{
+       u32 curr_x_coeff = 1;
+       struct pixel curr_pixel, tmp_pixel;
+       u32 x_array_size = x_coeff[0];
+       int x_column_num;
+       int i;
+       int l, m;
+       int k = 0;
+       u32 rnd = (1 << (x_shift - 1));
+
+       for (i = 0; i < src_w;) {
+               curr_x_coeff = 1;
+               get_pixel(tmp_pixel, *src_p, cf);
+               i++;
+               for (l = 0; l < x_array_size - 1; l++) {
+                       x_column_num = x_coeff[curr_x_coeff++];
+                       for (m = 0; m < x_column_num; m++) {
+                               row_buffer[k].red += tmp_pixel.red * y_coeff;
+                               row_buffer[k].green += tmp_pixel.green * y_coeff;
+                               row_buffer[k].blue += tmp_pixel.blue * y_coeff;
+                               k++;
+                       }
+                       curr_pixel.red = tmp_pixel.red * x_coeff[curr_x_coeff];
+                       curr_pixel.green = tmp_pixel.green
+                               * x_coeff[curr_x_coeff];
+                       curr_pixel.blue = tmp_pixel.blue * x_coeff[curr_x_coeff];
+                       curr_x_coeff++;
+                       get_pixel(tmp_pixel, *src_p, cf);
+                       i++;
+                       row_buffer[k].red += ((curr_pixel.red
+                                              + (tmp_pixel.red
+                                                 * x_coeff[curr_x_coeff])
+                                              + rnd) >> x_shift) * y_coeff;
+                       row_buffer[k].green += ((curr_pixel.green
+                                                + (tmp_pixel.green
+                                                   * x_coeff[curr_x_coeff])
+                                                + rnd) >> x_shift) * y_coeff;
+                       row_buffer[k].blue += ((curr_pixel.blue
+                                               + (tmp_pixel.blue
+                                                  * x_coeff[curr_x_coeff])
+                                               + rnd) >> x_shift) * y_coeff;
+                       k++;
+                       curr_x_coeff++;
+               }
+               for (m = 0; m < x_coeff[curr_x_coeff]; m++) {
+                       row_buffer[k].red += tmp_pixel.red * y_coeff;
+                       row_buffer[k].green += tmp_pixel.green * y_coeff;
+                       row_buffer[k].blue += tmp_pixel.blue * y_coeff;
+                       k++;
+               }
+       }
+}
+
+static int scale_y_down(unsigned char *src, unsigned char *dst,
+                       enum splash_color_format cf,
+                       int src_w, int src_h, int dst_w, int dst_h)
+{
+       int octpp = splash_octpp(cf);
+       int src_x_bytes = octpp * ((src_w + SPLASH_ALIGN) & ~SPLASH_ALIGN);
+       int dst_x_bytes = octpp * ((dst_w + SPLASH_ALIGN) & ~SPLASH_ALIGN);
+       int j;
+       struct pixel *row_buffer;
+       u32 x_shift, y_shift;
+       u32 *x_coeff;
+       u32 *y_coeff;
+       u32 curr_y_coeff = 1;
+       unsigned char *src_p;
+       unsigned char *src_p_line = src;
+       char *dst_p_line;
+       int r, s;
+       int y_array_rows;
+       int y_column_num;
+       int k;
+       u32 rnd;
+       int xup;
+
+       row_buffer = vmalloc(sizeof(struct pixel)
+                                            * (dst_w + 1));
+       x_coeff = do_coefficients(src_w, dst_w, &x_shift);
+       y_coeff = do_coefficients(src_h, dst_h, &y_shift);
+       if (!row_buffer || !x_coeff || !y_coeff) {
+               vfree(row_buffer);
+               vfree(x_coeff);
+               vfree(y_coeff);
+               return -ENOMEM;
+       }
+       y_array_rows = y_coeff[0];
+       rnd = (1 << (y_shift - 1));
+       xup = (src_w <= dst_w) ? 1 : 0;
+
+       dst_p_line = dst;
+
+       for (j = 0; j < src_h;) {
+               curr_y_coeff = 1;
+               for (r = 0; r < y_array_rows; r++) {
+                       y_column_num = y_coeff[curr_y_coeff++];
+                       for (k = 0; k < dst_w + 1; k++) {
+                               row_buffer[k].red = 0;
+                               row_buffer[k].green = 0;
+                               row_buffer[k].blue = 0;
+                       }
+                       src_p = src_p_line;
+                       if (xup)
+                               scale_x_up(cf,  src_w, &src_p, x_coeff,
+                                          x_shift, y_coeff[curr_y_coeff],
+                                          row_buffer);
+                       else
+                               scale_x_down(cf,  src_w, &src_p, x_coeff,
+                                            x_shift, y_coeff[curr_y_coeff],
+                                            row_buffer);
+                       curr_y_coeff++;
+                       for (s = 1; s < y_column_num; s++) {
+                               src_p = src_p_line = src_p_line + src_x_bytes;
+                               j++;
+                               if (xup)
+                                       scale_x_up(cf,  src_w, &src_p,
+                                                  x_coeff, x_shift,
+                                                  y_coeff[curr_y_coeff],
+                                                  row_buffer);
+                               else
+                                       scale_x_down(cf,  src_w, &src_p,
+                                                    x_coeff, x_shift,
+                                                    y_coeff[curr_y_coeff],
+                                                    row_buffer);
+                               curr_y_coeff++;
+                       }
+                       for (k = 0; k < dst_w; k++) {
+                               row_buffer[k].red = (row_buffer[k].red + rnd)
+                                       >> y_shift;
+                               row_buffer[k].green = (row_buffer[k].green
+                                                      + rnd)
+                                       >> y_shift;
+                               row_buffer[k].blue = (row_buffer[k].blue + rnd)
+                                       >> y_shift;
+                               put_pixel(row_buffer[k], dst, cf);
+                       }
+                       dst = dst_p_line = dst_p_line + dst_x_bytes;
+               }
+               src_p_line = src_p_line + src_x_bytes;
+               j++;
+       }
+       vfree(row_buffer);
+       vfree(x_coeff);
+       vfree(y_coeff);
+       return 0;
+}
+
+static int scale_y_up(unsigned char *src, unsigned char *dst,
+                     enum splash_color_format cf,
+                     int src_w, int src_h, int dst_w, int dst_h)
+{
+       int octpp = splash_octpp(cf);
+       int src_x_bytes = octpp * ((src_w + SPLASH_ALIGN) & ~SPLASH_ALIGN);
+       int dst_x_bytes = octpp * ((dst_w + SPLASH_ALIGN) & ~SPLASH_ALIGN);
+       int j;
+       u32 x_shift, y_shift;
+       u32 *x_coeff;
+       u32 *y_coeff;
+       struct pixel *row_buf_list[2];
+       struct pixel *row_buffer;
+       u32 curr_y_coeff = 1;
+       unsigned char *src_p;
+       unsigned char *src_p_line = src;
+       char *dst_p_line;
+       int r, s;
+       int y_array_rows;
+       int y_column_num;
+       int k;
+       u32 rnd;
+       int bi;
+       int xup;
+       int writes;
+
+       x_coeff = do_coefficients(src_w, dst_w, &x_shift);
+       y_coeff = do_coefficients(src_h, dst_h, &y_shift);
+       row_buf_list[0] = vmalloc(2 * sizeof(struct pixel)
+                                                 * (dst_w + 1));
+       if (!row_buf_list[0] || !x_coeff || !y_coeff) {
+               vfree(row_buf_list[0]);
+               vfree(x_coeff);
+               vfree(y_coeff);
+               return -ENOMEM;
+       }
+       row_buf_list[1] = row_buf_list[0] + (dst_w + 1);
+
+       y_array_rows = y_coeff[0];
+       rnd = (1 << (y_shift - 1));
+       bi = 1;
+       xup = (src_w <= dst_w) ? 1 : 0;
+       writes = 0;
+
+       dst_p_line = dst;
+       src_p = src_p_line;
+
+       row_buffer = row_buf_list[0];
+
+       for (j = 0; j < src_h;) {
+               memset(row_buf_list[0], 0, (2 * sizeof(struct pixel)
+                                           * (dst_w + 1)));
+               curr_y_coeff = 1;
+               if (xup)
+                       scale_x_up(cf,  src_w, &src_p, x_coeff,
+                                  x_shift, 1, row_buffer);
+               else
+                       scale_x_down(cf,  src_w, &src_p, x_coeff, x_shift, 1,
+                                    row_buffer);
+               src_p = src_p_line = src_p_line + src_x_bytes;
+               j++;
+               for (r = 0; r < y_array_rows - 1; r++) {
+                       struct pixel *old_row_buffer = row_buffer;
+                       u32 prev_y_coeff_val;
+
+                       y_column_num = y_coeff[curr_y_coeff];
+                       for (s = 0; s < y_column_num; s++) {
+                               for (k = 0; k < dst_w; k++)
+                                       put_pixel(row_buffer[k], dst, cf);
+                               dst = dst_p_line = dst_p_line + dst_x_bytes;
+                               writes++;
+                       }
+                       curr_y_coeff++;
+                       row_buffer = row_buf_list[(bi++) % 2];
+                       prev_y_coeff_val = y_coeff[curr_y_coeff++];
+                       if (xup)
+                               scale_x_up(cf,  src_w, &src_p, x_coeff,
+                                          x_shift, 1, row_buffer);
+                       else
+                               scale_x_down(cf,  src_w, &src_p, x_coeff,
+                                            x_shift, 1, row_buffer);
+                       src_p = src_p_line = src_p_line + src_x_bytes;
+                       j++;
+                       for (k = 0; k < dst_w; k++) {
+                               struct pixel pix;
+                               pix.red = ((old_row_buffer[k].red
+                                           * prev_y_coeff_val)
+                                          + (row_buffer[k].red
+                                             * y_coeff[curr_y_coeff])
+                                          + rnd) >> y_shift;
+                               pix.green = ((old_row_buffer[k].green
+                                             * prev_y_coeff_val)
+                                            + (row_buffer[k].green
+                                               * y_coeff[curr_y_coeff])
+                                            + rnd) >> y_shift;
+                               pix.blue = ((old_row_buffer[k].blue
+                                            * prev_y_coeff_val)
+                                           + (row_buffer[k].blue
+                                              * y_coeff[curr_y_coeff])
+                                           + rnd) >> y_shift;
+                               old_row_buffer[k].red = 0;
+                               old_row_buffer[k].green = 0;
+                               old_row_buffer[k].blue = 0;
+                               put_pixel(pix, dst, cf);
+                       }
+                       dst = dst_p_line = dst_p_line + dst_x_bytes;
+                       writes++;
+                       curr_y_coeff++;
+               }
+               for (r = 0; r < y_coeff[curr_y_coeff]; r++) {
+                       for (k = 0; k < dst_w; k++)
+                               put_pixel(row_buffer[k], dst, cf);
+
+                       dst = dst_p_line = dst_p_line + dst_x_bytes;
+                       writes++;
+               }
+       }
+       vfree(row_buf_list[0]);
+       vfree(x_coeff);
+       vfree(y_coeff);
+
+       return 0;
+}
+
+static int jpeg_get(unsigned char *buf, unsigned char *pic,
+                   int width, int height, enum splash_color_format cf,
+                   struct jpeg_decdata *decdata)
+{
+       int my_width, my_height;
+       int err;
+       int octpp = splash_octpp(cf);
+
+       jpeg_get_size(buf, &my_width, &my_height);
+
+       if (my_height != height || my_width != width) {
+               int my_size = ((my_width + 15) & ~15)
+                   * ((my_height + 15) & ~15) * octpp;
+               unsigned char *mem = vmalloc(my_size);
+               if (!mem)
+                       return 17;
+               err = jpeg_decode(buf, mem, ((my_width + 15) & ~15),
+                                 ((my_height + 15) & ~15), cf, decdata);
+               if (err) {
+                       vfree(mem);
+                       return err;
+               }
+               printk(KERN_INFO
+                      "bootsplash: scaling image from %dx%d to %dx%d\n",
+                      my_width, my_height, width, height);
+               if (my_height <= height)
+                       err = scale_y_up(mem, pic, cf, my_width, my_height,
+                                        ((width + 15) & ~15),
+                                        ((height + 15) & ~15));
+               else
+                       err = scale_y_down(mem, pic, cf, my_width, my_height,
+                                          ((width + 15) & ~15),
+                                          ((height + 15) & ~15));
+               vfree(mem);
+               if (err < 0)
+                       return 17;
+       } else {
+               err = jpeg_decode(buf, pic, ((width + 15) & ~15),
+                                 ((height + 15) & ~15), cf, decdata);
+               if (err)
+                       return err;
+       }
+       return 0;
+}
diff --git a/drivers/video/bootsplash/decode-jpg.c b/drivers/video/bootsplash/decode-jpg.c

new file mode 100644 (file)

index 0000000..617632e
--- /dev/null
+++ b/drivers/video/bootsplash/decode-jpg.c
@@ -0,0 +1,1045 @@
+/*
+ *    linux/drivers/video/bootsplash/decode-jpg.c - a tiny jpeg decoder.
+ *
+ *      (w) August 2001 by Michael Schroeder, <mls@suse.de>
+ *
+ */
+
+#include <linux/string.h>
+#include <asm/byteorder.h>
+#include <linux/bootsplash.h>
+#include "decode-jpg.h"
+
+#define ISHIFT 11
+
+#define IFIX(a) ((int)((a) * (1 << ISHIFT) + .5))
+#define IMULT(a, b) (((a) * (b)) >> ISHIFT)
+#define ITOINT(a) ((a) >> ISHIFT)
+
+/* special markers */
+#define M_BADHUFF      -1
+#define M_EOF          0x80
+
+struct in {
+       unsigned char *p;
+       unsigned int bits;
+       int left;
+       int marker;
+
+       int (*func)(void *);
+       void *data;
+};
+
+/*********************************/
+struct dec_hufftbl;
+struct enc_hufftbl;
+
+union hufftblp {
+       struct dec_hufftbl *dhuff;
+       struct enc_hufftbl *ehuff;
+};
+
+struct scan {
+       int dc;                 /* old dc value */
+
+       union hufftblp hudc;
+       union hufftblp huac;
+       int next;               /* when to switch to next scan */
+
+       int cid;                /* component id */
+       int hv;                 /* horiz/vert, copied from comp */
+       int tq;                 /* quant tbl, copied from comp */
+};
+
+/*********************************/
+
+#define DECBITS 10             /* seems to be the optimum */
+
+struct dec_hufftbl {
+       int maxcode[17];
+       int valptr[16];
+       unsigned char vals[256];
+       unsigned int llvals[1 << DECBITS];
+};
+
+static void decode_mcus(struct in *, int *, int, struct scan *, int *);
+static int dec_readmarker(struct in *);
+static void dec_makehuff(struct dec_hufftbl *, int *, unsigned char *);
+
+static void setinput(struct in *, unsigned char *);
+/*********************************/
+
+#undef PREC
+#define PREC int
+
+static void idctqtab(unsigned char *, PREC *);
+static void idct(int *, int *, PREC *, PREC, int);
+static void scaleidctqtab(PREC *, PREC);
+
+/*********************************/
+
+static void initcol(PREC[][64]);
+
+static void col221111(int *out, unsigned char *pic, int width);
+static void col221111_15(int *out, unsigned char *pic, int width);
+static void col221111_16(int *out, unsigned char *pic, int width);
+static void col221111_32(int *out, unsigned char *pic, int width);
+
+/*********************************/
+
+#define M_SOI  0xd8
+#define M_APP0 0xe0
+#define M_DQT  0xdb
+#define M_SOF0 0xc0
+#define M_DHT   0xc4
+#define M_DRI  0xdd
+#define M_SOS  0xda
+#define M_RST0 0xd0
+#define M_EOI  0xd9
+#define M_COM  0xfe
+
+static unsigned char *datap;
+
+static int getbyte(void)
+{
+       return *datap++;
+}
+
+static int getword(void)
+{
+       int c1, c2;
+       c1 = *datap++;
+       c2 = *datap++;
+       return c1 << 8 | c2;
+}
+
+struct comp {
+       int cid;
+       int hv;
+       int tq;
+};
+
+#define MAXCOMP 4
+struct jpginfo {
+       int nc;                 /* number of components */
+       int ns;                 /* number of scans */
+       int dri;                /* restart interval */
+       int nm;                 /* mcus til next marker */
+       int rm;                 /* next restart marker */
+};
+
+static struct jpginfo info;
+static struct comp comps[MAXCOMP];
+
+static struct scan dscans[MAXCOMP];
+
+static unsigned char quant[4][64];
+
+static struct dec_hufftbl dhuff[4];
+
+#define dec_huffdc (dhuff + 0)
+#define dec_huffac (dhuff + 2)
+
+static struct in in;
+
+static int readtables(int till)
+{
+       int m, l, i, j, lq, pq, tq;
+       int tc, th, tt;
+
+       for (;;) {
+               if (getbyte() != 0xff)
+                       return -1;
+               m = getbyte();
+               if (m == till)
+                       break;
+
+               switch (m) {
+               case 0xc2:
+                       return 0;
+
+               case M_DQT:
+                       lq = getword();
+                       while (lq > 2) {
+                               pq = getbyte();
+                               tq = pq & 15;
+                               if (tq > 3)
+                                       return -1;
+                               pq >>= 4;
+                               if (pq != 0)
+                                       return -1;
+                               for (i = 0; i < 64; i++)
+                                       quant[tq][i] = getbyte();
+                               lq -= 64 + 1;
+                       }
+                       break;
+
+               case M_DHT:
+                       l = getword();
+                       while (l > 2) {
+                               int hufflen[16], k;
+                               unsigned char huffvals[256];
+
+                               tc = getbyte();
+                               th = tc & 15;
+                               tc >>= 4;
+                               tt = tc * 2 + th;
+                               if (tc > 1 || th > 1)
+                                       return -1;
+                               for (i = 0; i < 16; i++)
+                                       hufflen[i] = getbyte();
+                               l -= 1 + 16;
+                               k = 0;
+                               for (i = 0; i < 16; i++) {
+                                       for (j = 0; j < hufflen[i]; j++)
+                                               huffvals[k++] = getbyte();
+                                       l -= hufflen[i];
+                               }
+                               dec_makehuff(dhuff + tt, hufflen,
+                                            huffvals);
+                       }
+                       break;
+
+               case M_DRI:
+                       l = getword();
+                       info.dri = getword();
+                       break;
+
+               default:
+                       l = getword();
+                       while (l-- > 2)
+                               getbyte();
+                       break;
+               }
+       }
+       return 0;
+}
+
+static void dec_initscans(void)
+{
+       int i;
+
+       info.nm = info.dri + 1;
+       info.rm = M_RST0;
+       for (i = 0; i < info.ns; i++)
+               dscans[i].dc = 0;
+}
+
+static int dec_checkmarker(void)
+{
+       int i;
+
+       if (dec_readmarker(&in) != info.rm)
+               return -1;
+       info.nm = info.dri;
+       info.rm = (info.rm + 1) & ~0x08;
+       for (i = 0; i < info.ns; i++)
+               dscans[i].dc = 0;
+       return 0;
+}
+
+void jpeg_get_size(unsigned char *buf, int *width, int *height)
+{
+       datap = buf;
+       getbyte();
+       getbyte();
+       readtables(M_SOF0);
+       getword();
+       getbyte();
+       *height = getword();
+       *width = getword();
+}
+
+int jpeg_decode(unsigned char *buf, unsigned char *pic,
+               int width, int height, enum splash_color_format cf,
+               struct jpeg_decdata *decdata)
+{
+       int i, j, m, tac, tdc;
+       int mcusx, mcusy, mx, my;
+       int max[6];
+
+       if (!decdata || !buf || !pic)
+               return -1;
+       datap = buf;
+       if (getbyte() != 0xff)
+               return ERR_NO_SOI;
+       if (getbyte() != M_SOI)
+               return ERR_NO_SOI;
+       if (readtables(M_SOF0))
+               return ERR_BAD_TABLES;
+       getword();
+       i = getbyte();
+       if (i != 8)
+               return ERR_NOT_8BIT;
+       if (((getword() + 15) & ~15) != height)
+               return ERR_HEIGHT_MISMATCH;
+       if (((getword() + 15) & ~15) != width)
+               return ERR_WIDTH_MISMATCH;
+       if ((height & 15) || (width & 15))
+               return ERR_BAD_WIDTH_OR_HEIGHT;
+       info.nc = getbyte();
+       if (info.nc > MAXCOMP)
+               return ERR_TOO_MANY_COMPPS;
+       for (i = 0; i < info.nc; i++) {
+               int h, v;
+               comps[i].cid = getbyte();
+               comps[i].hv = getbyte();
+               v = comps[i].hv & 15;
+               h = comps[i].hv >> 4;
+               comps[i].tq = getbyte();
+               if (h > 3 || v > 3)
+                       return ERR_ILLEGAL_HV;
+               if (comps[i].tq > 3)
+                       return ERR_QUANT_TABLE_SELECTOR;
+       }
+       if (readtables(M_SOS))
+               return ERR_BAD_TABLES;
+       getword();
+       info.ns = getbyte();
+       if (info.ns != 3)
+               return ERR_NOT_YCBCR_221111;
+       for (i = 0; i < 3; i++) {
+               dscans[i].cid = getbyte();
+               tdc = getbyte();
+               tac = tdc & 15;
+               tdc >>= 4;
+               if (tdc > 1 || tac > 1)
+                       return ERR_QUANT_TABLE_SELECTOR;
+               for (j = 0; j < info.nc; j++)
+                       if (comps[j].cid == dscans[i].cid)
+                               break;
+               if (j == info.nc)
+                       return ERR_UNKNOWN_CID_IN_SCAN;
+               dscans[i].hv = comps[j].hv;
+               dscans[i].tq = comps[j].tq;
+               dscans[i].hudc.dhuff = dec_huffdc + tdc;
+               dscans[i].huac.dhuff = dec_huffac + tac;
+       }
+
+       i = getbyte();
+       j = getbyte();
+       m = getbyte();
+
+       if (i != 0 || j != 63 || m != 0)
+               return ERR_NOT_SEQUENTIAL_DCT;
+
+       if (dscans[0].cid != 1 || dscans[1].cid != 2 || dscans[2].cid != 3)
+               return ERR_NOT_YCBCR_221111;
+
+       if (dscans[0].hv != 0x22 ||
+           dscans[1].hv != 0x11 ||
+           dscans[2].hv != 0x11)
+               return ERR_NOT_YCBCR_221111;
+
+       mcusx = width >> 4;
+       mcusy = height >> 4;
+
+
+       idctqtab(quant[dscans[0].tq], decdata->dquant[0]);
+       idctqtab(quant[dscans[1].tq], decdata->dquant[1]);
+       idctqtab(quant[dscans[2].tq], decdata->dquant[2]);
+       initcol(decdata->dquant);
+       setinput(&in, datap);
+
+#if 0
+       /* landing zone */
+       img[len] = 0;
+       img[len + 1] = 0xff;
+       img[len + 2] = M_EOF;
+#endif
+
+       dec_initscans();
+
+       dscans[0].next = 6 - 4;
+       dscans[1].next = 6 - 4 - 1;
+       dscans[2].next = 6 - 4 - 1 - 1; /* 411 encoding */
+       for (my = 0; my < mcusy; my++) {
+               for (mx = 0; mx < mcusx; mx++) {
+                       if (info.dri && !--info.nm)
+                               if (dec_checkmarker())
+                                       return ERR_WRONG_MARKER;
+
+                       decode_mcus(&in, decdata->dcts, 6, dscans, max);
+                       idct(decdata->dcts, decdata->out, decdata->dquant[0],
+                            IFIX(128.5), max[0]);
+                       idct(decdata->dcts + 64,
+                            decdata->out + 64,
+                            decdata->dquant[0], IFIX(128.5), max[1]);
+                       idct(decdata->dcts + 128,
+                            decdata->out + 128,
+                            decdata->dquant[0], IFIX(128.5), max[2]);
+                       idct(decdata->dcts + 192,
+                            decdata->out + 192,
+                            decdata->dquant[0], IFIX(128.5), max[3]);
+                       idct(decdata->dcts + 256,
+                            decdata->out + 256,
+                            decdata->dquant[1], IFIX(0.5), max[4]);
+                       idct(decdata->dcts + 320,
+                            decdata->out + 320,
+                            decdata->dquant[2], IFIX(0.5), max[5]);
+
+                       switch (cf) {
+                       case SPLASH_DEPTH_24:
+                               col221111_32(decdata->out,
+                                            (pic + (my * 16 * mcusx + mx)
+                                             * 16 * 4),
+                                            mcusx * 16 * 4);
+                               break;
+                       case SPLASH_DEPTH_24_PACKED:
+                               col221111(decdata->out,
+                                         (pic + (my * 16 * mcusx + mx)
+                                          * 16 * 3),
+                                         mcusx * 16 * 3);
+                               break;
+                       case SPLASH_DEPTH_16:
+                               col221111_16(decdata->out,
+                                            (pic + (my * 16 * mcusx + mx)
+                                             * 16 * 2), mcusx * 16 * 2);
+                               break;
+                       case SPLASH_DEPTH_15:
+                               col221111_15(decdata->out,
+                                            (pic + (my * 16 * mcusx + mx)
+                                             * 16 * 2), mcusx * 16 * 2);
+                               break;
+                       default:
+                               return ERR_DEPTH_MISMATCH;
+                               break;
+                       }
+               }
+       }
+
+       m = dec_readmarker(&in);
+       if (m != M_EOI)
+               return ERR_NO_EOI;
+
+       return 0;
+}
+
+/****************************************************************/
+/**************       huffman decoder             ***************/
+/****************************************************************/
+
+static int fillbits(struct in *, int, unsigned int);
+static int dec_rec2(struct in *, struct dec_hufftbl *, int *, int, int);
+
+static void setinput(struct in *in, unsigned char *p)
+{
+       in->p = p;
+       in->left = 0;
+       in->bits = 0;
+       in->marker = 0;
+}
+
+static int fillbits(struct in *in, int le, unsigned int bi)
+{
+       int b, m;
+
+       if (in->marker) {
+               if (le <= 16)
+                       in->bits = bi << 16, le += 16;
+               return le;
+       }
+       while (le <= 24) {
+               b = *in->p++;
+               if (b == 0xff) {
+                       m = *in->p++;
+                       if (m != 0) {
+                               if (m == M_EOF) {
+                                       if (in->func) {
+                                               m = in->func(in->data);
+                                               if  (m == 0)
+                                                       continue;
+                                       }
+                               }
+                               in->marker = m;
+                               if (le <= 16)
+                                       bi = bi << 16, le += 16;
+                               break;
+                       }
+               }
+               bi = bi << 8 | b;
+               le += 8;
+       }
+       in->bits = bi;          /* tmp... 2 return values needed */
+       return le;
+}
+
+static int dec_readmarker(struct in *in)
+{
+       int m;
+
+       in->left = fillbits(in, in->left, in->bits);
+       m = in->marker;
+       if (m == 0)
+               return 0;
+       in->left = 0;
+       in->marker = 0;
+       return m;
+}
+
+#define LEBI_DCL       int le, bi
+#define LEBI_GET(in)   (le = in->left, bi = in->bits)
+#define LEBI_PUT(in)   (in->left = le, in->bits = bi)
+
+#define GETBITS(in, n) \
+       (                                                               \
+        (le < (n) ? le = fillbits(in, le, bi), bi = in->bits : 0),     \
+        (le -= (n)),                                                   \
+        bi >> le & ((1 << (n)) - 1)                                    \
+                                                                       )
+
+#define UNGETBITS(in, n) (                     \
+                         le += (n)             \
+                                               )
+
+
+static int dec_rec2(struct in *in, struct dec_hufftbl *hu, int *runp, int c, int i)
+{
+       LEBI_DCL;
+
+       LEBI_GET(in);
+       if (i) {
+               UNGETBITS(in, i & 127);
+               *runp = i >> 8 & 15;
+               i >>= 16;
+       } else {
+               for (i = DECBITS;
+                    (c = ((c << 1) | GETBITS(in, 1))) >= (hu->maxcode[i]);
+                    i++)
+                       ;
+               if (i >= 16) {
+                       in->marker = M_BADHUFF;
+                       return 0;
+               }
+               i = hu->vals[hu->valptr[i] + c - hu->maxcode[i - 1] * 2];
+               *runp = i >> 4;
+               i &= 15;
+       }
+       if (i == 0) {           /* sigh, 0xf0 is 11 bit */
+               LEBI_PUT(in);
+               return 0;
+       }
+       /* receive part */
+       c = GETBITS(in, i);
+       if (c < (1 << (i - 1)))
+               c += (-1 << i) + 1;
+       LEBI_PUT(in);
+       return c;
+}
+
+#define DEC_REC(in, hu, r, i)   (                                      \
+                                 r = GETBITS(in, DECBITS),             \
+                                 i = hu->llvals[r],                    \
+                                 i & 128 ?                             \
+                                 (                                     \
+                                  UNGETBITS(in, i & 127),              \
+                                  r = i >> 8 & 15,                     \
+                                  i >> 16                              \
+                                                                       ) \
+                                 :                                     \
+                                 (                                     \
+                                  LEBI_PUT(in),                        \
+                                  i = dec_rec2(in, hu, &r, r, i),      \
+                                  LEBI_GET(in),                        \
+                                  i                                    \
+                                                                       ) \
+                                                                       )
+
+static void decode_mcus(struct in *in, int *dct, int n, struct scan *sc, int *maxp)
+{
+       struct dec_hufftbl *hu;
+       int i, r, t;
+       LEBI_DCL;
+
+       memset(dct, 0, n * 64 * sizeof(*dct));
+       LEBI_GET(in);
+       while (n-- > 0) {
+               hu = sc->hudc.dhuff;
+               *dct++ = (sc->dc += DEC_REC(in, hu, r, t));
+
+               hu = sc->huac.dhuff;
+               i = 63;
+               while (i > 0) {
+                       t = DEC_REC(in, hu, r, t);
+                       if (t == 0 && r == 0) {
+                               dct += i;
+                               break;
+                       }
+                       dct += r;
+                       *dct++ = t;
+                       i -= r + 1;
+               }
+               *maxp++ = 64 - i;
+               if (n == sc->next)
+                       sc++;
+       }
+       LEBI_PUT(in);
+}
+
+static void dec_makehuff(struct dec_hufftbl *hu, int *hufflen, unsigned char *huffvals)
+{
+       int code, k, i, j, d, x, c, v;
+       for (i = 0; i < (1 << DECBITS); i++)
+               hu->llvals[i] = 0;
+
+/*
+ * llvals layout:
+ *
+ * value v already known, run r, backup u bits:
+ *  vvvvvvvvvvvvvvvv 0000 rrrr 1 uuuuuuu
+ * value unknown, size b bits, run r, backup u bits:
+ *  000000000000bbbb 0000 rrrr 0 uuuuuuu
+ * value and size unknown:
+ *  0000000000000000 0000 0000 0 0000000
+ */
+       code = 0;
+       k = 0;
+       for (i = 0; i < 16; i++, code <<= 1) {  /* sizes */
+               hu->valptr[i] = k;
+               for (j = 0; j < hufflen[i]; j++) {
+                       hu->vals[k] = *huffvals++;
+                       if (i < DECBITS) {
+                               c = code << (DECBITS - 1 - i);
+                               v = hu->vals[k] & 0x0f; /* size */
+                               for (d = 1 << (DECBITS - 1 - i); --d >= 0;) {
+                                       if (v + i < DECBITS) {
+                                               /* both fit in table */
+                                               x = d >> (DECBITS - 1 - v -
+                                                         i);
+                                               if (v && x < (1 << (v - 1)))
+                                                       x += (-1 << v) + 1;
+                                               x = x << 16 |
+                                                 (hu->vals[k] & 0xf0) << 4
+                                                 | (DECBITS - (i + 1 + v))
+                                                 | 128;
+                                       } else
+                                               x = v << 16
+                                                 | (hu->vals[k] & 0xf0) << 4
+                                                 | (DECBITS - (i + 1));
+                                       hu->llvals[c | d] = x;
+                               }
+                       }
+                       code++;
+                       k++;
+               }
+               hu->maxcode[i] = code;
+       }
+       hu->maxcode[16] = 0x20000;      /* always terminate decode */
+}
+
+/****************************************************************/
+/**************             idct                  ***************/
+/****************************************************************/
+
+#define ONE ((PREC)IFIX(1.))
+#define S2  ((PREC)IFIX(0.382683432))
+#define C2  ((PREC)IFIX(0.923879532))
+#define C4  ((PREC)IFIX(0.707106781))
+
+#define S22 ((PREC)IFIX(2 * 0.382683432))
+#define C22 ((PREC)IFIX(2 * 0.923879532))
+#define IC4 ((PREC)IFIX(1 / 0.707106781))
+
+#define C3IC1 ((PREC)IFIX(0.847759065))        /* c3/c1 */
+#define C5IC1 ((PREC)IFIX(0.566454497))        /* c5/c1 */
+#define C7IC1 ((PREC)IFIX(0.198912367))        /* c7/c1 */
+
+#define XPP(a, b) (t = a + b, b = a - b, a = t)
+#define XMP(a, b) (t = a - b, b = a + b, a = t)
+#define XPM(a, b) (t = a + b, b = b - a, a = t)
+
+#define ROT(a, b, s, c) (t = IMULT(a + b, s),          \
+                        a = IMULT(a, c - s) + t,       \
+                        b = IMULT(b, c + s) - t)
+
+#define IDCT                                   \
+       (                                       \
+        XPP(t0, t1),                           \
+        XMP(t2, t3),                           \
+        t2 = IMULT(t2, IC4) - t3,              \
+        XPP(t0, t3),                           \
+        XPP(t1, t2),                           \
+        XMP(t4, t7),                           \
+        XPP(t5, t6),                           \
+        XMP(t5, t7),                           \
+        t5 = IMULT(t5, IC4),                   \
+        ROT(t4, t6, S22, C22),                 \
+        t6 -= t7,                              \
+        t5 -= t6,                              \
+        t4 -= t5,                              \
+        XPP(t0, t7),                           \
+        XPP(t1, t6),                           \
+        XPP(t2, t5),                           \
+        XPP(t3, t4)                            \
+                       )
+
+static unsigned char zig2[64] = {
+       0, 2, 3, 9, 10, 20, 21, 35,
+       14, 16, 25, 31, 39, 46, 50, 57,
+       5, 7, 12, 18, 23, 33, 37, 48,
+       27, 29, 41, 44, 52, 55, 59, 62,
+       15, 26, 30, 40, 45, 51, 56, 58,
+       1, 4, 8, 11, 19, 22, 34, 36,
+       28, 42, 43, 53, 54, 60, 61, 63,
+       6, 13, 17, 24, 32, 38, 47, 49
+};
+
+void idct(int *in, int *out, PREC *quant, PREC off, int max)
+{
+       PREC t0, t1, t2, t3, t4, t5, t6, t7, t;
+       PREC tmp[64], *tmpp;
+       int i, j;
+       unsigned char *zig2p;
+
+       t0 = off;
+       if (max == 1) {
+               t0 += in[0] * quant[0];
+               for (i = 0; i < 64; i++)
+                       out[i] = ITOINT(t0);
+               return;
+       }
+       zig2p = zig2;
+       tmpp = tmp;
+       for (i = 0; i < 8; i++) {
+               j = *zig2p++;
+               t0 += in[j] * quant[j];
+               j = *zig2p++;
+               t5 = in[j] * quant[j];
+               j = *zig2p++;
+               t2 = in[j] * quant[j];
+               j = *zig2p++;
+               t7 = in[j] * quant[j];
+               j = *zig2p++;
+               t1 = in[j] * quant[j];
+               j = *zig2p++;
+               t4 = in[j] * quant[j];
+               j = *zig2p++;
+               t3 = in[j] * quant[j];
+               j = *zig2p++;
+               t6 = in[j] * quant[j];
+               IDCT;
+               tmpp[0 * 8] = t0;
+               tmpp[1 * 8] = t1;
+               tmpp[2 * 8] = t2;
+               tmpp[3 * 8] = t3;
+               tmpp[4 * 8] = t4;
+               tmpp[5 * 8] = t5;
+               tmpp[6 * 8] = t6;
+               tmpp[7 * 8] = t7;
+               tmpp++;
+               t0 = 0;
+       }
+       for (i = 0; i < 8; i++) {
+               t0 = tmp[8 * i + 0];
+               t1 = tmp[8 * i + 1];
+               t2 = tmp[8 * i + 2];
+               t3 = tmp[8 * i + 3];
+               t4 = tmp[8 * i + 4];
+               t5 = tmp[8 * i + 5];
+               t6 = tmp[8 * i + 6];
+               t7 = tmp[8 * i + 7];
+               IDCT;
+               out[8 * i + 0] = ITOINT(t0);
+               out[8 * i + 1] = ITOINT(t1);
+               out[8 * i + 2] = ITOINT(t2);
+               out[8 * i + 3] = ITOINT(t3);
+               out[8 * i + 4] = ITOINT(t4);
+               out[8 * i + 5] = ITOINT(t5);
+               out[8 * i + 6] = ITOINT(t6);
+               out[8 * i + 7] = ITOINT(t7);
+       }
+}
+
+static unsigned char zig[64] = {
+       0, 1, 5, 6, 14, 15, 27, 28,
+       2, 4, 7, 13, 16, 26, 29, 42,
+       3, 8, 12, 17, 25, 30, 41, 43,
+       9, 11, 18, 24, 31, 40, 44, 53,
+       10, 19, 23, 32, 39, 45, 52, 54,
+       20, 22, 33, 38, 46, 51, 55, 60,
+       21, 34, 37, 47, 50, 56, 59, 61,
+       35, 36, 48, 49, 57, 58, 62, 63
+};
+
+static PREC aaidct[8] = {
+       IFIX(0.3535533906), IFIX(0.4903926402),
+       IFIX(0.4619397663), IFIX(0.4157348062),
+       IFIX(0.3535533906), IFIX(0.2777851165),
+       IFIX(0.1913417162), IFIX(0.0975451610)
+};
+
+
+static void idctqtab(unsigned char *qin, PREC *qout)
+{
+       int i, j;
+
+       for (i = 0; i < 8; i++)
+               for (j = 0; j < 8; j++)
+                       qout[zig[i * 8 + j]] = qin[zig[i * 8 + j]] *
+                                               IMULT(aaidct[i], aaidct[j]);
+}
+
+static void scaleidctqtab(PREC *q, PREC sc)
+{
+       int i;
+
+       for (i = 0; i < 64; i++)
+               q[i] = IMULT(q[i], sc);
+}
+
+/****************************************************************/
+/**************          color decoder            ***************/
+/****************************************************************/
+
+#define ROUND
+
+/*
+ * YCbCr Color transformation:
+ *
+ * y:0..255   Cb:-128..127   Cr:-128..127
+ *
+ *      R = Y                + 1.40200 * Cr
+ *      G = Y - 0.34414 * Cb - 0.71414 * Cr
+ *      B = Y + 1.77200 * Cb
+ *
+ * =>
+ *      Cr *= 1.40200;
+ *      Cb *= 1.77200;
+ *      Cg = 0.19421 * Cb + .50937 * Cr;
+ *      R = Y + Cr;
+ *      G = Y - Cg;
+ *      B = Y + Cb;
+ *
+ * =>
+ *      Cg = (50 * Cb + 130 * Cr + 128) >> 8;
+ */
+
+static void initcol(q)
+PREC q[][64];
+{
+       scaleidctqtab(q[1], IFIX(1.77200));
+       scaleidctqtab(q[2], IFIX(1.40200));
+}
+
+/* This is optimized for the stupid sun SUNWspro compiler. */
+#define STORECLAMP(a, x)                       \
+       (                                       \
+        (a) = (x),                             \
+        (unsigned int)(x) >= 256 ?             \
+        ((a) = (x) < 0 ? 0 : 255)              \
+        :                                      \
+        0                                      \
+                                               )
+
+#define CLAMP(x) ((unsigned int)(x) >= 256 ? ((x) < 0 ? 0 : 255) : (x))
+
+#ifdef ROUND
+
+#define CBCRCG(yin, xin)                       \
+       (                                       \
+        cb = outc[0 + yin * 8 + xin],          \
+        cr = outc[64 + yin * 8 + xin],         \
+        cg = (50 * cb + 130 * cr + 128) >> 8   \
+                                               )
+
+#else
+
+#define CBCRCG(yin, xin)                       \
+       (                                       \
+        cb = outc[0 + yin * 8 + xin],          \
+        cr = outc[64 + yin * 8 + xin],         \
+        cg = (3 * cb + 8 * cr) >> 4            \
+                                               )
+
+#endif
+
+#define PIC(yin, xin, p, xout)                 \
+       (                                       \
+        y = outy[(yin) * 8 + xin],             \
+        STORECLAMP(p[(xout) * 3 + 0], y + cr), \
+        STORECLAMP(p[(xout) * 3 + 1], y - cg), \
+        STORECLAMP(p[(xout) * 3 + 2], y + cb)  \
+                                               )
+
+#ifdef __LITTLE_ENDIAN
+#define PIC_15(yin, xin, p, xout, add)                         \
+       (                                                       \
+        y = outy[(yin) * 8 + xin],                             \
+        y = ((CLAMP(y + cr + add * 2 + 1) & 0xf8) <<  7) |     \
+        ((CLAMP(y - cg + add * 2 + 1) & 0xf8) <<  2) |         \
+        ((CLAMP(y + cb + add * 2 + 1)) >>  3),                 \
+        p[(xout) * 2 + 0] = y & 0xff,                          \
+        p[(xout) * 2 + 1] = y >> 8                             \
+                                                )
+
+#define PIC_16(yin, xin, p, xout, add)                     \
+       (                                                   \
+        y = outy[(yin) * 8 + xin],                         \
+        y = ((CLAMP(y + cr + add * 2 + 1) & 0xf8) <<  8) | \
+        ((CLAMP(y - cg + add) & 0xfc) <<  3) |             \
+        ((CLAMP(y + cb + add * 2 + 1)) >>  3),     \
+        p[(xout) * 2 + 0] = y & 0xff,                      \
+        p[(xout) * 2 + 1] = y >> 8                         \
+                                                )
+#else
+#define PIC_15(yin, xin, p, xout, add)                         \
+       (                                                       \
+        y = outy[(yin) * 8 + xin],                             \
+        y = ((CLAMP(y + cr + add * 2 + 1) & 0xf8) <<  7) |     \
+        ((CLAMP(y - cg + add * 2 + 1) & 0xf8) <<  2) |         \
+        ((CLAMP(y + cb + add * 2 + 1)) >>  3),                 \
+        p[(xout) * 2 + 0] = y >> 8,                            \
+        p[(xout) * 2 + 1] = y & 0xff                           \
+                                                )
+
+#define PIC_16(yin, xin, p, xout, add)                     \
+       (                                                   \
+        y = outy[(yin) * 8 + xin],                         \
+        y = ((CLAMP(y + cr + add * 2 + 1) & 0xf8) <<  8) | \
+        ((CLAMP(y - cg + add) & 0xfc) << 3) |              \
+        ((CLAMP(y + cb + add * 2 + 1)) >>  3),             \
+        p[(xout) * 2 + 0] = y >> 8,                        \
+        p[(xout) * 2 + 1] = y & 0xff                       \
+                                                )
+#endif
+
+#define PIC_32(yin, xin, p, xout)              \
+       (                                       \
+        y = outy[(yin) * 8 + xin],             \
+        STORECLAMP(p[(xout) * 4 + 0], y + cb), \
+        STORECLAMP(p[(xout) * 4 + 1], y - cg), \
+        STORECLAMP(p[(xout) * 4 + 2], y + cr), \
+        p[(xout) * 4 + 3] = 0                  \
+                                               )
+
+#define PIC221111(xin)                                                 \
+       (                                                               \
+        CBCRCG(0, xin),                                                \
+        PIC(xin / 4 * 8 + 0, (xin & 3) * 2 + 0, pic0, xin * 2 + 0),    \
+        PIC(xin / 4 * 8 + 0, (xin & 3) * 2 + 1, pic0, xin * 2 + 1),    \
+        PIC(xin / 4 * 8 + 1, (xin & 3) * 2 + 0, pic1, xin * 2 + 0),    \
+        PIC(xin / 4 * 8 + 1, (xin & 3) * 2 + 1, pic1, xin * 2 + 1)     \
+                                                               )
+
+#define PIC221111_15(xin)                                               \
+       (                                                               \
+        CBCRCG(0, xin),                                                \
+        PIC_15(xin / 4 * 8 + 0, (xin & 3) * 2 + 0, pic0, xin * 2 + 0, 3), \
+        PIC_15(xin / 4 * 8 + 0, (xin & 3) * 2 + 1, pic0, xin * 2 + 1, 0), \
+        PIC_15(xin / 4 * 8 + 1, (xin & 3) * 2 + 0, pic1, xin * 2 + 0, 1), \
+        PIC_15(xin / 4 * 8 + 1, (xin & 3) * 2 + 1, pic1, xin * 2 + 1, 2) \
+                                                                       )
+
+#define PIC221111_16(xin)                                               \
+       (                                                               \
+        CBCRCG(0, xin),                                                \
+        PIC_16(xin / 4 * 8 + 0, (xin & 3) * 2 + 0, pic0, xin * 2 + 0, 3), \
+        PIC_16(xin / 4 * 8 + 0, (xin & 3) * 2 + 1, pic0, xin * 2 + 1, 0), \
+        PIC_16(xin / 4 * 8 + 1, (xin & 3) * 2 + 0, pic1, xin * 2 + 0, 1), \
+        PIC_16(xin / 4 * 8 + 1, (xin & 3) * 2 + 1, pic1, xin * 2 + 1, 2) \
+                                                                       )
+
+#define PIC221111_32(xin)                                      \
+       (                                                       \
+        CBCRCG(0, xin),                                                \
+        PIC_32(xin / 4 * 8 + 0, (xin & 3) * 2 + 0, pic0, xin * 2 + 0), \
+        PIC_32(xin / 4 * 8 + 0, (xin & 3) * 2 + 1, pic0, xin * 2 + 1), \
+        PIC_32(xin / 4 * 8 + 1, (xin & 3) * 2 + 0, pic1, xin * 2 + 0), \
+        PIC_32(xin / 4 * 8 + 1, (xin & 3) * 2 + 1, pic1, xin * 2 + 1)  \
+                                                               )
+
+static void col221111(int *out, unsigned char *pic, int width)
+{
+       int i, j, k;
+       unsigned char *pic0, *pic1;
+       int *outy, *outc;
+       int cr, cg, cb, y;
+
+       pic0 = pic;
+       pic1 = pic + width;
+       outy = out;
+       outc = out + 64 * 4;
+       for (i = 2; i > 0; i--) {
+               for (j = 4; j > 0; j--) {
+                       for (k = 0; k < 8; k++)
+                               PIC221111(k);
+                       outc += 8;
+                       outy += 16;
+                       pic0 += 2 * width;
+                       pic1 += 2 * width;
+               }
+               outy += 64 * 2 - 16 * 4;
+       }
+}
+
+static void col221111_15(int *out, unsigned char *pic, int width)
+{
+       int i, j, k;
+       unsigned char *pic0, *pic1;
+       int *outy, *outc;
+       int cr, cg, cb, y;
+
+       pic0 = pic;
+       pic1 = pic + width;
+       outy = out;
+       outc = out + 64 * 4;
+       for (i = 2; i > 0; i--) {
+               for (j = 4; j > 0; j--) {
+                       for (k = 0; k < 8; k++)
+                               PIC221111_15(k);
+                       outc += 8;
+                       outy += 16;
+                       pic0 += 2 * width;
+                       pic1 += 2 * width;
+               }
+               outy += 64 * 2 - 16 * 4;
+       }
+}
+
+static void col221111_16(int *out, unsigned char *pic, int width)
+{
+       int i, j, k;
+       unsigned char *pic0, *pic1;
+       int *outy, *outc;
+       int cr, cg, cb, y;
+
+       pic0 = pic;
+       pic1 = pic + width;
+       outy = out;
+       outc = out + 64 * 4;
+       for (i = 2; i > 0; i--) {
+               for (j = 4; j > 0; j--) {
+                       for (k = 0; k < 8; k++)
+                               PIC221111_16(k);
+                       outc += 8;
+                       outy += 16;
+                       pic0 += 2 * width;
+                       pic1 += 2 * width;
+               }
+               outy += 64 * 2 - 16 * 4;
+       }
+}
+
+static void col221111_32(int *out, unsigned char *pic, int width)
+{
+       int i, j, k;
+       unsigned char *pic0, *pic1;
+       int *outy, *outc;
+       int cr, cg, cb, y;
+
+       pic0 = pic;
+       pic1 = pic + width;
+       outy = out;
+       outc = out + 64 * 4;
+       for (i = 2; i > 0; i--) {
+               for (j = 4; j > 0; j--) {
+                       for (k = 0; k < 8; k++)
+                               PIC221111_32(k);
+                       outc += 8;
+                       outy += 16;
+                       pic0 += 2 * width;
+                       pic1 += 2 * width;
+               }
+               outy += 64 * 2 - 16 * 4;
+       }
+}
diff --git a/drivers/video/bootsplash/decode-jpg.h b/drivers/video/bootsplash/decode-jpg.h

new file mode 100644 (file)

index 0000000..df324ec
--- /dev/null
+++ b/drivers/video/bootsplash/decode-jpg.h
@@ -0,0 +1,37 @@
+/*
+ *    linux/drivers/video/bootsplash/decode-jpg.h - a tiny jpeg decoder.
+ *
+ *      (w) August 2001 by Michael Schroeder, <mls@suse.de>
+ */
+
+#ifndef __DECODE_JPG_H
+#define __DECODE_JPG_H
+
+#define ERR_NO_SOI 1
+#define ERR_NOT_8BIT 2
+#define ERR_HEIGHT_MISMATCH 3
+#define ERR_WIDTH_MISMATCH 4
+#define ERR_BAD_WIDTH_OR_HEIGHT 5
+#define ERR_TOO_MANY_COMPPS 6
+#define ERR_ILLEGAL_HV 7
+#define ERR_QUANT_TABLE_SELECTOR 8
+#define ERR_NOT_YCBCR_221111 9
+#define ERR_UNKNOWN_CID_IN_SCAN 10
+#define ERR_NOT_SEQUENTIAL_DCT 11
+#define ERR_WRONG_MARKER 12
+#define ERR_NO_EOI 13
+#define ERR_BAD_TABLES 14
+#define ERR_DEPTH_MISMATCH 15
+
+struct jpeg_decdata {
+       int dcts[6 * 64 + 16];
+       int out[64 * 6];
+       int dquant[3][64];
+};
+
+extern int jpeg_decode(unsigned char *buf, unsigned char *pic,
+                      int width, int height, enum splash_color_format cf,
+                      struct jpeg_decdata *);
+extern void jpeg_get_size(unsigned char *, int *, int *);
+
+#endif
diff --git a/drivers/video/bootsplash/render.c b/drivers/video/bootsplash/render.c

new file mode 100644 (file)

index 0000000..2d89dde
--- /dev/null
+++ b/drivers/video/bootsplash/render.c
@@ -0,0 +1,517 @@
+/*
+ *    linux/drivers/video/bootsplash/render.c - splash screen render functions.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fb.h>
+#include <linux/vt_kern.h>
+#include <linux/selection.h>
+#include <asm/irq.h>
+
+#include "../console/fbcon.h"
+#include <linux/bootsplash.h>
+
+#ifndef DEBUG
+# define SPLASH_DEBUG(fmt, args...)
+#else
+# define SPLASH_DEBUG(fmt, args...) \
+       printk(KERN_WARNING "%s: " fmt "\n", __func__, ##args)
+#endif
+
+/* fake a region sync */
+void splash_sync_region(struct fb_info *info, int x, int y,
+                       int width, int height)
+{
+       struct splash_data *sd = info->splash_data;
+       if (sd && sd->need_sync) {
+               /* issue a fake copyarea (copy to the very same position)
+                * for marking the dirty region; this is required for Xen fb
+                * (bnc#739020)
+                */
+               struct fb_copyarea area;
+               area.sx = area.dx = x;
+               area.sy = area.dy = y;
+               area.width = width;
+               area.height = height;
+               info->fbops->fb_copyarea(info, &area);
+       }
+}
+
+void splash_putcs(struct vc_data *vc, struct fb_info *info,
+                  const unsigned short *s, int count, int ypos, int xpos)
+{
+       struct splash_data *sd;
+       unsigned short charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff;
+       int bgshift = (vc->vc_hi_font_mask) ? 13 : 12;
+       int fgshift = (vc->vc_hi_font_mask) ? 9 : 8;
+       union pt src;
+       union pt dst, splashsrc;
+       unsigned int d, x, y;
+       u32 dd, fgx, bgx;
+       u16 c = scr_readw(s);
+       int fg_color, bg_color, transparent;
+       int n;
+       int octpp = (info->var.bits_per_pixel + 1) >> 3;
+       int drawn_width;
+
+       if (!oops_in_progress
+           && (console_blanked || info->splash_data->splash_dosilent))
+               return;
+       sd = info->splash_data;
+
+       fg_color = attr_fgcol(fgshift, c);
+       bg_color = attr_bgcol(bgshift, c);
+       transparent = sd->imgd->splash_color == bg_color;
+       xpos = xpos * vc->vc_font.width + sd->imgd->splash_text_xo;
+       ypos = ypos * vc->vc_font.height + sd->imgd->splash_text_yo;
+       splashsrc.ub = (u8 *)(sd->pic->splash_pic
+                             + ypos * sd->pic->splash_pic_stride
+                             + xpos * octpp);
+       dst.ub = (u8 *)(info->screen_base
+                       + ypos * info->fix.line_length
+                       + xpos * octpp);
+       fgx = ((u32 *)info->pseudo_palette)[fg_color];
+       if (transparent && sd->imgd->splash_color == 15) {
+               if (fgx == 0xffea)
+                       fgx = 0xfe4a;
+               else if (fgx == 0x57ea)
+                       fgx = 0x0540;
+               else if (fgx == 0xffff)
+                       fgx = 0x52aa;
+       }
+       bgx = ((u32 *)info->pseudo_palette)[bg_color];
+       d = 0;
+       drawn_width = 0;
+       while (count--) {
+               c = scr_readw(s++);
+               src.ub = vc->vc_font.data
+                       + ((c & charmask)
+                          * vc->vc_font.height
+                          * ((vc->vc_font.width + 7) >> 3));
+               for (y = 0; y < vc->vc_font.height; y++) {
+                       for (x = 0; x < vc->vc_font.width; ) {
+                               if ((x & 7) == 0)
+                                       d = *src.ub++;
+                               switch (octpp) {
+                               case 2:
+                                       if (d & 0x80)
+                                               dd = fgx;
+                                       else
+                                               dd = (transparent ?
+                                                     *splashsrc.us : bgx);
+                                       splashsrc.us += 1;
+                                       if (d & 0x40)
+                                               dd |= fgx << 16;
+                                       else
+                                               dd |= (transparent ? *splashsrc.us : bgx) << 16;
+                                       splashsrc.us += 1;
+                                       d <<= 2;
+                                       x += 2;
+                                       fb_writel(dd, dst.ul);
+                                       dst.ul += 1;
+                                       break;
+                               case 3:
+                                       for (n = 0; n <= 16; n += 8) {
+                                               if (d & 0x80)
+                                                       dd = (fgx >> n) & 0xff;
+                                               else
+                                                       dd = (transparent ? *splashsrc.ul : ((bgx >> n) & 0xff));
+                                               splashsrc.ub += 1;
+                                               fb_writeb(dd, dst.ub);
+                                               dst.ub += 1;
+                                       }
+                                       d <<= 1;
+                                       x += 1;
+                                       break;
+                               case 4:
+                                       if (d & 0x80)
+                                               dd = fgx;
+                                       else
+                                               dd = (transparent ? *splashsrc.ul : bgx);
+                                       splashsrc.ul += 1;
+                                       d <<= 1;
+                                       x += 1;
+                                       fb_writel(dd, dst.ul);
+                                       dst.ul += 1;
+                                       break;
+                               }
+                       }
+                       dst.ub += info->fix.line_length
+                               - vc->vc_font.width * octpp;
+                       splashsrc.ub += sd->pic->splash_pic_stride
+                               - vc->vc_font.width * octpp;
+               }
+               dst.ub -= info->fix.line_length * vc->vc_font.height
+                       - vc->vc_font.width * octpp;
+               splashsrc.ub -= sd->pic->splash_pic_stride * vc->vc_font.height
+                       - vc->vc_font.width * octpp;
+               drawn_width += vc->vc_font.width;
+       }
+       splash_sync_region(info, xpos, ypos, drawn_width, vc->vc_font.height);
+}
+
+static void splash_renderc(struct fb_info *info,
+                          int fg_color, int bg_color,
+                          u8 *src,
+                          int ypos, int xpos,
+                          int height, int width)
+{
+       struct splash_data *sd;
+       int transparent;
+       u32 dd, fgx, bgx;
+       union pt dst, splashsrc;
+       unsigned int d, x, y;
+       int n;
+       int octpp = (info->var.bits_per_pixel + 1) >> 3;
+
+       if (!oops_in_progress
+           && (console_blanked || info->splash_data->splash_dosilent))
+               return;
+
+       sd = info->splash_data;
+
+       transparent = sd->imgd->splash_color == bg_color;
+       splashsrc.ub = (u8 *)(sd->pic->splash_pic
+                            + ypos * sd->pic->splash_pic_stride
+                            + xpos * octpp);
+       dst.ub = (u8 *)(info->screen_base
+                      + ypos * info->fix.line_length
+                      + xpos * octpp);
+       fgx = ((u32 *)info->pseudo_palette)[fg_color];
+       if (transparent && (sd->imgd->splash_color == 15)) {
+               if (fgx == 0xffea)
+                       fgx = 0xfe4a;
+               else if (fgx == 0x57ea)
+                       fgx = 0x0540;
+               else if (fgx == 0xffff)
+                       fgx = 0x52aa;
+       }
+       bgx = ((u32 *)info->pseudo_palette)[bg_color];
+       d = 0;
+       for (y = 0; y < height; y++) {
+               for (x = 0; x < width; ) {
+                       if ((x & 7) == 0)
+                               d = *src++;
+                       switch (octpp) {
+                       case 2:
+                               if (d & 0x80)
+                                       dd = fgx;
+                               else
+                                       dd = (transparent ? *splashsrc.us : bgx);
+                               splashsrc.us += 1;
+                               if (d & 0x40)
+                                       dd |= fgx << 16;
+                               else
+                                       dd |= (transparent ? *splashsrc.us : bgx) << 16;
+                               splashsrc.us += 1;
+                               d <<= 2;
+                               x += 2;
+                               fb_writel(dd, dst.ul);
+                               dst.ul += 1;
+                               break;
+                       case 3:
+                               for (n = 0; n <= 16; n += 8) {
+                                       if (d & 0x80)
+                                               dd = (fgx >> n) & 0xff;
+                                       else
+                                               dd = (transparent ? *splashsrc.ub : bgx);
+                                       splashsrc.ub += 1;
+                                       fb_writeb(dd, dst.ub);
+                                       dst.ub += 1;
+                               }
+                               d <<= 1;
+                               x += 1;
+                               break;
+                       case 4:
+                               if (d & 0x80)
+                                       dd = fgx;
+                               else
+                                       dd = (transparent ? *splashsrc.ul : bgx);
+                               splashsrc.ul += 1;
+                               d <<= 1;
+                               x += 1;
+                               fb_writel(dd, dst.ul);
+                               dst.ul += 1;
+                               break;
+                       }
+               }
+               dst.ub += info->fix.line_length - width * octpp;
+               splashsrc.ub += sd->pic->splash_pic_stride - width * octpp;
+       }
+       splash_sync_region(info, xpos, ypos, width, height);
+}
+
+void splashcopy(u8 *dst, u8 *src, int height, int width,
+               int dstbytes, int srcbytes, int octpp)
+{
+       int i;
+
+       width *= octpp;
+       while (height-- > 0) {
+               union pt p, q;
+               p.ul = (u32 *)dst;
+               q.ul = (u32 *)src;
+               for (i = 0; i < width / 4; i++)
+                       fb_writel(*q.ul++, p.ul++);
+               if (width & 2)
+                       fb_writew(*q.us++, p.us++);
+               if (width & 1)
+                       fb_writeb(*q.ub, p.ub);
+               dst += dstbytes;
+               src += srcbytes;
+       }
+}
+
+static void splashset(u8 *dst, int height, int width,
+                     int dstbytes, u32 bgx, int octpp) {
+       int i;
+
+       width *= octpp;
+       if (octpp == 2)
+               bgx |= bgx << 16;
+       while (height-- > 0) {
+               union pt p;
+               p.ul = (u32 *)dst;
+               if (!(octpp & 1)) {
+                       for (i = 0; i < width / 4; i++)
+                               fb_writel(bgx, p.ul++);
+                       if (width & 2)
+                               fb_writew(bgx, p.us++);
+                       if (width & 1)
+                               fb_writeb(bgx, p.ub);
+                       dst += dstbytes;
+               } else { /* slow! */
+                       for (i = 0; i < width; i++)
+                               fb_writeb((bgx >> ((i % 3) * 8)) & 0xff,
+                                         p.ub++);
+               }
+       }
+}
+
+static void splashfill(struct fb_info *info, int sy, int sx,
+                      int height, int width) {
+       int octpp = (info->var.bits_per_pixel + 1) >> 3;
+       struct splash_data *sd = info->splash_data;
+
+       splashcopy((u8 *)(info->screen_base
+                         + sy * info->fix.line_length + sx * octpp),
+                  (u8 *)(sd->pic->splash_pic
+                         + sy * sd->pic->splash_pic_stride
+                         + sx * octpp),
+                  height, width, info->fix.line_length,
+                  sd->pic->splash_pic_stride,
+                  octpp);
+       splash_sync_region(info, sx, sy, width, height);
+}
+
+void splash_clear(struct vc_data *vc, struct fb_info *info, int sy,
+                       int sx, int height, int width)
+{
+       struct splash_data *sd;
+       int bgshift = (vc->vc_hi_font_mask) ? 13 : 12;
+       int bg_color = attr_bgcol_ec(bgshift, vc, info);
+       int transparent;
+       int octpp = (info->var.bits_per_pixel + 1) >> 3;
+       u32 bgx;
+       u8 *dst;
+
+       if (!oops_in_progress
+           && (console_blanked || info->splash_data->splash_dosilent))
+               return;
+
+       sd = info->splash_data;
+
+       transparent = sd->imgd->splash_color == bg_color;
+
+       sy = sy * vc->vc_font.height + sd->imgd->splash_text_yo;
+       sx = sx * vc->vc_font.width + sd->imgd->splash_text_xo;
+       height *= vc->vc_font.height;
+       width *= vc->vc_font.width;
+       if (transparent) {
+               splashfill(info, sy, sx, height, width);
+               return;
+       }
+       dst = (u8 *)(info->screen_base
+                    + sy * info->fix.line_length
+                    + sx * octpp);
+       bgx = ((u32 *)info->pseudo_palette)[bg_color];
+       splashset(dst,
+                 height, width,
+                 info->fix.line_length,
+                 bgx,
+                 (info->var.bits_per_pixel + 1) >> 3);
+       splash_sync_region(info, sx, sy, width, height);
+}
+
+void splash_bmove(struct vc_data *vc, struct fb_info *info, int sy,
+               int sx, int dy, int dx, int height, int width)
+{
+       struct splash_data *sd;
+       struct fb_copyarea area;
+
+       if (!oops_in_progress
+           && (console_blanked || info->splash_data->splash_dosilent))
+               return;
+
+       sd = info->splash_data;
+
+       area.sx = sx * vc->vc_font.width;
+       area.sy = sy * vc->vc_font.height;
+       area.dx = dx * vc->vc_font.width;
+       area.dy = dy * vc->vc_font.height;
+       area.sx += sd->imgd->splash_text_xo;
+       area.sy += sd->imgd->splash_text_yo;
+       area.dx += sd->imgd->splash_text_xo;
+       area.dy += sd->imgd->splash_text_yo;
+       area.height = height * vc->vc_font.height;
+       area.width = width * vc->vc_font.width;
+
+       info->fbops->fb_copyarea(info, &area);
+}
+
+void splash_clear_margins(struct vc_data *vc, struct fb_info *info,
+                               int bottom_only)
+{
+       struct splash_data *sd;
+       unsigned int tw = vc->vc_cols*vc->vc_font.width;
+       unsigned int th = vc->vc_rows*vc->vc_font.height;
+       SPLASH_DEBUG();
+
+       if (!oops_in_progress
+           && (console_blanked || info->splash_data->splash_dosilent))
+               return;
+
+       sd = info->splash_data;
+
+       if (!bottom_only) {
+               /* top margin */
+               splashfill(info,
+                          0,
+                          0,
+                          sd->imgd->splash_text_yo,
+                          info->var.xres);
+               /* left margin */
+               splashfill(info,
+                          sd->imgd->splash_text_yo,
+                          0,
+                          th,
+                          sd->imgd->splash_text_xo);
+               /* right margin */
+               splashfill(info,
+                          sd->imgd->splash_text_yo,
+                          sd->imgd->splash_text_xo + tw,
+                          th,
+                          info->var.xres - sd->imgd->splash_text_xo - tw);
+       }
+       splashfill(info,
+                  sd->imgd->splash_text_yo + th,
+                  0,
+                  info->var.yres - sd->imgd->splash_text_yo - th,
+                  info->var.xres);
+}
+
+int splash_cursor(struct fb_info *info, struct fb_cursor *cursor)
+{
+       struct splash_data *sd;
+       int i;
+       unsigned int dsize, s_pitch;
+
+       if (info->state != FBINFO_STATE_RUNNING)
+               return 0;
+
+       sd = info->splash_data;
+
+       s_pitch = (cursor->image.width + 7) >> 3;
+       dsize = s_pitch * cursor->image.height;
+       if (cursor->enable) {
+               switch (cursor->rop) {
+               case ROP_XOR:
+                       for (i = 0; i < dsize; i++)
+                               info->fb_cursordata[i] = cursor->image.data[i]
+                                       ^ cursor->mask[i];
+                       break;
+               case ROP_COPY:
+               default:
+                       for (i = 0; i < dsize; i++)
+                               info->fb_cursordata[i] = cursor->image.data[i]
+                                       & cursor->mask[i];
+                       break;
+               }
+       } else if (info->fb_cursordata != cursor->image.data)
+               memcpy(info->fb_cursordata, cursor->image.data, dsize);
+       cursor->image.data = info->fb_cursordata;
+       splash_renderc(info, cursor->image.fg_color, cursor->image.bg_color,
+                      (u8 *)info->fb_cursordata,
+                      cursor->image.dy + sd->imgd->splash_text_yo,
+                      cursor->image.dx + sd->imgd->splash_text_xo,
+                      cursor->image.height,
+                      cursor->image.width);
+       return 0;
+}
+
+void splash_bmove_redraw(struct vc_data *vc, struct fb_info *info,
+                        int y, int sx, int dx, int width)
+{
+       struct splash_data *sd;
+       unsigned short *d = (unsigned short *) (vc->vc_origin
+                                               + vc->vc_size_row * y
+                                               + dx * 2);
+       unsigned short *s = d + (dx - sx);
+       unsigned short *start = d;
+       unsigned short *ls = d;
+       unsigned short *le = d + width;
+       unsigned short c;
+       int x = dx;
+       unsigned short attr = 1;
+
+       if (console_blanked || info->splash_data->splash_dosilent)
+               return;
+
+       sd = info->splash_data;
+
+       do {
+               c = scr_readw(d);
+               if (attr != (c & 0xff00)) {
+                       attr = c & 0xff00;
+                       if (d > start) {
+                               splash_putcs(vc, info, start, d - start, y, x);
+                               x += d - start;
+                               start = d;
+                       }
+               }
+               if (s >= ls && s < le && c == scr_readw(s)) {
+                       if (d > start) {
+                               splash_putcs(vc, info, start, d - start, y, x);
+                               x += d - start + 1;
+                               start = d + 1;
+                       } else {
+                               x++;
+                               start++;
+                       }
+               }
+               s++;
+               d++;
+       } while (d < le);
+       if (d > start)
+               splash_putcs(vc, info, start, d - start, y, x);
+}
+
+void splash_blank(struct vc_data *vc, struct fb_info *info, int blank)
+{
+       SPLASH_DEBUG();
+
+       if (blank) {
+               splashset((u8 *)info->screen_base,
+                         info->var.yres, info->var.xres,
+                         info->fix.line_length,
+                         0,
+                         (info->var.bits_per_pixel + 1) >> 3);
+               splash_sync_region(info, 0, 0, info->var.xres, info->var.yres);
+       } else {
+               /* splash_prepare(vc, info);  *//* do we really need this? */
+               splash_clear_margins(vc, info, 0);
+               /* no longer needed, done in fbcon_blank */
+               /* update_screen(vc->vc_num); */
+       }
+}
diff --git a/drivers/video/console/bitblit.c b/drivers/video/console/bitblit.c

index 28b1a83..1220773 100644 (file)
--- a/drivers/video/console/bitblit.c
+++ b/drivers/video/console/bitblit.c
@@ -19,6 +19,9 @@
  #include <asm/types.h>
  #include "fbcon.h"
  
+#include <linux/bootsplash.h>
+
+
  /*
   * Accelerated handlers.
   */
@@ -48,6 +51,12 @@ static void bit_bmove(struct vc_data *vc, struct fb_info *info, int sy,
  {
         struct fb_copyarea area;
  
+       if (SPLASH_DATA(info)) {
+               splash_bmove(vc, info,
+                       sy, sx, dy, dx, height, width);
+               return;
+       }
+
         area.sx = sx * vc->vc_font.width;
         area.sy = sy * vc->vc_font.height;
         area.dx = dx * vc->vc_font.width;
@@ -64,6 +73,12 @@ static void bit_clear(struct vc_data *vc, struct fb_info *info, int sy,
         int bgshift = (vc->vc_hi_font_mask) ? 13 : 12;
         struct fb_fillrect region;
  
+       if (SPLASH_DATA(info)) {
+               splash_clear(vc, info,
+                            sy, sx, height, width);
+               return;
+       }
+
         region.color = attr_bgcol_ec(bgshift, vc, info);
         region.dx = sx * vc->vc_font.width;
         region.dy = sy * vc->vc_font.height;
@@ -161,6 +176,11 @@ static void bit_putcs(struct vc_data *vc, struct fb_info *info,
         image.height = vc->vc_font.height;
         image.depth = 1;
  
+       if (SPLASH_DATA(info)) {
+               splash_putcs(vc, info, s, count, yy, xx);
+               return;
+       }
+
         if (attribute) {
                 buf = kmalloc(cellsize, GFP_KERNEL);
                 if (!buf)
@@ -214,6 +234,11 @@ static void bit_clear_margins(struct vc_data *vc, struct fb_info *info,
         unsigned int bs = info->var.yres - bh;
         struct fb_fillrect region;
  
+       if (SPLASH_DATA(info)) {
+               splash_clear_margins(vc, info, bottom_only);
+               return;
+       }
+
         region.color = attr_bgcol_ec(bgshift, vc, info);
         region.rop = ROP_COPY;
  
@@ -380,6 +405,12 @@ static void bit_cursor(struct vc_data *vc, struct fb_info *info, int mode,
         cursor.image.depth = 1;
         cursor.rop = ROP_XOR;
  
+       if (SPLASH_DATA(info)) {
+               splash_cursor(info, &cursor);
+               ops->cursor_reset = 0;
+               return;
+       }
+
         if (info->fbops->fb_cursor)
                 err = info->fbops->fb_cursor(info, &cursor);
  
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c

index 2e471c2..8ddbaa2 100644 (file)
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -79,6 +79,7 @@
  #include <asm/irq.h>
  
  #include "fbcon.h"
+#include <linux/bootsplash.h>
  
  #ifdef FBCONDEBUG
  #  define DPRINTK(fmt, args...) printk(KERN_DEBUG "%s: " fmt, __func__ , ## args)
@@ -94,7 +95,11 @@ enum {
  
  static struct display fb_display[MAX_NR_CONSOLES];
  
+#ifdef CONFIG_BOOTSPLASH
+signed char con2fb_map[MAX_NR_CONSOLES];
+#else
  static signed char con2fb_map[MAX_NR_CONSOLES];
+#endif
  static signed char con2fb_map_boot[MAX_NR_CONSOLES];
  
  static int logo_lines;
@@ -535,6 +540,8 @@ static int fbcon_takeover(int show_logo)
         for (i = first_fb_vc; i <= last_fb_vc; i++)
                 con2fb_map[i] = info_idx;
  
+       splash_init();
+
         err = take_over_console(&fb_con, first_fb_vc, last_fb_vc,
                                 fbcon_is_default);
  
@@ -1098,6 +1105,21 @@ static void fbcon_init(struct vc_data *vc, int init)
         new_cols /= vc->vc_font.width;
         new_rows /= vc->vc_font.height;
  
+#ifdef CONFIG_BOOTSPLASH
+       if (vc->vc_splash_data && vc->vc_splash_data->splash_state) {
+               new_cols = vc->vc_splash_data->splash_vc_text_wi
+                       / vc->vc_font.width;
+               new_rows = vc->vc_splash_data->splash_vc_text_he
+                       / vc->vc_font.height;
+               logo = 0;
+               con_remap_def_color(vc,
+                                   (vc->vc_splash_data->imgd->splash_color
+                                    << 4) |
+                                   vc->vc_splash_data->imgd->splash_fg_color);
+       }
+#endif
+
+
         /*
          * We must always set the mode. The mode of the previous console
          * driver could be in the same resolution but we are using different
@@ -1799,6 +1821,8 @@ static int fbcon_scroll(struct vc_data *vc, int t, int b, int dir,
                         fbcon_softback_note(vc, t, count);
                 if (logo_shown >= 0)
                         goto redraw_up;
+               if (SPLASH_DATA(info))
+                       goto redraw_up;
                 switch (p->scrollmode) {
                 case SCROLL_MOVE:
                         fbcon_redraw_blit(vc, info, p, t, b - t - count,
@@ -1890,6 +1914,8 @@ static int fbcon_scroll(struct vc_data *vc, int t, int b, int dir,
                         count = vc->vc_rows;
                 if (logo_shown >= 0)
                         goto redraw_down;
+               if (SPLASH_DATA(info))
+                       goto redraw_down;
                 switch (p->scrollmode) {
                 case SCROLL_MOVE:
                         fbcon_redraw_blit(vc, info, p, b - 1, b - t - count,
@@ -2038,6 +2064,12 @@ static void fbcon_bmove_rec(struct vc_data *vc, struct display *p, int sy, int s
                 }
                 return;
         }
+
+       if (SPLASH_DATA(info) && sy == dy && height == 1) {
+               /*must use slower redraw bmove to keep background pic intact*/
+               splash_bmove_redraw(vc, info, sy, sx, dx, width);
+               return;
+       }
         ops->bmove(vc, info, real_y(p, sy), sx, real_y(p, dy), dx,
                    height, width);
  }
@@ -2146,6 +2178,23 @@ static int fbcon_switch(struct vc_data *vc)
         info = registered_fb[con2fb_map[vc->vc_num]];
         ops = info->fbcon_par;
  
+#ifdef CONFIG_BOOTSPLASH
+       {
+               struct splash_data *prev_sd = vc->vc_splash_data;
+               splash_prepare(vc, info);
+               if (vc->vc_splash_data && vc->vc_splash_data->splash_state &&
+                   vc->vc_splash_data != prev_sd) {
+                       vc_resize(vc, vc->vc_splash_data->splash_vc_text_wi
+                                 / vc->vc_font.width,
+                                 vc->vc_splash_data->splash_vc_text_he
+                                 / vc->vc_font.height);
+                       con_remap_def_color(vc,
+                                           vc->vc_splash_data->imgd->splash_color << 4
+                                           | vc->vc_splash_data->imgd->splash_fg_color);
+               }
+       }
+#endif
+
         if (softback_top) {
                 if (softback_lines)
                         fbcon_set_origin(vc);
@@ -2279,6 +2328,11 @@ static void fbcon_generic_blank(struct vc_data *vc, struct fb_info *info,
  {
         struct fb_event event;
  
+       if (SPLASH_DATA(info)) {
+               splash_blank(vc, info, blank);
+               return;
+       }
+
         if (blank) {
                 unsigned short charmask = vc->vc_hi_font_mask ?
                         0x1ff : 0xff;
@@ -2504,6 +2558,10 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h,
  
                 cols = FBCON_SWAP(ops->rotate, info->var.xres, info->var.yres);
                 rows = FBCON_SWAP(ops->rotate, info->var.yres, info->var.xres);
+               if (SPLASH_DATA(info)) {
+                       cols = TEXT_WIDTH_FROM_SPLASH_DATA(info);
+                       rows = TEXT_HIGHT_FROM_SPLASH_DATA(info);
+               }
                 cols /= w;
                 rows /= h;
                 vc_resize(vc, cols, rows);
diff --git a/drivers/video/console/fbcon.h b/drivers/video/console/fbcon.h

index 6bd2e0c..83285cb 100644 (file)
--- a/drivers/video/console/fbcon.h
+++ b/drivers/video/console/fbcon.h
@@ -25,6 +25,54 @@
      *    low-level frame buffer device
      */
  
+#ifdef CONFIG_BOOTSPLASH
+struct splash_img_data {
+       int ref_cnt;
+       int splash_color;       /* transparent color */
+       int splash_fg_color;    /* foreground color */
+       int splash_width;       /* width of image */
+       int splash_height;      /* height of image */
+       int splash_text_xo;     /* text area origin of origin */
+       int splash_text_yo;
+       int splash_text_wi;     /* text area size of jpeg*/
+       int splash_text_he;
+       int splash_boxcount;
+       int splash_sboxcount;
+       int splash_overpaintok; /* is it ok to overpaint boxes */
+       unsigned char *splash_boxes;
+       unsigned char *splash_jpeg;             /* jpeg */
+       unsigned char *splash_sboxes;
+       unsigned char *splash_silentjpeg;
+       unsigned char *splash_palette;          /* palette for 8-bit */
+};
+
+struct splash_pic_data {
+       int ref_cnt;
+       unsigned char *splash_pic;
+       int splash_pic_stride;
+       int splash_pic_size;
+};
+
+struct splash_data {
+       struct splash_data *next;
+       struct splash_img_data *imgd;
+       struct splash_pic_data *pic;
+       int splash_state;                       /* show splash? */
+       int splash_percent;
+       int splash_dosilent;                    /* show silent jpeg */
+
+       int splash_vc_text_wi;                  /* text area size used*/
+       int splash_vc_text_he;
+       int splash_boxes_xoff;
+       int splash_boxes_yoff;
+       int splash_sboxes_xoff;
+       int splash_sboxes_yoff;
+
+       bool color_set;
+       bool need_sync;
+};
+#endif
+
  struct display {
      /* Filled in by the low-level console driver */
      const u_char *fontdata;
diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c

index d449a74..43c8da8 100644 (file)
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -1255,7 +1255,7 @@ static int vgacon_font_set(struct vc_data *c, struct console_font *font, unsigne
         unsigned charcount = font->charcount;
         int rc;
  
-       if (vga_video_type < VIDEO_TYPE_EGAM)
+       if (vga_video_type < VIDEO_TYPE_EGAM || vga_is_gfx)
                 return -EINVAL;
  
         if (font->width != VGA_FONTWIDTH ||
@@ -1273,7 +1273,7 @@ static int vgacon_font_set(struct vc_data *c, struct console_font *font, unsigne
  
  static int vgacon_font_get(struct vc_data *c, struct console_font *font)
  {
-       if (vga_video_type < VIDEO_TYPE_EGAM)
+       if (vga_video_type < VIDEO_TYPE_EGAM || vga_is_gfx)
                 return -EINVAL;
  
         font->width = VGA_FONTWIDTH;
diff --git a/drivers/video/geode/Kconfig b/drivers/video/geode/Kconfig

index c5d8ba4..251b09b 100644 (file)
--- a/drivers/video/geode/Kconfig
+++ b/drivers/video/geode/Kconfig
@@ -3,7 +3,7 @@
  #
  config FB_GEODE
         bool "AMD Geode family framebuffer support (EXPERIMENTAL)"
-       depends on FB && PCI && EXPERIMENTAL && X86
+       depends on FB && PCI && EXPERIMENTAL && X86_32
         ---help---
           Say 'Y' here to allow you to select framebuffer drivers for
           the AMD Geode family of processors.
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig

index 1a61939..9bdda63 100644 (file)
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -8,6 +8,7 @@ config VIRTIO_RING
         depends on VIRTIO
  
  menu "Virtio drivers"
+       depends on !XEN
  
  config VIRTIO_PCI
         tristate "PCI driver for virtio devices (EXPERIMENTAL)"
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig

index 3709624..3fa1993 100644 (file)
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -1184,7 +1184,7 @@ config WATCHDOG_RIO
  
  config XEN_WDT
         tristate "Xen Watchdog support"
-       depends on XEN
+       depends on XEN || PARAVIRT_XEN
         help
           Say Y here to support the hypervisor watchdog capability provided
           by Xen 4.0 and newer.  The watchdog timeout period is normally one
diff --git a/drivers/watchdog/xen_wdt.c b/drivers/watchdog/xen_wdt.c

index e4a25b5..d3dc105 100644 (file)
--- a/drivers/watchdog/xen_wdt.c
+++ b/drivers/watchdog/xen_wdt.c
@@ -1,7 +1,8 @@
  /*
   *     Xen Watchdog Driver
   *
- *     (c) Copyright 2010 Novell, Inc.
+ *     (c) Copyright 2010,2011 Novell, Inc.
+ *     (c) Copyright 2011,2012 SuSE
   *
   *     This program is free software; you can redistribute it and/or
   *     modify it under the terms of the GNU General Public License
@@ -28,8 +29,10 @@
  #include <linux/spinlock.h>
  #include <linux/uaccess.h>
  #include <linux/watchdog.h>
+#ifdef CONFIG_PARAVIRT_XEN
  #include <xen/xen.h>
  #include <asm/xen/hypercall.h>
+#endif
  #include <xen/interface/sched.h>
  
  static struct platform_device *platform_device;
@@ -329,17 +332,19 @@ static int __init xen_wdt_init_module(void)
  {
         int err;
  
+#ifdef CONFIG_PARAVIRT_XEN
         if (!xen_domain())
                 return -ENODEV;
+#endif
  
-       pr_info("Xen WatchDog Timer Driver v%s\n", DRV_VERSION);
+       printk(KERN_INFO "Xen WatchDog Timer Driver v%s\n", DRV_VERSION);
  
         err = platform_driver_register(&xen_wdt_driver);
         if (err)
                 return err;
  
         platform_device = platform_device_register_simple(DRV_NAME,
-                                                                 -1, NULL, 0);
+                                                         -1, NULL, 0);
         if (IS_ERR(platform_device)) {
                 err = PTR_ERR(platform_device);
                 platform_driver_unregister(&xen_wdt_driver);
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig

index ea20c51..6f79fb1 100644 (file)
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -1,8 +1,375 @@
+#
+# This Kconfig describe xen options
+#
+
+config XEN
+       bool
+
+if XEN
+config XEN_INTERFACE_VERSION
+       hex
+       default 0x00030207
+
+menu "XEN"
+
+config XEN_PRIVILEGED_GUEST
+       bool "Privileged Guest (domain 0)"
+       help
+         Support for privileged operation (domain 0)
+
+config XEN_UNPRIVILEGED_GUEST
+       def_bool y
+       depends on !XEN_PRIVILEGED_GUEST
+       select PM
+       select SUSPEND
+
+config XEN_DOMCTL
+       tristate
+
+config XEN_XENBUS_DEV
+       def_bool y
+       depends on PROC_FS
+
+config XEN_NETDEV_ACCEL_SFC_UTIL
+       depends on X86
+       tristate
+
+config XEN_BACKEND
+        tristate "Backend driver support"
+        default XEN_PRIVILEGED_GUEST
+        help
+          Support for backend device drivers that provide I/O services
+          to other virtual machines.
+
+config XEN_BLKDEV_BACKEND
+       tristate "Block-device backend driver"
+       depends on BLOCK && XEN_BACKEND
+       default XEN_BACKEND
+       select XEN_DOMCTL
+       help
+         The block-device backend driver allows the kernel to export its
+         block devices to other guests via a high-performance shared-memory
+         interface.
+
+config XEN_BLKDEV_TAP
+       tristate "Block-device tap backend driver"
+       depends on BLOCK && XEN_BACKEND
+       default XEN_BACKEND
+       select XEN_DOMCTL
+       help
+         The block tap driver is an alternative to the block back driver
+         and allows VM block requests to be redirected to userspace through
+         a device interface.  The tap allows user-space development of
+         high-performance block backends, where disk images may be implemented
+         as files, in memory, or on other hosts across the network.  This
+         driver can safely coexist with the existing blockback driver.
+
+config XEN_BLKDEV_TAP2
+       tristate "Block-device tap backend driver 2"
+       depends on BLOCK && XEN_BACKEND
+       default XEN_BACKEND
+       help
+         The block tap driver is an alternative to the block back driver
+         and allows VM block requests to be redirected to userspace through
+         a device interface.  The tap allows user-space development of
+         high-performance block backends, where disk images may be implemented
+         as files, in memory, or on other hosts across the network.  This
+         driver can safely coexist with the existing blockback driver.
+
+choice
+       prompt "Select blktap2 driver"
+       depends on XEN_BLKDEV_TAP2=y
+
+config XEN_BLKDEV_TAP2_LEGACY
+       bool "legacy"
+
+config XEN_BLKDEV_TAP2_NEW
+       bool "'new'"
+
+endchoice
+
+config XEN_NR_TAP2_DEVICES
+       int "Number of devices the version 2 tap backend driver can handle"
+       range 2 1048575
+       default 1024 if 64BIT
+       default 256
+       depends on XEN_BLKDEV_TAP2
+       help
+         This sets the number of backend devices the v2 tap backend driver
+         will be able to handle simultaneously. Note that device 0 is the
+         control device and hence not available to service guests.
+
+config XEN_BLKBACK_PAGEMAP
+       tristate
+       depends on XEN_BLKDEV_BACKEND != n && XEN_BLKDEV_TAP2 != n
+       default XEN_BLKDEV_BACKEND || XEN_BLKDEV_TAP2
+
+config XEN_NETDEV_BACKEND
+       tristate "Network-device backend driver"
+        depends on XEN_BACKEND && NET
+       default XEN_BACKEND
+       help
+         The network-device backend driver allows the kernel to export its
+         network devices to other guests via a high-performance shared-memory
+         interface.
+
+config XEN_NETDEV_TX_SHIFT
+       int "Maximum simultaneous transmit requests (as a power of 2)"
+       depends on XEN_NETDEV_BACKEND
+       range 5 15
+       default 8
+       help
+         The maximum number transmits the driver can hold pending, expressed
+         as the exponent of a power of 2.
+
+config XEN_NETDEV_PIPELINED_TRANSMITTER
+       bool "Pipelined transmitter (DANGEROUS)"
+       depends on XEN_NETDEV_BACKEND
+       help
+         If the net backend is a dumb domain, such as a transparent Ethernet
+         bridge with no local IP interface, it is safe to say Y here to get
+         slightly lower network overhead.
+         If the backend has a local IP interface; or may be doing smart things
+         like reassembling packets to perform firewall filtering; or if you
+         are unsure; or if you experience network hangs when this option is
+         enabled; then you must say N here.
+
+config XEN_NETDEV_ACCEL_SFC_BACKEND
+       tristate "Network-device backend driver acceleration for Solarflare NICs"
+       depends on XEN_NETDEV_BACKEND && SFC && SFC_RESOURCE && X86
+       select XEN_NETDEV_ACCEL_SFC_UTIL
+       default m
+
+config XEN_NETDEV_LOOPBACK
+       tristate "Network-device loopback driver"
+       depends on XEN_NETDEV_BACKEND
+       help
+         A two-interface loopback device to emulate a local netfront-netback
+         connection. If unsure, it is probably safe to say N here.
+
+config XEN_TPMDEV_BACKEND
+       tristate "TPM-device backend driver"
+        depends on XEN_BACKEND
+       help
+         The TPM-device backend driver
+
+config XEN_SCSI_BACKEND
+       tristate "SCSI backend driver"
+       depends on SCSI && XEN_BACKEND
+       default m
+       help
+         The SCSI backend driver allows the kernel to export its SCSI Devices
+         to other guests via a high-performance shared-memory interface.
+
+config XEN_USB_BACKEND
+       tristate "USB backend driver"
+       depends on USB && XEN_BACKEND
+       default m
+       help
+         The USB backend driver allows the kernel to export its USB Devices
+         to other guests.
+
+config XEN_BLKDEV_FRONTEND
+       tristate "Block-device frontend driver"
+       default y
+       help
+         The block-device frontend driver allows the kernel to access block
+         devices mounted within another guest OS. Unless you are building a
+         dedicated device-driver domain, or your master control domain
+         (domain 0), then you almost certainly want to say Y here.
+
+config XEN_NETDEV_FRONTEND
+       tristate "Network-device frontend driver"
+       depends on NET
+       default y
+       help
+         The network-device frontend driver allows the kernel to access
+         network interfaces within another guest OS. Unless you are building a
+         dedicated device-driver domain, or your master control domain
+         (domain 0), then you almost certainly want to say Y here.
+
+config XEN_NETDEV_ACCEL_SFC_FRONTEND
+       tristate "Network-device frontend driver acceleration for Solarflare NICs"
+       depends on XEN_NETDEV_FRONTEND && X86
+       select XEN_NETDEV_ACCEL_SFC_UTIL
+       default m
+
+config XEN_SCSI_FRONTEND
+       tristate "SCSI frontend driver"
+       depends on SCSI
+       default m
+       help
+         The SCSI frontend driver allows the kernel to access SCSI Devices
+         within another guest OS.
+
+config XEN_USB_FRONTEND
+       tristate "USB frontend driver"
+       depends on USB
+       default m
+       help
+         The USB frontend driver allows the kernel to access USB Devices
+         within another guest OS.
+
+config XEN_USB_FRONTEND_HCD_STATS
+       bool "Taking the HCD statistics (for debug)"
+       depends on XEN_USB_FRONTEND
+       default y
+       help
+         Count the transferred urb status and the RING_FULL occurrence.
+
+config XEN_USB_FRONTEND_HCD_PM
+       bool "HCD suspend/resume support (DO NOT USE)"
+       depends on XEN_USB_FRONTEND
+       default n
+       help
+         Experimental bus suspend/resume feature support.
+
+config XEN_GRANT_DEV
+       tristate "User-space granted page access driver"
+       depends on XEN_BACKEND != n
+       default XEN_PRIVILEGED_GUEST
+       help
+         Device for accessing (in user-space) pages that have been granted
+         by other domains.
+
+config XEN_FRAMEBUFFER
+       tristate "Framebuffer-device frontend driver"
+       depends on FB
+       select FB_CFB_FILLRECT
+       select FB_CFB_COPYAREA
+       select FB_CFB_IMAGEBLIT
+       default y
+       help
+         The framebuffer-device frontend drivers allows the kernel to create a
+         virtual framebuffer.  This framebuffer can be viewed in another
+         domain.  Unless this domain has access to a real video card, you
+         probably want to say Y here.
+
+config XEN_KEYBOARD
+       tristate "Keyboard-device frontend driver"
+       depends on XEN_FRAMEBUFFER && INPUT
+       default y
+       help
+         The keyboard-device frontend driver allows the kernel to create a
+         virtual keyboard.  This keyboard can then be driven by another
+         domain.  If you've said Y to CONFIG_XEN_FRAMEBUFFER, you probably
+         want to say Y here.
+
+config XEN_DISABLE_SERIAL
+       bool "Disable serial port drivers"
+       default y
+       help
+         Disable serial port drivers, allowing the Xen console driver
+         to provide a serial console at ttyS0.
+
+config XEN_NR_GUEST_DEVICES
+       int "Number of guest devices"
+       range 0 4032 if 64BIT
+       range 0 960
+       default 256 if XEN_BACKEND
+       default 16
+       help
+         Specify the total number of virtual devices (i.e. both frontend
+         and backend) that you want the kernel to be able to service.
+
+choice
+       prompt "Xen version compatibility"
+       default XEN_COMPAT_030002_AND_LATER
+
+       config XEN_COMPAT_030002_AND_LATER
+               bool "3.0.2 and later"
+
+       config XEN_COMPAT_030004_AND_LATER
+               bool "3.0.4 and later"
+
+       config XEN_COMPAT_030100_AND_LATER
+               bool "3.1.0 and later"
+
+       config XEN_COMPAT_030200_AND_LATER
+               bool "3.2.0 and later"
+
+       config XEN_COMPAT_030300_AND_LATER
+               bool "3.3.0 and later"
+
+       config XEN_COMPAT_030400_AND_LATER
+               bool "3.4.0 and later"
+
+       config XEN_COMPAT_040000_AND_LATER
+               bool "4.0.0 and later"
+
+       config XEN_COMPAT_040100_AND_LATER
+               bool "4.1.0 and later"
+
+       config XEN_COMPAT_LATEST_ONLY
+               bool "no compatibility code"
+
+endchoice
+
+config XEN_COMPAT
+       hex
+       default 0xffffff if XEN_COMPAT_LATEST_ONLY
+       default 0x040100 if XEN_COMPAT_040100_AND_LATER
+       default 0x040000 if XEN_COMPAT_040000_AND_LATER
+       default 0x030400 if XEN_COMPAT_030400_AND_LATER
+       default 0x030300 if XEN_COMPAT_030300_AND_LATER
+       default 0x030200 if XEN_COMPAT_030200_AND_LATER
+       default 0x030100 if XEN_COMPAT_030100_AND_LATER
+       default 0x030004 if XEN_COMPAT_030004_AND_LATER
+       default 0x030002 if XEN_COMPAT_030002_AND_LATER
+       default 0
+
+config XEN_VCPU_INFO_PLACEMENT
+       bool "Place shared vCPU info in per-CPU storage"
+#      depends on X86 && (XEN_COMPAT >= 0x00030101)
+       depends on X86
+       depends on !XEN_COMPAT_030002_AND_LATER
+       depends on !XEN_COMPAT_030004_AND_LATER
+       depends on !XEN_COMPAT_030100_AND_LATER
+       default SMP
+       ---help---
+         This allows faster access to the per-vCPU shared info
+         structure.
+
+endmenu
+
+config HAVE_IRQ_IGNORE_UNHANDLED
+       def_bool y
+
+config ARCH_HAS_WALK_MEMORY
+       def_bool y
+       depends on X86
+
+config XEN_SMPBOOT
+       def_bool y
+       depends on SMP && !PPC_XEN
+
+config XEN_SPINLOCK_ACQUIRE_NESTING
+       int "maximum nesting level for acquiring spin locks"
+       depends on SMP
+       # Xen versions prior to 3.2.x have a race condition with HYPERVISOR_poll().
+       depends on !XEN_COMPAT_030002_AND_LATER
+       depends on !XEN_COMPAT_030004_AND_LATER
+       depends on !XEN_COMPAT_030100_AND_LATER
+       range 0 3 if EXPERIMENTAL
+       range 0 1
+       default 0
+       help
+         IRQ-safe spin lock acquire operations can re-enable interrupts
+         before entering polling mode, to reduce interrupt latencies.
+         This option specifies how many times this can be done for each
+         individual spin lock (0 disables this behavior).
+
+config XEN_DEVMEM
+       def_bool y
+
+endif
+
  menu "Xen driver support"
-       depends on XEN
+       depends on XEN || PARAVIRT_XEN
  
  config XEN_BALLOON
-       bool "Xen memory balloon driver"
+       bool "Xen memory balloon driver" if PARAVIRT_XEN
+       depends on PARAVIRT_XEN || !PPC_XEN
         default y
         help
           The balloon driver allows the Xen domain to request more memory from
@@ -11,8 +378,7 @@ config XEN_BALLOON
  
  config XEN_SELFBALLOONING
         bool "Dynamically self-balloon kernel memory to target"
-       depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP && XEN_TMEM
-       default n
+       depends on XEN_BALLOON && CLEANCACHE && SWAP && XEN_TMEM
         help
           Self-ballooning dynamically balloons available kernel memory driven
           by the current usage of anonymous memory ("committed AS") and
@@ -29,7 +395,7 @@ config XEN_SELFBALLOONING
  config XEN_BALLOON_MEMORY_HOTPLUG
         bool "Memory hotplug support for Xen balloon driver"
         default n
-       depends on XEN_BALLOON && MEMORY_HOTPLUG
+       depends on PARAVIRT_XEN && XEN_BALLOON && MEMORY_HOTPLUG
         help
           Memory hotplug support for Xen balloon driver allows expanding memory
           available for the system above limit declared at system startup.
@@ -57,26 +423,28 @@ config XEN_BALLOON_MEMORY_HOTPLUG
           In that case step 3 should be omitted.
  
  config XEN_SCRUB_PAGES
-       bool "Scrub pages before returning them to system"
-       depends on XEN_BALLOON
+       bool "Scrub memory before freeing it to Xen"
+       depends on XEN || XEN_BALLOON
         default y
         help
-         Scrub pages before returning them to the system for reuse by
-         other domains.  This makes sure that any confidential data
-         is not accidentally visible to other domains.  Is it more
-         secure, but slightly less efficient.
+         Erase memory contents before freeing it back to Xen's global
+         pool. This ensures that any secrets contained within that
+         memory (e.g., private keys) cannot be found by other guests that
+         may be running on the machine. Most people will want to say Y here.
+         If security is not a concern then you may increase performance by
+         saying N.
           If in doubt, say yes.
  
  config XEN_DEV_EVTCHN
         tristate "Xen /dev/xen/evtchn device"
-       default y
+       default PARAVIRT_XEN || XEN_PRIVILEGED_GUEST || m
         help
           The evtchn driver allows a userspace process to triger event
           channels and to receive notification of an event channel
           firing.
           If in doubt, say yes.
  
-config XEN_BACKEND
+config PARAVIRT_XEN_BACKEND
         bool "Backend driver support"
         depends on XEN_DOM0
         default y
@@ -86,6 +454,7 @@ config XEN_BACKEND
  
  config XENFS
         tristate "Xen filesystem"
+       depends on PARAVIRT_XEN
         select XEN_PRIVCMD
         default y
         help
@@ -123,7 +492,7 @@ config XEN_XENBUS_FRONTEND
  
  config XEN_GNTDEV
         tristate "userspace grant access device driver"
-       depends on XEN
+       depends on PARAVIRT_XEN
         default m
         select MMU_NOTIFIER
         help
@@ -131,7 +500,7 @@ config XEN_GNTDEV
  
  config XEN_GRANT_DEV_ALLOC
         tristate "User-space grant reference allocator driver"
-       depends on XEN
+       depends on PARAVIRT_XEN
         default m
         help
           Allows userspace processes to create pages with access granted
@@ -140,9 +509,12 @@ config XEN_GRANT_DEV_ALLOC
  
  config SWIOTLB_XEN
         def_bool y
-       depends on PCI
+       depends on PARAVIRT_XEN && PCI
         select SWIOTLB
  
+config XEN_XENCOMM
+       bool
+
  config XEN_TMEM
         bool
         default y if (CLEANCACHE || FRONTSWAP)
@@ -152,8 +524,8 @@ config XEN_TMEM
  
  config XEN_PCIDEV_BACKEND
         tristate "Xen PCI-device backend driver"
-       depends on PCI && X86 && XEN
-       depends on XEN_BACKEND
+       depends on PCI && ((X86 && PARAVIRT_XEN_BACKEND) || (XEN_PRIVILEGED_GUEST && XEN_BACKEND))
+       default XEN_BACKEND if XEN
         default m
         help
           The PCI device backend driver allows the kernel to export arbitrary
@@ -161,11 +533,6 @@ config XEN_PCIDEV_BACKEND
           will need to make sure no other driver has bound to the device(s)
           you want to make visible to other guests.
  
-         The parameter "passthrough" allows you specify how you want the PCI
-         devices to appear in the guest. You can choose the default (0) where
-         PCI topology starts at 00.00.0, or (1) for passthrough if you want
-         the PCI devices topology appear the same as in the host.
-
           The "hide" parameter (only applicable if backend driver is compiled
           into the kernel) allows you to bind the PCI devices to this module
           from the default device drivers. The argument is the list of PCI BDFs:
@@ -173,14 +540,106 @@ config XEN_PCIDEV_BACKEND
  
           If in doubt, say m.
  
+menu "PCI Backend Mode"
+       depends on XEN_PCIDEV_BACKEND
+
+choice
+       prompt "Default PCI backend mode"
+       default XEN_PCIDEV_BACKEND_DEFAULT_CONTROLLER if IA64
+       default XEN_PCIDEV_BACKEND_DEFAULT_VPCI
+
+config XEN_PCIDEV_BACKEND_DEFAULT_VPCI
+       bool "Virtual PCI"
+       select XEN_PCIDEV_BACKEND_VPCI
+
+config XEN_PCIDEV_BACKEND_DEFAULT_PASSTHROUGH
+       bool "Passthrough"
+       select XEN_PCIDEV_BACKEND_PASSTHROUGH
+
+config XEN_PCIDEV_BACKEND_DEFAULT_SLOT
+       bool "Slot"
+       select XEN_PCIDEV_BACKEND_SLOT
+
+config XEN_PCIDEV_BACKEND_DEFAULT_CONTROLLER
+       bool "Controller"
+       depends on IA64
+       select XEN_PCIDEV_BACKEND_CONTROLLER
+
+endchoice
+
+config XEN_PCIDEV_BACKEND_DEFAULT
+       string
+       default "vpci" if XEN_PCIDEV_BACKEND_DEFAULT_VPCI
+       default "passthrough" if XEN_PCIDEV_BACKEND_DEFAULT_PASSTHROUGH
+       default "slot" if XEN_PCIDEV_BACKEND_DEFAULT_SLOT
+       default "controller" if XEN_PCIDEV_BACKEND_DEFAULT_CONTROLLER
+
+config XEN_PCIDEV_BACKEND_VPCI
+       bool "Virtual PCI"
+       default X86
+       ---help---
+         This PCI Backend hides the true PCI topology and makes the frontend
+         think there is a single PCI bus with only the exported devices on it.
+         For example, a device at 03:05.0 will be re-assigned to 00:00.0. A
+         second device at 02:1a.1 will be re-assigned to 00:01.1.
+
+         If not the default, the parameter "mode=vpci" allows you to use this
+         mode.
+
+config XEN_PCIDEV_BACKEND_PASSTHROUGH
+       bool "Passthrough"
+       ---help---
+         This PCI Backend provides a real view of the PCI topology to the
+         frontend (for example, a device at 06:01.b will still appear at
+         06:01.b to the frontend). This is similar to how Xen 2.0.x exposed
+         PCI devices to its driver domains. This may be required for drivers
+         which depend on finding their hardward in certain bus/slot
+         locations.
+
+         If not the default, the parameter "mode=passthrough" allows you to
+         use this mode.
+
+config XEN_PCIDEV_BACKEND_SLOT
+       bool "Slot"
+       ---help---
+         This PCI Backend hides the true PCI topology and makes the frontend
+         think there is a single PCI bus with only the exported devices on it.
+         Contrary to the virtual PCI backend, a function becomes a new slot.
+         For example, a device at 03:05.2 will be re-assigned to 00:00.0. A
+         second device at 02:1a.1 will be re-assigned to 00:01.0.
+
+         If not the default, the parameter "mode=slot" allows you to use this
+         mode.
+
+config XEN_PCIDEV_BACKEND_CONTROLLER
+       bool "Controller"
+       depends on IA64
+       ---help---
+         This PCI backend virtualizes the PCI bus topology by providing a
+         virtual bus per PCI root device.  Devices which are physically under
+         the same root bus will appear on the same virtual bus.  For systems
+         with complex I/O addressing, this is the only backend which supports
+         extended I/O port spaces and MMIO translation offsets.  This backend
+         also supports slot virtualization.  For example, a device at
+         0000:01:02.1 will be re-assigned to 0000:00:00.0.  A second device
+         at 0000:02:05.0 (behind a P2P bridge on bus 0000:01) will be
+         re-assigned to 0000:00:01.0.  A third device at 0000:16:05.0 (under
+         a different PCI root bus) will be re-assigned to 0000:01:00.0.
+
+         If not the default, the parameter "mode=controller" allows you to
+         use this mode.
+
+endmenu
+
  config XEN_PRIVCMD
         tristate
-       depends on XEN
+       depends on PARAVIRT_XEN || (XEN && PROC_FS)
+       default y if XEN
         default m
  
  config XEN_ACPI_PROCESSOR
         tristate "Xen ACPI processor"
-       depends on XEN && X86 && ACPI_PROCESSOR && CPU_FREQ
+       depends on PARAVIRT_XEN && X86 && ACPI_PROCESSOR && CPU_FREQ
         default m
         help
            This ACPI processor uploads Power Management information to the Xen
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile

index 9adc5be..7de472b 100644 (file)
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,27 +1,68 @@
-obj-y  += grant-table.o features.o events.o manage.o balloon.o
-obj-y  += xenbus/
+obj-$(CONFIG_PARAVIRT_XEN)     += grant-table.o features.o events.o manage.o balloon.o
+xen-biomerge-$(CONFIG_PARAVIRT_XEN) := biomerge.o
+xen-hotplug-$(CONFIG_PARAVIRT_XEN) := cpu_hotplug.o
+xen-balloon_$(CONFIG_PARAVIRT_XEN) := xen-balloon.o
+xen-evtchn-name-$(CONFIG_PARAVIRT_XEN) := xen-evtchn
+xen-privcmd_$(CONFIG_PARAVIRT_XEN) := xen-privcmd.o
+
+xen-balloon_$(CONFIG_XEN)      := balloon/
+xen-privcmd_$(CONFIG_XEN)      := privcmd/
+obj-$(CONFIG_XEN)              += core/
+obj-$(CONFIG_XEN)              += console/
+obj-y                          += xenbus/
+obj-$(CONFIG_XEN)              += char/
+
+xen-backend-$(CONFIG_XEN_BACKEND)      := util.o
+xen-evtchn-name-$(CONFIG_XEN)          := evtchn
  
  nostackp := $(call cc-option, -fno-stack-protector)
+ifeq ($(CONFIG_PARAVIRT_XEN),y)
  CFLAGS_features.o                      := $(nostackp)
+endif
+
+priv-$(CONFIG_PCI)                     := pci.o
  
-obj-$(CONFIG_BLOCK)                    += biomerge.o
-obj-$(CONFIG_HOTPLUG_CPU)              += cpu_hotplug.o
+obj-$(CONFIG_XEN)                      += features.o $(xen-backend-y) $(xen-backend-m)
+obj-$(CONFIG_XEN_PRIVILEGED_GUEST)     += $(priv-y)
+obj-$(CONFIG_BLOCK)                    += $(xen-biomerge-y)
+obj-$(CONFIG_HOTPLUG_CPU)              += $(xen-hotplug-y)
  obj-$(CONFIG_XEN_XENCOMM)              += xencomm.o
-obj-$(CONFIG_XEN_BALLOON)              += xen-balloon.o
+obj-$(CONFIG_XEN_BALLOON)              += $(xen-balloon_y)
  obj-$(CONFIG_XEN_SELFBALLOONING)       += xen-selfballoon.o
-obj-$(CONFIG_XEN_DEV_EVTCHN)           += xen-evtchn.o
+obj-$(CONFIG_XEN_DEV_EVTCHN)           += $(xen-evtchn-name-y).o
  obj-$(CONFIG_XEN_GNTDEV)               += xen-gntdev.o
-obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)      += xen-gntalloc.o
  obj-$(CONFIG_XENFS)                    += xenfs/
+obj-$(CONFIG_XEN_GRANT_DEV_ALLOC)      += xen-gntalloc.o
  obj-$(CONFIG_XEN_SYS_HYPERVISOR)       += sys-hypervisor.o
  obj-$(CONFIG_XEN_PVHVM)                        += platform-pci.o
  obj-$(CONFIG_XEN_TMEM)                 += tmem.o
  obj-$(CONFIG_SWIOTLB_XEN)              += swiotlb-xen.o
  obj-$(CONFIG_XEN_DOM0)                 += pci.o
  obj-$(CONFIG_XEN_PCIDEV_BACKEND)       += xen-pciback/
-obj-$(CONFIG_XEN_PRIVCMD)              += xen-privcmd.o
+obj-$(CONFIG_XEN_PRIVCMD)              += $(xen-privcmd_y)
  obj-$(CONFIG_XEN_ACPI_PROCESSOR)       += xen-acpi-processor.o
  xen-evtchn-y                           := evtchn.o
  xen-gntdev-y                           := gntdev.o
  xen-gntalloc-y                         := gntalloc.o
  xen-privcmd-y                          := privcmd.o
+
+obj-$(CONFIG_XEN_BLKDEV_BACKEND)       += blkback/
+obj-$(CONFIG_XEN_BLKDEV_TAP)           += blktap/
+obj-$(filter m,$(CONFIG_XEN_BLKDEV_TAP2)) += blktap2/ blktap2-new/
+obj-$(CONFIG_XEN_BLKDEV_TAP2_LEGACY)   += blktap2/
+obj-$(CONFIG_XEN_BLKDEV_TAP2_NEW)      += blktap2-new/
+obj-$(CONFIG_XEN_NETDEV_BACKEND)       += netback/
+obj-$(CONFIG_XEN_TPMDEV_BACKEND)       += tpmback/
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += blkfront/
+obj-$(CONFIG_XEN_NETDEV_FRONTEND)      += netfront/
+obj-$(CONFIG_XEN_PCIDEV_FRONTEND)      += pcifront/
+obj-$(CONFIG_XEN_FRAMEBUFFER)          += fbfront/
+obj-$(CONFIG_XEN_KEYBOARD)             += fbfront/
+obj-$(CONFIG_XEN_SCSI_BACKEND)         += scsiback/
+obj-$(CONFIG_XEN_SCSI_FRONTEND)                += scsifront/
+obj-$(CONFIG_XEN_USB_BACKEND)          += usbback/
+obj-$(CONFIG_XEN_USB_FRONTEND)         += usbfront/
+obj-$(CONFIG_XEN_GRANT_DEV)    += gntdev/
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL)                += sfc_netutil/
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND)    += sfc_netfront/
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND)     += sfc_netback/
diff --git a/drivers/xen/balloon/Makefile b/drivers/xen/balloon/Makefile

new file mode 100644 (file)

index 0000000..3fc3d0b
--- /dev/null
+++ b/drivers/xen/balloon/Makefile
@@ -0,0 +1,2 @@
+
+obj-y := balloon.o sysfs.o
diff --git a/drivers/xen/balloon/balloon.c b/drivers/xen/balloon/balloon.c

new file mode 100644 (file)

index 0000000..d522460
--- /dev/null
+++ b/drivers/xen/balloon/balloon.c
@@ -0,0 +1,804 @@
+/******************************************************************************
+ * balloon.c
+ *
+ * Xen balloon driver - enables returning/claiming memory to/from Xen.
+ *
+ * Copyright (c) 2003, B Dragovic
+ * Copyright (c) 2003-2004, M Williamson, K Fraser
+ * Copyright (c) 2005 Dan M. Smith, IBM Corporation
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <xen/xen_proc.h>
+#include <asm/hypervisor.h>
+#include <xen/balloon.h>
+#include <xen/interface/memory.h>
+#include <asm/maddr.h>
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+#include <xen/xenbus.h>
+#include "common.h"
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#ifdef CONFIG_PROC_FS
+static struct proc_dir_entry *balloon_pde;
+#endif
+
+static DEFINE_MUTEX(balloon_mutex);
+
+/*
+ * Protects atomic reservation decrease/increase against concurrent increases.
+ * Also protects non-atomic updates of current_pages and driver_pages, and
+ * balloon lists.
+ */
+DEFINE_SPINLOCK(balloon_lock);
+
+struct balloon_stats balloon_stats;
+
+/* We increase/decrease in batches which fit in a page */
+static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
+
+#ifdef CONFIG_HIGHMEM
+#define inc_totalhigh_pages() (totalhigh_pages++)
+#define dec_totalhigh_pages() (totalhigh_pages--)
+#else
+#define inc_totalhigh_pages() ((void)0)
+#define dec_totalhigh_pages() ((void)0)
+#endif
+
+#ifndef CONFIG_XEN
+/*
+ * In HVM guests accounting here uses the Xen visible values, but the kernel
+ * determined totalram_pages value shouldn't get altered. Since totalram_pages
+ * includes neither the kernel static image nor any memory allocated prior to
+ * or from the bootmem allocator, we have to synchronize the two values.
+ */
+static unsigned long __read_mostly totalram_bias;
+#else
+#define totalram_bias 0
+#endif
+
+/* List of ballooned pages, threaded through the mem_map array. */
+static LIST_HEAD(ballooned_pages);
+
+/* Main work function, always executed in process context. */
+static void balloon_process(struct work_struct *unused);
+static DECLARE_WORK(balloon_worker, balloon_process);
+
+/* When ballooning out (allocating memory to return to Xen) we don't really 
+   want the kernel to try too hard since that can trigger the oom killer. */
+#define GFP_BALLOON (GFP_HIGHUSER|__GFP_NOWARN|__GFP_NORETRY|__GFP_NOMEMALLOC|\
+                    __GFP_NOTRACK|__GFP_COLD)
+
+#define PAGE_TO_LIST(p) (&(p)->lru)
+#define LIST_TO_PAGE(l) list_entry((l), struct page, lru)
+#define UNLIST_PAGE(p)                         \
+       do {                                    \
+               list_del(PAGE_TO_LIST(p));      \
+               PAGE_TO_LIST(p)->next = NULL;   \
+               PAGE_TO_LIST(p)->prev = NULL;   \
+       } while(0)
+
+#define IPRINTK(fmt, args...) pr_info("xen_mem: " fmt, ##args)
+#define WPRINTK(fmt, args...) pr_warning("xen_mem: " fmt, ##args)
+
+/* balloon_append: add the given page to the balloon. */
+static void balloon_append(struct page *page, int account)
+{
+       unsigned long pfn;
+
+       /* Lowmem is re-populated first, so highmem pages go at list tail. */
+       if (PageHighMem(page)) {
+               list_add_tail(PAGE_TO_LIST(page), &ballooned_pages);
+               bs.balloon_high++;
+               if (account)
+                       dec_totalhigh_pages();
+       } else {
+               list_add(PAGE_TO_LIST(page), &ballooned_pages);
+               bs.balloon_low++;
+       }
+
+       pfn = page_to_pfn(page);
+       if (account) {
+               SetPageReserved(page);
+               set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+               page_zone(page)->present_pages--;
+       } else {
+               BUG_ON(!PageReserved(page));
+               WARN_ON_ONCE(phys_to_machine_mapping_valid(pfn));
+       }
+}
+
+/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
+static struct page *balloon_retrieve(int *was_empty)
+{
+       struct page *page;
+       struct zone *zone;
+
+       if (list_empty(&ballooned_pages))
+               return NULL;
+
+       page = LIST_TO_PAGE(ballooned_pages.next);
+       UNLIST_PAGE(page);
+       BUG_ON(!PageReserved(page));
+
+       if (PageHighMem(page)) {
+               bs.balloon_high--;
+               inc_totalhigh_pages();
+       }
+       else
+               bs.balloon_low--;
+       zone = page_zone(page);
+       *was_empty |= !populated_zone(zone);
+       zone->present_pages++;
+
+       return page;
+}
+
+static struct page *balloon_first_page(void)
+{
+       if (list_empty(&ballooned_pages))
+               return NULL;
+       return LIST_TO_PAGE(ballooned_pages.next);
+}
+
+static struct page *balloon_next_page(struct page *page)
+{
+       struct list_head *next = PAGE_TO_LIST(page)->next;
+       if (next == &ballooned_pages)
+               return NULL;
+       return LIST_TO_PAGE(next);
+}
+
+static inline void balloon_free_page(struct page *page)
+{
+#ifndef MODULE
+       if (put_page_testzero(page))
+               free_hot_cold_page(page, 1);
+#else
+       /* free_hot_cold_page() is not being exported. */
+       __free_page(page);
+#endif
+}
+
+static void balloon_alarm(unsigned long unused)
+{
+       schedule_work(&balloon_worker);
+}
+static DEFINE_TIMER(balloon_timer, balloon_alarm, 0, 0);
+
+static unsigned long current_target(void)
+{
+       unsigned long target = bs.target_pages;
+       if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high))
+               target = bs.current_pages + bs.balloon_low + bs.balloon_high;
+       return target;
+}
+
+unsigned long balloon_minimum_target(void)
+{
+#ifndef CONFIG_XEN
+#define max_pfn num_physpages
+#endif
+       unsigned long min_pages, curr_pages = current_target();
+
+#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
+       /* Simple continuous piecewiese linear function:
+        *  max MiB -> min MiB  gradient
+        *       0         0
+        *      16        16
+        *      32        24
+        *     128        72    (1/2)
+        *     512       168    (1/4)
+        *    2048       360    (1/8)
+        *    8192       552    (1/32)
+        *   32768      1320
+        *  131072      4392
+        */
+       if (max_pfn < MB2PAGES(128))
+               min_pages = MB2PAGES(8) + (max_pfn >> 1);
+       else if (max_pfn < MB2PAGES(512))
+               min_pages = MB2PAGES(40) + (max_pfn >> 2);
+       else if (max_pfn < MB2PAGES(2048))
+               min_pages = MB2PAGES(104) + (max_pfn >> 3);
+       else
+               min_pages = MB2PAGES(296) + (max_pfn >> 5);
+#undef MB2PAGES
+
+       /* Don't enforce growth */
+       return min(min_pages, curr_pages);
+#ifndef CONFIG_XEN
+#undef max_pfn
+#endif
+}
+
+static int increase_reservation(unsigned long nr_pages)
+{
+       unsigned long  pfn, i, flags;
+       struct page   *page;
+       long           rc;
+       int            need_zonelists_rebuild = 0;
+       struct xen_memory_reservation reservation = {
+               .address_bits = 0,
+               .extent_order = 0,
+               .domid        = DOMID_SELF
+       };
+
+       if (nr_pages > ARRAY_SIZE(frame_list))
+               nr_pages = ARRAY_SIZE(frame_list);
+
+       balloon_lock(flags);
+
+       page = balloon_first_page();
+       for (i = 0; i < nr_pages; i++) {
+               BUG_ON(page == NULL);
+               frame_list[i] = page_to_pfn(page);;
+               page = balloon_next_page(page);
+       }
+
+       set_xen_guest_handle(reservation.extent_start, frame_list);
+       reservation.nr_extents = nr_pages;
+       rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+       if (rc < 0)
+               goto out;
+
+       for (i = 0; i < rc; i++) {
+               page = balloon_retrieve(&need_zonelists_rebuild);
+               BUG_ON(page == NULL);
+
+               pfn = page_to_pfn(page);
+               BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
+                      phys_to_machine_mapping_valid(pfn));
+
+               set_phys_to_machine(pfn, frame_list[i]);
+
+#ifdef CONFIG_XEN
+               /* Link back into the page tables if not highmem. */
+               if (pfn < max_low_pfn) {
+                       int ret;
+                       ret = HYPERVISOR_update_va_mapping(
+                               (unsigned long)__va(pfn << PAGE_SHIFT),
+                               pfn_pte_ma(frame_list[i], PAGE_KERNEL),
+                               0);
+                       BUG_ON(ret);
+               }
+#endif
+
+               /* Relinquish the page back to the allocator. */
+               ClearPageReserved(page);
+               init_page_count(page);
+               balloon_free_page(page);
+       }
+
+       bs.current_pages += rc;
+       totalram_pages = bs.current_pages - totalram_bias;
+
+ out:
+       balloon_unlock(flags);
+
+#ifndef MODULE
+       setup_per_zone_wmarks();
+       if (rc > 0)
+               kswapd_run(0);
+       if (need_zonelists_rebuild)
+               build_all_zonelists(NULL);
+       else
+               vm_total_pages = nr_free_pagecache_pages();
+#endif
+
+       return rc < 0 ? rc : rc != nr_pages;
+}
+
+static int decrease_reservation(unsigned long nr_pages)
+{
+       unsigned long  pfn, i, flags;
+       struct page   *page;
+       void          *v;
+       int            need_sleep = 0;
+       int ret;
+       struct xen_memory_reservation reservation = {
+               .address_bits = 0,
+               .extent_order = 0,
+               .domid        = DOMID_SELF
+       };
+
+       if (nr_pages > ARRAY_SIZE(frame_list))
+               nr_pages = ARRAY_SIZE(frame_list);
+
+       for (i = 0; i < nr_pages; i++) {
+               if ((page = alloc_page(GFP_BALLOON)) == NULL) {
+                       nr_pages = i;
+                       need_sleep = 1;
+                       break;
+               }
+
+               pfn = page_to_pfn(page);
+               frame_list[i] = pfn_to_mfn(pfn);
+
+               if (!PageHighMem(page)) {
+                       v = phys_to_virt(pfn << PAGE_SHIFT);
+                       xen_scrub_pages(v, 1);
+#ifdef CONFIG_XEN
+                       ret = HYPERVISOR_update_va_mapping(
+                               (unsigned long)v, __pte_ma(0), 0);
+                       BUG_ON(ret);
+#endif
+               }
+#ifdef CONFIG_XEN_SCRUB_PAGES
+               else {
+                       v = kmap(page);
+                       xen_scrub_pages(v, 1);
+                       kunmap(page);
+               }
+#endif
+       }
+
+#ifdef CONFIG_XEN
+       /* Ensure that ballooned highmem pages don't have kmaps. */
+       kmap_flush_unused();
+       flush_tlb_all();
+#endif
+
+       balloon_lock(flags);
+
+       /* No more mappings: invalidate P2M and add to balloon. */
+       for (i = 0; i < nr_pages; i++) {
+               pfn = mfn_to_pfn(frame_list[i]);
+               balloon_append(pfn_to_page(pfn), 1);
+       }
+
+       set_xen_guest_handle(reservation.extent_start, frame_list);
+       reservation.nr_extents   = nr_pages;
+       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+       BUG_ON(ret != nr_pages);
+
+       bs.current_pages -= nr_pages;
+       totalram_pages = bs.current_pages - totalram_bias;
+
+       balloon_unlock(flags);
+
+       return need_sleep;
+}
+
+/*
+ * We avoid multiple worker processes conflicting via the balloon mutex.
+ * We may of course race updates of the target counts (which are protected
+ * by the balloon lock), or with changes to the Xen hard limit, but we will
+ * recover from these in time.
+ */
+static void balloon_process(struct work_struct *unused)
+{
+       int need_sleep = 0;
+       long credit;
+
+       mutex_lock(&balloon_mutex);
+
+       do {
+               credit = current_target() - bs.current_pages;
+               if (credit > 0)
+                       need_sleep = (increase_reservation(credit) != 0);
+               if (credit < 0)
+                       need_sleep = (decrease_reservation(-credit) != 0);
+
+#ifndef CONFIG_PREEMPT
+               if (need_resched())
+                       schedule();
+#endif
+       } while ((credit != 0) && !need_sleep);
+
+       /* Schedule more work if there is some still to be done. */
+       if (current_target() != bs.current_pages)
+               mod_timer(&balloon_timer, jiffies + HZ);
+
+       mutex_unlock(&balloon_mutex);
+}
+
+/* Resets the Xen limit, sets new target, and kicks off processing. */
+void balloon_set_new_target(unsigned long target)
+{
+       /* No need for lock. Not read-modify-write updates. */
+       bs.target_pages = max(target, balloon_minimum_target());
+       schedule_work(&balloon_worker);
+}
+
+static struct xenbus_watch target_watch =
+{
+       .node = "memory/target"
+};
+
+/* React to a change in the target key */
+static void watch_target(struct xenbus_watch *watch,
+                        const char **vec, unsigned int len)
+{
+       unsigned long long new_target;
+       int err;
+
+       err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target);
+       if (err != 1) {
+               /* This is ok (for domain0 at least) - so just return */
+               return;
+       }
+
+       /* The given memory/target value is in KiB, so it needs converting to
+        * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
+        */
+       balloon_set_new_target(new_target >> (PAGE_SHIFT - 10));
+}
+
+static int balloon_init_watcher(struct notifier_block *notifier,
+                               unsigned long event,
+                               void *data)
+{
+       int err;
+
+       err = register_xenbus_watch(&target_watch);
+       if (err)
+               pr_err("Failed to set balloon watcher\n");
+
+       return NOTIFY_DONE;
+}
+
+#ifdef CONFIG_PROC_FS
+static int balloon_write(struct file *file, const char __user *buffer,
+                        unsigned long count, void *data)
+{
+       char memstring[64], *endchar;
+       unsigned long long target_bytes;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (count <= 1)
+               return -EBADMSG; /* runt */
+       if (count > sizeof(memstring))
+               return -EFBIG;   /* too long */
+
+       if (copy_from_user(memstring, buffer, count))
+               return -EFAULT;
+       memstring[sizeof(memstring)-1] = '\0';
+
+       target_bytes = memparse(memstring, &endchar);
+       balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+       return count;
+}
+
+static int balloon_read(char *page, char **start, off_t off,
+                       int count, int *eof, void *data)
+{
+       int len;
+
+       len = sprintf(
+               page,
+               "Current allocation: %8lu kB\n"
+               "Requested target:   %8lu kB\n"
+               "Minimum target:     %8lu kB\n"
+               "Maximum target:     %8lu kB\n"
+               "Low-mem balloon:    %8lu kB\n"
+               "High-mem balloon:   %8lu kB\n"
+               "Driver pages:       %8lu kB\n",
+               PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages), 
+               PAGES2KB(balloon_minimum_target()), PAGES2KB(num_physpages),
+               PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high),
+               PAGES2KB(bs.driver_pages));
+
+
+       *eof = 1;
+       return len;
+}
+#endif
+
+static struct notifier_block xenstore_notifier;
+
+static int __init balloon_init(void)
+{
+#if !defined(CONFIG_XEN)
+# ifndef XENMEM_get_pod_target
+#  define XENMEM_get_pod_target 17
+       typedef struct xen_pod_target {
+               uint64_t target_pages;
+               uint64_t tot_pages;
+               uint64_t pod_cache_pages;
+               uint64_t pod_entries;
+               domid_t domid;
+       } xen_pod_target_t;
+# endif
+       xen_pod_target_t pod_target = { .domid = DOMID_SELF };
+       int rc;
+#elif defined(CONFIG_X86)
+       unsigned long pfn;
+       struct page *page;
+#endif
+
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       IPRINTK("Initialising balloon driver.\n");
+
+#ifdef CONFIG_XEN
+       bs.current_pages = min(xen_start_info->nr_pages, max_pfn);
+       totalram_pages   = bs.current_pages;
+#else 
+       rc = HYPERVISOR_memory_op(XENMEM_get_pod_target, &pod_target);
+       /*
+        * Xen prior to 3.4.0 masks the memory_op command to 4 bits, thus
+        * converting XENMEM_get_pod_target to XENMEM_decrease_reservation.
+        * Fortunately this results in a request with all input fields zero,
+        * but (due to the way bit 4 and upwards get interpreted) a starting
+        * extent of 1. When start_extent > nr_extents (>= in newer Xen), we
+        * simply get start_extent returned.
+        */
+       totalram_bias = HYPERVISOR_memory_op(rc != -ENOSYS && rc != 1
+               ? XENMEM_maximum_reservation : XENMEM_current_reservation,
+               &pod_target.domid);
+       if ((long)totalram_bias != -ENOSYS) {
+               BUG_ON(totalram_bias < totalram_pages);
+               bs.current_pages = totalram_bias;
+               totalram_bias -= totalram_pages;
+       } else {
+               totalram_bias = 0;
+               bs.current_pages = totalram_pages;
+       }
+#endif
+       bs.target_pages  = bs.current_pages;
+       bs.balloon_low   = 0;
+       bs.balloon_high  = 0;
+       bs.driver_pages  = 0UL;
+
+#ifdef CONFIG_PROC_FS
+       if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) {
+               WPRINTK("Unable to create /proc/xen/balloon.\n");
+               return -1;
+       }
+
+       balloon_pde->read_proc  = balloon_read;
+       balloon_pde->write_proc = balloon_write;
+#endif
+       balloon_sysfs_init();
+
+#if defined(CONFIG_X86) && defined(CONFIG_XEN) 
+       /* Initialise the balloon with excess memory space. */
+       for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
+               page = pfn_to_page(pfn);
+               if (!PageReserved(page)) {
+                       SetPageReserved(page);
+                       set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+                       balloon_append(page, 0);
+               }
+       }
+#endif
+
+       target_watch.callback = watch_target;
+       xenstore_notifier.notifier_call = balloon_init_watcher;
+
+       register_xenstore_notifier(&xenstore_notifier);
+    
+       return 0;
+}
+
+subsys_initcall(balloon_init);
+
+static void __exit balloon_exit(void)
+{
+       balloon_sysfs_exit();
+       /* XXX - release balloon here */
+}
+
+module_exit(balloon_exit); 
+
+void balloon_update_driver_allowance(long delta)
+{
+       unsigned long flags;
+
+       balloon_lock(flags);
+       bs.driver_pages += delta;
+       balloon_unlock(flags);
+}
+EXPORT_SYMBOL_GPL(balloon_update_driver_allowance);
+
+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
+
+#ifdef CONFIG_XEN
+static int dealloc_pte_fn(
+       pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+{
+       unsigned long pfn, mfn = pte_mfn(*pte);
+       int ret;
+       struct xen_memory_reservation reservation = {
+               .nr_extents   = 1,
+               .extent_order = 0,
+               .domid        = DOMID_SELF
+       };
+       set_xen_guest_handle(reservation.extent_start, &mfn);
+       set_pte_at(&init_mm, addr, pte, __pte_ma(0));
+       pfn = __pa(addr) >> PAGE_SHIFT;
+       set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+       SetPageReserved(pfn_to_page(pfn));
+       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+       BUG_ON(ret != 1);
+       return 0;
+}
+#endif
+
+struct page **alloc_empty_pages_and_pagevec(int nr_pages)
+{
+       unsigned long flags;
+       void *v;
+       struct page *page, **pagevec;
+       int i, ret;
+
+       pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL);
+       if (pagevec == NULL)
+               return NULL;
+
+       for (i = 0; i < nr_pages; i++) {
+               balloon_lock(flags);
+               page = balloon_first_page();
+               if (page && !PageHighMem(page)) {
+                       UNLIST_PAGE(page);
+                       bs.balloon_low--;
+                       balloon_unlock(flags);
+                       pagevec[i] = page;
+                       continue;
+               }
+               balloon_unlock(flags);
+
+               page = pagevec[i] = alloc_page(GFP_KERNEL|__GFP_NOTRACK|__GFP_COLD);
+               if (page == NULL)
+                       goto err;
+
+               v = page_address(page);
+               xen_scrub_pages(v, 1);
+
+               balloon_lock(flags);
+
+               if (xen_feature(XENFEAT_auto_translated_physmap)) {
+                       unsigned long gmfn = page_to_pfn(page);
+                       struct xen_memory_reservation reservation = {
+                               .nr_extents   = 1,
+                               .extent_order = 0,
+                               .domid        = DOMID_SELF
+                       };
+                       set_xen_guest_handle(reservation.extent_start, &gmfn);
+                       ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+                                                  &reservation);
+                       if (ret == 1)
+                               ret = 0; /* success */
+               } else {
+#ifdef CONFIG_XEN
+                       ret = apply_to_page_range(&init_mm, (unsigned long)v,
+                                                 PAGE_SIZE, dealloc_pte_fn,
+                                                 NULL);
+#else
+                       /* Cannot handle non-auto translate mode. */
+                       ret = 1;
+#endif
+               }
+
+               if (ret != 0) {
+                       balloon_free_page(page);
+                       balloon_unlock(flags);
+                       goto err;
+               }
+
+               totalram_pages = --bs.current_pages - totalram_bias;
+               if (PageHighMem(page))
+                       dec_totalhigh_pages();
+               page_zone(page)->present_pages--;
+
+               balloon_unlock(flags);
+       }
+
+ out:
+       schedule_work(&balloon_worker);
+#ifdef CONFIG_XEN
+       flush_tlb_all();
+#endif
+       return pagevec;
+
+ err:
+       balloon_lock(flags);
+       while (--i >= 0)
+               balloon_append(pagevec[i], 0);
+       balloon_unlock(flags);
+       kfree(pagevec);
+       pagevec = NULL;
+       goto out;
+}
+EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec);
+
+#endif /* CONFIG_XEN_BACKEND */
+
+#ifdef CONFIG_XEN
+static void _free_empty_pages(struct page **pagevec, int nr_pages,
+                             bool account)
+{
+       unsigned long flags;
+       int i;
+
+       balloon_lock(flags);
+       for (i = 0; i < nr_pages; i++) {
+               BUG_ON(page_count(pagevec[i]) != 1);
+               balloon_append(pagevec[i], account);
+       }
+       if (account) {
+               bs.current_pages -= nr_pages;
+               totalram_pages = bs.current_pages - totalram_bias;
+       }
+       balloon_unlock(flags);
+
+       schedule_work(&balloon_worker);
+}
+
+void free_empty_pages(struct page **pagevec, int nr_pages)
+{
+       _free_empty_pages(pagevec, nr_pages, true);
+}
+#endif
+
+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
+void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages)
+{
+       if (pagevec) {
+               _free_empty_pages(pagevec, nr_pages, false);
+               kfree(pagevec);
+       }
+}
+EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec);
+#endif
+
+void balloon_release_driver_page(struct page *page)
+{
+       unsigned long flags;
+
+       balloon_lock(flags);
+       balloon_append(page, 1);
+       totalram_pages = --bs.current_pages - totalram_bias;
+       bs.driver_pages--;
+       balloon_unlock(flags);
+
+       schedule_work(&balloon_worker);
+}
+EXPORT_SYMBOL_GPL(balloon_release_driver_page);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/balloon/common.h b/drivers/xen/balloon/common.h

new file mode 100644 (file)

index 0000000..0a53f7a
--- /dev/null
+++ b/drivers/xen/balloon/common.h
@@ -0,0 +1,57 @@
+/******************************************************************************
+ * balloon/common.h
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_BALLOON_COMMON_H__
+#define __XEN_BALLOON_COMMON_H__
+
+#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10))
+
+struct balloon_stats {
+       /* We aim for 'current allocation' == 'target allocation'. */
+       unsigned long current_pages;
+       unsigned long target_pages;
+       /*
+        * Drivers may alter the memory reservation independently, but they
+        * must inform the balloon driver so we avoid hitting the hard limit.
+        */
+       unsigned long driver_pages;
+       /* Number of pages in high- and low-memory balloons. */
+       unsigned long balloon_low;
+       unsigned long balloon_high;
+};
+
+extern struct balloon_stats balloon_stats;
+#define bs balloon_stats
+
+int balloon_sysfs_init(void);
+void balloon_sysfs_exit(void);
+
+void balloon_set_new_target(unsigned long target);
+unsigned long balloon_minimum_target(void);
+
+#endif /* __XEN_BALLOON_COMMON_H__ */
diff --git a/drivers/xen/balloon/sysfs.c b/drivers/xen/balloon/sysfs.c

new file mode 100644 (file)

index 0000000..d49dfca
--- /dev/null
+++ b/drivers/xen/balloon/sysfs.c
@@ -0,0 +1,209 @@
+/******************************************************************************
+ * balloon/sysfs.c
+ *
+ * Xen balloon driver - sysfs interfaces.
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <xen/balloon.h>
+#include "common.h"
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#define BALLOON_CLASS_NAME "xen_memory"
+
+#define BALLOON_SHOW(name, format, args...)                    \
+       static ssize_t show_##name(struct device *dev,          \
+                                  struct device_attribute *attr, \
+                                  char *buf)                   \
+       {                                                       \
+               return sprintf(buf, format, ##args);            \
+       }                                                       \
+       static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(bs.current_pages));
+BALLOON_SHOW(min_kb, "%lu\n", PAGES2KB(balloon_minimum_target()));
+BALLOON_SHOW(max_kb, "%lu\n", PAGES2KB(num_physpages));
+BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(bs.balloon_low));
+BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high));
+BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages));
+
+static ssize_t show_target_kb(struct device *dev,
+                             struct device_attribute *attr, char *buf)
+{
+       return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages));
+}
+
+static ssize_t store_target_kb(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t count)
+{
+       char *endchar;
+       unsigned long long target_bytes;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       
+       if (count <= 1)
+               return -EBADMSG; /* runt */
+       
+       target_bytes = simple_strtoull(buf, &endchar, 0) << 10;
+       balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+       
+       return count;
+}
+
+static DEVICE_ATTR(target_kb, S_IRUGO | S_IWUSR,
+                  show_target_kb, store_target_kb);
+
+static ssize_t show_target(struct device *dev,
+                          struct device_attribute *attr, char *buf)
+{
+       return sprintf(buf, "%llu\n",
+                      (unsigned long long)balloon_stats.target_pages
+                      << PAGE_SHIFT);
+}
+
+static ssize_t store_target(struct device *dev,
+                           struct device_attribute *attr,
+                           const char *buf,
+                           size_t count)
+{
+       char *endchar;
+       unsigned long long target_bytes;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (count <= 1)
+               return -EBADMSG; /* runt */
+
+       target_bytes = memparse(buf, &endchar);
+       balloon_set_new_target(target_bytes >> PAGE_SHIFT);
+
+       return count;
+}
+
+static DEVICE_ATTR(target, S_IRUGO | S_IWUSR,
+                  show_target, store_target);
+
+static struct device_attribute *balloon_attrs[] = {
+       &dev_attr_target_kb,
+       &dev_attr_target,
+};
+
+static struct attribute *balloon_info_attrs[] = {
+       &dev_attr_current_kb.attr,
+       &dev_attr_min_kb.attr,
+       &dev_attr_max_kb.attr,
+       &dev_attr_low_kb.attr,
+       &dev_attr_high_kb.attr,
+       &dev_attr_driver_kb.attr,
+       NULL
+};
+
+static const struct attribute_group balloon_info_group = {
+       .name = "info",
+       .attrs = balloon_info_attrs,
+};
+
+static struct bus_type balloon_subsys = {
+       .name = BALLOON_CLASS_NAME,
+       .dev_name = BALLOON_CLASS_NAME,
+};
+
+static struct device balloon_dev;
+
+static int __init register_balloon(struct device *dev)
+{
+       int i, error;
+
+       error = subsys_system_register(&balloon_subsys, NULL);
+       if (error)
+               return error;
+
+       dev->id = 0;
+       dev->bus = &balloon_subsys;
+
+       error = device_register(dev);
+       if (error) {
+               bus_unregister(&balloon_subsys);
+               return error;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) {
+               error = device_create_file(dev, balloon_attrs[i]);
+               if (error)
+                       goto fail;
+       }
+
+       error = sysfs_create_group(&dev->kobj, &balloon_info_group);
+       if (error)
+               goto fail;
+       
+       return 0;
+
+ fail:
+       while (--i >= 0)
+               device_remove_file(dev, balloon_attrs[i]);
+       device_unregister(dev);
+       bus_unregister(&balloon_subsys);
+       return error;
+}
+
+static __exit void unregister_balloon(struct device *dev)
+{
+       int i;
+
+       sysfs_remove_group(&dev->kobj, &balloon_info_group);
+       for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++)
+               device_remove_file(dev, balloon_attrs[i]);
+       device_unregister(dev);
+       bus_unregister(&balloon_subsys);
+}
+
+int __init balloon_sysfs_init(void)
+{
+       int rc = register_balloon(&balloon_dev);
+
+       register_xen_selfballooning(&balloon_dev);
+
+       return rc;
+}
+
+void __exit balloon_sysfs_exit(void)
+{
+       unregister_balloon(&balloon_dev);
+}
diff --git a/drivers/xen/blkback/Makefile b/drivers/xen/blkback/Makefile

new file mode 100644 (file)

index 0000000..599afe4
--- /dev/null
+++ b/drivers/xen/blkback/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
+obj-$(CONFIG_XEN_BLKBACK_PAGEMAP) += blkback-pagemap.o
+
+blkbk-y        := blkback.o xenbus.o interface.o vbd.o cdrom.o
diff --git a/drivers/xen/blkback/blkback-pagemap.c b/drivers/xen/blkback/blkback-pagemap.c

new file mode 100644 (file)

index 0000000..3a52ead
--- /dev/null
+++ b/drivers/xen/blkback/blkback-pagemap.c
@@ -0,0 +1,97 @@
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "blkback-pagemap.h"
+
+static int blkback_pagemap_size;
+static struct blkback_pagemap *blkback_pagemap;
+
+static inline int
+blkback_pagemap_entry_clear(struct blkback_pagemap *map)
+{
+       static struct blkback_pagemap zero;
+       return !memcmp(map, &zero, sizeof(zero));
+}
+
+int
+blkback_pagemap_init(int pages)
+{
+       blkback_pagemap = kzalloc(pages * sizeof(struct blkback_pagemap),
+                                 GFP_KERNEL);
+       if (!blkback_pagemap)
+               return -ENOMEM;
+
+       blkback_pagemap_size = pages;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkback_pagemap_init);
+
+void
+blkback_pagemap_set(int idx, struct page *page,
+                   domid_t domid, busid_t busid, grant_ref_t gref)
+{
+       struct blkback_pagemap *entry;
+
+       BUG_ON(!blkback_pagemap);
+       BUG_ON(idx >= blkback_pagemap_size);
+
+       SetPageBlkback(page);
+       set_page_private(page, idx);
+
+       entry = blkback_pagemap + idx;
+       if (!blkback_pagemap_entry_clear(entry)) {
+               pr_emerg("overwriting pagemap %d: d %u b %u g %u\n",
+                        idx, entry->domid, entry->busid, entry->gref);
+               BUG();
+       }
+
+       entry->domid = domid;
+       entry->busid = busid;
+       entry->gref  = gref;
+}
+EXPORT_SYMBOL_GPL(blkback_pagemap_set);
+
+void
+blkback_pagemap_clear(struct page *page)
+{
+       int idx;
+       struct blkback_pagemap *entry;
+
+       idx = (int)page_private(page);
+
+       BUG_ON(!blkback_pagemap);
+       BUG_ON(!PageBlkback(page));
+       BUG_ON(idx >= blkback_pagemap_size);
+
+       entry = blkback_pagemap + idx;
+       if (blkback_pagemap_entry_clear(entry)) {
+               pr_emerg("clearing empty pagemap %d\n", idx);
+               BUG();
+       }
+
+       memset(entry, 0, sizeof(*entry));
+}
+EXPORT_SYMBOL_GPL(blkback_pagemap_clear);
+
+struct blkback_pagemap
+blkback_pagemap_read(struct page *page)
+{
+       int idx;
+       struct blkback_pagemap *entry;
+
+       idx = (int)page_private(page);
+
+       BUG_ON(!blkback_pagemap);
+       BUG_ON(!PageBlkback(page));
+       BUG_ON(idx >= blkback_pagemap_size);
+
+       entry = blkback_pagemap + idx;
+       if (blkback_pagemap_entry_clear(entry)) {
+               pr_emerg("reading empty pagemap %d\n", idx);
+               BUG();
+       }
+
+       return *entry;
+}
+EXPORT_SYMBOL(blkback_pagemap_read);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/blkback/blkback-pagemap.h b/drivers/xen/blkback/blkback-pagemap.h

new file mode 100644 (file)

index 0000000..0becf22
--- /dev/null
+++ b/drivers/xen/blkback/blkback-pagemap.h
@@ -0,0 +1,37 @@
+#ifndef _BLKBACK_PAGEMAP_H_
+#define _BLKBACK_PAGEMAP_H_
+
+#include <linux/mm.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/grant_table.h>
+
+typedef unsigned int busid_t;
+
+struct blkback_pagemap {
+       domid_t          domid;
+       busid_t          busid;
+       grant_ref_t      gref;
+};
+
+#if defined(CONFIG_XEN_BLKBACK_PAGEMAP) || defined(CONFIG_XEN_BLKBACK_PAGEMAP_MODULE)
+
+int blkback_pagemap_init(int);
+void blkback_pagemap_set(int, struct page *, domid_t, busid_t, grant_ref_t);
+void blkback_pagemap_clear(struct page *);
+struct blkback_pagemap blkback_pagemap_read(struct page *);
+
+#else /* CONFIG_XEN_BLKBACK_PAGEMAP */
+
+static inline int blkback_pagemap_init(int pages) { return 0; }
+static inline void blkback_pagemap_set(int idx, struct page *page, domid_t dom,
+                                      busid_t bus, grant_ref_t gnt) {}
+static inline void blkback_pagemap_clear(struct page *page) {}
+static inline struct blkback_pagemap blkback_pagemap_read(struct page *page)
+{
+       BUG();
+       return (struct blkback_pagemap){-1, -1, -1};
+}
+
+#endif /* CONFIG_XEN_BLKBACK_PAGEMAP */
+
+#endif
diff --git a/drivers/xen/blkback/blkback.c b/drivers/xen/blkback/blkback.c

new file mode 100644 (file)

index 0000000..6475b7e
--- /dev/null
+++ b/drivers/xen/blkback/blkback.c
@@ -0,0 +1,771 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/main.c
+ * 
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A 
+ * reference front-end implementation can be found in:
+ *  arch/xen/drivers/blkif/frontend
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Copyright (c) 2005, Christopher Clark
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <asm/hypervisor.h>
+#include "common.h"
+
+/*
+ * These are rather arbitrary. They are fairly large because adjacent requests
+ * pulled from a communication ring are quite likely to end up being part of
+ * the same scatter/gather request at the disc.
+ * 
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ * 
+ * This will increase the chances of being able to write whole tracks.
+ * 64 should be enough to keep us competitive with Linux.
+ */
+static int blkif_reqs = 64;
+module_param_named(reqs, blkif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a 
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements 
+ * the pendcnt towards zero. When it hits zero, the specified domain has a 
+ * response queued for it, with the saved 'id' passed back.
+ */
+typedef struct {
+       blkif_t       *blkif;
+       u64            id;
+       atomic_t       pendcnt;
+       unsigned short nr_pages;
+       unsigned short operation;
+       struct list_head free_list;
+} pending_req_t;
+
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static DEFINE_SPINLOCK(pending_free_lock);
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+static struct page **pending_pages;
+static grant_handle_t *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+       return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+       unsigned long pfn = page_to_pfn(pending_page(req, seg));
+       return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#define pending_handle(_req, _seg) \
+       (pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
+
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+                                blkif_request_t *req,
+                                pending_req_t *pending_req);
+static void make_response(blkif_t *blkif, u64 id,
+                         unsigned short op, int st);
+
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+       pending_req_t *req = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+       if (!list_empty(&pending_free)) {
+               req = list_entry(pending_free.next, pending_req_t, free_list);
+               list_del(&req->free_list);
+       }
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+       return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+       unsigned long flags;
+       int was_empty;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+       was_empty = list_empty(&pending_free);
+       list_add(&req->free_list, &pending_free);
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+       if (was_empty)
+               wake_up(&pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+       if (blkif->plug == NULL)
+               return;
+       kobject_put(&blkif->plug->kobj);
+       blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct block_device *bdev)
+{
+       struct request_queue *q = bdev_get_queue(bdev);
+
+       if (q == blkif->plug)
+               return;
+       unplug_queue(blkif);
+       WARN_ON(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags));
+       kobject_get(&q->kobj);
+       blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
+{
+       struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+       unsigned int i, invcount = 0;
+       grant_handle_t handle;
+       int ret;
+
+       for (i = 0; i < req->nr_pages; i++) {
+               handle = pending_handle(req, i);
+               if (handle == BLKBACK_INVALID_HANDLE)
+                       continue;
+               blkback_pagemap_clear(pending_page(req, i));
+               gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
+                                   GNTMAP_host_map, handle);
+               pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+               invcount++;
+       }
+
+       ret = HYPERVISOR_grant_table_op(
+               GNTTABOP_unmap_grant_ref, unmap, invcount);
+       BUG_ON(ret);
+}
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(blkif_t *blkif)
+{
+       printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d  |  br %4d"
+              "  |  fl %4d  |  ds %4d  |  pk %4d\n",
+              current->comm, blkif->st_oo_req,
+              blkif->st_rd_req, blkif->st_wr_req,
+              blkif->st_br_req, blkif->st_fl_req,
+              blkif->st_ds_req, blkif->st_pk_req);
+       blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+       blkif->st_rd_req = 0;
+       blkif->st_wr_req = 0;
+       blkif->st_oo_req = 0;
+       blkif->st_br_req = 0;
+       blkif->st_fl_req = 0;
+       blkif->st_ds_req = 0;
+       blkif->st_pk_req = 0;
+}
+
+int blkif_schedule(void *arg)
+{
+       blkif_t *blkif = arg;
+       struct vbd *vbd = &blkif->vbd;
+
+       blkif_get(blkif);
+
+       if (debug_lvl)
+               printk(KERN_DEBUG "%s: started\n", current->comm);
+
+       while (!kthread_should_stop()) {
+               if (try_to_freeze())
+                       continue;
+               if (unlikely(vbd->size != vbd_size(vbd)))
+                       vbd_resize(blkif);
+
+               wait_event_interruptible(
+                       blkif->wq,
+                       blkif->waiting_reqs || kthread_should_stop());
+               wait_event_interruptible(
+                       pending_free_wq,
+                       !list_empty(&pending_free) || kthread_should_stop());
+
+               blkif->waiting_reqs = 0;
+               smp_mb(); /* clear flag *before* checking for work */
+
+               if (do_block_io_op(blkif))
+                       blkif->waiting_reqs = 1;
+               unplug_queue(blkif);
+
+               if (log_stats && time_after(jiffies, blkif->st_print))
+                       print_stats(blkif);
+       }
+
+       if (log_stats)
+               print_stats(blkif);
+       if (debug_lvl)
+               printk(KERN_DEBUG "%s: exiting\n", current->comm);
+
+       blkif->xenblkd = NULL;
+       blkif_put(blkif);
+
+       return 0;
+}
+
+static void drain_io(blkif_t *blkif)
+{
+       atomic_set(&blkif->drain, 1);
+       do {
+               /* The initial value is one, and one refcnt taken at the
+                * start of the blkif_schedule thread. */
+               if (atomic_read(&blkif->refcnt) <= 2)
+                       break;
+
+               wait_for_completion_interruptible_timeout(
+                               &blkif->drain_complete, HZ);
+
+               if (!atomic_read(&blkif->drain))
+                       break;
+       } while (!kthread_should_stop());
+       atomic_set(&blkif->drain, 0);
+}
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(pending_req_t *pending_req, int error)
+{
+       blkif_t *blkif = pending_req->blkif;
+       int status = BLKIF_RSP_OKAY;
+
+       /* An error fails the entire request. */
+       if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
+           (error == -EOPNOTSUPP)) {
+               DPRINTK("blkback: write barrier op failed, not supported\n");
+               blkback_barrier(XBT_NIL, blkif->be, 0);
+               status = BLKIF_RSP_EOPNOTSUPP;
+       } else if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
+                  (error == -EOPNOTSUPP)) {
+               DPRINTK("blkback: flush diskcache op failed, not supported\n");
+               blkback_flush_diskcache(XBT_NIL, blkif->be, 0);
+               status = BLKIF_RSP_EOPNOTSUPP;
+       } else if (error) {
+               DPRINTK("Buffer not up-to-date at end of operation, "
+                       "error=%d\n", error);
+               status = BLKIF_RSP_ERROR;
+       }
+
+       if (atomic_dec_and_test(&pending_req->pendcnt)) {
+               fast_flush_area(pending_req);
+               make_response(blkif, pending_req->id,
+                             pending_req->operation, status);
+               free_req(pending_req);
+               if (atomic_read(&blkif->drain)
+                   && atomic_read(&blkif->refcnt) <= 2)
+                       complete(&blkif->drain_complete);
+               blkif_put(blkif);
+       }
+}
+
+static void end_block_io_op(struct bio *bio, int error)
+{
+       __end_block_io_op(bio->bi_private, error);
+       bio_put(bio);
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+static void blkif_notify_work(blkif_t *blkif)
+{
+       blkif->waiting_reqs = 1;
+       wake_up(&blkif->wq);
+}
+
+irqreturn_t blkif_be_int(int irq, void *dev_id)
+{
+       blkif_notify_work(dev_id);
+       return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+
+static void dispatch_discard(blkif_t *blkif, struct blkif_request_discard *req)
+{
+       unsigned long secure = (blkif->vbd.discard_secure &&
+                               (req->flag & BLKIF_DISCARD_SECURE)) ?
+                              BLKDEV_DISCARD_SECURE : 0;
+       struct phys_req preq;
+       int status;
+
+       blkif->st_ds_req++;
+
+       preq.dev           = req->handle;
+       preq.sector_number = req->sector_number;
+       preq.nr_sects      = req->nr_sectors;
+
+       if (vbd_translate(&preq, blkif, REQ_DISCARD) != 0) {
+               DPRINTK("access denied: discard of [%Lu,%Lu) on dev=%04x\n",
+                       preq.sector_number,
+                       preq.sector_number + preq.nr_sects, preq.dev);
+               make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+               msleep(1); /* back off a bit */
+               return;
+       }
+
+       plug_queue(blkif, preq.bdev);
+
+       switch (blkdev_issue_discard(preq.bdev, preq.sector_number,
+                                    preq.nr_sects, GFP_KERNEL, secure)) {
+       case 0:
+               status = BLKIF_RSP_OKAY;
+               break;
+       case -EOPNOTSUPP:
+               DPRINTK("discard op failed, not supported\n");
+               status = BLKIF_RSP_EOPNOTSUPP;
+               break;
+       default:
+               status = BLKIF_RSP_ERROR;
+               break;
+       }
+
+       make_response(blkif, req->id, req->operation, status);
+}
+
+static int _do_block_io_op(blkif_t *blkif)
+{
+       blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+       blkif_request_t req;
+       pending_req_t *pending_req;
+       RING_IDX rc, rp;
+
+       rc = blk_rings->common.req_cons;
+       rp = blk_rings->common.sring->req_prod;
+       rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+       while (rc != rp) {
+               if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
+                       break;
+
+               if (kthread_should_stop())
+                       return 1;
+
+               switch (blkif->blk_protocol) {
+               case BLKIF_PROTOCOL_NATIVE:
+                       req = *RING_GET_REQUEST(&blk_rings->native, rc);
+                       break;
+               case BLKIF_PROTOCOL_X86_32:
+                       blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+                       break;
+               case BLKIF_PROTOCOL_X86_64:
+                       blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+                       break;
+               default:
+                       BUG();
+                       return 0; /* make compiler happy */
+               }
+
+               ++rc;
+
+               switch (req.operation) {
+               case BLKIF_OP_READ:
+               case BLKIF_OP_WRITE:
+               case BLKIF_OP_WRITE_BARRIER:
+               case BLKIF_OP_FLUSH_DISKCACHE:
+                       pending_req = alloc_req();
+                       if (!pending_req) {
+                               blkif->st_oo_req++;
+                               return 1;
+                       }
+
+                       /* before make_response() */
+                       blk_rings->common.req_cons = rc;
+
+                       /* Apply all sanity checks to /private copy/ of request. */
+                       barrier();
+
+                       dispatch_rw_block_io(blkif, &req, pending_req);
+                       break;
+               case BLKIF_OP_DISCARD:
+                       blk_rings->common.req_cons = rc;
+                       barrier();
+                       dispatch_discard(blkif, (void *)&req);
+                       break;
+               case BLKIF_OP_PACKET:
+                       blk_rings->common.req_cons = rc;
+                       barrier();
+                       blkif->st_pk_req++;
+                       DPRINTK("error: block operation BLKIF_OP_PACKET not implemented\n");
+                       make_response(blkif, req.id, req.operation,
+                                     BLKIF_RSP_ERROR);
+                       break;
+               default:
+                       /* A good sign something is wrong: sleep for a while to
+                        * avoid excessive CPU consumption by a bad guest. */
+                       msleep(1);
+                       blk_rings->common.req_cons = rc;
+                       barrier();
+                       DPRINTK("error: unknown block io operation [%d]\n",
+                               req.operation);
+                       make_response(blkif, req.id, req.operation,
+                                     BLKIF_RSP_ERROR);
+                       break;
+               }
+
+               /* Yield point for this unbounded loop. */
+               cond_resched();
+       }
+
+       return 0;
+}
+
+static int
+do_block_io_op(blkif_t *blkif)
+{
+       blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+       int more_to_do;
+
+       do {
+               more_to_do = _do_block_io_op(blkif);
+               if (more_to_do)
+                       break;
+
+               RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+       } while (more_to_do);
+
+       return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+                                blkif_request_t *req,
+                                pending_req_t *pending_req)
+{
+       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+       struct phys_req preq;
+       struct { 
+               unsigned long buf; unsigned int nsec;
+       } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+       unsigned int nseg;
+       struct bio *bio = NULL;
+       uint32_t flags;
+       int ret, i;
+       int operation;
+
+       switch (req->operation) {
+       case BLKIF_OP_READ:
+               blkif->st_rd_req++;
+               operation = READ;
+               break;
+       case BLKIF_OP_WRITE:
+               blkif->st_wr_req++;
+               operation = WRITE;
+               break;
+       case BLKIF_OP_WRITE_BARRIER:
+               blkif->st_br_req++;
+               operation = WRITE_FLUSH_FUA;
+               break;
+       case BLKIF_OP_FLUSH_DISKCACHE:
+               blkif->st_fl_req++;
+               operation = WRITE_FLUSH;
+               break;
+       default:
+               operation = 0; /* make gcc happy */
+               BUG();
+       }
+
+       /* Check that number of segments is sane. */
+       nseg = req->nr_segments;
+       if (unlikely(nseg == 0 && !(operation & REQ_FLUSH)) ||
+           unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+               DPRINTK("Bad number of segments in request (%d)\n", nseg);
+               goto fail_response;
+       }
+
+       preq.dev           = req->handle;
+       preq.sector_number = req->sector_number;
+       preq.nr_sects      = 0;
+
+       pending_req->blkif     = blkif;
+       pending_req->id        = req->id;
+       pending_req->operation = req->operation;
+       pending_req->nr_pages  = nseg;
+
+       flags = GNTMAP_host_map;
+       if (operation != READ)
+               flags |= GNTMAP_readonly;
+
+       for (i = 0; i < nseg; i++) {
+               seg[i].nsec = req->seg[i].last_sect -
+                       req->seg[i].first_sect + 1;
+
+               if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+                   (req->seg[i].last_sect < req->seg[i].first_sect))
+                       goto fail_response;
+               preq.nr_sects += seg[i].nsec;
+
+               gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
+                                 req->seg[i].gref, blkif->domid);
+       }
+
+       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
+       BUG_ON(ret);
+
+       for (i = 0; i < nseg; i++) {
+               if (unlikely(map[i].status == GNTST_eagain))
+                       gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map[i])
+               if (unlikely(map[i].status != GNTST_okay)) {
+                       DPRINTK("invalid buffer -- could not remap it\n");
+                       map[i].handle = BLKBACK_INVALID_HANDLE;
+                       ret = 1;
+               } else {
+                       blkback_pagemap_set(vaddr_pagenr(pending_req, i),
+                                           pending_page(pending_req, i),
+                                           blkif->domid, req->handle,
+                                           req->seg[i].gref);
+               }
+
+               pending_handle(pending_req, i) = map[i].handle;
+
+               if (ret)
+                       continue;
+
+               set_phys_to_machine(
+                       page_to_pfn(pending_page(pending_req, i)),
+                       FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
+               seg[i].buf  = map[i].dev_bus_addr | 
+                       (req->seg[i].first_sect << 9);
+       }
+
+       if (ret)
+               goto fail_flush;
+
+       if (vbd_translate(&preq, blkif, operation) != 0) {
+               DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", 
+                       operation == READ ? "read" : "write",
+                       preq.sector_number,
+                       preq.sector_number + preq.nr_sects, preq.dev);
+               goto fail_flush;
+       }
+
+       /* Wait on all outstanding I/O's and once that has been completed
+        * issue the WRITE_FLUSH.
+        */
+       if (req->operation == BLKIF_OP_WRITE_BARRIER)
+               drain_io(blkif);
+
+       plug_queue(blkif, preq.bdev);
+       atomic_set(&pending_req->pendcnt, 1);
+       blkif_get(blkif);
+
+       for (i = 0; i < nseg; i++) {
+               if (((int)preq.sector_number|(int)seg[i].nsec) &
+                   ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
+                       DPRINTK("Misaligned I/O request from domain %d",
+                               blkif->domid);
+                       goto fail_put_bio;
+               }
+
+               while ((bio == NULL) ||
+                      (bio_add_page(bio,
+                                    pending_page(pending_req, i),
+                                    seg[i].nsec << 9,
+                                    seg[i].buf & ~PAGE_MASK) == 0)) {
+                       if (bio) {
+                               atomic_inc(&pending_req->pendcnt);
+                               submit_bio(operation, bio);
+                       }
+
+                       bio = bio_alloc(GFP_KERNEL, nseg-i);
+                       if (unlikely(bio == NULL))
+                               goto fail_put_bio;
+
+                       bio->bi_bdev    = preq.bdev;
+                       bio->bi_private = pending_req;
+                       bio->bi_end_io  = end_block_io_op;
+                       bio->bi_sector  = preq.sector_number;
+               }
+
+               preq.sector_number += seg[i].nsec;
+       }
+
+       if (!bio) {
+               BUG_ON(!(operation & (REQ_FLUSH|REQ_FUA)));
+               bio = bio_alloc(GFP_KERNEL, 0);
+               if (unlikely(bio == NULL))
+                       goto fail_put_bio;
+
+               bio->bi_bdev    = preq.bdev;
+               bio->bi_private = pending_req;
+               bio->bi_end_io  = end_block_io_op;
+               bio->bi_sector  = -1;
+       }
+
+       submit_bio(operation, bio);
+
+       if (operation == READ)
+               blkif->st_rd_sect += preq.nr_sects;
+       else
+               blkif->st_wr_sect += preq.nr_sects;
+
+       return;
+
+ fail_flush:
+       fast_flush_area(pending_req);
+ fail_response:
+       make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+       free_req(pending_req);
+       msleep(1); /* back off a bit */
+       return;
+
+ fail_put_bio:
+       __end_block_io_op(pending_req, -EINVAL);
+       if (bio)
+               bio_put(bio);
+       unplug_queue(blkif);
+       msleep(1); /* back off a bit */
+       return;
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, u64 id,
+                         unsigned short op, int st)
+{
+       blkif_response_t  resp;
+       unsigned long     flags;
+       blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+       int notify;
+
+       resp.id        = id;
+       resp.operation = op;
+       resp.status    = st;
+
+       spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+       /* Place on the response ring for the relevant domain. */
+       switch (blkif->blk_protocol) {
+       case BLKIF_PROTOCOL_NATIVE:
+               memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
+                      &resp, sizeof(resp));
+               break;
+       case BLKIF_PROTOCOL_X86_32:
+               memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
+                      &resp, sizeof(resp));
+               break;
+       case BLKIF_PROTOCOL_X86_64:
+               memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
+                      &resp, sizeof(resp));
+               break;
+       default:
+               BUG();
+       }
+       blk_rings->common.rsp_prod_pvt++;
+       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+       spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+
+       if (notify)
+               notify_remote_via_irq(blkif->irq);
+}
+
+static int __init blkif_init(void)
+{
+       int i, mmap_pages;
+
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+       pending_reqs          = kzalloc(sizeof(pending_reqs[0]) *
+                                       blkif_reqs, GFP_KERNEL);
+       pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+                                       mmap_pages, GFP_KERNEL);
+       pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
+
+       if (blkback_pagemap_init(mmap_pages))
+               goto out_of_memory;
+
+       if (!pending_reqs || !pending_grant_handles || !pending_pages)
+               goto out_of_memory;
+
+       for (i = 0; i < mmap_pages; i++)
+               pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+
+       blkif_interface_init();
+
+       INIT_LIST_HEAD(&pending_free);
+
+       for (i = 0; i < blkif_reqs; i++)
+               list_add_tail(&pending_reqs[i].free_list, &pending_free);
+
+       blkif_xenbus_init();
+
+       return 0;
+
+ out_of_memory:
+       kfree(pending_reqs);
+       kfree(pending_grant_handles);
+       free_empty_pages_and_pagevec(pending_pages, mmap_pages);
+       pr_warning("%s: out of memory\n", __FUNCTION__);
+       return -ENOMEM;
+}
+
+module_init(blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vbd");
diff --git a/drivers/xen/blkback/cdrom.c b/drivers/xen/blkback/cdrom.c

new file mode 100644 (file)

index 0000000..fbbb9a5
--- /dev/null
+++ b/drivers/xen/blkback/cdrom.c
@@ -0,0 +1,154 @@
+/******************************************************************************
+ * blkback/cdrom.c
+ *
+ * Routines for managing cdrom watch and media-present attribute of a
+ * cdrom type virtual block device (VBD).
+ *
+ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
+ * Copyright (c) 2007       Pat Campbell
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#define MEDIA_PRESENT "media-present"
+
+static void cdrom_media_changed(struct xenbus_watch *, const char **, unsigned int);
+
+/**
+ * Writes media-present=1 attribute for the given vbd device if not
+ * already there
+ */
+static int cdrom_xenstore_write_media_present(struct backend_info *be)
+{
+       struct xenbus_device *dev = be->dev;
+       struct xenbus_transaction xbt;
+       int err;
+       int media_present;
+
+       err = xenbus_scanf(XBT_NIL, dev->nodename, MEDIA_PRESENT, "%d",
+                          &media_present);
+       if (0 < err) {
+               DPRINTK("already written err%d", err);
+               return(0);
+       }
+       media_present = !!be->blkif->vbd.bdev;
+
+again:
+       err = xenbus_transaction_start(&xbt);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "starting transaction");
+               return(-1);
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, MEDIA_PRESENT, "%d", media_present );
+       if (err) {
+               xenbus_dev_fatal(dev, err, "writing %s/%s",
+                        dev->nodename, MEDIA_PRESENT);
+               goto abort;
+       }
+       err = xenbus_transaction_end(xbt, 0);
+       if (err == -EAGAIN)
+               goto again;
+       if (err)
+               xenbus_dev_fatal(dev, err, "ending transaction");
+       return 0;
+ abort:
+       xenbus_transaction_end(xbt, 1);
+       return -1;
+}
+
+/**
+ *
+ */
+static int cdrom_is_type(struct backend_info *be)
+{
+       DPRINTK("type:%x", be->blkif->vbd.type );
+       return (be->blkif->vbd.type & VDISK_CDROM)
+              && (be->blkif->vbd.type & GENHD_FL_REMOVABLE);
+}
+
+/**
+ *
+ */
+void cdrom_add_media_watch(struct backend_info *be)
+{
+       struct xenbus_device *dev = be->dev;
+       int err;
+
+       DPRINTK("nodename:%s", dev->nodename);
+       if (cdrom_is_type(be)) {
+               DPRINTK("is a cdrom");
+               if (cdrom_xenstore_write_media_present(be) == 0) {
+                       DPRINTK("xenstore wrote OK");
+                       err = xenbus_watch_path2(dev, dev->nodename, MEDIA_PRESENT,
+                                                &be->cdrom_watch,
+                                                cdrom_media_changed);
+                       if (err)
+                               DPRINTK(MEDIA_PRESENT " watch add failed");
+               }
+       }
+}
+
+/**
+ * Callback received when the MEDIA_PRESENT xenstore node is changed
+ */
+static void cdrom_media_changed(struct xenbus_watch *watch,
+                               const char **vec, unsigned int len)
+{
+       int err, media_present;
+       struct backend_info *be
+               = container_of(watch, struct backend_info, cdrom_watch);
+       struct xenbus_device *dev = be->dev;
+
+       if (!cdrom_is_type(be)) {
+               DPRINTK("callback not for a cdrom" );
+               return;
+       }
+
+       err = xenbus_scanf(XBT_NIL, dev->nodename, MEDIA_PRESENT, "%d",
+                          &media_present);
+       if (err <= 0) {
+               DPRINTK("read of " MEDIA_PRESENT " node error:%d", err);
+               return;
+       }
+
+       if (!media_present)
+               vbd_free(&be->blkif->vbd);
+       else if (!be->blkif->vbd.bdev) {
+               char *p = strrchr(dev->otherend, '/') + 1;
+               long handle = simple_strtoul(p, NULL, 0);
+
+               err = vbd_create(be->blkif, handle, be->major, be->minor,
+                                be->blkif->vbd.mode, true);
+               if (err && err != -ENOMEDIUM) {
+                       be->major = be->minor = 0;
+                       xenbus_dev_fatal(dev, err, "creating vbd structure");
+                       return;
+               }
+               vbd_resize(be->blkif);
+       }
+}
diff --git a/drivers/xen/blkback/common.h b/drivers/xen/blkback/common.h

new file mode 100644 (file)

index 0000000..0b49cbc
--- /dev/null
+++ b/drivers/xen/blkback/common.h
@@ -0,0 +1,155 @@
+/* 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/wait.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+#include <xen/xenbus.h>
+#include <xen/interface/event_channel.h>
+#include "blkback-pagemap.h"
+
+
+#define DPRINTK(_f, _a...)                     \
+       pr_debug("(file=%s, line=%d) " _f,      \
+                __FILE__ , __LINE__ , ## _a )
+
+struct vbd {
+       blkif_vdev_t   handle;      /* what the domain refers to this vbd as */
+       fmode_t        mode;        /* FMODE_xxx */
+       unsigned char  type;        /* VDISK_xxx */
+       bool           flush_support;
+       bool           discard_secure;
+       u32            pdevice;     /* phys device that this vbd maps to */
+       struct block_device *bdev;
+       sector_t       size;        /* Cached size parameter */
+};
+
+struct backend_info;
+
+typedef struct blkif_st {
+       /* Unique identifier for this interface. */
+       domid_t           domid;
+       unsigned int      handle;
+       /* Physical parameters of the comms window. */
+       unsigned int      irq;
+       /* Comms information. */
+       enum blkif_protocol blk_protocol;
+       blkif_back_rings_t blk_rings;
+       struct vm_struct *blk_ring_area;
+       /* The VBD attached to this interface. */
+       struct vbd        vbd;
+       /* Back pointer to the backend_info. */
+       struct backend_info *be;
+       /* Private fields. */
+       spinlock_t       blk_ring_lock;
+       atomic_t         refcnt;
+
+       wait_queue_head_t   wq;
+       /* for barrier (drain) requests */
+       struct completion   drain_complete;
+       atomic_t            drain;
+       struct task_struct  *xenblkd;
+       unsigned int        waiting_reqs;
+       struct request_queue *plug;
+
+       /* statistics */
+       unsigned long       st_print;
+       int                 st_rd_req;
+       int                 st_wr_req;
+       int                 st_oo_req;
+       int                 st_br_req;
+       int                 st_fl_req;
+       int                 st_ds_req;
+       int                 st_pk_req;
+       int                 st_rd_sect;
+       int                 st_wr_sect;
+
+       wait_queue_head_t waiting_to_free;
+} blkif_t;
+
+struct backend_info
+{
+       struct xenbus_device *dev;
+       blkif_t *blkif;
+       struct xenbus_watch backend_watch;
+       struct xenbus_watch cdrom_watch;
+       unsigned major;
+       unsigned minor;
+       char *mode;
+};
+
+blkif_t *blkif_alloc(domid_t domid);
+void blkif_disconnect(blkif_t *blkif);
+void blkif_free(blkif_t *blkif);
+int blkif_map(blkif_t *blkif, grant_ref_t, evtchn_port_t);
+void vbd_resize(blkif_t *blkif);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)                                  \
+       do {                                            \
+               if (atomic_dec_and_test(&(_b)->refcnt)) \
+                       wake_up(&(_b)->waiting_to_free);\
+       } while (0)
+
+/* Create a vbd. */
+int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major,
+              unsigned minor, fmode_t mode, bool cdrom);
+void vbd_free(struct vbd *vbd);
+
+unsigned long long vbd_size(struct vbd *vbd);
+unsigned long vbd_secsize(struct vbd *vbd);
+
+struct phys_req {
+       unsigned short       dev;
+       blkif_sector_t       nr_sects;
+       struct block_device *bdev;
+       blkif_sector_t       sector_number;
+};
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation);
+
+void blkif_interface_init(void);
+
+void blkif_xenbus_init(void);
+
+irqreturn_t blkif_be_int(int irq, void *dev_id);
+int blkif_schedule(void *arg);
+
+void blkback_barrier(struct xenbus_transaction, struct backend_info *,
+                    int state);
+void blkback_flush_diskcache(struct xenbus_transaction,
+                            struct backend_info *, int state);
+
+/* cdrom media change */
+void cdrom_add_media_watch(struct backend_info *be);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/blkback/interface.c b/drivers/xen/blkback/interface.c

new file mode 100644 (file)

index 0000000..215a530
--- /dev/null
+++ b/drivers/xen/blkback/interface.c
@@ -0,0 +1,139 @@
+/******************************************************************************
+ * arch/xen/drivers/blkif/backend/interface.c
+ * 
+ * Block-device interface management.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <xen/evtchn.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+
+static struct kmem_cache *blkif_cachep;
+
+blkif_t *blkif_alloc(domid_t domid)
+{
+       blkif_t *blkif;
+
+       blkif = kmem_cache_zalloc(blkif_cachep, GFP_KERNEL);
+       if (!blkif)
+               return ERR_PTR(-ENOMEM);
+
+       blkif->domid = domid;
+       spin_lock_init(&blkif->blk_ring_lock);
+       atomic_set(&blkif->refcnt, 1);
+       init_waitqueue_head(&blkif->wq);
+       init_completion(&blkif->drain_complete);
+       atomic_set(&blkif->drain, 0);
+       blkif->st_print = jiffies;
+       init_waitqueue_head(&blkif->waiting_to_free);
+
+       return blkif;
+}
+
+int blkif_map(blkif_t *blkif, grant_ref_t ring_ref, evtchn_port_t evtchn)
+{
+       struct vm_struct *area;
+       int err;
+
+       /* Already connected through? */
+       if (blkif->irq)
+               return 0;
+
+       area = xenbus_map_ring_valloc(blkif->be->dev, ring_ref);
+       if (IS_ERR(area))
+               return PTR_ERR(area);
+       blkif->blk_ring_area = area;
+
+       switch (blkif->blk_protocol) {
+#define BLKBK_RING_INIT(p) ({ \
+               struct blkif_##p##_sring *sring = area->addr; \
+               BACK_RING_INIT(&blkif->blk_rings.p, sring, PAGE_SIZE); \
+       })
+       case BLKIF_PROTOCOL_NATIVE:
+               BLKBK_RING_INIT(native);
+               break;
+       case BLKIF_PROTOCOL_X86_32:
+               BLKBK_RING_INIT(x86_32);
+               break;
+       case BLKIF_PROTOCOL_X86_64:
+               BLKBK_RING_INIT(x86_64);
+               break;
+       default:
+               BUG();
+#undef BLKBK_RING_INIT
+       }
+
+       err = bind_interdomain_evtchn_to_irqhandler(
+               blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif);
+       if (err < 0)
+       {
+               xenbus_unmap_ring_vfree(blkif->be->dev, area);
+               blkif->blk_rings.common.sring = NULL;
+               return err;
+       }
+       blkif->irq = err;
+
+       return 0;
+}
+
+void blkif_disconnect(blkif_t *blkif)
+{
+       if (blkif->xenblkd) {
+               kthread_stop(blkif->xenblkd);
+               blkif->xenblkd = NULL;
+       }
+
+       atomic_dec(&blkif->refcnt);
+       wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
+       atomic_inc(&blkif->refcnt);
+
+       if (blkif->irq) {
+               unbind_from_irqhandler(blkif->irq, blkif);
+               blkif->irq = 0;
+       }
+
+       if (blkif->blk_rings.common.sring) {
+               xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring_area);
+               blkif->blk_rings.common.sring = NULL;
+       }
+}
+
+void blkif_free(blkif_t *blkif)
+{
+       if (!atomic_dec_and_test(&blkif->refcnt))
+               BUG();
+       kmem_cache_free(blkif_cachep, blkif);
+}
+
+void __init blkif_interface_init(void)
+{
+       blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
+                                        0, 0, NULL);
+}
diff --git a/drivers/xen/blkback/vbd.c b/drivers/xen/blkback/vbd.c

new file mode 100644 (file)

index 0000000..66fb8d1
--- /dev/null
+++ b/drivers/xen/blkback/vbd.c
@@ -0,0 +1,212 @@
+/******************************************************************************
+ * blkback/vbd.c
+ * 
+ * Routines for managing virtual block devices (VBDs).
+ * 
+ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+
+#define vbd_sz(_v)   ((_v)->bdev->bd_part ?                            \
+       (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
+
+unsigned long long vbd_size(struct vbd *vbd)
+{
+       return vbd->bdev ? vbd_sz(vbd) : 0;
+}
+
+unsigned long vbd_secsize(struct vbd *vbd)
+{
+       return vbd->bdev ? bdev_logical_block_size(vbd->bdev) : 0;
+}
+
+int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major,
+              unsigned minor, fmode_t mode, bool cdrom)
+{
+       struct vbd *vbd;
+       struct block_device *bdev;
+       struct request_queue *q;
+
+       vbd = &blkif->vbd;
+       vbd->handle   = handle; 
+       vbd->size     = 0;
+       vbd->type     = cdrom ? VDISK_CDROM : 0;
+
+       if (!(mode & FMODE_WRITE)) {
+               mode &= ~FMODE_EXCL; /* xend doesn't even allow mode="r!" */
+               vbd->type |= VDISK_READONLY;
+       }
+       vbd->mode = mode;
+
+       vbd->pdevice  = MKDEV(major, minor);
+
+       bdev = blkdev_get_by_dev(vbd->pdevice, mode, blkif);
+
+       if (IS_ERR(bdev)) {
+               if (PTR_ERR(bdev) != -ENOMEDIUM) {
+                       DPRINTK("vbd_creat: device %08x could not be opened\n",
+                               vbd->pdevice);
+                       return -ENOENT;
+               }
+
+               DPRINTK("vbd_creat: device %08x has no medium\n",
+                       vbd->pdevice);
+               if (cdrom)
+                       return -ENOMEDIUM;
+
+               bdev = blkdev_get_by_dev(vbd->pdevice, mode | FMODE_NDELAY,
+                                        blkif);
+               if (IS_ERR(bdev))
+                       return -ENOMEDIUM;
+
+               if (bdev->bd_disk) {
+                       if (bdev->bd_disk->flags & GENHD_FL_CD)
+                               vbd->type |= VDISK_CDROM;
+                       if (bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+                               vbd->type |= VDISK_REMOVABLE;
+               }
+
+               blkdev_put(bdev, mode);
+               return -ENOMEDIUM;
+       }
+
+       vbd->bdev = bdev;
+
+       if (vbd->bdev->bd_disk == NULL) {
+               DPRINTK("vbd_creat: device %08x doesn't exist.\n",
+                       vbd->pdevice);
+               vbd_free(vbd);
+               return -ENOENT;
+       }
+
+       vbd->size = vbd_size(vbd);
+
+       if (bdev->bd_disk->flags & GENHD_FL_CD)
+               vbd->type |= VDISK_CDROM;
+       if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+               vbd->type |= VDISK_REMOVABLE;
+
+       q = bdev_get_queue(bdev);
+       if (q && q->flush_flags)
+               vbd->flush_support = true;
+
+       if (q && blk_queue_secdiscard(q))
+               vbd->discard_secure = true;
+
+       DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
+               handle, blkif->domid);
+       return 0;
+}
+
+void vbd_free(struct vbd *vbd)
+{
+       if (vbd->bdev)
+               blkdev_put(vbd->bdev, vbd->mode);
+       vbd->bdev = NULL;
+}
+
+int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation)
+{
+       struct vbd *vbd = &blkif->vbd;
+       int rc = -EACCES;
+
+       if ((operation != READ) && !(vbd->mode & FMODE_WRITE))
+               goto out;
+
+       if (vbd->bdev == NULL) {
+               rc = -ENOMEDIUM;
+               goto out;
+       }
+
+       if (likely(req->nr_sects)) {
+               blkif_sector_t end = req->sector_number + req->nr_sects;
+
+               if (unlikely(end < req->sector_number))
+                       goto out;
+               if (unlikely(end > vbd_sz(vbd)))
+                       goto out;
+       }
+
+       req->dev  = vbd->pdevice;
+       req->bdev = vbd->bdev;
+       rc = 0;
+
+ out:
+       return rc;
+}
+
+void vbd_resize(blkif_t *blkif)
+{
+       struct vbd *vbd = &blkif->vbd;
+       struct xenbus_transaction xbt;
+       int err;
+       struct xenbus_device *dev = blkif->be->dev;
+       unsigned long long new_size = vbd_size(vbd);
+
+       pr_info("VBD Resize: new size %Lu\n", new_size);
+       vbd->size = new_size;
+again:
+       err = xenbus_transaction_start(&xbt);
+       if (err) {
+               pr_warning("Error %d starting transaction", err);
+               return;
+       }
+       err = xenbus_printf(xbt, dev->nodename, "sectors", "%Lu",
+                           vbd_size(vbd));
+       if (err) {
+               pr_warning("Error %d writing new size", err);
+               goto abort;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
+                           vbd_secsize(vbd));
+       if (err) {
+               pr_warning("Error writing new sector size");
+               goto abort;
+       }
+
+       /*
+        * Write the current state; we will use this to synchronize
+        * the front-end. If the current state is "connected" the
+        * front-end will get the new size information online.
+        */
+       err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
+       if (err) {
+               pr_warning("Error %d writing the state", err);
+               goto abort;
+       }
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err == -EAGAIN)
+               goto again;
+       if (err)
+               pr_warning("Error %d ending transaction", err);
+       return;
+abort:
+       xenbus_transaction_end(xbt, 1);
+}
diff --git a/drivers/xen/blkback/xenbus.c b/drivers/xen/blkback/xenbus.c

new file mode 100644 (file)

index 0000000..d7f1c09
--- /dev/null
+++ b/drivers/xen/blkback/xenbus.c
@@ -0,0 +1,622 @@
+/*  Xenbus code for blkif backend
+    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+    Copyright (C) 2005 XenSource Ltd
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdarg.h>
+#include <linux/kthread.h>
+#include "common.h"
+#include "../core/domctl.h"
+
+#undef DPRINTK
+#define DPRINTK(fmt, args...)                          \
+       pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",   \
+                __FUNCTION__, __LINE__, ##args)
+
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static void backend_changed(struct xenbus_watch *, const char **,
+                           unsigned int);
+
+static int blkback_name(blkif_t *blkif, char *buf)
+{
+       char *devpath, *devname;
+       struct xenbus_device *dev = blkif->be->dev;
+
+       devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+       if (IS_ERR(devpath)) 
+               return PTR_ERR(devpath);
+       
+       if ((devname = strstr(devpath, "/dev/")) != NULL)
+               devname += strlen("/dev/");
+       else
+               devname  = devpath;
+
+       snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname);
+       kfree(devpath);
+       
+       return 0;
+}
+
+static void update_blkif_status(blkif_t *blkif)
+{ 
+       int err;
+       char name[TASK_COMM_LEN];
+
+       /* Not ready to connect? */
+       if (!blkif->irq)
+               return;
+
+       /* Already connected? */
+       if (blkif->be->dev->state == XenbusStateConnected)
+               return;
+
+       /* Attempt to connect: exit if we fail to. */
+       connect(blkif->be);
+       if (blkif->be->dev->state != XenbusStateConnected)
+               return;
+
+       err = blkback_name(blkif, name);
+       if (err) {
+               xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
+               return;
+       }
+
+       if (blkif->vbd.bdev) {
+               struct address_space *mapping
+                       = blkif->vbd.bdev->bd_inode->i_mapping;
+
+               err = filemap_write_and_wait(mapping);
+               if (err) {
+                       xenbus_dev_error(blkif->be->dev, err, "block flush");
+                       return;
+               }
+               invalidate_inode_pages2(mapping);
+       }
+
+       blkif->xenblkd = kthread_run(blkif_schedule, blkif, name);
+       if (IS_ERR(blkif->xenblkd)) {
+               err = PTR_ERR(blkif->xenblkd);
+               blkif->xenblkd = NULL;
+               xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+       }
+}
+
+
+/****************************************************************
+ *  sysfs interface for VBD I/O requests
+ */
+
+#define VBD_SHOW(name, format, args...)                                        \
+       static ssize_t show_##name(struct device *_dev,                 \
+                                  struct device_attribute *attr,       \
+                                  char *buf)                           \
+       {                                                               \
+               ssize_t ret = -ENODEV;                                  \
+               struct xenbus_device *dev;                              \
+               struct backend_info *be;                                \
+                                                                       \
+               if (!get_device(_dev))                                  \
+                       return ret;                                     \
+               dev = to_xenbus_device(_dev);                           \
+               if ((be = dev_get_drvdata(&dev->dev)) != NULL)          \
+                       ret = sprintf(buf, format, ##args);             \
+               put_device(_dev);                                       \
+               return ret;                                             \
+       }                                                               \
+       static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
+VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
+VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
+VBD_SHOW(br_req,  "%d\n", be->blkif->st_br_req);
+VBD_SHOW(fl_req,  "%d\n", be->blkif->st_fl_req);
+VBD_SHOW(ds_req,  "%d\n", be->blkif->st_ds_req);
+VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
+VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
+
+static struct attribute *vbdstat_attrs[] = {
+       &dev_attr_oo_req.attr,
+       &dev_attr_rd_req.attr,
+       &dev_attr_wr_req.attr,
+       &dev_attr_br_req.attr,
+       &dev_attr_fl_req.attr,
+       &dev_attr_ds_req.attr,
+       &dev_attr_rd_sect.attr,
+       &dev_attr_wr_sect.attr,
+       NULL
+};
+
+static const struct attribute_group vbdstat_group = {
+       .name = "statistics",
+       .attrs = vbdstat_attrs,
+};
+
+VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
+VBD_SHOW(mode, "%s\n", be->mode);
+
+int xenvbd_sysfs_addif(struct xenbus_device *dev)
+{
+       int error;
+       
+       error = device_create_file(&dev->dev, &dev_attr_physical_device);
+       if (error)
+               goto fail1;
+
+       error = device_create_file(&dev->dev, &dev_attr_mode);
+       if (error)
+               goto fail2;
+
+       error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group);
+       if (error)
+               goto fail3;
+
+       return 0;
+
+fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+fail2: device_remove_file(&dev->dev, &dev_attr_mode);
+fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
+       return error;
+}
+
+void xenvbd_sysfs_delif(struct xenbus_device *dev)
+{
+       sysfs_remove_group(&dev->dev.kobj, &vbdstat_group);
+       device_remove_file(&dev->dev, &dev_attr_mode);
+       device_remove_file(&dev->dev, &dev_attr_physical_device);
+}
+
+static int blkback_remove(struct xenbus_device *dev)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+       DPRINTK("");
+
+       if (be->major || be->minor)
+               xenvbd_sysfs_delif(dev);
+
+       if (be->backend_watch.node) {
+               unregister_xenbus_watch(&be->backend_watch);
+               kfree(be->backend_watch.node);
+               be->backend_watch.node = NULL;
+       }
+
+       if (be->cdrom_watch.node) {
+               unregister_xenbus_watch(&be->cdrom_watch);
+               kfree(be->cdrom_watch.node);
+               be->cdrom_watch.node = NULL;
+       }
+
+       if (be->blkif) {
+               blkif_disconnect(be->blkif);
+               vbd_free(&be->blkif->vbd);
+               blkif_free(be->blkif);
+               be->blkif = NULL;
+       }
+
+       kfree(be);
+       dev_set_drvdata(&dev->dev, NULL);
+       return 0;
+}
+
+void blkback_barrier(struct xenbus_transaction xbt,
+                    struct backend_info *be, int state)
+{
+       struct xenbus_device *dev = be->dev;
+       int err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+                               "%d", state);
+
+       if (err)
+               xenbus_dev_error(dev, err, "writing feature-barrier");
+}
+
+void blkback_flush_diskcache(struct xenbus_transaction xbt,
+                            struct backend_info *be, int state)
+{
+       struct xenbus_device *dev = be->dev;
+       int err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache",
+                               "%d", state);
+
+       if (err)
+               xenbus_dev_error(dev, err, "writing feature-flush-cache");
+}
+
+static void blkback_discard(struct xenbus_transaction xbt,
+                           struct backend_info *be)
+{
+       struct xenbus_device *dev = be->dev;
+       struct vbd *vbd = &be->blkif->vbd;
+       struct request_queue *q = bdev_get_queue(vbd->bdev);
+       int err, state = 0;
+
+       if (blk_queue_discard(q)) {
+               err = xenbus_printf(xbt, dev->nodename, "discard-granularity",
+                                   "%u", q->limits.discard_granularity);
+               if (!err)
+                       state = 1;
+               else
+                       xenbus_dev_error(dev, err,
+                                        "writing discard-granularity");
+               err = xenbus_printf(xbt, dev->nodename, "discard-alignment",
+                                   "%u", q->limits.discard_alignment);
+               if (err) {
+                       xenbus_dev_error(dev, err,
+                                        "writing discard-alignment");
+                       state = 0;
+               }
+       }
+
+       /* Optional. */
+       if (state) {
+               err = xenbus_printf(xbt, dev->nodename, "discard-secure",
+                                   "%d", vbd->discard_secure);
+               if (err)
+                       xenbus_dev_error(dev, err, "writing discard-secure");
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "feature-discard",
+                           "%d", state);
+       if (err)
+               xenbus_dev_error(dev, err, "writing feature-discard");
+}
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures, and watch the store waiting for the hotplug scripts to tell us
+ * the device's physical major and minor numbers.  Switch to InitWait.
+ */
+static int blkback_probe(struct xenbus_device *dev,
+                        const struct xenbus_device_id *id)
+{
+       int err;
+       struct backend_info *be = kzalloc(sizeof(struct backend_info),
+                                         GFP_KERNEL);
+       if (!be) {
+               xenbus_dev_fatal(dev, -ENOMEM,
+                                "allocating backend structure");
+               return -ENOMEM;
+       }
+       be->dev = dev;
+       dev_set_drvdata(&dev->dev, be);
+
+       be->blkif = blkif_alloc(dev->otherend_id);
+       if (IS_ERR(be->blkif)) {
+               err = PTR_ERR(be->blkif);
+               be->blkif = NULL;
+               xenbus_dev_fatal(dev, err, "creating block interface");
+               goto fail;
+       }
+
+       /* setup back pointer */
+       be->blkif->be = be;
+
+       err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
+                                &be->backend_watch, backend_changed);
+       if (err)
+               goto fail;
+
+       err = xenbus_switch_state(dev, XenbusStateInitWait);
+       if (err)
+               goto fail;
+
+       return 0;
+
+fail:
+       DPRINTK("failed");
+       blkback_remove(dev);
+       return err;
+}
+
+
+/**
+ * Callback received when the hotplug scripts have placed the physical-device
+ * node.  Read it and the mode node, and create a vbd.  If the frontend is
+ * ready, connect.
+ */
+static void backend_changed(struct xenbus_watch *watch,
+                           const char **vec, unsigned int len)
+{
+       int err;
+       unsigned major;
+       unsigned minor;
+       struct backend_info *be
+               = container_of(watch, struct backend_info, backend_watch);
+       struct xenbus_device *dev = be->dev;
+       int cdrom = 0;
+       char *device_type;
+
+       DPRINTK("");
+
+       err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
+                          &major, &minor);
+       if (XENBUS_EXIST_ERR(err)) {
+               /* Since this watch will fire once immediately after it is
+                  registered, we expect this.  Ignore it, and wait for the
+                  hotplug scripts. */
+               return;
+       }
+       if (err != 2) {
+               xenbus_dev_fatal(dev, err, "reading physical-device");
+               return;
+       }
+
+       if ((be->major || be->minor) &&
+           ((be->major != major) || (be->minor != minor))) {
+               pr_warning("blkback: changing physical device (from %x:%x to"
+                          " %x:%x) not supported\n", be->major, be->minor,
+                          major, minor);
+               return;
+       }
+
+       be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
+       if (IS_ERR(be->mode)) {
+               err = PTR_ERR(be->mode);
+               be->mode = NULL;
+               xenbus_dev_fatal(dev, err, "reading mode");
+               return;
+       }
+
+       device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
+       if (!IS_ERR(device_type)) {
+               cdrom = strcmp(device_type, "cdrom") == 0;
+               kfree(device_type);
+       }
+
+       if (be->major == 0 && be->minor == 0) {
+               /* Front end dir is a number, which is used as the handle. */
+
+               char *p = strrchr(dev->otherend, '/') + 1;
+               long handle = simple_strtoul(p, NULL, 0);
+
+               be->major = major;
+               be->minor = minor;
+
+               err = vbd_create(be->blkif, handle, major, minor,
+                                FMODE_READ
+                                | (strchr(be->mode, 'w') ? FMODE_WRITE : 0)
+                                | (strchr(be->mode, '!') ? 0 : FMODE_EXCL),
+                                cdrom);
+               switch (err) {
+               case -ENOMEDIUM:
+                       if (be->blkif->vbd.type
+                           & (VDISK_CDROM | VDISK_REMOVABLE))
+               case 0:
+                               break;
+               default:
+                       be->major = be->minor = 0;
+                       xenbus_dev_fatal(dev, err, "creating vbd structure");
+                       return;
+               }
+
+               err = xenvbd_sysfs_addif(dev);
+               if (err) {
+                       vbd_free(&be->blkif->vbd);
+                       be->major = be->minor = 0;
+                       xenbus_dev_fatal(dev, err, "creating sysfs entries");
+                       return;
+               }
+
+               /* We're potentially connected now */
+               update_blkif_status(be->blkif);
+
+               /* Add watch for cdrom media status if necessay */
+               cdrom_add_media_watch(be);
+       }
+}
+
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+                            enum xenbus_state frontend_state)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+       int err;
+
+       DPRINTK("%s", xenbus_strstate(frontend_state));
+
+       switch (frontend_state) {
+       case XenbusStateInitialising:
+               if (dev->state == XenbusStateClosed) {
+                       pr_info("%s: %s: prepare for reconnect\n",
+                               __FUNCTION__, dev->nodename);
+                       xenbus_switch_state(dev, XenbusStateInitWait);
+               }
+               break;
+
+       case XenbusStateInitialised:
+       case XenbusStateConnected:
+               /* Ensure we connect even when two watches fire in 
+                  close successsion and we miss the intermediate value 
+                  of frontend_state. */
+               if (dev->state == XenbusStateConnected)
+                       break;
+
+               /* Enforce precondition before potential leak point.
+                * blkif_disconnect() is idempotent.
+                */
+               blkif_disconnect(be->blkif);
+
+               err = connect_ring(be);
+               if (err)
+                       break;
+               if (be->blkif->vbd.bdev)
+                       update_blkif_status(be->blkif);
+               break;
+
+       case XenbusStateClosing:
+       case XenbusStateClosed:
+               blkif_disconnect(be->blkif);
+               xenbus_switch_state(dev, frontend_state);
+               if (frontend_state != XenbusStateClosed ||
+                   xenbus_dev_is_online(dev))
+                       break;
+               /* fall through if not online */
+       case XenbusStateUnknown:
+               /* implies blkif_disconnect() via blkback_remove() */
+               device_unregister(&dev->dev);
+               break;
+
+       default:
+               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+                                frontend_state);
+               break;
+       }
+}
+
+
+/* ** Connection ** */
+
+
+/**
+ * Write the physical details regarding the block device to the store, and
+ * switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+       struct xenbus_transaction xbt;
+       int err;
+       struct xenbus_device *dev = be->dev;
+
+       DPRINTK("%s", dev->otherend);
+
+       /* Supply the information about the device the frontend needs */
+again:
+       err = xenbus_transaction_start(&xbt);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "starting transaction");
+               return;
+       }
+
+       blkback_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
+       blkback_barrier(xbt, be, be->blkif->vbd.flush_support);
+       blkback_discard(xbt, be);
+
+       err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+                           vbd_size(&be->blkif->vbd));
+       if (err) {
+               xenbus_dev_fatal(dev, err, "writing %s/sectors",
+                                dev->nodename);
+               goto abort;
+       }
+
+       /* FIXME: use a typename instead */
+       err = xenbus_printf(xbt, dev->nodename, "info", "%u",
+                           be->blkif->vbd.type);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "writing %s/info",
+                                dev->nodename);
+               goto abort;
+       }
+       err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
+                           vbd_secsize(&be->blkif->vbd));
+       if (err) {
+               xenbus_dev_fatal(dev, err, "writing %s/sector-size",
+                                dev->nodename);
+               goto abort;
+       }
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err == -EAGAIN)
+               goto again;
+       if (err)
+               xenbus_dev_fatal(dev, err, "ending transaction");
+
+       err = xenbus_switch_state(dev, XenbusStateConnected);
+       if (err)
+               xenbus_dev_fatal(dev, err, "%s: switching to Connected state",
+                                dev->nodename);
+
+       return;
+ abort:
+       xenbus_transaction_end(xbt, 1);
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+       struct xenbus_device *dev = be->dev;
+       unsigned int ring_ref, evtchn;
+       char *protocol;
+       int err;
+
+       DPRINTK("%s", dev->otherend);
+
+       err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%u", &ring_ref,
+                           "event-channel", "%u", &evtchn, NULL);
+       if (err) {
+               xenbus_dev_fatal(dev, err,
+                                "reading %s/ring-ref and event-channel",
+                                dev->otherend);
+               return err;
+       }
+
+       be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+       protocol = xenbus_read(XBT_NIL, dev->otherend, "protocol", NULL);
+       if (IS_ERR(protocol)) {
+               protocol = NULL;
+               be->blkif->blk_protocol = xen_guest_blkif_protocol(be->blkif->domid);
+#ifndef CONFIG_X86_32
+       } else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) {
+               be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+#endif
+#ifndef CONFIG_X86_64
+       } else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) {
+               be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+#endif
+       } else if (0 != strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) {
+               xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+               kfree(protocol);
+               return -1;
+       }
+       pr_info("blkback: ring-ref %u, event-channel %u, protocol %d (%s)\n",
+               ring_ref, evtchn, be->blkif->blk_protocol,
+               protocol ?: "unspecified");
+       kfree(protocol);
+
+       /* Map the shared frame, irq etc. */
+       err = blkif_map(be->blkif, ring_ref, evtchn);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "mapping ring-ref %u port %u",
+                                ring_ref, evtchn);
+               return err;
+       }
+
+       return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blkback_ids[] = {
+       { "vbd" },
+       { "" }
+};
+
+static DEFINE_XENBUS_DRIVER(blkback, ,
+       .probe = blkback_probe,
+       .remove = blkback_remove,
+       .otherend_changed = frontend_changed
+);
+
+
+void blkif_xenbus_init(void)
+{
+       WARN_ON(xenbus_register_backend(&blkback_driver));
+}
diff --git a/drivers/xen/blkfront/Makefile b/drivers/xen/blkfront/Makefile

new file mode 100644 (file)

index 0000000..1ca0bed
--- /dev/null
+++ b/drivers/xen/blkfront/Makefile
@@ -0,0 +1,5 @@
+
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      := xenblk.o
+
+xenblk-objs := blkfront.o vbd.o vcd.o
+
diff --git a/drivers/xen/blkfront/blkfront.c b/drivers/xen/blkfront/blkfront.c

new file mode 100644 (file)

index 0000000..7e910b4
--- /dev/null
+++ b/drivers/xen/blkfront/blkfront.c
@@ -0,0 +1,1222 @@
+/******************************************************************************
+ * blkfront.c
+ * 
+ * XenLinux virtual block-device driver.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004, Christian Limpach
+ * Copyright (c) 2004, Andrew Warfield
+ * Copyright (c) 2005, Christopher Clark
+ * Copyright (c) 2005, XenSource Ltd
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/version.h>
+#include "block.h"
+#include <linux/cdrom.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/scatterlist.h>
+#include <scsi/scsi.h>
+#include <xen/evtchn.h>
+#include <xen/xenbus.h>
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/protocols.h>
+#include <xen/gnttab.h>
+#include <asm/hypervisor.h>
+#include <asm/maddr.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#define BLKIF_STATE_DISCONNECTED 0
+#define BLKIF_STATE_CONNECTED    1
+#define BLKIF_STATE_SUSPENDED    2
+
+#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
+    (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
+
+static void connect(struct blkfront_info *);
+static void blkfront_closing(struct blkfront_info *);
+static int blkfront_remove(struct xenbus_device *);
+static int talk_to_backend(struct xenbus_device *, struct blkfront_info *);
+static int setup_blkring(struct xenbus_device *, struct blkfront_info *);
+
+static void kick_pending_request_queues(struct blkfront_info *);
+
+static irqreturn_t blkif_int(int irq, void *dev_id);
+static void blkif_restart_queue(struct work_struct *arg);
+static int blkif_recover(struct blkfront_info *);
+static void blkif_completion(struct blk_shadow *);
+static void blkif_free(struct blkfront_info *, int);
+
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and the ring buffer for communication with the backend, and
+ * inform the backend of the appropriate details for those.  Switch to
+ * Initialised state.
+ */
+static int blkfront_probe(struct xenbus_device *dev,
+                         const struct xenbus_device_id *id)
+{
+       int err, vdevice, i;
+       struct blkfront_info *info;
+
+#ifndef CONFIG_XEN /* For HVM guests, do not take over CDROM devices. */
+       char *type;
+
+       type = xenbus_read(XBT_NIL, dev->nodename, "device-type", NULL);
+       if (IS_ERR(type)) {
+               xenbus_dev_fatal(dev, PTR_ERR(type), "reading dev type");
+               return PTR_ERR(type);
+       }
+       if (!strncmp(type, "cdrom", 5)) {
+               /*
+                * We are handed a cdrom device in a hvm guest; let the
+                * native cdrom driver handle this device.
+                */
+               kfree(type);
+               pr_notice("blkfront: ignoring CDROM %s\n", dev->nodename);
+               return -ENXIO;
+       }
+       kfree(type);
+#endif
+
+       /* FIXME: Use dynamic device id if this is not set. */
+       err = xenbus_scanf(XBT_NIL, dev->nodename,
+                          "virtual-device", "%i", &vdevice);
+       if (err != 1) {
+               /* go looking in the extended area instead */
+               err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
+                       "%i", &vdevice);
+               if (err != 1) {
+                       xenbus_dev_fatal(dev, err, "reading virtual-device");
+                       return err;
+               }
+       }
+
+       info = kzalloc(sizeof(*info), GFP_KERNEL);
+       if (!info) {
+               xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+               return -ENOMEM;
+       }
+
+       spin_lock_init(&info->io_lock);
+       mutex_init(&info->mutex);
+       info->xbdev = dev;
+       info->vdevice = vdevice;
+       info->connected = BLKIF_STATE_DISCONNECTED;
+       INIT_WORK(&info->work, blkif_restart_queue);
+
+       for (i = 0; i < BLK_RING_SIZE; i++)
+               info->shadow[i].req.id = i+1;
+       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+
+       /* Front end dir is a number, which is used as the id. */
+       info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0);
+       dev_set_drvdata(&dev->dev, info);
+
+       err = talk_to_backend(dev, info);
+       if (err) {
+               kfree(info);
+               dev_set_drvdata(&dev->dev, NULL);
+               return err;
+       }
+
+       return 0;
+}
+
+
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart.  We tear down our blkif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int blkfront_resume(struct xenbus_device *dev)
+{
+       struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+       int err;
+
+       DPRINTK("blkfront_resume: %s\n", dev->nodename);
+
+       blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
+
+       err = talk_to_backend(dev, info);
+       if (info->connected == BLKIF_STATE_SUSPENDED && !err)
+               err = blkif_recover(info);
+
+       return err;
+}
+
+
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_backend(struct xenbus_device *dev,
+                          struct blkfront_info *info)
+{
+       const char *message = NULL;
+       struct xenbus_transaction xbt;
+       int err;
+
+       /* Create shared ring, alloc event channel. */
+       err = setup_blkring(dev, info);
+       if (err)
+               goto out;
+
+again:
+       err = xenbus_transaction_start(&xbt);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "starting transaction");
+               goto destroy_blkring;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename,
+                           "ring-ref","%u", info->ring_ref);
+       if (err) {
+               message = "writing ring-ref";
+               goto abort_transaction;
+       }
+       err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+                           irq_to_evtchn_port(info->irq));
+       if (err) {
+               message = "writing event-channel";
+               goto abort_transaction;
+       }
+       err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
+                           XEN_IO_PROTO_ABI_NATIVE);
+       if (err) {
+               message = "writing protocol";
+               goto abort_transaction;
+       }
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err) {
+               if (err == -EAGAIN)
+                       goto again;
+               xenbus_dev_fatal(dev, err, "completing transaction");
+               goto destroy_blkring;
+       }
+
+       xenbus_switch_state(dev, XenbusStateInitialised);
+
+       return 0;
+
+ abort_transaction:
+       xenbus_transaction_end(xbt, 1);
+       if (message)
+               xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_blkring:
+       blkif_free(info, 0);
+ out:
+       return err;
+}
+
+
+static int setup_blkring(struct xenbus_device *dev,
+                        struct blkfront_info *info)
+{
+       blkif_sring_t *sring;
+       int err;
+
+       info->ring_ref = GRANT_INVALID_REF;
+
+       sring = (blkif_sring_t *)__get_free_page(GFP_NOIO | __GFP_HIGH);
+       if (!sring) {
+               xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
+               return -ENOMEM;
+       }
+       SHARED_RING_INIT(sring);
+       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+       sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+       err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
+       if (err < 0) {
+               free_page((unsigned long)sring);
+               info->ring.sring = NULL;
+               goto fail;
+       }
+       info->ring_ref = err;
+
+       err = bind_listening_port_to_irqhandler(
+               dev->otherend_id, blkif_int, IRQF_SAMPLE_RANDOM, "blkif", info);
+       if (err <= 0) {
+               xenbus_dev_fatal(dev, err,
+                                "bind_listening_port_to_irqhandler");
+               goto fail;
+       }
+       info->irq = err;
+
+       return 0;
+fail:
+       blkif_free(info, 0);
+       return err;
+}
+
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+                           enum xenbus_state backend_state)
+{
+       struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+       struct block_device *bd;
+
+       DPRINTK("blkfront:backend_changed.\n");
+
+       switch (backend_state) {
+       case XenbusStateInitialising:
+       case XenbusStateInitWait:
+       case XenbusStateInitialised:
+       case XenbusStateReconfiguring:
+       case XenbusStateReconfigured:
+       case XenbusStateUnknown:
+       case XenbusStateClosed:
+               break;
+
+       case XenbusStateConnected:
+               connect(info);
+               break;
+
+       case XenbusStateClosing:
+               mutex_lock(&info->mutex);
+               if (dev->state == XenbusStateClosing) {
+                       mutex_unlock(&info->mutex);
+                       break;
+               }
+
+               bd = info->gd ? bdget_disk(info->gd, 0) : NULL;
+
+               mutex_unlock(&info->mutex);
+
+               if (bd == NULL) {
+                       xenbus_frontend_closed(dev);
+                       break;
+               }
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+               down(&bd->bd_sem);
+#else
+               mutex_lock(&bd->bd_mutex);
+#endif
+               if (bd->bd_openers) {
+                       xenbus_dev_error(dev, -EBUSY,
+                                        "Device in use; refusing to close");
+                       xenbus_switch_state(dev, XenbusStateClosing);
+               } else
+                       blkfront_closing(info);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+               up(&bd->bd_sem);
+#else
+               mutex_unlock(&bd->bd_mutex);
+#endif
+               bdput(bd);
+               break;
+       }
+}
+
+
+/* ** Connection ** */
+
+static void blkfront_setup_discard(struct blkfront_info *info)
+{
+       int err;
+       char *type;
+       unsigned int discard_granularity;
+       unsigned int discard_alignment;
+       int discard_secure;
+
+       type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL);
+       if (IS_ERR(type))
+               return;
+
+       info->feature_secdiscard = 0;
+       if (strncmp(type, "phy", 3) == 0) {
+               err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+                       "discard-granularity", "%u", &discard_granularity,
+                       "discard-alignment", "%u", &discard_alignment,
+                       NULL);
+               if (!err) {
+                       info->feature_discard = 1;
+                       info->discard_granularity = discard_granularity;
+                       info->discard_alignment = discard_alignment;
+               }
+               err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                           "discard-secure", "%d", &discard_secure);
+               if (err == 1)
+                       info->feature_secdiscard = discard_secure;
+       } else if (strncmp(type, "file", 4) == 0)
+               info->feature_discard = 1;
+
+       kfree(type);
+}
+
+/*
+ * Invoked when the backend is finally 'ready' (and has told produced
+ * the details about the physical device - #sectors, size, etc).
+ */
+static void connect(struct blkfront_info *info)
+{
+       unsigned long long sectors;
+       unsigned long sector_size;
+       unsigned int binfo;
+       int err, barrier, flush, discard;
+
+       switch (info->connected) {
+       case BLKIF_STATE_CONNECTED:
+               /*
+                * Potentially, the back-end may be signalling
+                * a capacity change; update the capacity.
+                */
+               err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                                  "sectors", "%Lu", &sectors);
+               if (err != 1)
+                       return;
+               err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                                  "sector-size", "%lu", &sector_size);
+               if (err != 1)
+                       sector_size = 0;
+               if (sector_size)
+                       blk_queue_logical_block_size(info->gd->queue,
+                                                    sector_size);
+               pr_info("Setting capacity to %Lu\n", sectors);
+               set_capacity(info->gd, sectors);
+               revalidate_disk(info->gd);
+
+               /* fall through */
+       case BLKIF_STATE_SUSPENDED:
+               return;
+       }
+
+       DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend);
+
+       err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+                           "sectors", "%Lu", &sectors,
+                           "info", "%u", &binfo,
+                           "sector-size", "%lu", &sector_size,
+                           NULL);
+       if (err) {
+               xenbus_dev_fatal(info->xbdev, err,
+                                "reading backend fields at %s",
+                                info->xbdev->otherend);
+               return;
+       }
+
+       info->feature_flush = 0;
+       info->flush_op = 0;
+
+       err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                          "feature-barrier", "%d", &barrier);
+       /*
+        * If there's no "feature-barrier" defined, then it means
+        * we're dealing with a very old backend which writes
+        * synchronously; nothing to do.
+        *
+        * If there are barriers, then we use flush.
+        */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
+       if (err > 0 && barrier) {
+               info->feature_flush = REQ_FLUSH | REQ_FUA;
+               info->flush_op = BLKIF_OP_WRITE_BARRIER;
+       }
+       /*
+        * And if there is "feature-flush-cache" use that above
+        * barriers.
+        */
+       err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                          "feature-flush-cache", "%d", &flush);
+       if (err > 0 && flush) {
+               info->feature_flush = REQ_FLUSH;
+               info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
+       }
+#else
+       if (err <= 0)
+               info->feature_flush = QUEUE_ORDERED_DRAIN;
+       else if (barrier)
+               info->feature_flush = QUEUE_ORDERED_TAG;
+       else
+               info->feature_flush = QUEUE_ORDERED_NONE;
+#endif
+
+       err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                          "feature-discard", "%d", &discard);
+
+       if (err > 0 && discard)
+               blkfront_setup_discard(info);
+
+       err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info);
+       if (err) {
+               xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
+                                info->xbdev->otherend);
+               return;
+       }
+
+       err = xlvbd_sysfs_addif(info);
+       if (err) {
+               xenbus_dev_fatal(info->xbdev, err, "xlvbd_sysfs_addif at %s",
+                                info->xbdev->otherend);
+               return;
+       }
+
+       (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+       /* Kick pending requests. */
+       spin_lock_irq(&info->io_lock);
+       info->connected = BLKIF_STATE_CONNECTED;
+       kick_pending_request_queues(info);
+       spin_unlock_irq(&info->io_lock);
+
+       add_disk(info->gd);
+
+       info->is_ready = 1;
+
+       register_vcd(info);
+}
+
+/**
+ * Handle the change of state of the backend to Closing.  We must delete our
+ * device-layer structures now, to ensure that writes are flushed through to
+ * the backend.  Once is this done, we can switch to Closed in
+ * acknowledgement.
+ */
+static void blkfront_closing(struct blkfront_info *info)
+{
+       unsigned long flags;
+
+       DPRINTK("blkfront_closing: %d removed\n", info->vdevice);
+
+       if (info->rq == NULL)
+               goto out;
+
+       spin_lock_irqsave(&info->io_lock, flags);
+       /* No more blkif_request(). */
+       blk_stop_queue(info->rq);
+       /* No more gnttab callback work. */
+       gnttab_cancel_free_callback(&info->callback);
+       spin_unlock_irqrestore(&info->io_lock, flags);
+
+       /* Flush gnttab callback work. Must be done with no locks held. */
+       flush_work_sync(&info->work);
+
+       xlvbd_sysfs_delif(info);
+
+       unregister_vcd(info);
+
+       xlvbd_del(info);
+
+ out:
+       if (info->xbdev)
+               xenbus_frontend_closed(info->xbdev);
+}
+
+
+static int blkfront_remove(struct xenbus_device *dev)
+{
+       struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+       struct block_device *bd;
+       struct gendisk *disk;
+
+       DPRINTK("blkfront_remove: %s removed\n", dev->nodename);
+
+       blkif_free(info, 0);
+
+       mutex_lock(&info->mutex);
+
+       disk = info->gd;
+       bd = disk ? bdget_disk(disk, 0) : NULL;
+
+       info->xbdev = NULL;
+       mutex_unlock(&info->mutex);
+
+       if (!bd) {
+               kfree(info);
+               return 0;
+       }
+
+       /*
+        * The xbdev was removed before we reached the Closed
+        * state. See if it's safe to remove the disk. If the bdev
+        * isn't closed yet, we let release take care of it.
+        */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+       down(&bd->bd_sem);
+#else
+       mutex_lock(&bd->bd_mutex);
+#endif
+       info = disk->private_data;
+
+       dev_warn(disk_to_dev(disk),
+                "%s was hot-unplugged, %d stale handles\n",
+                dev->nodename, bd->bd_openers);
+
+       if (info && !bd->bd_openers) {
+               blkfront_closing(info);
+               disk->private_data = NULL;
+               kfree(info);
+       }
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+       up(&bd->bd_sem);
+#else
+       mutex_unlock(&bd->bd_mutex);
+#endif
+       bdput(bd);
+
+       return 0;
+}
+
+
+static inline int GET_ID_FROM_FREELIST(
+       struct blkfront_info *info)
+{
+       unsigned long free = info->shadow_free;
+       BUG_ON(free >= BLK_RING_SIZE);
+       info->shadow_free = info->shadow[free].req.id;
+       info->shadow[free].req.id = 0x0fffffee; /* debug */
+       return free;
+}
+
+static inline void ADD_ID_TO_FREELIST(
+       struct blkfront_info *info, unsigned long id)
+{
+       info->shadow[id].req.id  = info->shadow_free;
+       info->shadow[id].request = NULL;
+       info->shadow_free = id;
+}
+
+static inline void flush_requests(struct blkfront_info *info)
+{
+       int notify;
+
+       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
+
+       if (notify)
+               notify_remote_via_irq(info->irq);
+}
+
+static void kick_pending_request_queues(struct blkfront_info *info)
+{
+       if (!RING_FULL(&info->ring)) {
+               /* Re-enable calldowns. */
+               blk_start_queue(info->rq);
+               /* Kick things off immediately. */
+               do_blkif_request(info->rq);
+       }
+}
+
+static void blkif_restart_queue(struct work_struct *arg)
+{
+       struct blkfront_info *info = container_of(arg, struct blkfront_info, work);
+       spin_lock_irq(&info->io_lock);
+       if (info->connected == BLKIF_STATE_CONNECTED)
+               kick_pending_request_queues(info);
+       spin_unlock_irq(&info->io_lock);
+}
+
+static void blkif_restart_queue_callback(void *arg)
+{
+       struct blkfront_info *info = (struct blkfront_info *)arg;
+       schedule_work(&info->work);
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+int blkif_open(struct inode *inode, struct file *filep)
+{
+       struct block_device *bd = inode->i_bdev;
+#else
+int blkif_open(struct block_device *bd, fmode_t mode)
+{
+#endif
+       struct blkfront_info *info = bd->bd_disk->private_data;
+       int err = 0;
+
+       if (!info)
+               /* xbdev gone */
+               err = -ERESTARTSYS;
+       else {
+               mutex_lock(&info->mutex);
+
+               if (!info->gd)
+                       /* xbdev is closed */
+                       err = -ERESTARTSYS;
+
+               mutex_unlock(&info->mutex);
+       }
+
+       return err;
+}
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+int blkif_release(struct inode *inode, struct file *filep)
+{
+       struct gendisk *disk = inode->i_bdev->bd_disk;
+#else
+int blkif_release(struct gendisk *disk, fmode_t mode)
+{
+#endif
+       struct blkfront_info *info = disk->private_data;
+       struct xenbus_device *xbdev;
+       struct block_device *bd = bdget_disk(disk, 0);
+
+       bdput(bd);
+       if (bd->bd_openers)
+               return 0;
+
+       /*
+        * Check if we have been instructed to close. We will have
+        * deferred this request, because the bdev was still open.
+        */
+       mutex_lock(&info->mutex);
+       xbdev = info->xbdev;
+
+       if (xbdev && xbdev->state == XenbusStateClosing) {
+               /* pending switch to state closed */
+               dev_info(disk_to_dev(disk), "releasing disk\n");
+               blkfront_closing(info);
+       }
+
+       mutex_unlock(&info->mutex);
+
+       if (!xbdev) {
+               /* sudden device removal */
+               dev_info(disk_to_dev(disk), "releasing disk\n");
+               blkfront_closing(info);
+               disk->private_data = NULL;
+               kfree(info);
+       }
+
+       return 0;
+}
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+int blkif_ioctl(struct inode *inode, struct file *filep,
+               unsigned command, unsigned long argument)
+{
+       struct block_device *bd = inode->i_bdev;
+#else
+int blkif_ioctl(struct block_device *bd, fmode_t mode,
+               unsigned command, unsigned long argument)
+{
+#endif
+       struct blkfront_info *info = bd->bd_disk->private_data;
+       int i;
+
+       DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n",
+                     command, (long)argument, inode->i_rdev);
+
+       switch (command) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
+       case HDIO_GETGEO: {
+               struct hd_geometry geo;
+               int ret;
+
+                if (!argument)
+                        return -EINVAL;
+
+               geo.start = get_start_sect(bd);
+               ret = blkif_getgeo(bd, &geo);
+               if (ret)
+                       return ret;
+
+               if (copy_to_user((struct hd_geometry __user *)argument, &geo,
+                                sizeof(geo)))
+                        return -EFAULT;
+
+                return 0;
+       }
+#endif
+       case CDROMMULTISESSION:
+               DPRINTK("FIXME: support multisession CDs later\n");
+               for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+                       if (put_user(0, (char __user *)(argument + i)))
+                               return -EFAULT;
+               return 0;
+
+       case CDROM_GET_CAPABILITY:
+               if (info->gd && (info->gd->flags & GENHD_FL_CD))
+                       return 0;
+               return -EINVAL;
+
+       default:
+               if (info->mi && info->gd && info->rq) {
+                       switch (info->mi->major) {
+                       case SCSI_DISK0_MAJOR:
+                       case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
+                       case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR:
+                       case SCSI_CDROM_MAJOR:
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+                               return scsi_cmd_ioctl(filep, info->gd, command,
+                                                     (void __user *)argument);
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+                               return scsi_cmd_ioctl(filep, info->rq,
+                                                     info->gd, command,
+                                                     (void __user *)argument);
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0)
+                               return scsi_cmd_ioctl(info->rq, info->gd,
+                                                     mode, command,
+                                                     (void __user *)argument);
+#else
+                               return scsi_cmd_blk_ioctl(bd, mode, command,
+                                                         (void __user *)argument);
+#endif
+                       }
+               }
+
+               return -EINVAL; /* same return as native Linux */
+       }
+
+       return 0;
+}
+
+
+int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+       /* We don't have real geometry info, but let's at least return
+          values consistent with the size of the device */
+       sector_t nsect = get_capacity(bd->bd_disk);
+       sector_t cylinders = nsect;
+
+       hg->heads = 0xff;
+       hg->sectors = 0x3f;
+       sector_div(cylinders, hg->heads * hg->sectors);
+       hg->cylinders = cylinders;
+       if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+               hg->cylinders = 0xffff;
+       return 0;
+}
+
+
+/*
+ * Generate a Xen blkfront IO request from a blk layer request.  Reads
+ * and writes are handled as expected.
+ *
+ * @req: a request struct
+ */
+static int blkif_queue_request(struct request *req)
+{
+       struct blkfront_info *info = req->rq_disk->private_data;
+       unsigned long buffer_mfn;
+       blkif_request_t *ring_req;
+       unsigned long id;
+       unsigned int fsect, lsect;
+       int i, ref;
+       grant_ref_t gref_head;
+       struct scatterlist *sg;
+
+       if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+               return 1;
+
+       if (gnttab_alloc_grant_references(
+               BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
+               gnttab_request_free_callback(
+                       &info->callback,
+                       blkif_restart_queue_callback,
+                       info,
+                       BLKIF_MAX_SEGMENTS_PER_REQUEST);
+               return 1;
+       }
+
+       /* Fill out a communications ring structure. */
+       ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+       id = GET_ID_FROM_FREELIST(info);
+       info->shadow[id].request = req;
+
+       ring_req->id = id;
+       ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
+       ring_req->handle = info->handle;
+
+       ring_req->operation = rq_data_dir(req) ?
+               BLKIF_OP_WRITE : BLKIF_OP_READ;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
+       if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
+#else
+       if (req->cmd_flags & REQ_HARDBARRIER)
+#endif
+               ring_req->operation = info->flush_op;
+       if (req->cmd_type == REQ_TYPE_BLOCK_PC)
+               ring_req->operation = BLKIF_OP_PACKET;
+
+       if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
+               struct blkif_request_discard *discard = (void *)ring_req;
+
+               /* id, sector_number and handle are set above. */
+               discard->operation = BLKIF_OP_DISCARD;
+               discard->flag = 0;
+               discard->nr_sectors = blk_rq_sectors(req);
+               if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+                       discard->flag = BLKIF_DISCARD_SECURE;
+       } else {
+               ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
+               BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+               for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
+                       buffer_mfn = page_to_phys(sg_page(sg)) >> PAGE_SHIFT;
+                       fsect = sg->offset >> 9;
+                       lsect = fsect + (sg->length >> 9) - 1;
+                       /* install a grant reference. */
+                       ref = gnttab_claim_grant_reference(&gref_head);
+                       BUG_ON(ref == -ENOSPC);
+
+                       gnttab_grant_foreign_access_ref(
+                               ref,
+                               info->xbdev->otherend_id,
+                               buffer_mfn,
+                               rq_data_dir(req) ? GTF_readonly : 0 );
+
+                       info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
+                       ring_req->seg[i] =
+                               (struct blkif_request_segment) {
+                                       .gref       = ref,
+                                       .first_sect = fsect,
+                                       .last_sect  = lsect };
+               }
+       }
+
+       info->ring.req_prod_pvt++;
+
+       /* Keep a private copy so we can reissue requests when recovering. */
+       info->shadow[id].req = *ring_req;
+
+       gnttab_free_grant_references(gref_head);
+
+       return 0;
+}
+
+/*
+ * do_blkif_request
+ *  read a block; request is in a request queue
+ */
+void do_blkif_request(struct request_queue *rq)
+{
+       struct blkfront_info *info = NULL;
+       struct request *req;
+       int queued;
+
+       DPRINTK("Entered do_blkif_request\n");
+
+       queued = 0;
+
+       while ((req = blk_peek_request(rq)) != NULL) {
+               info = req->rq_disk->private_data;
+
+               if (RING_FULL(&info->ring))
+                       goto wait;
+
+               blk_start_request(req);
+
+               if ((req->cmd_type != REQ_TYPE_FS &&
+                    (req->cmd_type != REQ_TYPE_BLOCK_PC || req->cmd_len)) ||
+                   ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) &&
+                    !info->flush_op)) {
+                       req->errors = (DID_ERROR << 16) |
+                                     (DRIVER_INVALID << 24);
+                       __blk_end_request_all(req, -EIO);
+                       continue;
+               }
+
+               DPRINTK("do_blk_req %p: cmd %p, sec %llx, "
+                       "(%u/%u) buffer:%p [%s]\n",
+                       req, req->cmd, (long long)blk_rq_pos(req),
+                       blk_rq_cur_sectors(req), blk_rq_sectors(req),
+                       req->buffer, rq_data_dir(req) ? "write" : "read");
+
+               if (blkif_queue_request(req)) {
+                       blk_requeue_request(rq, req);
+               wait:
+                       /* Avoid pointless unplugs. */
+                       blk_stop_queue(rq);
+                       break;
+               }
+
+               queued++;
+       }
+
+       if (queued != 0)
+               flush_requests(info);
+}
+
+
+static irqreturn_t blkif_int(int irq, void *dev_id)
+{
+       struct request *req;
+       blkif_response_t *bret;
+       RING_IDX i, rp;
+       unsigned long flags;
+       struct blkfront_info *info = (struct blkfront_info *)dev_id;
+
+       spin_lock_irqsave(&info->io_lock, flags);
+
+       if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
+               spin_unlock_irqrestore(&info->io_lock, flags);
+               return IRQ_HANDLED;
+       }
+
+ again:
+       rp = info->ring.sring->rsp_prod;
+       rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+       for (i = info->ring.rsp_cons; i != rp; i++) {
+               unsigned long id;
+               int ret;
+
+               bret = RING_GET_RESPONSE(&info->ring, i);
+               id   = bret->id;
+               req  = info->shadow[id].request;
+
+               blkif_completion(&info->shadow[id]);
+
+               ADD_ID_TO_FREELIST(info, id);
+
+               ret = bret->status == BLKIF_RSP_OKAY ? 0 : -EIO;
+               switch (bret->operation) {
+                       const char *what;
+
+               case BLKIF_OP_FLUSH_DISKCACHE:
+               case BLKIF_OP_WRITE_BARRIER:
+                       what = bret->operation == BLKIF_OP_WRITE_BARRIER ?
+                              "write barrier" : "flush disk cache";
+                       if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+                               pr_warn("blkfront: %s: %s op failed\n",
+                                       what, info->gd->disk_name);
+                               ret = -EOPNOTSUPP;
+                       }
+                       if (unlikely(bret->status == BLKIF_RSP_ERROR &&
+                                    info->shadow[id].req.nr_segments == 0)) {
+                               pr_warn("blkfront: %s: empty %s op failed\n",
+                                       what, info->gd->disk_name);
+                               ret = -EOPNOTSUPP;
+                       }
+                       if (unlikely(ret)) {
+                               if (ret == -EOPNOTSUPP)
+                                       ret = 0;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
+                               info->feature_flush = 0;
+#else
+                               info->feature_flush = QUEUE_ORDERED_NONE;
+#endif
+                               xlvbd_flush(info);
+                       }
+                       /* fall through */
+               case BLKIF_OP_READ:
+               case BLKIF_OP_WRITE:
+               case BLKIF_OP_PACKET:
+                       if (unlikely(bret->status != BLKIF_RSP_OKAY))
+                               DPRINTK("Bad return from blkdev data "
+                                       "request: %x\n", bret->status);
+
+                       __blk_end_request_all(req, ret);
+                       break;
+               case BLKIF_OP_DISCARD:
+                       if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+                               struct request_queue *rq = info->rq;
+
+                               pr_warn("blkfront: %s: discard op failed\n",
+                                       info->gd->disk_name);
+                               ret = -EOPNOTSUPP;
+                               info->feature_discard = 0;
+                               info->feature_secdiscard = 0;
+                               queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
+                               queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
+                       }
+                       __blk_end_request_all(req, ret);
+                       break;
+               default:
+                       BUG();
+               }
+       }
+
+       info->ring.rsp_cons = i;
+
+       if (i != info->ring.req_prod_pvt) {
+               int more_to_do;
+               RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+               if (more_to_do)
+                       goto again;
+       } else
+               info->ring.sring->rsp_event = i + 1;
+
+       kick_pending_request_queues(info);
+
+       spin_unlock_irqrestore(&info->io_lock, flags);
+
+       return IRQ_HANDLED;
+}
+
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+       /* Prevent new requests being issued until we fix things up. */
+       spin_lock_irq(&info->io_lock);
+       info->connected = suspend ?
+               BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+       /* No more blkif_request(). */
+       if (info->rq)
+               blk_stop_queue(info->rq);
+       /* No more gnttab callback work. */
+       gnttab_cancel_free_callback(&info->callback);
+       spin_unlock_irq(&info->io_lock);
+
+       /* Flush gnttab callback work. Must be done with no locks held. */
+       flush_work_sync(&info->work);
+
+       /* Free resources associated with old device channel. */
+       if (info->ring_ref != GRANT_INVALID_REF) {
+               gnttab_end_foreign_access(info->ring_ref, 
+                                         (unsigned long)info->ring.sring);
+               info->ring_ref = GRANT_INVALID_REF;
+               info->ring.sring = NULL;
+       }
+       if (info->irq)
+               unbind_from_irqhandler(info->irq, info);
+       info->irq = 0;
+}
+
+static void blkif_completion(struct blk_shadow *s)
+{
+       int i;
+
+       if (s->req.operation == BLKIF_OP_DISCARD)
+               return;
+       for (i = 0; i < s->req.nr_segments; i++)
+               gnttab_end_foreign_access(s->req.seg[i].gref, 0UL);
+}
+
+static int blkif_recover(struct blkfront_info *info)
+{
+       int i;
+       blkif_request_t *req;
+       struct blk_shadow *copy;
+       int j;
+
+       /* Stage 1: Make a safe copy of the shadow state. */
+       copy = kmemdup(info->shadow, sizeof(info->shadow),
+                      GFP_NOIO | __GFP_NOFAIL | __GFP_HIGH);
+       if (!copy)
+               return -ENOMEM;
+
+       /* Stage 2: Set up free list. */
+       memset(&info->shadow, 0, sizeof(info->shadow));
+       for (i = 0; i < BLK_RING_SIZE; i++)
+               info->shadow[i].req.id = i+1;
+       info->shadow_free = info->ring.req_prod_pvt;
+       info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
+
+       /* Stage 3: Find pending requests and requeue them. */
+       for (i = 0; i < BLK_RING_SIZE; i++) {
+               /* Not in use? */
+               if (!copy[i].request)
+                       continue;
+
+               /* Grab a request slot and copy shadow state into it. */
+               req = RING_GET_REQUEST(
+                       &info->ring, info->ring.req_prod_pvt);
+               *req = copy[i].req;
+
+               /* We get a new request id, and must reset the shadow state. */
+               req->id = GET_ID_FROM_FREELIST(info);
+               memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
+
+               /* Rewrite any grant references invalidated by susp/resume. */
+               for (j = 0; j < req->nr_segments; j++)
+                       gnttab_grant_foreign_access_ref(
+                               req->seg[j].gref,
+                               info->xbdev->otherend_id,
+                               pfn_to_mfn(info->shadow[req->id].frame[j]),
+                               rq_data_dir(info->shadow[req->id].request) ?
+                               GTF_readonly : 0);
+               info->shadow[req->id].req = *req;
+
+               info->ring.req_prod_pvt++;
+       }
+
+       kfree(copy);
+
+       (void)xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+       spin_lock_irq(&info->io_lock);
+
+       /* Now safe for us to use the shared ring */
+       info->connected = BLKIF_STATE_CONNECTED;
+
+       /* Send off requeued requests */
+       flush_requests(info);
+
+       /* Kick any other new requests queued since we resumed */
+       kick_pending_request_queues(info);
+
+       spin_unlock_irq(&info->io_lock);
+
+       return 0;
+}
+
+int blkfront_is_ready(struct xenbus_device *dev)
+{
+       struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+
+       return info->is_ready && info->xbdev;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blkfront_ids[] = {
+       { "vbd" },
+       { "" }
+};
+MODULE_ALIAS("xen:vbd");
+
+static DEFINE_XENBUS_DRIVER(blkfront, ,
+       .probe = blkfront_probe,
+       .remove = blkfront_remove,
+       .resume = blkfront_resume,
+       .otherend_changed = backend_changed,
+       .is_ready = blkfront_is_ready,
+);
+
+
+static int __init xlblk_init(void)
+{
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       return xenbus_register_frontend(&blkfront_driver);
+}
+module_init(xlblk_init);
+
+
+static void __exit xlblk_exit(void)
+{
+       xenbus_unregister_driver(&blkfront_driver);
+       xlbd_release_major_info();
+}
+module_exit(xlblk_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/blkfront/block.h b/drivers/xen/blkfront/block.h

new file mode 100644 (file)

index 0000000..09a12dd
--- /dev/null
+++ b/drivers/xen/blkfront/block.h
@@ -0,0 +1,169 @@
+/******************************************************************************
+ * block.h
+ * 
+ * Shared definitions between all levels of XenLinux Virtual block devices.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004-2005, Christian Limpach
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_DRIVERS_BLOCK_H__
+#define __XEN_DRIVERS_BLOCK_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/atomic.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/major.h>
+#include <linux/mutex.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/gnttab.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/ring.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+#define DPRINTK(_f, _a...) pr_debug(_f, ## _a)
+
+#if 0
+#define DPRINTK_IOCTL(_f, _a...) pr_alert(_f, ## _a)
+#else
+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
+#endif
+
+struct xlbd_major_info
+{
+       int major;
+       int index;
+       int usage;
+       const struct xlbd_type_info *type;
+       struct xlbd_minor_state *minors;
+};
+
+struct blk_shadow {
+       blkif_request_t req;
+       struct request *request;
+       unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+
+#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
+
+/*
+ * We have one of these per vbd, whether ide, scsi or 'other'.  They
+ * hang in private_data off the gendisk structure. We may end up
+ * putting all kinds of interesting stuff here :-)
+ */
+struct blkfront_info
+{
+       struct xenbus_device *xbdev;
+       struct gendisk *gd;
+       struct mutex mutex;
+       int vdevice;
+       blkif_vdev_t handle;
+       int connected;
+       int ring_ref;
+       blkif_front_ring_t ring;
+       spinlock_t io_lock;
+       struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+       unsigned int irq;
+       struct xlbd_major_info *mi;
+       struct request_queue *rq;
+       struct work_struct work;
+       struct gnttab_free_callback callback;
+       struct blk_shadow shadow[BLK_RING_SIZE];
+       unsigned long shadow_free;
+       unsigned int feature_flush;
+       unsigned int flush_op;
+       bool feature_discard;
+       bool feature_secdiscard;
+       unsigned int discard_granularity;
+       unsigned int discard_alignment;
+       int is_ready;
+};
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+extern int blkif_open(struct inode *inode, struct file *filep);
+extern int blkif_release(struct inode *inode, struct file *filep);
+extern int blkif_ioctl(struct inode *inode, struct file *filep,
+                      unsigned command, unsigned long argument);
+#else
+extern int blkif_open(struct block_device *bdev, fmode_t mode);
+extern int blkif_release(struct gendisk *disk, fmode_t mode);
+extern int blkif_ioctl(struct block_device *bdev, fmode_t mode,
+                      unsigned command, unsigned long argument);
+#endif
+extern int blkif_getgeo(struct block_device *, struct hd_geometry *);
+extern int blkif_check(dev_t dev);
+extern int blkif_revalidate(dev_t dev);
+extern void do_blkif_request (struct request_queue *rq);
+
+/* Virtual block-device subsystem. */
+/* Note that xlvbd_add doesn't call add_disk for you: you're expected
+   to call add_disk on info->gd once the disk is properly connected
+   up. */
+int xlvbd_add(blkif_sector_t capacity, int device,
+             u16 vdisk_info, u16 sector_size, struct blkfront_info *info);
+void xlvbd_del(struct blkfront_info *info);
+void xlvbd_flush(struct blkfront_info *info);
+
+#ifdef CONFIG_SYSFS
+int xlvbd_sysfs_addif(struct blkfront_info *info);
+void xlvbd_sysfs_delif(struct blkfront_info *info);
+#else
+static inline int xlvbd_sysfs_addif(struct blkfront_info *info)
+{
+       return 0;
+}
+
+static inline void xlvbd_sysfs_delif(struct blkfront_info *info)
+{
+       ;
+}
+#endif
+
+void xlbd_release_major_info(void);
+
+/* Virtual cdrom block-device */
+#ifdef CONFIG_XEN
+extern void register_vcd(struct blkfront_info *info);
+extern void unregister_vcd(struct blkfront_info *info);
+#else
+static inline void register_vcd(struct blkfront_info *info) {}
+static inline void unregister_vcd(struct blkfront_info *info) {}
+#endif
+
+#endif /* __XEN_DRIVERS_BLOCK_H__ */
diff --git a/drivers/xen/blkfront/vbd.c b/drivers/xen/blkfront/vbd.c

new file mode 100644 (file)

index 0000000..afddd8c
--- /dev/null
+++ b/drivers/xen/blkfront/vbd.c
@@ -0,0 +1,609 @@
+/******************************************************************************
+ * vbd.c
+ * 
+ * XenLinux virtual block-device driver (xvd).
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004-2005, Christian Limpach
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "block.h"
+#include <linux/bitmap.h>
+#include <linux/blkdev.h>
+#include <linux/list.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+#define XENVBD_MAJOR 202
+#endif
+
+#define BLKIF_MAJOR(dev) ((dev)>>8)
+#define BLKIF_MINOR(dev) ((dev) & 0xff)
+
+#define EXT_SHIFT 28
+#define EXTENDED (1<<EXT_SHIFT)
+#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
+#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
+
+struct xlbd_minor_state {
+       unsigned int nr;
+       unsigned long *bitmap;
+       spinlock_t lock;
+};
+
+/*
+ * For convenience we distinguish between ide, scsi and 'other' (i.e.,
+ * potentially combinations of the two) in the naming scheme and in a few other
+ * places.
+ */
+
+#define NUM_IDE_MAJORS 10
+#define NUM_SD_MAJORS 16
+#define NUM_VBD_MAJORS 2
+
+struct xlbd_type_info
+{
+       int partn_shift;
+       int disks_per_major;
+       char *devname;
+       char *diskname;
+};
+
+static const struct xlbd_type_info xlbd_ide_type = {
+       .partn_shift = 6,
+       .disks_per_major = 2,
+       .devname = "ide",
+       .diskname = "hd",
+};
+
+static const struct xlbd_type_info xlbd_sd_type = {
+       .partn_shift = 4,
+       .disks_per_major = 16,
+       .devname = "sd",
+       .diskname = "sd",
+};
+
+static const struct xlbd_type_info xlbd_sr_type = {
+       .partn_shift = 0,
+       .disks_per_major = 256,
+       .devname = "sr",
+       .diskname = "sr",
+};
+
+static const struct xlbd_type_info xlbd_vbd_type = {
+       .partn_shift = 4,
+       .disks_per_major = 16,
+       .devname = "xvd",
+       .diskname = "xvd",
+};
+
+static const struct xlbd_type_info xlbd_vbd_type_ext = {
+       .partn_shift = 8,
+       .disks_per_major = 256,
+       .devname = "xvd",
+       .diskname = "xvd",
+};
+
+static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SD_MAJORS + 1 +
+                                        NUM_VBD_MAJORS];
+
+#define XLBD_MAJOR_IDE_START   0
+#define XLBD_MAJOR_SD_START    (NUM_IDE_MAJORS)
+#define XLBD_MAJOR_SR_START    (NUM_IDE_MAJORS + NUM_SD_MAJORS)
+#define XLBD_MAJOR_VBD_START   (NUM_IDE_MAJORS + NUM_SD_MAJORS + 1)
+
+#define XLBD_MAJOR_IDE_RANGE   XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SD_START - 1
+#define XLBD_MAJOR_SD_RANGE    XLBD_MAJOR_SD_START ... XLBD_MAJOR_SR_START - 1
+#define XLBD_MAJOR_SR_RANGE    XLBD_MAJOR_SR_START
+#define XLBD_MAJOR_VBD_RANGE   XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1
+
+#define XLBD_MAJOR_VBD_ALT(idx) ((idx) ^ XLBD_MAJOR_VBD_START ^ (XLBD_MAJOR_VBD_START + 1))
+
+static const struct block_device_operations xlvbd_block_fops =
+{
+       .owner = THIS_MODULE,
+       .open = blkif_open,
+       .release = blkif_release,
+       .ioctl  = blkif_ioctl,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+       .getgeo = blkif_getgeo
+#endif
+};
+
+static struct xlbd_major_info *
+xlbd_alloc_major_info(int major, int minor, int index)
+{
+       struct xlbd_major_info *ptr;
+       struct xlbd_minor_state *minors;
+       int do_register;
+
+       ptr = kzalloc(sizeof(struct xlbd_major_info), GFP_KERNEL);
+       if (ptr == NULL)
+               return NULL;
+
+       ptr->major = major;
+       minors = kmalloc(sizeof(*minors), GFP_KERNEL);
+       if (minors == NULL) {
+               kfree(ptr);
+               return NULL;
+       }
+
+       minors->bitmap = kzalloc(BITS_TO_LONGS(256) * sizeof(*minors->bitmap),
+                                GFP_KERNEL);
+       if (minors->bitmap == NULL) {
+               kfree(minors);
+               kfree(ptr);
+               return NULL;
+       }
+
+       spin_lock_init(&minors->lock);
+       minors->nr = 256;
+       do_register = 1;
+
+       switch (index) {
+       case XLBD_MAJOR_IDE_RANGE:
+               ptr->type = &xlbd_ide_type;
+               ptr->index = index - XLBD_MAJOR_IDE_START;
+               break;
+       case XLBD_MAJOR_SD_RANGE:
+               ptr->type = &xlbd_sd_type;
+               ptr->index = index - XLBD_MAJOR_SD_START;
+               break;
+       case XLBD_MAJOR_SR_RANGE:
+               ptr->type = &xlbd_sr_type;
+               ptr->index = index - XLBD_MAJOR_SR_START;
+               break;
+       case XLBD_MAJOR_VBD_RANGE:
+               ptr->index = 0;
+               if ((index - XLBD_MAJOR_VBD_START) == 0)
+                       ptr->type = &xlbd_vbd_type;
+               else
+                       ptr->type = &xlbd_vbd_type_ext;
+
+               /* 
+                * if someone already registered block major XENVBD_MAJOR,
+                * don't try to register it again
+                */
+               if (major_info[XLBD_MAJOR_VBD_ALT(index)] != NULL) {
+                       kfree(minors->bitmap);
+                       kfree(minors);
+                       minors = major_info[XLBD_MAJOR_VBD_ALT(index)]->minors;
+                       do_register = 0;
+               }
+               break;
+       }
+
+       if (do_register) {
+               if (register_blkdev(ptr->major, ptr->type->devname)) {
+                       kfree(minors->bitmap);
+                       kfree(minors);
+                       kfree(ptr);
+                       return NULL;
+               }
+
+               pr_info("xen-vbd: registered block device major %i\n",
+                       ptr->major);
+       }
+
+       ptr->minors = minors;
+       major_info[index] = ptr;
+       return ptr;
+}
+
+static struct xlbd_major_info *
+xlbd_get_major_info(int major, int minor, int vdevice)
+{
+       struct xlbd_major_info *mi;
+       int index;
+
+       switch (major) {
+       case IDE0_MAJOR: index = 0; break;
+       case IDE1_MAJOR: index = 1; break;
+       case IDE2_MAJOR: index = 2; break;
+       case IDE3_MAJOR: index = 3; break;
+       case IDE4_MAJOR: index = 4; break;
+       case IDE5_MAJOR: index = 5; break;
+       case IDE6_MAJOR: index = 6; break;
+       case IDE7_MAJOR: index = 7; break;
+       case IDE8_MAJOR: index = 8; break;
+       case IDE9_MAJOR: index = 9; break;
+       case SCSI_DISK0_MAJOR: index = XLBD_MAJOR_SD_START; break;
+       case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR:
+               index = XLBD_MAJOR_SD_START + 1 + major - SCSI_DISK1_MAJOR;
+               break;
+       case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR:
+               index = XLBD_MAJOR_SD_START + 8 + major - SCSI_DISK8_MAJOR;
+               break;
+       case SCSI_CDROM_MAJOR:
+               index = XLBD_MAJOR_SR_START;
+               break;
+       case XENVBD_MAJOR:
+               index = XLBD_MAJOR_VBD_START + !!VDEV_IS_EXTENDED(vdevice);
+               break;
+       default:
+               return NULL;
+       }
+
+       mi = ((major_info[index] != NULL) ? major_info[index] :
+             xlbd_alloc_major_info(major, minor, index));
+       if (mi)
+               mi->usage++;
+       return mi;
+}
+
+static void
+xlbd_put_major_info(struct xlbd_major_info *mi)
+{
+       mi->usage--;
+       /* XXX: release major if 0 */
+}
+
+void __exit
+xlbd_release_major_info(void)
+{
+       unsigned int i;
+       int vbd_done = 0;
+
+       for (i = 0; i < ARRAY_SIZE(major_info); ++i) {
+               struct xlbd_major_info *mi = major_info[i];
+
+               if (!mi)
+                       continue;
+               if (mi->usage)
+                       pr_warning("vbd: major %u still in use (%u times)\n",
+                                  mi->major, mi->usage);
+               if (mi->major != XENVBD_MAJOR || !vbd_done) {
+                       unregister_blkdev(mi->major, mi->type->devname);
+                       kfree(mi->minors->bitmap);
+                       kfree(mi->minors);
+               }
+               if (mi->major == XENVBD_MAJOR)
+                       vbd_done = 1;
+               kfree(mi);
+       }
+}
+
+static int
+xlbd_reserve_minors(struct xlbd_major_info *mi, unsigned int minor,
+                   unsigned int nr_minors)
+{
+       struct xlbd_minor_state *ms = mi->minors;
+       unsigned int end = minor + nr_minors;
+       int rc;
+
+       if (end > ms->nr) {
+               unsigned long *bitmap, *old;
+
+               bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
+                                GFP_KERNEL);
+               if (bitmap == NULL)
+                       return -ENOMEM;
+
+               spin_lock(&ms->lock);
+               if (end > ms->nr) {
+                       old = ms->bitmap;
+                       memcpy(bitmap, ms->bitmap,
+                              BITS_TO_LONGS(ms->nr) * sizeof(*bitmap));
+                       ms->bitmap = bitmap;
+                       ms->nr = BITS_TO_LONGS(end) * BITS_PER_LONG;
+               } else
+                       old = bitmap;
+               spin_unlock(&ms->lock);
+               kfree(old);
+       }
+
+       spin_lock(&ms->lock);
+       if (find_next_bit(ms->bitmap, end, minor) >= end) {
+               bitmap_set(ms->bitmap, minor, nr_minors);
+               rc = 0;
+       } else
+               rc = -EBUSY;
+       spin_unlock(&ms->lock);
+
+       return rc;
+}
+
+static void
+xlbd_release_minors(struct xlbd_major_info *mi, unsigned int minor,
+                   unsigned int nr_minors)
+{
+       struct xlbd_minor_state *ms = mi->minors;
+
+       BUG_ON(minor + nr_minors > ms->nr);
+       spin_lock(&ms->lock);
+       bitmap_clear(ms->bitmap, minor, nr_minors);
+       spin_unlock(&ms->lock);
+}
+
+static char *encode_disk_name(char *ptr, unsigned int n)
+{
+       if (n >= 26)
+               ptr = encode_disk_name(ptr, n / 26 - 1);
+       *ptr = 'a' + n % 26;
+       return ptr + 1;
+}
+
+static int
+xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
+                    struct blkfront_info *info)
+{
+       struct request_queue *rq;
+
+       rq = blk_init_queue(do_blkif_request, &info->io_lock);
+       if (rq == NULL)
+               return -1;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
+       queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
+#endif
+
+       if (info->feature_discard) {
+               queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq);
+               blk_queue_max_discard_sectors(rq, get_capacity(gd));
+               rq->limits.discard_granularity = info->discard_granularity;
+               rq->limits.discard_alignment = info->discard_alignment;
+               if (info->feature_secdiscard)
+                       queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
+       }
+
+       /* Hard sector size and max sectors impersonate the equiv. hardware. */
+       blk_queue_logical_block_size(rq, sector_size);
+       blk_queue_max_hw_sectors(rq, 512);
+
+       /* Each segment in a request is up to an aligned page in size. */
+       blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+       blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+       /* Ensure a merged request will fit in a single I/O ring slot. */
+       blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+       /* Make sure buffer addresses are sector-aligned. */
+       blk_queue_dma_alignment(rq, 511);
+
+       /* Make sure we don't use bounce buffers. */
+       blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
+
+       gd->queue = rq;
+       info->rq = rq;
+
+       return 0;
+}
+
+int
+xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info,
+         u16 sector_size, struct blkfront_info *info)
+{
+       int major, minor;
+       struct gendisk *gd;
+       struct xlbd_major_info *mi;
+       int nr_minors = 1;
+       int err = -ENODEV;
+       char *ptr;
+       unsigned int offset;
+
+       if ((vdevice>>EXT_SHIFT) > 1) {
+               /* this is above the extended range; something is wrong */
+               pr_warning("blkfront: vdevice %#x is above the extended range;"
+                          " ignoring\n", vdevice);
+               return -ENODEV;
+       }
+
+       if (!VDEV_IS_EXTENDED(vdevice)) {
+               major = BLKIF_MAJOR(vdevice);
+               minor = BLKIF_MINOR(vdevice);
+       }
+       else {
+               major = XENVBD_MAJOR;
+               minor = BLKIF_MINOR_EXT(vdevice);
+               if (minor >> MINORBITS) {
+                       pr_warning("blkfront: %#x's minor (%#x) out of range;"
+                                  " ignoring\n", vdevice, minor);
+                       return -ENODEV;
+               }
+       }
+
+       BUG_ON(info->gd != NULL);
+       BUG_ON(info->mi != NULL);
+       BUG_ON(info->rq != NULL);
+
+       mi = xlbd_get_major_info(major, minor, vdevice);
+       if (mi == NULL)
+               goto out;
+       info->mi = mi;
+
+       if ((vdisk_info & VDISK_CDROM) ||
+           !(minor & ((1 << mi->type->partn_shift) - 1)))
+               nr_minors = 1 << mi->type->partn_shift;
+
+       err = xlbd_reserve_minors(mi, minor & ~(nr_minors - 1), nr_minors);
+       if (err)
+               goto out;
+       err = -ENODEV;
+
+       gd = alloc_disk(vdisk_info & VDISK_CDROM ? 1 : nr_minors);
+       if (gd == NULL)
+               goto release;
+
+       strcpy(gd->disk_name, mi->type->diskname);
+       ptr = gd->disk_name + strlen(mi->type->diskname);
+       offset = mi->index * mi->type->disks_per_major +
+                (minor >> mi->type->partn_shift);
+       if (mi->type->partn_shift) {
+               ptr = encode_disk_name(ptr, offset);
+               offset = minor & ((1 << mi->type->partn_shift) - 1);
+       } else
+               gd->flags |= GENHD_FL_CD;
+       BUG_ON(ptr >= gd->disk_name + ARRAY_SIZE(gd->disk_name));
+       if (nr_minors > 1)
+               *ptr = 0;
+       else
+               snprintf(ptr, gd->disk_name + ARRAY_SIZE(gd->disk_name) - ptr,
+                        "%u", offset);
+
+       gd->major = mi->major;
+       gd->first_minor = minor;
+       gd->fops = &xlvbd_block_fops;
+       gd->private_data = info;
+       gd->driverfs_dev = &(info->xbdev->dev);
+       set_capacity(gd, capacity);
+
+       if (xlvbd_init_blk_queue(gd, sector_size, info)) {
+               del_gendisk(gd);
+               goto release;
+       }
+
+       info->gd = gd;
+
+       xlvbd_flush(info);
+
+       if (vdisk_info & VDISK_READONLY)
+               set_disk_ro(gd, 1);
+
+       if (vdisk_info & VDISK_REMOVABLE)
+               gd->flags |= GENHD_FL_REMOVABLE;
+
+       if (vdisk_info & VDISK_CDROM)
+               gd->flags |= GENHD_FL_CD;
+
+       return 0;
+
+ release:
+       xlbd_release_minors(mi, minor, nr_minors);
+ out:
+       if (mi)
+               xlbd_put_major_info(mi);
+       info->mi = NULL;
+       return err;
+}
+
+void
+xlvbd_del(struct blkfront_info *info)
+{
+       unsigned int minor, nr_minors;
+
+       if (info->mi == NULL)
+               return;
+
+       BUG_ON(info->gd == NULL);
+       minor = info->gd->first_minor;
+       nr_minors = (info->gd->flags & GENHD_FL_CD)
+                   || !(minor & ((1 << info->mi->type->partn_shift) - 1))
+                   ? 1 << info->mi->type->partn_shift : 1;
+       del_gendisk(info->gd);
+       put_disk(info->gd);
+       info->gd = NULL;
+
+       xlbd_release_minors(info->mi, minor & ~(nr_minors - 1), nr_minors);
+       xlbd_put_major_info(info->mi);
+       info->mi = NULL;
+
+       BUG_ON(info->rq == NULL);
+       blk_cleanup_queue(info->rq);
+       info->rq = NULL;
+}
+
+void
+xlvbd_flush(struct blkfront_info *info)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37)
+       blk_queue_flush(info->rq, info->feature_flush);
+       pr_info("blkfront: %s: %s: %s\n",
+               info->gd->disk_name,
+               info->flush_op == BLKIF_OP_WRITE_BARRIER ?
+               "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
+                            "flush diskcache" : "barrier or flush"),
+               info->feature_flush ? "enabled" : "disabled");
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+       int err;
+       const char *barrier;
+
+       switch (info->feature_flush) {
+       case QUEUE_ORDERED_DRAIN:       barrier = "enabled (drain)"; break;
+       case QUEUE_ORDERED_TAG:         barrier = "enabled (tag)"; break;
+       case QUEUE_ORDERED_NONE:        barrier = "disabled"; break;
+       default:                        return -EINVAL;
+       }
+
+       err = blk_queue_ordered(info->rq, info->feature_flush);
+       if (err)
+               return err;
+       pr_info("blkfront: %s: barriers %s\n",
+               info->gd->disk_name, barrier);
+#else
+       if (info->feature_flush)
+               pr_info("blkfront: %s: barriers disabled\n", info->gd->disk_name);
+#endif
+}
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_media(struct device *dev,
+                                 struct device_attribute *attr, char *buf)
+{
+       struct xenbus_device *xendev = to_xenbus_device(dev);
+       struct blkfront_info *info = dev_get_drvdata(&xendev->dev);
+
+       if (info->gd->flags & GENHD_FL_CD)
+               return sprintf(buf, "cdrom\n");
+       return sprintf(buf, "disk\n");
+}
+
+static struct device_attribute xlvbd_attrs[] = {
+       __ATTR(media, S_IRUGO, show_media, NULL),
+};
+
+int xlvbd_sysfs_addif(struct blkfront_info *info)
+{
+       int i;
+       int error = 0;
+
+       for (i = 0; i < ARRAY_SIZE(xlvbd_attrs); i++) {
+               error = device_create_file(info->gd->driverfs_dev,
+                               &xlvbd_attrs[i]);
+               if (error)
+                       goto fail;
+       }
+       return 0;
+
+fail:
+       while (--i >= 0)
+               device_remove_file(info->gd->driverfs_dev, &xlvbd_attrs[i]);
+       return error;
+}
+
+void xlvbd_sysfs_delif(struct blkfront_info *info)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(xlvbd_attrs); i++)
+               device_remove_file(info->gd->driverfs_dev, &xlvbd_attrs[i]);
+}
+
+#endif /* CONFIG_SYSFS */
diff --git a/drivers/xen/blkfront/vcd.c b/drivers/xen/blkfront/vcd.c

new file mode 100644 (file)

index 0000000..75fd7a9
--- /dev/null
+++ b/drivers/xen/blkfront/vcd.c
@@ -0,0 +1,494 @@
+/*******************************************************************************
+ * vcd.c
+ *
+ * Implements CDROM cmd packet passing between frontend guest and backend driver.
+ *
+ * Copyright (c) 2008, Pat Campell  plc@novell.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/list.h>
+#include <linux/cdrom.h>
+#include <xen/interface/io/cdromif.h>
+#include "block.h"
+
+/* List of cdrom_device_info, can have as many as blkfront supports */
+struct vcd_disk {
+       struct list_head vcd_entry;
+       struct cdrom_device_info vcd_cdrom_info;
+       spinlock_t vcd_cdrom_info_lock;
+};
+static LIST_HEAD(vcd_disks);
+static DEFINE_SPINLOCK(vcd_disks_lock);
+
+static struct vcd_disk *xencdrom_get_list_entry(struct gendisk *disk)
+{
+       struct vcd_disk *ret_vcd = NULL;
+       struct vcd_disk *vcd;
+
+       spin_lock(&vcd_disks_lock);
+       list_for_each_entry(vcd, &vcd_disks, vcd_entry) {
+               if (vcd->vcd_cdrom_info.disk == disk) {
+                       spin_lock(&vcd->vcd_cdrom_info_lock);
+                       ret_vcd = vcd;
+                       break;
+               }
+       }
+       spin_unlock(&vcd_disks_lock);
+       return ret_vcd;
+}
+
+static void submit_message(struct blkfront_info *info, void *sp)
+{
+       struct request *req = NULL;
+
+       req = blk_get_request(info->rq, READ, __GFP_WAIT);
+       if (blk_rq_map_kern(info->rq, req, sp, PAGE_SIZE, __GFP_WAIT))
+               goto out;
+
+       req->rq_disk = info->gd;
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
+       req->cmd_type = REQ_TYPE_BLOCK_PC;
+       req->cmd_flags |= REQ_NOMERGE;
+#else
+       req->flags |= REQ_BLOCK_PC;
+#endif
+       req->__sector = 0;
+       req->cmd_len = 0;
+       req->timeout = 60*HZ;
+
+       blk_execute_rq(req->q, info->gd, req, 1);
+
+out:
+       blk_put_request(req);
+}
+
+static int submit_cdrom_cmd(struct blkfront_info *info,
+                           struct packet_command *cgc)
+{
+       int ret = 0;
+       struct page *page;
+       union xen_block_packet *sp;
+       struct xen_cdrom_packet *xcp;
+       struct vcd_generic_command *vgc;
+
+       if (cgc->buffer && cgc->buflen > MAX_PACKET_DATA) {
+               pr_warn("%s() Packet buffer length is to large \n", __func__);
+               return -EIO;
+       }
+
+       page = alloc_page(GFP_NOIO|__GFP_ZERO);
+       if (!page) {
+               pr_crit("%s() Unable to allocate page\n", __func__);
+               return -ENOMEM;
+       }
+
+       sp = page_address(page);
+       xcp = &(sp->xcp);
+       xcp->type = XEN_TYPE_CDROM_PACKET;
+       xcp->payload_offset = PACKET_PAYLOAD_OFFSET;
+
+       vgc = (struct vcd_generic_command *)((char *)sp + xcp->payload_offset);
+       memcpy(vgc->cmd, cgc->cmd, CDROM_PACKET_SIZE);
+       vgc->stat = cgc->stat;
+       vgc->data_direction = cgc->data_direction;
+       vgc->quiet = cgc->quiet;
+       vgc->timeout = cgc->timeout;
+       if (cgc->sense) {
+               vgc->sense_offset = PACKET_SENSE_OFFSET;
+               memcpy((char *)sp + vgc->sense_offset, cgc->sense, sizeof(struct request_sense));
+       }
+       if (cgc->buffer) {
+               vgc->buffer_offset = PACKET_BUFFER_OFFSET;
+               memcpy((char *)sp + vgc->buffer_offset, cgc->buffer, cgc->buflen);
+               vgc->buflen = cgc->buflen;
+       }
+
+       submit_message(info,sp);
+
+       if (xcp->ret)
+               ret = xcp->err;
+
+       if (cgc->sense)
+               memcpy(cgc->sense, (char *)sp + PACKET_SENSE_OFFSET, sizeof(struct request_sense));
+       if (cgc->buffer && cgc->buflen)
+               memcpy(cgc->buffer, (char *)sp + PACKET_BUFFER_OFFSET, cgc->buflen);
+
+       __free_page(page);
+       return ret;
+}
+
+
+static int xencdrom_open(struct cdrom_device_info *cdi, int purpose)
+{
+       int ret = 0;
+       struct page *page;
+       struct blkfront_info *info;
+       union xen_block_packet *sp;
+       struct xen_cdrom_open *xco;
+
+       info = cdi->disk->private_data;
+
+       if (!info->xbdev)
+               return -ENODEV;
+
+       if (strlen(info->xbdev->otherend) > MAX_PACKET_DATA) {
+               return -EIO;
+       }
+
+       page = alloc_page(GFP_NOIO|__GFP_ZERO);
+       if (!page) {
+               pr_crit("%s() Unable to allocate page\n", __func__);
+               return -ENOMEM;
+       }
+
+       sp = page_address(page);
+       xco = &(sp->xco);
+       xco->type = XEN_TYPE_CDROM_OPEN;
+       xco->payload_offset = sizeof(struct xen_cdrom_open);
+       strcpy((char *)sp + xco->payload_offset, info->xbdev->otherend);
+
+       submit_message(info,sp);
+
+       if (xco->ret) {
+               ret = xco->err;
+               goto out;
+       }
+
+       if (xco->media_present)
+               set_capacity(cdi->disk, xco->sectors);
+
+out:
+       __free_page(page);
+       return ret;
+}
+
+static void xencdrom_release(struct cdrom_device_info *cdi)
+{
+}
+
+static int xencdrom_media_changed(struct cdrom_device_info *cdi, int disc_nr)
+{
+       int ret;
+       struct page *page;
+       struct blkfront_info *info;
+       union xen_block_packet *sp;
+       struct xen_cdrom_media_changed *xcmc;
+
+       info = cdi->disk->private_data;
+
+       page = alloc_page(GFP_NOIO|__GFP_ZERO);
+       if (!page) {
+               pr_crit("%s() Unable to allocate page\n", __func__);
+               return -ENOMEM;
+       }
+
+       sp = page_address(page);
+       xcmc = &(sp->xcmc);
+       xcmc->type = XEN_TYPE_CDROM_MEDIA_CHANGED;
+       submit_message(info,sp);
+       ret = xcmc->media_changed;
+
+       __free_page(page);
+
+       return ret;
+}
+
+static int xencdrom_tray_move(struct cdrom_device_info *cdi, int position)
+{
+       struct packet_command cgc;
+       struct blkfront_info *info;
+
+       info = cdi->disk->private_data;
+       init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+       cgc.cmd[0] = GPCMD_START_STOP_UNIT;
+       if (position)
+               cgc.cmd[4] = 2;
+       else
+               cgc.cmd[4] = 3;
+
+       return submit_cdrom_cmd(info, &cgc);
+}
+
+static int xencdrom_lock_door(struct cdrom_device_info *cdi, int lock)
+{
+       struct blkfront_info *info;
+       struct packet_command cgc;
+
+       info = cdi->disk->private_data;
+       init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+       cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL;
+       cgc.cmd[4] = lock;
+
+       return submit_cdrom_cmd(info, &cgc);
+}
+
+static int xencdrom_packet(struct cdrom_device_info *cdi,
+                          struct packet_command *cgc)
+{
+       return cgc->stat = submit_cdrom_cmd(cdi->disk->private_data, cgc);
+}
+
+static int xencdrom_audio_ioctl(struct cdrom_device_info *cdi, unsigned int cmd,
+               void *arg)
+{
+       return -EINVAL;
+}
+
+/* Query backend to see if CDROM packets are supported */
+static int xencdrom_supported(struct blkfront_info *info)
+{
+       struct page *page;
+       union xen_block_packet *sp;
+       struct xen_cdrom_support *xcs;
+
+       page = alloc_page(GFP_NOIO|__GFP_ZERO);
+       if (!page) {
+               pr_crit("%s() Unable to allocate page\n", __func__);
+               return -ENOMEM;
+       }
+
+       sp = page_address(page);
+       xcs = &(sp->xcs);
+       xcs->type = XEN_TYPE_CDROM_SUPPORT;
+       submit_message(info,sp);
+       return xcs->supported;
+}
+
+static struct cdrom_device_ops xencdrom_dops = {
+    .open           = xencdrom_open,
+    .release        = xencdrom_release,
+    .media_changed  = xencdrom_media_changed,
+    .tray_move      = xencdrom_tray_move,
+    .lock_door      = xencdrom_lock_door,
+    .generic_packet = xencdrom_packet,
+    .audio_ioctl    = xencdrom_audio_ioctl,
+    .capability     = (CDC_CLOSE_TRAY | CDC_OPEN_TRAY | CDC_LOCK | \
+                       CDC_MEDIA_CHANGED | CDC_GENERIC_PACKET |  CDC_DVD | \
+                       CDC_CD_R),
+    .n_minors       = 1,
+};
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+static int xencdrom_block_open(struct inode *inode, struct file *file)
+{
+       struct block_device *bd = inode->i_bdev;
+#else
+static int xencdrom_block_open(struct block_device *bd, fmode_t mode)
+{
+#endif
+       struct blkfront_info *info = bd->bd_disk->private_data;
+       struct vcd_disk *vcd;
+       int ret = 0;
+
+       if (!info->xbdev)
+               return -ENODEV;
+
+       if ((vcd = xencdrom_get_list_entry(info->gd))) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+               ret = cdrom_open(&vcd->vcd_cdrom_info, inode, file);
+#else
+               ret = cdrom_open(&vcd->vcd_cdrom_info, bd, mode);
+#endif
+               spin_unlock(&vcd->vcd_cdrom_info_lock);
+       }
+
+       return ret;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+static int xencdrom_block_release(struct inode *inode, struct file *file)
+{
+       struct gendisk *gd = inode->i_bdev->bd_disk;
+#else
+static int xencdrom_block_release(struct gendisk *gd, fmode_t mode)
+{
+#endif
+       struct blkfront_info *info = gd->private_data;
+       struct vcd_disk *vcd;
+       int ret = 0;
+
+       if ((vcd = xencdrom_get_list_entry(info->gd))) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+               ret = cdrom_release(&vcd->vcd_cdrom_info, file);
+#else
+               cdrom_release(&vcd->vcd_cdrom_info, mode);
+#endif
+               spin_unlock(&vcd->vcd_cdrom_info_lock);
+               if (vcd->vcd_cdrom_info.use_count == 0) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+                       blkif_release(inode, file);
+#else
+                       blkif_release(gd, mode);
+#endif
+               }
+       }
+
+       return ret;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+static int xencdrom_block_ioctl(struct inode *inode, struct file *file,
+                               unsigned cmd, unsigned long arg)
+{
+       struct block_device *bd = inode->i_bdev;
+#else
+static int xencdrom_block_ioctl(struct block_device *bd, fmode_t mode,
+                               unsigned cmd, unsigned long arg)
+{
+#endif
+       struct blkfront_info *info = bd->bd_disk->private_data;
+       struct vcd_disk *vcd;
+       int ret = 0;
+
+       if (!(vcd = xencdrom_get_list_entry(info->gd)))
+               goto out;
+
+       switch (cmd) {
+       case 2285: /* SG_IO */
+               ret = -ENOSYS;
+               break;
+       case CDROMEJECT:
+               ret = xencdrom_tray_move(&vcd->vcd_cdrom_info, 1);
+               break;
+       case CDROMCLOSETRAY:
+               ret = xencdrom_tray_move(&vcd->vcd_cdrom_info, 0);
+               break;
+       case CDROM_GET_CAPABILITY:
+               ret = vcd->vcd_cdrom_info.ops->capability & ~vcd->vcd_cdrom_info.mask;
+               break;
+       case CDROM_SET_OPTIONS:
+               ret = vcd->vcd_cdrom_info.options;
+               break;
+       case CDROM_SEND_PACKET: {
+               struct packet_command cgc;
+
+               ret = copy_from_user(&cgc, (void __user *)arg, sizeof(cgc))
+                     ? -EFAULT : submit_cdrom_cmd(info, &cgc);
+               break;
+       }
+       default:
+               spin_unlock(&vcd->vcd_cdrom_info_lock);
+out:
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+               return blkif_ioctl(inode, file, cmd, arg);
+#else
+               return blkif_ioctl(bd, mode, cmd, arg);
+#endif
+       }
+       spin_unlock(&vcd->vcd_cdrom_info_lock);
+
+       return ret;
+}
+
+/* Called as result of cdrom_open, vcd_cdrom_info_lock already held */
+static int xencdrom_block_media_changed(struct gendisk *disk)
+{
+       struct vcd_disk *vcd;
+       struct vcd_disk *ret_vcd = NULL;
+
+       spin_lock(&vcd_disks_lock);
+       list_for_each_entry(vcd, &vcd_disks, vcd_entry) {
+               if (vcd->vcd_cdrom_info.disk == disk) {
+                       ret_vcd = vcd;
+                       break;
+               }
+       }
+       spin_unlock(&vcd_disks_lock);
+
+       return ret_vcd ? cdrom_media_changed(&ret_vcd->vcd_cdrom_info) : 0;
+}
+
+static const struct block_device_operations xencdrom_bdops =
+{
+       .owner          = THIS_MODULE,
+       .open           = xencdrom_block_open,
+       .release        = xencdrom_block_release,
+       .ioctl          = xencdrom_block_ioctl,
+       .media_changed  = xencdrom_block_media_changed,
+};
+
+void register_vcd(struct blkfront_info *info)
+{
+       struct gendisk *gd = info->gd;
+       struct vcd_disk *vcd;
+
+       /* Make sure this is for a CD device */
+       if (!(gd->flags & GENHD_FL_CD))
+               goto out;
+
+       /* Make sure we have backend support */
+       if (!xencdrom_supported(info))
+               goto out;
+
+       /* Create new vcd_disk and fill in cdrom_info */
+       vcd = kzalloc(sizeof(*vcd), GFP_KERNEL);
+       if (!vcd) {
+               pr_info("%s(): Unable to allocate vcd struct!\n", __func__);
+               goto out;
+       }
+       spin_lock_init(&vcd->vcd_cdrom_info_lock);
+
+       vcd->vcd_cdrom_info.ops = &xencdrom_dops;
+       vcd->vcd_cdrom_info.speed = 4;
+       vcd->vcd_cdrom_info.capacity = 1;
+       vcd->vcd_cdrom_info.options = 0;
+       strlcpy(vcd->vcd_cdrom_info.name, gd->disk_name,
+               ARRAY_SIZE(vcd->vcd_cdrom_info.name));
+       vcd->vcd_cdrom_info.mask = (CDC_CD_RW | CDC_DVD_R | CDC_DVD_RAM |
+                       CDC_SELECT_DISC | CDC_SELECT_SPEED |
+                       CDC_MRW | CDC_MRW_W | CDC_RAM);
+
+       if (register_cdrom(&(vcd->vcd_cdrom_info)) != 0) {
+               pr_warn("%s() Cannot register blkdev as a cdrom %d!\n",
+                       __func__, gd->major);
+               goto err_out;
+       }
+       gd->fops = &xencdrom_bdops;
+       vcd->vcd_cdrom_info.disk = gd;
+
+       spin_lock(&vcd_disks_lock);
+       list_add(&(vcd->vcd_entry), &vcd_disks);
+       spin_unlock(&vcd_disks_lock);
+out:
+       return;
+err_out:
+       kfree(vcd);
+}
+
+void unregister_vcd(struct blkfront_info *info) {
+       struct gendisk *gd = info->gd;
+       struct vcd_disk *vcd;
+
+       spin_lock(&vcd_disks_lock);
+       list_for_each_entry(vcd, &vcd_disks, vcd_entry) {
+               if (vcd->vcd_cdrom_info.disk == gd) {
+                       spin_lock(&vcd->vcd_cdrom_info_lock);
+                       unregister_cdrom(&vcd->vcd_cdrom_info);
+                       list_del(&vcd->vcd_entry);
+                       spin_unlock(&vcd->vcd_cdrom_info_lock);
+                       kfree(vcd);
+                       break;
+               }
+       }
+       spin_unlock(&vcd_disks_lock);
+}
diff --git a/drivers/xen/blktap/Makefile b/drivers/xen/blktap/Makefile

new file mode 100644 (file)

index 0000000..b1e4a07
--- /dev/null
+++ b/drivers/xen/blktap/Makefile
@@ -0,0 +1,5 @@
+LINUXINCLUDE += -I../xen/include/public/io
+
+obj-$(CONFIG_XEN_BLKDEV_TAP) := blktap.o
+
+blktap-y := xenbus.o interface.o blocktap.o
diff --git a/drivers/xen/blktap/blktap.c b/drivers/xen/blktap/blktap.c

new file mode 100644 (file)

index 0000000..eafd5db
--- /dev/null
+++ b/drivers/xen/blktap/blktap.c
@@ -0,0 +1,1784 @@
+/******************************************************************************
+ * drivers/xen/blktap/blktap.c
+ * 
+ * Back-end driver for user level virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. Requests
+ * are remapped to a user-space memory region.
+ *
+ * Based on the blkback driver code.
+ * 
+ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
+ *
+ * Clean ups and fix ups:
+ *    Copyright (c) 2006, Steven Rostedt - Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <asm/hypervisor.h>
+#include "common.h"
+#include <xen/balloon.h>
+#include <xen/driver_util.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/gfp.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <linux/nsproxy.h>
+#include <asm/tlbflush.h>
+
+#define MAX_TAP_DEV 256     /*the maximum number of tapdisk ring devices    */
+#define MAX_DEV_NAME 100    /*the max tapdisk ring device name e.g. blktap0 */
+
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by 
+ *
+ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define BLK_RING_SIZE          __CONST_RING_SIZE(blkif, PAGE_SIZE)
+#define MAX_DYNAMIC_MEM                BLK_RING_SIZE
+#define MAX_PENDING_REQS       BLK_RING_SIZE
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req,_seg)                                   \
+        (_start +                                                       \
+         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
+         ((_seg) * PAGE_SIZE))
+static int mmap_pages = MMAP_PAGES;
+
+#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we
+                     * have a bunch of pages reserved for shared
+                     * memory rings.
+                     */
+
+/*Data struct handed back to userspace for tapdisk device to VBD mapping*/
+typedef struct domid_translate {
+       unsigned short domid;
+       unsigned short busid;
+} domid_translate_t ;
+
+typedef struct domid_translate_ext {
+       unsigned short domid;
+       u32 busid;
+} domid_translate_ext_t ;
+
+/*Data struct associated with each of the tapdisk devices*/
+typedef struct tap_blkif {
+       struct mm_struct *mm;         /*User address space                   */
+       unsigned long rings_vstart;   /*Kernel memory mapping                */
+       unsigned long user_vstart;    /*User memory mapping                  */
+       unsigned long dev_inuse;      /*One process opens device at a time.  */
+       unsigned long dev_pending;    /*In process of being opened           */
+       unsigned long ring_ok;        /*make this ring->state                */
+       blkif_front_ring_t ufe_ring;  /*Rings up to user space.              */
+       wait_queue_head_t wait;       /*for poll                             */
+       unsigned long mode;           /*current switching mode               */
+       int minor;                    /*Minor number for tapdisk device      */
+       pid_t pid;                    /*tapdisk process id                   */
+       struct pid_namespace *pid_ns; /*... and its corresponding namespace  */
+       enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace 
+                                                 shutdown                   */
+       spinlock_t map_lock;          /*protects idx_map                     */
+       struct idx_map {
+               u16 mem, req;
+       } *idx_map;                   /*Record the user ring id to kern
+                                       [req id, idx] tuple                  */
+       blkif_t *blkif;               /*Associate blkif with tapdev          */
+       struct domid_translate_ext trans; /*Translation from domid to bus.   */
+       struct vm_foreign_map foreign_map;    /*Mapping page */
+} tap_blkif_t;
+
+static struct tap_blkif *tapfds[MAX_TAP_DEV];
+static int blktap_next_minor;
+
+/* Run-time switchable: /sys/module/blktap/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a 
+ * 'pending_req' allocated to it.
+ */
+typedef struct {
+       blkif_t       *blkif;
+       u64            id;
+       unsigned short mem_idx;
+       unsigned short nr_pages;
+       struct list_head free_list;
+} pending_req_t;
+
+static pending_req_t *pending_reqs[MAX_PENDING_REQS];
+static struct list_head pending_free;
+static DEFINE_SPINLOCK(pending_free_lock);
+static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq);
+static int alloc_pending_reqs;
+
+static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) {
+       return (req - pending_reqs[idx]);
+}
+
+#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+static struct page **foreign_pages[MAX_DYNAMIC_MEM];
+static inline struct page *idx_to_page(
+       unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
+{
+       unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx;
+       return foreign_pages[mmap_idx][arr_idx];
+}
+static inline unsigned long idx_to_kaddr(
+       unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx)
+{
+       unsigned long pfn = page_to_pfn(idx_to_page(mmap_idx,req_idx,sg_idx));
+       return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+static unsigned short mmap_alloc = 0;
+static unsigned short mmap_lock = 0;
+static unsigned short mmap_inuse = 0;
+
+/******************************************************************
+ * GRANT HANDLES
+ */
+
+/* When using grant tables to map a frame for device access then the
+ * handle returned must be used to unmap the frame. This is needed to
+ * drop the ref count on the frame.
+ */
+struct grant_handle_pair
+{
+        grant_handle_t kernel;
+        grant_handle_t user;
+};
+#define INVALID_GRANT_HANDLE   0xFFFF
+
+static struct grant_handle_pair 
+    pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES];
+#define pending_handle(_id, _idx, _i) \
+    (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \
+    + (_i)])
+
+
+static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/
+
+#define BLKTAP_MINOR 0  /*/dev/xen/blktap has a dynamic major */
+#define BLKTAP_DEV_DIR  "/dev/xen"
+
+static int blktap_major;
+
+/* blktap IOCTLs: */
+#define BLKTAP_IOCTL_KICK_FE         1
+#define BLKTAP_IOCTL_KICK_BE         2 /* currently unused */
+#define BLKTAP_IOCTL_SETMODE         3
+#define BLKTAP_IOCTL_SENDPID        4
+#define BLKTAP_IOCTL_NEWINTF        5
+#define BLKTAP_IOCTL_MINOR          6
+#define BLKTAP_IOCTL_MAJOR          7
+#define BLKTAP_QUERY_ALLOC_REQS      8
+#define BLKTAP_IOCTL_FREEINTF        9
+#define BLKTAP_IOCTL_NEWINTF_EXT     50
+#define BLKTAP_IOCTL_PRINT_IDXS      100  
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
+#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
+#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE     0x00000002  /* unimp.             */
+
+#define BLKTAP_MODE_INTERPOSE \
+           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
+{
+       return ((arg == BLKTAP_MODE_PASSTHROUGH ) ||
+               (arg == BLKTAP_MODE_INTERCEPT_FE) ||
+                (arg == BLKTAP_MODE_INTERPOSE   ));
+}
+
+/* Requests passing through the tap to userspace are re-assigned an ID.
+ * We must record a mapping between the BE [IDX,ID] tuple and the userspace
+ * ring ID. 
+ */
+
+#define INVALID_MIDX 0xdead
+
+/*TODO: Convert to a free list*/
+static inline unsigned int GET_NEXT_REQ(const struct idx_map *idx_map)
+{
+       unsigned int i;
+
+       for (i = 0; i < MAX_PENDING_REQS; i++)
+               if (idx_map[i].mem == INVALID_MIDX)
+                       break;
+
+       return i;
+}
+
+static inline unsigned int OFFSET_TO_USR_IDX(unsigned long offset)
+{
+       return offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
+}
+
+static inline unsigned int OFFSET_TO_SEG(unsigned long offset)
+{
+       return offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
+}
+
+
+#define BLKTAP_INVALID_HANDLE(_g) \
+    (((_g->kernel) == INVALID_GRANT_HANDLE) &&  \
+     ((_g->user) == INVALID_GRANT_HANDLE))
+
+#define BLKTAP_INVALIDATE_HANDLE(_g) do {       \
+    (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \
+    } while(0)
+
+
+static char *blktap_devnode(struct device *dev, umode_t *mode)
+{
+       return kasprintf(GFP_KERNEL, "xen/blktap%u", MINOR(dev->devt));
+}
+
+static struct device_type blktap_type = {
+       .devnode = blktap_devnode
+};
+
+/******************************************************************
+ * BLKTAP VM OPS
+ */
+
+static int blktap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       /*
+        * if the page has not been mapped in by the driver then return
+        * VM_FAULT_SIGBUS to the domain.
+        */
+
+       return VM_FAULT_SIGBUS;
+}
+
+static pte_t blktap_clear_pte(struct vm_area_struct *vma,
+                             unsigned long uvaddr,
+                             pte_t *ptep, int is_fullmm)
+{
+       pte_t copy;
+       tap_blkif_t *info = NULL;
+       unsigned int seg, usr_idx, pending_idx, mmap_idx, count = 0;
+       unsigned long offset;
+       struct page *pg;
+       struct grant_handle_pair *khandle;
+       struct gnttab_unmap_grant_ref unmap[2];
+
+       /*
+        * If the address is before the start of the grant mapped region or
+        * if vm_file is NULL (meaning mmap failed and we have nothing to do)
+        */
+       if (vma->vm_file != NULL)
+               info = vma->vm_file->private_data;
+       if (info == NULL || uvaddr < info->user_vstart)
+               return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
+                                                  is_fullmm);
+
+       offset = (uvaddr - info->user_vstart) >> PAGE_SHIFT;
+       usr_idx = OFFSET_TO_USR_IDX(offset);
+       seg = OFFSET_TO_SEG(offset);
+
+       spin_lock(&info->map_lock);
+
+       pending_idx = info->idx_map[usr_idx].req;
+       mmap_idx = info->idx_map[usr_idx].mem;
+
+       /* fast_flush_area() may already have cleared this entry */
+       if (mmap_idx == INVALID_MIDX) {
+               spin_unlock(&info->map_lock);
+               return xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
+                                                  is_fullmm);
+       }
+
+       pg = idx_to_page(mmap_idx, pending_idx, seg);
+       ClearPageReserved(pg);
+       info->foreign_map.map[offset + RING_PAGES] = NULL;
+
+       khandle = &pending_handle(mmap_idx, pending_idx, seg);
+
+       if (khandle->kernel != INVALID_GRANT_HANDLE) {
+               unsigned long pfn = page_to_pfn(pg);
+
+               gnttab_set_unmap_op(&unmap[count],
+                                   (unsigned long)pfn_to_kaddr(pfn),
+                                   GNTMAP_host_map, khandle->kernel);
+               count++;
+
+               set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+       }
+
+       if (khandle->user != INVALID_GRANT_HANDLE) {
+               BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+
+               copy = *ptep;
+               gnttab_set_unmap_op(&unmap[count], ptep_to_machine(ptep),
+                                   GNTMAP_host_map 
+                                   | GNTMAP_application_map 
+                                   | GNTMAP_contains_pte,
+                                   khandle->user);
+               count++;
+       } else {
+               BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap));
+
+               /* USING SHADOW PAGE TABLES. */
+               copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
+                                                  is_fullmm);
+       }
+
+       if (count) {
+               BLKTAP_INVALIDATE_HANDLE(khandle);
+               if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+                                             unmap, count))
+                       BUG();
+       }
+
+       spin_unlock(&info->map_lock);
+
+       return copy;
+}
+
+static void blktap_vma_open(struct vm_area_struct *vma)
+{
+       tap_blkif_t *info;
+       if (vma->vm_file == NULL)
+               return;
+
+       info = vma->vm_file->private_data;
+       vma->vm_private_data =
+               &info->foreign_map.map[(vma->vm_start - info->rings_vstart) >> PAGE_SHIFT];
+}
+
+/* tricky part
+ * When partial munmapping, ->open() is called only splitted vma which
+ * will be released soon. * See split_vma() and do_munmap() in mm/mmap.c
+ * So there is no chance to fix up vm_private_data of the end vma.
+ */
+static void blktap_vma_close(struct vm_area_struct *vma)
+{
+       tap_blkif_t *info;
+       struct vm_area_struct *next = vma->vm_next;
+
+       if (next == NULL ||
+           vma->vm_ops != next->vm_ops ||
+           vma->vm_end != next->vm_start ||
+           vma->vm_file == NULL ||
+           vma->vm_file != next->vm_file)
+               return;
+
+       info = vma->vm_file->private_data;
+       next->vm_private_data =
+               &info->foreign_map.map[(next->vm_start - info->rings_vstart) >> PAGE_SHIFT];
+}
+
+static struct vm_operations_struct blktap_vm_ops = {
+       fault:    blktap_fault,
+       zap_pte:  blktap_clear_pte,
+       open:     blktap_vma_open,
+       close:    blktap_vma_close,
+};
+
+/******************************************************************
+ * BLKTAP FILE OPS
+ */
+ 
+/*Function Declarations*/
+static tap_blkif_t *get_next_free_dev(void);
+static int blktap_open(struct inode *inode, struct file *filp);
+static int blktap_release(struct inode *inode, struct file *filp);
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma);
+static long blktap_ioctl(struct file *filp, unsigned int cmd,
+                        unsigned long arg);
+static unsigned int blktap_poll(struct file *file, poll_table *wait);
+
+static const struct file_operations blktap_fops = {
+       .owner   = THIS_MODULE,
+       .poll    = blktap_poll,
+       .unlocked_ioctl = blktap_ioctl,
+       .open    = blktap_open,
+       .release = blktap_release,
+       .llseek  = no_llseek,
+       .mmap    = blktap_mmap,
+};
+
+
+static tap_blkif_t *get_next_free_dev(void)
+{
+       tap_blkif_t *info;
+       int minor;
+
+       /*
+        * This is called only from the ioctl, which
+        * means we should always have interrupts enabled.
+        */
+       BUG_ON(irqs_disabled());
+
+       spin_lock_irq(&pending_free_lock);
+
+       /* tapfds[0] is always NULL */
+
+       for (minor = 1; minor < blktap_next_minor; minor++) {
+               info = tapfds[minor];
+               /* we could have failed a previous attempt. */
+               if (!info ||
+                   ((!test_bit(0, &info->dev_inuse)) &&
+                    (info->dev_pending == 0)) ) {
+                       info->dev_pending = 1;
+                       goto found;
+               }
+       }
+       info = NULL;
+       minor = -1;
+
+       /*
+        * We didn't find free device. If we can still allocate
+        * more, then we grab the next device minor that is
+        * available.  This is done while we are still under
+        * the protection of the pending_free_lock.
+        */
+       if (blktap_next_minor < MAX_TAP_DEV)
+               minor = blktap_next_minor++;
+found:
+       spin_unlock_irq(&pending_free_lock);
+
+       if (!info && minor > 0) {
+               info = kzalloc(sizeof(*info), GFP_KERNEL);
+               if (unlikely(!info)) {
+                       /*
+                        * If we failed here, try to put back
+                        * the next minor number. But if one
+                        * was just taken, then we just lose this
+                        * minor.  We can try to allocate this
+                        * minor again later.
+                        */
+                       spin_lock_irq(&pending_free_lock);
+                       if (blktap_next_minor == minor+1)
+                               blktap_next_minor--;
+                       spin_unlock_irq(&pending_free_lock);
+                       goto out;
+               }
+
+               info->minor = minor;
+               spin_lock_init(&info->map_lock);
+               /*
+                * Make sure that we have a minor before others can
+                * see us.
+                */
+               wmb();
+               tapfds[minor] = info;
+
+               xen_class_device_create(&blktap_type, NULL,
+                                       MKDEV(blktap_major, minor),
+                                       NULL, "blktap%d", minor);
+       }
+
+out:
+       return info;
+}
+
+int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) 
+{
+       tap_blkif_t *info;
+       int i;
+
+       for (i = 1; i < blktap_next_minor; i++) {
+               info = tapfds[i];
+               if ( info &&
+                    (info->trans.domid == domid) &&
+                    (info->trans.busid == xenbus_id) ) {
+                       info->blkif = blkif;
+                       info->status = RUNNING;
+                       return i;
+               }
+       }
+       return -1;
+}
+
+void signal_tapdisk(int idx) 
+{
+       tap_blkif_t *info;
+       struct task_struct *ptask;
+       struct mm_struct *mm;
+
+       /*
+        * if the userland tools set things up wrong, this could be negative;
+        * just don't try to signal in this case
+        */
+       if (idx < 0 || idx >= MAX_TAP_DEV)
+               return;
+
+       info = tapfds[idx];
+       if (!info)
+               return;
+
+       if (info->pid > 0) {
+               ptask = pid_task(find_pid_ns(info->pid, info->pid_ns),
+                                PIDTYPE_PID);
+               if (ptask)
+                       info->status = CLEANSHUTDOWN;
+       }
+       info->blkif = NULL;
+
+       mm = xchg(&info->mm, NULL);
+       if (mm)
+               mmput(mm);
+}
+
+static int blktap_open(struct inode *inode, struct file *filp)
+{
+       blkif_sring_t *sring;
+       int idx = iminor(inode) - BLKTAP_MINOR;
+       tap_blkif_t *info;
+       int i;
+       
+       nonseekable_open(inode, filp);
+
+       /* ctrl device, treat differently */
+       if (!idx)
+               return 0;
+       if (idx < 0 || idx >= MAX_TAP_DEV) {
+               WPRINTK("No device /dev/xen/blktap%d\n", idx);
+               return -ENODEV;
+       }
+
+       info = tapfds[idx];
+       if (!info) {
+               WPRINTK("Unable to open device /dev/xen/blktap%d\n",
+                       idx);
+               return -ENODEV;
+       }
+
+       DPRINTK("Opening device /dev/xen/blktap%d\n",idx);
+       
+       /*Only one process can access device at a time*/
+       if (test_and_set_bit(0, &info->dev_inuse))
+               return -EBUSY;
+
+       info->dev_pending = 0;
+           
+       /* Allocate the fe ring. */
+       sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
+       if (sring == NULL)
+               goto fail_nomem;
+
+       SetPageReserved(virt_to_page(sring));
+    
+       SHARED_RING_INIT(sring);
+       FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE);
+       
+       filp->private_data = info;
+       info->mm = NULL;
+
+       info->idx_map = kmalloc(sizeof(*info->idx_map) * MAX_PENDING_REQS,
+                               GFP_KERNEL);
+       
+       if (info->idx_map == NULL)
+               goto fail_nomem;
+
+       if (idx > 0) {
+               init_waitqueue_head(&info->wait);
+               for (i = 0; i < MAX_PENDING_REQS; i++) {
+                       info->idx_map[i].mem = INVALID_MIDX;
+                       info->idx_map[i].req = ~0;
+               }
+       }
+
+       DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx);
+       return 0;
+
+ fail_nomem:
+       return -ENOMEM;
+}
+
+static int blktap_release(struct inode *inode, struct file *filp)
+{
+       tap_blkif_t *info = filp->private_data;
+       struct mm_struct *mm;
+       
+       /* check for control device */
+       if (!info)
+               return 0;
+
+       info->ring_ok = 0;
+       smp_wmb();
+       info->rings_vstart = 0;
+
+       mm = xchg(&info->mm, NULL);
+       if (mm)
+               mmput(mm);
+       kfree(info->foreign_map.map);
+       info->foreign_map.map = NULL;
+
+       /* Free the ring page. */
+       ClearPageReserved(virt_to_page(info->ufe_ring.sring));
+       free_page((unsigned long) info->ufe_ring.sring);
+
+       if (info->idx_map) {
+               kfree(info->idx_map);
+               info->idx_map = NULL;
+       }
+
+       if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) {
+               if (info->blkif->xenblkd != NULL) {
+                       kthread_stop(info->blkif->xenblkd);
+                       info->blkif->xenblkd = NULL;
+               }
+               info->status = CLEANSHUTDOWN;
+       }
+
+       clear_bit(0, &info->dev_inuse);
+       DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor);
+
+       return 0;
+}
+
+
+/* Note on mmap:
+ * We need to map pages to user space in a way that will allow the block
+ * subsystem set up direct IO to them.  This couldn't be done before, because
+ * there isn't really a sane way to translate a user virtual address down to a 
+ * physical address when the page belongs to another domain.
+ *
+ * My first approach was to map the page in to kernel memory, add an entry
+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
+ * and then attempt to map that page up to user space.  This is disallowed
+ * by xen though, which realizes that we don't really own the machine frame
+ * underlying the physical page.
+ *
+ * The new approach is to provide explicit support for this in xen linux.
+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
+ * mapped from other vms.  vma->vm_private_data is set up as a mapping 
+ * from pages to actual page structs.  There is a new clause in get_user_pages
+ * that does the right thing for this sort of mapping.
+ */
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+       int size;
+       tap_blkif_t *info = filp->private_data;
+       int ret;
+
+       if (info == NULL) {
+               WPRINTK("mmap: no private data?\n");
+               return -ENOMEM;
+       }
+
+       if (info->rings_vstart) {
+               WPRINTK("mmap already called on filp %p (minor %d)\n",
+                       filp, info->minor);
+               return -EPERM;
+       }
+
+       vma->vm_flags |= VM_RESERVED;
+       vma->vm_ops = &blktap_vm_ops;
+
+       size = vma->vm_end - vma->vm_start;
+       if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) {
+               WPRINTK("you _must_ map exactly %d pages!\n",
+                      mmap_pages + RING_PAGES);
+               return -EAGAIN;
+       }
+
+       size >>= PAGE_SHIFT;
+       info->rings_vstart = vma->vm_start;
+       info->user_vstart  = info->rings_vstart + (RING_PAGES << PAGE_SHIFT);
+    
+       /* Map the ring pages to the start of the region and reserve it. */
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               ret = vm_insert_page(vma, vma->vm_start,
+                                    virt_to_page(info->ufe_ring.sring));
+       else
+               ret = remap_pfn_range(vma, vma->vm_start,
+                                     __pa(info->ufe_ring.sring) >> PAGE_SHIFT,
+                                     PAGE_SIZE, vma->vm_page_prot);
+       if (ret) {
+               WPRINTK("Mapping user ring failed!\n");
+               goto fail;
+       }
+
+       /* Mark this VM as containing foreign pages, and set up mappings. */
+       info->foreign_map.map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) *
+                           sizeof(*info->foreign_map.map), GFP_KERNEL);
+       if (info->foreign_map.map == NULL) {
+               WPRINTK("Couldn't alloc VM_FOREIGN map.\n");
+               goto fail;
+       }
+
+       vma->vm_private_data = &info->foreign_map;
+       vma->vm_flags |= VM_FOREIGN;
+       vma->vm_flags |= VM_DONTCOPY;
+
+#ifdef CONFIG_X86
+       vma->vm_mm->context.has_foreign_mappings = 1;
+#endif
+
+       info->mm = get_task_mm(current);
+       smp_wmb();
+       info->ring_ok = 1;
+       return 0;
+ fail:
+       /* Clear any active mappings. */
+       zap_page_range(vma, vma->vm_start, 
+                      vma->vm_end - vma->vm_start, NULL);
+       info->rings_vstart = 0;
+
+       return -ENOMEM;
+}
+
+
+static long blktap_ioctl(struct file *filp, unsigned int cmd,
+                        unsigned long arg)
+{
+       tap_blkif_t *info = filp->private_data;
+
+       switch(cmd) {
+       case BLKTAP_IOCTL_KICK_FE: 
+       {
+               /* There are fe messages to process. */
+               return blktap_read_ufe_ring(info);
+       }
+       case BLKTAP_IOCTL_SETMODE:
+       {
+               if (info) {
+                       if (BLKTAP_MODE_VALID(arg)) {
+                               info->mode = arg;
+                               /* XXX: may need to flush rings here. */
+                               DPRINTK("set mode to %lx\n", arg);
+                               return 0;
+                       }
+               }
+               return 0;
+       }
+       case BLKTAP_IOCTL_PRINT_IDXS:
+        {
+               if (info) {
+                       pr_info("User Rings: \n-----------\n");
+                       pr_info("UF: rsp_cons: %2d, req_prod_prv: %2d "
+                               "| req_prod: %2d, rsp_prod: %2d\n",
+                               info->ufe_ring.rsp_cons,
+                               info->ufe_ring.req_prod_pvt,
+                               info->ufe_ring.sring->req_prod,
+                               info->ufe_ring.sring->rsp_prod);
+               }
+               return 0;
+        }
+       case BLKTAP_IOCTL_SENDPID:
+       {
+               if (info) {
+                       info->pid = (pid_t)arg;
+                       info->pid_ns = current->nsproxy->pid_ns;
+                       DPRINTK("pid received %p:%d\n",
+                               info->pid_ns, info->pid);
+               }
+               return 0;
+       }
+       case BLKTAP_IOCTL_NEWINTF:
+       {               
+               uint64_t val = (uint64_t)arg;
+               domid_translate_t *tr = (domid_translate_t *)&val;
+
+               DPRINTK("NEWINTF Req for domid %d and bus id %d\n", 
+                      tr->domid, tr->busid);
+               info = get_next_free_dev();
+               if (!info) {
+                       WPRINTK("Error initialising /dev/xen/blktap - "
+                               "No more devices\n");
+                       return -1;
+               }
+               info->trans.domid = tr->domid;
+               info->trans.busid = tr->busid;
+               return info->minor;
+       }
+       case BLKTAP_IOCTL_NEWINTF_EXT:
+       {
+               void __user *udata = (void __user *) arg;
+               domid_translate_ext_t tr;
+
+               if (copy_from_user(&tr, udata, sizeof(domid_translate_ext_t)))
+                       return -EFAULT;
+
+               DPRINTK("NEWINTF_EXT Req for domid %d and bus id %d\n", 
+                      tr.domid, tr.busid);
+               info = get_next_free_dev();
+               if (!info) {
+                       WPRINTK("Error initialising /dev/xen/blktap - "
+                               "No more devices\n");
+                       return -1;
+               }
+               info->trans.domid = tr.domid;
+               info->trans.busid = tr.busid;
+               return info->minor;
+       }
+       case BLKTAP_IOCTL_FREEINTF:
+       {
+               unsigned long dev = arg;
+               unsigned long flags;
+
+               if (info || dev >= MAX_TAP_DEV)
+                       return -EINVAL;
+
+               info = tapfds[dev];
+               if (!info)
+                       return 0; /* should this be an error? */
+
+               spin_lock_irqsave(&pending_free_lock, flags);
+               if (info->dev_pending)
+                       info->dev_pending = 0;
+               spin_unlock_irqrestore(&pending_free_lock, flags);
+
+               return 0;
+       }
+       case BLKTAP_IOCTL_MINOR:
+               if (!info) {
+                       unsigned long dev = arg;
+
+                       if (dev >= MAX_TAP_DEV)
+                               return -EINVAL;
+
+                       info = tapfds[dev];
+                       if (!info)
+                               return -EINVAL;
+               }
+
+               return info->minor;
+
+       case BLKTAP_IOCTL_MAJOR:
+               return blktap_major;
+
+       case BLKTAP_QUERY_ALLOC_REQS:
+               WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%lu\n",
+                       alloc_pending_reqs, MAX_PENDING_REQS);
+               return (alloc_pending_reqs/MAX_PENDING_REQS) * 100;
+       }
+       return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_poll(struct file *filp, poll_table *wait)
+{
+       tap_blkif_t *info = filp->private_data;
+       
+       /* do not work on the control device */
+       if (!info)
+               return 0;
+
+       poll_wait(filp, &info->wait, wait);
+       if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) {
+               RING_PUSH_REQUESTS(&info->ufe_ring);
+               return POLLIN | POLLRDNORM;
+       }
+       return 0;
+}
+
+static void blktap_kick_user(int idx)
+{
+       tap_blkif_t *info;
+
+       if (idx < 0 || idx >= MAX_TAP_DEV)
+               return;
+
+       info = tapfds[idx];
+       if (!info)
+               return;
+
+       wake_up_interruptible(&info->wait);
+
+       return;
+}
+
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+                                blkif_request_t *req,
+                                pending_req_t *pending_req);
+static void make_response(blkif_t *blkif, u64 id,
+                          unsigned short op, int st);
+
+/******************************************************************
+ * misc small helpers
+ */
+static int req_increase(void)
+{
+       int i, j;
+
+       if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) 
+               return -EINVAL;
+
+       pending_reqs[mmap_alloc]  = kzalloc(sizeof(pending_req_t)
+                                           * MAX_PENDING_REQS, GFP_KERNEL);
+       foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages);
+
+       if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc])
+               goto out_of_memory;
+
+       DPRINTK("reqs=%lu, pages=%d\n", MAX_PENDING_REQS, mmap_pages);
+
+       for (i = 0; i < MAX_PENDING_REQS; i++) {
+               list_add_tail(&pending_reqs[mmap_alloc][i].free_list, 
+                             &pending_free);
+               pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc;
+               for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++)
+                       BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, 
+                                                                i, j));
+       }
+
+       mmap_alloc++;
+       DPRINTK("# MMAPs increased to %d\n",mmap_alloc);
+       return 0;
+
+ out_of_memory:
+       free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
+       kfree(pending_reqs[mmap_alloc]);
+       WPRINTK("%s: out of memory\n", __FUNCTION__);
+       return -ENOMEM;
+}
+
+static void mmap_req_del(int mmap)
+{
+       assert_spin_locked(&pending_free_lock);
+
+       kfree(pending_reqs[mmap]);
+       pending_reqs[mmap] = NULL;
+
+       free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages);
+       foreign_pages[mmap] = NULL;
+
+       mmap_lock = 0;
+       DPRINTK("# MMAPs decreased to %d\n",mmap_alloc);
+       mmap_alloc--;
+}
+
+static pending_req_t* alloc_req(void)
+{
+       pending_req_t *req = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+
+       if (!list_empty(&pending_free)) {
+               req = list_entry(pending_free.next, pending_req_t, free_list);
+               list_del(&req->free_list);
+       }
+
+       if (req)
+               alloc_pending_reqs++;
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+
+       return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+       unsigned long flags;
+       int was_empty;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+
+       alloc_pending_reqs--;
+       if (mmap_lock && (req->mem_idx == mmap_alloc-1)) {
+               mmap_inuse--;
+               if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1);
+               spin_unlock_irqrestore(&pending_free_lock, flags);
+               return;
+       }
+       was_empty = list_empty(&pending_free);
+       list_add(&req->free_list, &pending_free);
+
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+
+       if (was_empty)
+               wake_up(&pending_free_wq);
+}
+
+static void blktap_zap_page_range(struct mm_struct *mm,
+                                 unsigned long uvaddr, int nr_pages)
+{
+       unsigned long end = uvaddr + (nr_pages << PAGE_SHIFT);
+       struct vm_area_struct *vma;
+
+       vma = find_vma(mm, uvaddr);
+       while (vma && uvaddr < end) {
+               unsigned long s = max(uvaddr, vma->vm_start);
+               unsigned long e = min(end, vma->vm_end);
+
+               zap_page_range(vma, s, e - s, NULL);
+
+               uvaddr = e;
+               vma = vma->vm_next;
+       }
+}
+
+static void fast_flush_area(pending_req_t *req, unsigned int k_idx,
+                            unsigned int u_idx, tap_blkif_t *info)
+{
+       struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+       unsigned int i, mmap_idx, invcount = 0;
+       struct grant_handle_pair *khandle;
+       uint64_t ptep;
+       int ret;
+       unsigned long uvaddr;
+       struct mm_struct *mm = info->mm;
+
+       if (mm != NULL)
+               down_read(&mm->mmap_sem);
+
+       if (mm != NULL && xen_feature(XENFEAT_auto_translated_physmap)) {
+ slow:
+               blktap_zap_page_range(mm,
+                                     MMAP_VADDR(info->user_vstart, u_idx, 0),
+                                     req->nr_pages);
+               info->idx_map[u_idx].mem = INVALID_MIDX;
+               up_read(&mm->mmap_sem);
+               return;
+       }
+
+       mmap_idx = req->mem_idx;
+
+       spin_lock(&info->map_lock);
+
+       for (i = 0; i < req->nr_pages; i++) {
+               uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i);
+
+               khandle = &pending_handle(mmap_idx, k_idx, i);
+
+               if (khandle->kernel != INVALID_GRANT_HANDLE) {
+                       gnttab_set_unmap_op(&unmap[invcount],
+                                           idx_to_kaddr(mmap_idx, k_idx, i),
+                                           GNTMAP_host_map, khandle->kernel);
+                       invcount++;
+
+                       set_phys_to_machine(
+                               page_to_pfn(idx_to_page(mmap_idx, k_idx, i)),
+                               INVALID_P2M_ENTRY);
+               }
+
+               if (mm != NULL && khandle->user != INVALID_GRANT_HANDLE) {
+                       BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+                       if (create_lookup_pte_addr(
+                               mm,
+                               MMAP_VADDR(info->user_vstart, u_idx, i),
+                               &ptep) !=0) {
+                               spin_unlock(&info->map_lock);
+                               WPRINTK("Couldn't get a pte addr!\n");
+                               goto slow;
+                       }
+
+                       gnttab_set_unmap_op(&unmap[invcount], ptep,
+                                           GNTMAP_host_map
+                                           | GNTMAP_application_map
+                                           | GNTMAP_contains_pte,
+                                           khandle->user);
+                       invcount++;
+               }
+
+               BLKTAP_INVALIDATE_HANDLE(khandle);
+       }
+       ret = HYPERVISOR_grant_table_op(
+               GNTTABOP_unmap_grant_ref, unmap, invcount);
+       BUG_ON(ret);
+       
+       info->idx_map[u_idx].mem = INVALID_MIDX;
+
+       spin_unlock(&info->map_lock);
+       if (mm != NULL)
+               up_read(&mm->mmap_sem);
+}
+
+/******************************************************************
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(blkif_t *blkif)
+{
+       printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d |  pk %4d\n",
+              current->comm, blkif->st_oo_req,
+              blkif->st_rd_req, blkif->st_wr_req, blkif->st_pk_req);
+       blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+       blkif->st_rd_req = 0;
+       blkif->st_wr_req = 0;
+       blkif->st_oo_req = 0;
+       blkif->st_pk_req = 0;
+}
+
+int tap_blkif_schedule(void *arg)
+{
+       blkif_t *blkif = arg;
+       tap_blkif_t *info;
+
+       blkif_get(blkif);
+
+       if (debug_lvl)
+               printk(KERN_DEBUG "%s: started\n", current->comm);
+
+       while (!kthread_should_stop()) {
+               if (try_to_freeze())
+                       continue;
+
+               wait_event_interruptible(
+                       blkif->wq,
+                       blkif->waiting_reqs || kthread_should_stop());
+               wait_event_interruptible(
+                       pending_free_wq,
+                       !list_empty(&pending_free) || kthread_should_stop());
+
+               blkif->waiting_reqs = 0;
+               smp_mb(); /* clear flag *before* checking for work */
+
+               if (do_block_io_op(blkif))
+                       blkif->waiting_reqs = 1;
+
+               if (log_stats && time_after(jiffies, blkif->st_print))
+                       print_stats(blkif);
+       }
+
+       if (log_stats)
+               print_stats(blkif);
+       if (debug_lvl)
+               printk(KERN_DEBUG "%s: exiting\n", current->comm);
+
+       blkif->xenblkd = NULL;
+       info = tapfds[blkif->dev_num];
+       blkif_put(blkif);
+
+       if (info) {
+               struct mm_struct *mm = xchg(&info->mm, NULL);
+
+               if (mm)
+                       mmput(mm);
+       }
+
+       return 0;
+}
+
+/******************************************************************
+ * COMPLETION CALLBACK -- Called by user level ioctl()
+ */
+
+static int blktap_read_ufe_ring(tap_blkif_t *info)
+{
+       /* This is called to read responses from the UFE ring. */
+       RING_IDX i, j, rp;
+       blkif_response_t *resp;
+       blkif_t *blkif=NULL;
+       unsigned int pending_idx, usr_idx, mmap_idx;
+       pending_req_t *pending_req;
+       
+       if (!info)
+               return 0;
+
+       /* We currently only forward packets in INTERCEPT_FE mode. */
+       if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE))
+               return 0;
+
+       /* for each outstanding message on the UFEring  */
+       rp = info->ufe_ring.sring->rsp_prod;
+       rmb();
+        
+       for (i = info->ufe_ring.rsp_cons; i != rp; i++) {
+               blkif_response_t res;
+               resp = RING_GET_RESPONSE(&info->ufe_ring, i);
+               memcpy(&res, resp, sizeof(res));
+               mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
+               ++info->ufe_ring.rsp_cons;
+
+               /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/
+               if (res.id >= MAX_PENDING_REQS) {
+                       WPRINTK("incorrect req map [%llx]\n",
+                               (unsigned long long)res.id);
+                       continue;
+               }
+
+               usr_idx = (unsigned int)res.id;
+               pending_idx = info->idx_map[usr_idx].req;
+               mmap_idx = info->idx_map[usr_idx].mem;
+
+               if (mmap_idx >= mmap_alloc ||
+                   pending_idx >= MAX_PENDING_REQS) {
+                       WPRINTK("incorrect req map [%d],"
+                               " internal map [%d,%d]\n",
+                               usr_idx, mmap_idx, pending_idx);
+                       continue;
+               }
+
+               pending_req = &pending_reqs[mmap_idx][pending_idx];
+               blkif = pending_req->blkif;
+
+               for (j = 0; j < pending_req->nr_pages; j++) {
+
+                       unsigned long uvaddr;
+                       struct page *pg;
+                       int offset;
+
+                       uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j);
+
+                       pg = idx_to_page(mmap_idx, pending_idx, j);
+                       ClearPageReserved(pg);
+                       offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
+                       info->foreign_map.map[offset] = NULL;
+               }
+               fast_flush_area(pending_req, pending_idx, usr_idx, info);
+               make_response(blkif, pending_req->id, res.operation,
+                             res.status);
+               blkif_put(pending_req->blkif);
+               free_req(pending_req);
+       }
+               
+       return 0;
+}
+
+
+/******************************************************************************
+ * NOTIFICATION FROM GUEST OS.
+ */
+
+static void blkif_notify_work(blkif_t *blkif)
+{
+       blkif->waiting_reqs = 1;
+       wake_up(&blkif->wq);
+}
+
+irqreturn_t tap_blkif_be_int(int irq, void *dev_id)
+{
+       blkif_notify_work(dev_id);
+       return IRQ_HANDLED;
+}
+
+
+
+/******************************************************************
+ * DOWNWARD CALLS -- These interface with the block-device layer proper.
+ */
+static int print_dbug = 1;
+static int do_block_io_op(blkif_t *blkif)
+{
+       blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+       blkif_request_t req;
+       pending_req_t *pending_req;
+       RING_IDX rc, rp;
+       int more_to_do = 0;
+       tap_blkif_t *info;
+
+       rc = blk_rings->common.req_cons;
+       rp = blk_rings->common.sring->req_prod;
+       rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+       /*Check blkif has corresponding UE ring*/
+       if (blkif->dev_num < 0 || blkif->dev_num >= MAX_TAP_DEV) {
+               /*oops*/
+               if (print_dbug) {
+                       WPRINTK("Corresponding UE " 
+                              "ring does not exist!\n");
+                       print_dbug = 0; /*We only print this message once*/
+               }
+               return 0;
+       }
+
+       info = tapfds[blkif->dev_num];
+
+       if (!info || !test_bit(0, &info->dev_inuse)) {
+               if (print_dbug) {
+                       WPRINTK("Can't get UE info!\n");
+                       print_dbug = 0;
+               }
+               return 0;
+       }
+
+       while (rc != rp) {
+               
+               if (RING_FULL(&info->ufe_ring)) {
+                       WPRINTK("RING_FULL! More to do\n");
+                       more_to_do = 1;
+                       break;
+               }
+
+               if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) {
+                       WPRINTK("RING_REQUEST_CONS_OVERFLOW!"
+                              " More to do\n");
+                       more_to_do = 1;
+                       break;          
+               }
+
+               if (kthread_should_stop()) {
+                       more_to_do = 1;
+                       break;
+               }
+
+               pending_req = alloc_req();
+               if (NULL == pending_req) {
+                       blkif->st_oo_req++;
+                       more_to_do = 1;
+                       break;
+               }
+
+               switch (blkif->blk_protocol) {
+               case BLKIF_PROTOCOL_NATIVE:
+                       memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc),
+                              sizeof(req));
+                       break;
+               case BLKIF_PROTOCOL_X86_32:
+                       blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+                       break;
+               case BLKIF_PROTOCOL_X86_64:
+                       blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+                       break;
+               default:
+                       BUG();
+               }
+               blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+               /* Apply all sanity checks to /private copy/ of request. */
+               barrier();
+
+               switch (req.operation) {
+               case BLKIF_OP_READ:
+                       blkif->st_rd_req++;
+                       dispatch_rw_block_io(blkif, &req, pending_req);
+                       break;
+
+               case BLKIF_OP_WRITE_BARRIER:
+                       /* TODO Some counter? */
+                       /* Fall through */
+               case BLKIF_OP_WRITE:
+                       blkif->st_wr_req++;
+                       dispatch_rw_block_io(blkif, &req, pending_req);
+                       break;
+
+               case BLKIF_OP_PACKET:
+                       blkif->st_pk_req++;
+                       dispatch_rw_block_io(blkif, &req, pending_req);
+                       break;
+
+               default:
+                       /* A good sign something is wrong: sleep for a while to
+                        * avoid excessive CPU consumption by a bad guest. */
+                       msleep(1);
+                       WPRINTK("unknown operation [%d]\n",
+                               req.operation);
+                       make_response(blkif, req.id, req.operation,
+                                     BLKIF_RSP_ERROR);
+                       free_req(pending_req);
+                       break;
+               }
+
+               /* Yield point for this unbounded loop. */
+               cond_resched();
+       }
+               
+       blktap_kick_user(blkif->dev_num);
+
+       return more_to_do;
+}
+
+static void dispatch_rw_block_io(blkif_t *blkif,
+                                blkif_request_t *req,
+                                pending_req_t *pending_req)
+{
+       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
+       unsigned int nseg;
+       int ret, i, op, nr_sects = 0;
+       tap_blkif_t *info;
+       blkif_request_t *target;
+       unsigned int mmap_idx = pending_req->mem_idx;
+       unsigned int pending_idx = RTN_PEND_IDX(pending_req, mmap_idx);
+       unsigned int usr_idx;
+       uint32_t flags;
+       struct mm_struct *mm;
+       struct vm_area_struct *vma = NULL;
+
+       if (blkif->dev_num < 0 || blkif->dev_num >= MAX_TAP_DEV)
+               goto fail_response;
+
+       info = tapfds[blkif->dev_num];
+       if (info == NULL)
+               goto fail_response;
+
+       /* Check we have space on user ring - should never fail. */
+       spin_lock(&info->map_lock);
+       usr_idx = GET_NEXT_REQ(info->idx_map);
+       spin_unlock(&info->map_lock);
+       if (usr_idx >= MAX_PENDING_REQS) {
+               WARN_ON(1);
+               goto fail_response;
+       }
+
+       /* Check that number of segments is sane. */
+       nseg = req->nr_segments;
+       if (unlikely(nseg == 0 && req->operation != BLKIF_OP_WRITE_BARRIER) ||
+           unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) {
+               WPRINTK("Bad number of segments in request (%d)\n", nseg);
+               goto fail_response;
+       }
+       
+       /* Make sure userspace is ready. */
+       if (!info->ring_ok) {
+               WPRINTK("ring not ready for requests!\n");
+               goto fail_response;
+       }
+       smp_rmb();
+
+       if (RING_FULL(&info->ufe_ring)) {
+               WPRINTK("fe_ring is full, "
+                       "IO Request will be dropped. %d %d\n",
+                       RING_SIZE(&info->ufe_ring),
+                       RING_SIZE(&blkif->blk_rings.common));
+               goto fail_response;
+       }
+
+       pending_req->blkif     = blkif;
+       pending_req->id        = req->id;
+       pending_req->nr_pages  = nseg;
+
+       flags = GNTMAP_host_map;
+       switch (req->operation) {
+       case BLKIF_OP_WRITE:
+       case BLKIF_OP_WRITE_BARRIER:
+               flags |= GNTMAP_readonly;
+               break;
+       }
+
+       op = 0;
+       mm = info->mm;
+       if (!xen_feature(XENFEAT_auto_translated_physmap))
+               down_read(&mm->mmap_sem);
+       for (i = 0; i < nseg; i++) {
+               unsigned long uvaddr;
+               unsigned long kvaddr;
+               uint64_t ptep;
+
+               uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
+               kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i);
+
+               gnttab_set_map_op(&map[op], kvaddr, flags,
+                                 req->seg[i].gref, blkif->domid);
+               op++;
+
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                       /* Now map it to user. */
+                       ret = create_lookup_pte_addr(mm, uvaddr, &ptep);
+                       if (ret) {
+                               up_read(&mm->mmap_sem);
+                               WPRINTK("Couldn't get a pte addr!\n");
+                               goto fail_response;
+                       }
+
+                       gnttab_set_map_op(&map[op], ptep,
+                                         flags | GNTMAP_application_map
+                                               | GNTMAP_contains_pte,
+                                         req->seg[i].gref, blkif->domid);
+                       op++;
+               }
+
+               nr_sects += (req->seg[i].last_sect - 
+                            req->seg[i].first_sect + 1);
+       }
+
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               down_read(&mm->mmap_sem);
+
+       spin_lock(&info->map_lock);
+
+       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op);
+       BUG_ON(ret);
+
+       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+               for (i = 0; i < (nseg*2); i+=2) {
+                       unsigned long uvaddr;
+                       unsigned long offset;
+                       struct page *pg;
+
+                       uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2);
+
+                       gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
+
+                       if (unlikely(map[i].status != GNTST_okay)) {
+                               WPRINTK("invalid kernel buffer -- could not remap it\n");
+                               ret = 1;
+                               map[i].handle = INVALID_GRANT_HANDLE;
+                       }
+
+                       if (unlikely(map[i+1].status != GNTST_okay)) {
+                               WPRINTK("invalid user buffer -- could not remap it\n");
+                               ret = 1;
+                               map[i+1].handle = INVALID_GRANT_HANDLE;
+                       }
+
+                       pending_handle(mmap_idx, pending_idx, i/2).kernel 
+                               = map[i].handle;
+                       pending_handle(mmap_idx, pending_idx, i/2).user   
+                               = map[i+1].handle;
+
+                       if (ret)
+                               continue;
+
+                       pg = idx_to_page(mmap_idx, pending_idx, i/2);
+                       set_phys_to_machine(page_to_pfn(pg),
+                                           FOREIGN_FRAME(map[i].dev_bus_addr
+                                                         >> PAGE_SHIFT));
+                       offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
+                       info->foreign_map.map[offset] = pg;
+               }
+       } else {
+               for (i = 0; i < nseg; i++) {
+                       unsigned long uvaddr;
+                       unsigned long offset;
+                       struct page *pg;
+
+                       uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i);
+
+                       gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
+
+                       if (unlikely(map[i].status != GNTST_okay)) {
+                               WPRINTK("invalid kernel buffer -- could not remap it\n");
+                               ret = 1;
+                               map[i].handle = INVALID_GRANT_HANDLE;
+                       }
+
+                       pending_handle(mmap_idx, pending_idx, i).kernel 
+                               = map[i].handle;
+
+                       if (ret)
+                               continue;
+
+                       offset = (uvaddr - info->rings_vstart) >> PAGE_SHIFT;
+                       pg = idx_to_page(mmap_idx, pending_idx, i);
+                       info->foreign_map.map[offset] = pg;
+               }
+       }
+
+       /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/
+       info->idx_map[usr_idx].mem = mmap_idx;
+       info->idx_map[usr_idx].req = pending_idx;
+
+       spin_unlock(&info->map_lock);
+
+       if (ret)
+               goto fail_flush;
+
+       if (xen_feature(XENFEAT_auto_translated_physmap)) {
+               for (i = 0; i < nseg; i++) {
+                       struct page *pg = idx_to_page(mmap_idx, pending_idx, i);
+                       unsigned long uvaddr = MMAP_VADDR(info->user_vstart,
+                                                         usr_idx, i);
+                       if (vma && uvaddr >= vma->vm_end) {
+                               vma = vma->vm_next;
+                               if (vma &&
+                                   (uvaddr < vma->vm_start ||
+                                    uvaddr >= vma->vm_end))
+                                       vma = NULL;
+                       }
+                       if (vma == NULL) {
+                               vma = find_vma(mm, uvaddr);
+                               /* this virtual area was already munmapped.
+                                  so skip to next page */
+                               if (!vma)
+                                       continue;
+                       }
+                       ret = vm_insert_page(vma, uvaddr, pg);
+                       if (ret)
+                               goto fail_flush;
+               }
+       }
+       
+       up_read(&mm->mmap_sem);
+
+       blkif_get(blkif);
+       /* Finally, write the request message to the user ring. */
+       target = RING_GET_REQUEST(&info->ufe_ring,
+                                 info->ufe_ring.req_prod_pvt);
+       memcpy(target, req, sizeof(*req));
+       target->id = usr_idx;
+       wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
+       info->ufe_ring.req_prod_pvt++;
+
+       switch (req->operation) {
+       case BLKIF_OP_READ:
+               blkif->st_rd_sect += nr_sects;
+               break;
+       case BLKIF_OP_WRITE:
+       case BLKIF_OP_WRITE_BARRIER:
+               blkif->st_wr_sect += nr_sects;
+               break;
+       }
+
+       return;
+
+ fail_flush:
+       up_read(&mm->mmap_sem);
+       WPRINTK("Reached Fail_flush\n");
+       fast_flush_area(pending_req, pending_idx, usr_idx, info);
+ fail_response:
+       make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+       free_req(pending_req);
+       msleep(1); /* back off a bit */
+}
+
+
+
+/******************************************************************
+ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING
+ */
+
+
+static void make_response(blkif_t *blkif, u64 id,
+                          unsigned short op, int st)
+{
+       blkif_response_t  resp;
+       unsigned long     flags;
+       blkif_back_rings_t *blk_rings = &blkif->blk_rings;
+       int more_to_do = 0;
+       int notify;
+
+       resp.id        = id;
+       resp.operation = op;
+       resp.status    = st;
+
+       spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+       /* Place on the response ring for the relevant domain. */
+       switch (blkif->blk_protocol) {
+       case BLKIF_PROTOCOL_NATIVE:
+               memcpy(RING_GET_RESPONSE(&blk_rings->native,
+                                        blk_rings->native.rsp_prod_pvt),
+                      &resp, sizeof(resp));
+               break;
+       case BLKIF_PROTOCOL_X86_32:
+               memcpy(RING_GET_RESPONSE(&blk_rings->x86_32,
+                                        blk_rings->x86_32.rsp_prod_pvt),
+                      &resp, sizeof(resp));
+               break;
+       case BLKIF_PROTOCOL_X86_64:
+               memcpy(RING_GET_RESPONSE(&blk_rings->x86_64,
+                                        blk_rings->x86_64.rsp_prod_pvt),
+                      &resp, sizeof(resp));
+               break;
+       default:
+               BUG();
+       }
+       blk_rings->common.rsp_prod_pvt++;
+       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+
+       if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) {
+               /*
+                * Tail check for pending requests. Allows frontend to avoid
+                * notifications if requests are already in flight (lower
+                * overheads and promotes batching).
+                */
+               RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+       } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) {
+               more_to_do = 1;
+       }
+
+       spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+       if (more_to_do)
+               blkif_notify_work(blkif);
+       if (notify)
+               notify_remote_via_irq(blkif->irq);
+}
+
+static int __init blkif_init(void)
+{
+       int i, ret;
+
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       INIT_LIST_HEAD(&pending_free);
+        for(i = 0; i < 2; i++) {
+               ret = req_increase();
+               if (ret)
+                       break;
+       }
+       if (i == 0)
+               return ret;
+
+       tap_blkif_interface_init();
+
+       alloc_pending_reqs = 0;
+
+       tap_blkif_xenbus_init();
+
+       /* Dynamically allocate a major for this device */
+       ret = __register_chrdev(0, 0, MAX_TAP_DEV, "blktap", &blktap_fops);
+
+       if (ret < 0) {
+               WPRINTK("Couldn't register /dev/xen/blktap\n");
+               return -ENOMEM;
+       }       
+       
+       blktap_major = ret;
+
+       /* tapfds[0] is always NULL */
+       blktap_next_minor++;
+
+       DPRINTK("Created misc_dev %d:0 [/dev/xen/blktap0]\n", ret);
+
+       /* Make sure the xen class exists */
+       if (get_xen_class()) {
+               /*
+                * This will allow udev to create the blktap ctrl device.
+                * We only want to create blktap0 first.  We don't want
+                * to flood the sysfs system with needless blktap devices.
+                * We only create the device when a request of a new device is
+                * made.
+                */
+               xen_class_device_create(&blktap_type, NULL,
+                                       MKDEV(blktap_major, 0), NULL,
+                                       "blktap0");
+       } else {
+               /* this is bad, but not fatal */
+               WPRINTK("sysfs xen_class not created\n");
+       }
+
+       DPRINTK("Blktap device successfully created\n");
+
+       return 0;
+}
+
+module_init(blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("devname:xen/blktap0");
+MODULE_ALIAS("xen-backend:tap");
diff --git a/drivers/xen/blktap/blocktap.c b/drivers/xen/blktap/blocktap.c

new file mode 100644 (file)

index 0000000..31973c0
--- /dev/null
+++ b/drivers/xen/blktap/blocktap.c
@@ -0,0 +1 @@
+#include "blktap.c"
diff --git a/drivers/xen/blktap/common.h b/drivers/xen/blktap/common.h

new file mode 100644 (file)

index 0000000..4adeef8
--- /dev/null
+++ b/drivers/xen/blktap/common.h
@@ -0,0 +1,112 @@
+/* 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __BLKIF__BACKEND__COMMON_H__
+#define __BLKIF__BACKEND__COMMON_H__
+
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+#include <xen/xenbus.h>
+#include <xen/interface/event_channel.h>
+
+#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \
+                                    __FILE__ , __LINE__ , ## _a )
+
+#define WPRINTK(fmt, args...) pr_warning("blktap: " fmt, ##args)
+
+struct backend_info;
+
+typedef struct blkif_st {
+       /* Unique identifier for this interface. */
+       domid_t           domid;
+       unsigned int      handle;
+       /* Physical parameters of the comms window. */
+       unsigned int      irq;
+       /* Comms information. */
+       enum blkif_protocol blk_protocol;
+       blkif_back_rings_t blk_rings;
+       struct vm_struct *blk_ring_area;
+       /* Back pointer to the backend_info. */
+       struct backend_info *be;
+       /* Private fields. */
+       spinlock_t       blk_ring_lock;
+       atomic_t         refcnt;
+
+       wait_queue_head_t   wq;
+       struct task_struct  *xenblkd;
+       unsigned int        waiting_reqs;
+       struct request_queue *plug;
+
+       /* statistics */
+       unsigned long       st_print;
+       int                 st_rd_req;
+       int                 st_wr_req;
+       int                 st_oo_req;
+       int                 st_pk_req;
+       int                 st_rd_sect;
+       int                 st_wr_sect;
+
+       wait_queue_head_t waiting_to_free;
+
+       int             dev_num;
+       uint64_t        sectors;
+} blkif_t;
+
+blkif_t *tap_alloc_blkif(domid_t domid);
+void tap_blkif_free(blkif_t *, struct xenbus_device *);
+void tap_blkif_kmem_cache_free(blkif_t *blkif);
+int tap_blkif_map(blkif_t *, struct xenbus_device *, grant_ref_t,
+                 evtchn_port_t);
+
+#define blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blkif_put(_b)                                  \
+       do {                                            \
+               if (atomic_dec_and_test(&(_b)->refcnt)) \
+                       wake_up(&(_b)->waiting_to_free);\
+       } while (0)
+
+
+struct phys_req {
+       unsigned short       dev;
+       unsigned short       nr_sects;
+       struct block_device *bdev;
+       blkif_sector_t       sector_number;
+};
+
+void tap_blkif_interface_init(void);
+
+void tap_blkif_xenbus_init(void);
+
+irqreturn_t tap_blkif_be_int(int irq, void *dev_id);
+int tap_blkif_schedule(void *arg);
+
+int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif);
+void signal_tapdisk(int idx);
+
+#endif /* __BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/blktap/interface.c b/drivers/xen/blktap/interface.c

new file mode 100644 (file)

index 0000000..db8681d
--- /dev/null
+++ b/drivers/xen/blktap/interface.c
@@ -0,0 +1,133 @@
+/******************************************************************************
+ * drivers/xen/blktap/interface.c
+ * 
+ * Block-device interface management.
+ * 
+ * Copyright (c) 2004, Keir Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+
+ */
+
+#include "common.h"
+#include <xen/evtchn.h>
+#include <linux/vmalloc.h>
+
+static struct kmem_cache *blkif_cachep;
+
+blkif_t *tap_alloc_blkif(domid_t domid)
+{
+       blkif_t *blkif;
+
+       blkif = kmem_cache_zalloc(blkif_cachep, GFP_KERNEL);
+       if (!blkif)
+               return ERR_PTR(-ENOMEM);
+
+       blkif->domid = domid;
+       spin_lock_init(&blkif->blk_ring_lock);
+       atomic_set(&blkif->refcnt, 1);
+       init_waitqueue_head(&blkif->wq);
+       blkif->st_print = jiffies;
+       init_waitqueue_head(&blkif->waiting_to_free);
+
+       return blkif;
+}
+
+int tap_blkif_map(blkif_t *blkif, struct xenbus_device *dev,
+                 grant_ref_t ring_ref, evtchn_port_t evtchn)
+{
+       struct vm_struct *area;
+       int err;
+
+       /* Already connected through? */
+       if (blkif->irq)
+               return 0;
+
+       area = xenbus_map_ring_valloc(dev, ring_ref);
+       if (IS_ERR(area))
+               return PTR_ERR(area);
+       blkif->blk_ring_area = area;
+
+       switch (blkif->blk_protocol) {
+#define BLKTAP_RING_INIT(p) ({ \
+               struct blkif_##p##_sring *sring = area->addr; \
+               BACK_RING_INIT(&blkif->blk_rings.p, sring, PAGE_SIZE); \
+       })
+       case BLKIF_PROTOCOL_NATIVE:
+               BLKTAP_RING_INIT(native);
+               break;
+       case BLKIF_PROTOCOL_X86_32:
+               BLKTAP_RING_INIT(x86_32);
+               break;
+       case BLKIF_PROTOCOL_X86_64:
+               BLKTAP_RING_INIT(x86_64);
+               break;
+       default:
+               BUG();
+#undef BLKTAP_RING_INIT
+       }
+
+       err = bind_interdomain_evtchn_to_irqhandler(
+               blkif->domid, evtchn, tap_blkif_be_int,
+               0, "blkif-backend", blkif);
+       if (err < 0) {
+               xenbus_unmap_ring_vfree(dev, area);
+               blkif->blk_rings.common.sring = NULL;
+               return err;
+       }
+       blkif->irq = err;
+
+       return 0;
+}
+
+void tap_blkif_free(blkif_t *blkif, struct xenbus_device *dev)
+{
+       atomic_dec(&blkif->refcnt);
+       wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0);
+       atomic_inc(&blkif->refcnt);
+
+       if (blkif->irq) {
+               unbind_from_irqhandler(blkif->irq, blkif);
+               blkif->irq = 0;
+       }
+
+       if (blkif->blk_rings.common.sring) {
+               xenbus_unmap_ring_vfree(dev, blkif->blk_ring_area);
+               blkif->blk_rings.common.sring = NULL;
+       }
+}
+
+void tap_blkif_kmem_cache_free(blkif_t *blkif)
+{
+       if (!atomic_dec_and_test(&blkif->refcnt))
+               BUG();
+       kmem_cache_free(blkif_cachep, blkif);
+}
+
+void __init tap_blkif_interface_init(void)
+{
+       blkif_cachep = kmem_cache_create("blktapif_cache", sizeof(blkif_t), 
+                                        0, 0, NULL);
+}
diff --git a/drivers/xen/blktap/xenbus.c b/drivers/xen/blktap/xenbus.c

new file mode 100644 (file)

index 0000000..dda4e2a
--- /dev/null
+++ b/drivers/xen/blktap/xenbus.c
@@ -0,0 +1,517 @@
+/* drivers/xen/blktap/xenbus.c
+ *
+ * Xenbus code for blktap
+ *
+ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield
+ *
+ * Based on the blkback xenbus code:
+ *
+ * Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+ * Copyright (C) 2005 XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdarg.h>
+#include <linux/kthread.h>
+#include <xen/xenbus.h>
+#include "common.h"
+#include "../core/domctl.h"
+
+
+struct backend_info
+{
+       struct xenbus_device *dev;
+       blkif_t *blkif;
+       struct xenbus_watch backend_watch;
+       int xenbus_id;
+       int group_added;
+};
+
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static int blktap_remove(struct xenbus_device *dev);
+static int blktap_probe(struct xenbus_device *dev,
+                        const struct xenbus_device_id *id);
+static void tap_backend_changed(struct xenbus_watch *, const char **,
+                           unsigned int);
+static void tap_frontend_changed(struct xenbus_device *dev,
+                            enum xenbus_state frontend_state);
+
+static int strsep_len(const char *str, char c, unsigned int len)
+{
+        unsigned int i;
+
+        for (i = 0; str[i]; i++)
+                if (str[i] == c) {
+                        if (len == 0)
+                                return i;
+                        len--;
+                }
+        return -ERANGE;
+}
+
+static long get_id(const char *str)
+{
+       int len;
+        const char *ptr;
+       char num[10];
+       
+        len = strsep_len(str, '/', 2);
+       if (len < 0)
+               return -1;
+       
+        ptr = str + len + 1;
+       strlcpy(num, ptr, ARRAY_SIZE(num));
+       DPRINTK("get_id(%s) -> %s\n", str, num);
+       
+        return simple_strtol(num, NULL, 10);
+}                              
+
+static int blktap_name(blkif_t *blkif, char *buf)
+{
+       char *devpath, *devname;
+       struct xenbus_device *dev = blkif->be->dev;
+
+       devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+       if (IS_ERR(devpath)) 
+               return PTR_ERR(devpath);
+       
+       if ((devname = strstr(devpath, "/dev/")) != NULL)
+               devname += strlen("/dev/");
+       else
+               devname  = devpath;
+
+       snprintf(buf, TASK_COMM_LEN, "blktap.%d.%s", blkif->domid, devname);
+       kfree(devpath);
+       
+       return 0;
+}
+
+/****************************************************************
+ *  sysfs interface for I/O requests of blktap device
+ */
+
+#define VBD_SHOW(name, format, args...)                                        \
+       static ssize_t show_##name(struct device *_dev,                 \
+                                  struct device_attribute *attr,       \
+                                  char *buf)                           \
+       {                                                               \
+               ssize_t ret = -ENODEV;                                  \
+               struct xenbus_device *dev;                              \
+               struct backend_info *be;                                \
+                                                                       \
+               if (!get_device(_dev))                                  \
+                       return ret;                                     \
+               dev = to_xenbus_device(_dev);                           \
+               if ((be = dev_get_drvdata(&dev->dev)) != NULL)          \
+                       ret = sprintf(buf, format, ##args);             \
+               put_device(_dev);                                       \
+               return ret;                                             \
+       }                                                               \
+       static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
+VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
+VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
+VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect);
+VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect);
+
+static struct attribute *tapstat_attrs[] = {
+       &dev_attr_oo_req.attr,
+       &dev_attr_rd_req.attr,
+       &dev_attr_wr_req.attr,
+       &dev_attr_rd_sect.attr,
+       &dev_attr_wr_sect.attr,
+       NULL
+};
+
+static const struct attribute_group tapstat_group = {
+       .name = "statistics",
+       .attrs = tapstat_attrs,
+};
+
+int xentap_sysfs_addif(struct xenbus_device *dev)
+{
+       int err;
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+       err = sysfs_create_group(&dev->dev.kobj, &tapstat_group);
+       if (!err)
+               be->group_added = 1;
+       return err;
+}
+
+void xentap_sysfs_delif(struct xenbus_device *dev)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+       sysfs_remove_group(&dev->dev.kobj, &tapstat_group);
+       be->group_added = 0;
+}
+
+static int blktap_remove(struct xenbus_device *dev)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+       if (be->group_added)
+               xentap_sysfs_delif(be->dev);
+       if (be->backend_watch.node) {
+               unregister_xenbus_watch(&be->backend_watch);
+               kfree(be->backend_watch.node);
+               be->backend_watch.node = NULL;
+       }
+       if (be->blkif) {
+               if (be->blkif->xenblkd)
+                       kthread_stop(be->blkif->xenblkd);
+               signal_tapdisk(be->blkif->dev_num);
+               tap_blkif_free(be->blkif, dev);
+               tap_blkif_kmem_cache_free(be->blkif);
+               be->blkif = NULL;
+       }
+       kfree(be);
+       dev_set_drvdata(&dev->dev, NULL);
+       return 0;
+}
+
+static void tap_update_blkif_status(blkif_t *blkif)
+{ 
+       int err;
+       char name[TASK_COMM_LEN];
+
+       /* Not ready to connect? */
+       if(!blkif->irq || !blkif->sectors) {
+               return;
+       } 
+
+       /* Already connected? */
+       if (blkif->be->dev->state == XenbusStateConnected)
+               return;
+
+       /* Attempt to connect: exit if we fail to. */
+       connect(blkif->be);
+       if (blkif->be->dev->state != XenbusStateConnected)
+               return;
+
+       err = blktap_name(blkif, name);
+       if (err) {
+               xenbus_dev_error(blkif->be->dev, err, "get blktap dev name");
+               return;
+       }
+
+       if (!blkif->be->group_added) {
+               err = xentap_sysfs_addif(blkif->be->dev);
+               if (err) {
+                       xenbus_dev_fatal(blkif->be->dev, err, 
+                                        "creating sysfs entries");
+                       return;
+               }
+       }
+
+       blkif->xenblkd = kthread_run(tap_blkif_schedule, blkif, name);
+       if (IS_ERR(blkif->xenblkd)) {
+               err = PTR_ERR(blkif->xenblkd);
+               blkif->xenblkd = NULL;
+               xenbus_dev_fatal(blkif->be->dev, err, "start xenblkd");
+               WPRINTK("Error starting thread %s\n", name);
+       } else
+               DPRINTK("Thread started for domid %d, connected disk %d\n",
+                       blkif->domid, blkif->dev_num);
+
+}
+
+/**
+ * Entry point to this code when a new device is created.  Allocate
+ * the basic structures, and watch the store waiting for the
+ * user-space program to tell us the physical device info.  Switch to
+ * InitWait.
+ */
+static int blktap_probe(struct xenbus_device *dev,
+                        const struct xenbus_device_id *id)
+{
+       int err;
+       struct backend_info *be = kzalloc(sizeof(struct backend_info),
+                                         GFP_KERNEL);
+       if (!be) {
+               xenbus_dev_fatal(dev, -ENOMEM,
+                                "allocating backend structure");
+               return -ENOMEM;
+       }
+
+       be->dev = dev;
+       dev_set_drvdata(&dev->dev, be);
+       be->xenbus_id = get_id(dev->nodename);
+
+       be->blkif = tap_alloc_blkif(dev->otherend_id);
+       if (IS_ERR(be->blkif)) {
+               err = PTR_ERR(be->blkif);
+               be->blkif = NULL;
+               xenbus_dev_fatal(dev, err, "creating block interface");
+               goto fail;
+       }
+
+       /* setup back pointer */
+       be->blkif->be = be;
+       be->blkif->sectors = 0;
+
+       /* set a watch on disk info, waiting for userspace to update details*/
+       err = xenbus_watch_path2(dev, dev->nodename, "info",
+                                &be->backend_watch, tap_backend_changed);
+       if (err)
+               goto fail;
+       
+       err = xenbus_switch_state(dev, XenbusStateInitWait);
+       if (err)
+               goto fail;
+       return 0;
+
+fail:
+       DPRINTK("blktap probe failed\n");
+       blktap_remove(dev);
+       return err;
+}
+
+
+/**
+ * Callback received when the user space code has placed the device
+ * information in xenstore. 
+ */
+static void tap_backend_changed(struct xenbus_watch *watch,
+                           const char **vec, unsigned int len)
+{
+       int err;
+       unsigned long info;
+       struct backend_info *be
+               = container_of(watch, struct backend_info, backend_watch);
+       struct xenbus_device *dev = be->dev;
+       
+       /** 
+        * Check to see whether userspace code has opened the image 
+        * and written sector
+        * and disk info to xenstore
+        */
+       err = xenbus_gather(XBT_NIL, dev->nodename, "info", "%lu", &info, 
+                           "sectors", "%Lu", &be->blkif->sectors, NULL);
+       if (XENBUS_EXIST_ERR(err))
+               return;
+       if (err) {
+               xenbus_dev_error(dev, err, "getting info");
+               return;
+       }
+
+       DPRINTK("Userspace update on disk info, %lu\n",info);
+
+       /* Associate tap dev with domid*/
+       be->blkif->dev_num = dom_to_devid(be->blkif->domid, be->xenbus_id, 
+                                         be->blkif);
+
+       tap_update_blkif_status(be->blkif);
+}
+
+
+static void blkif_disconnect(blkif_t *blkif)
+{
+       if (blkif->xenblkd) {
+               kthread_stop(blkif->xenblkd);
+               blkif->xenblkd = NULL;
+       }
+
+       /* idempotent */
+       tap_blkif_free(blkif, blkif->be->dev);
+}
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void tap_frontend_changed(struct xenbus_device *dev,
+                            enum xenbus_state frontend_state)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+       int err;
+
+       DPRINTK("fe_changed(%s,%d)\n", dev->nodename, frontend_state);
+
+       switch (frontend_state) {
+       case XenbusStateInitialising:
+               if (dev->state == XenbusStateClosed) {
+                       pr_info("%s: %s: prepare for reconnect\n",
+                               __FUNCTION__, dev->nodename);
+                       xenbus_switch_state(dev, XenbusStateInitWait);
+               }
+               break;
+
+       case XenbusStateInitialised:
+       case XenbusStateConnected:
+               /* Ensure we connect even when two watches fire in 
+                  close successsion and we miss the intermediate value 
+                  of frontend_state. */
+               if (dev->state == XenbusStateConnected)
+                       break;
+
+               /* Enforce precondition before potential leak point.
+                * blkif_disconnect() is idempotent.
+                */
+               blkif_disconnect(be->blkif);
+
+               err = connect_ring(be);
+               if (err)
+                       break;
+               tap_update_blkif_status(be->blkif);
+               break;
+
+       case XenbusStateClosing:
+               blkif_disconnect(be->blkif);
+               xenbus_switch_state(dev, XenbusStateClosing);
+               break;
+
+       case XenbusStateClosed:
+               xenbus_switch_state(dev, XenbusStateClosed);
+               if (xenbus_dev_is_online(dev))
+                       break;
+               /* fall through if not online */
+       case XenbusStateUnknown:
+               /* Implies the effects of blkif_disconnect() via
+                * blktap_remove().
+                */
+               device_unregister(&dev->dev);
+               break;
+
+       default:
+               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+                                frontend_state);
+               break;
+       }
+}
+
+
+/**
+ * Switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+       int err;
+
+       struct xenbus_device *dev = be->dev;
+       struct xenbus_transaction xbt;
+
+       /* Write feature-barrier to xenstore */
+again:
+       err = xenbus_transaction_start(&xbt);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "starting transaction");
+               return;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "feature-barrier",  "1");
+       if (err) {
+               xenbus_dev_fatal(dev, err, "writing feature-barrier");
+               xenbus_transaction_end(xbt, 1);
+               return;
+       }
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err == -EAGAIN)
+               goto again;
+
+       /* Switch state */
+       err = xenbus_switch_state(dev, XenbusStateConnected);
+       if (err)
+               xenbus_dev_fatal(dev, err, "%s: switching to Connected state",
+                                dev->nodename);
+
+       return;
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+       struct xenbus_device *dev = be->dev;
+       unsigned int ring_ref, evtchn;
+       char *protocol;
+       int err;
+
+       DPRINTK("%s\n", dev->otherend);
+
+       err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%u",
+                           &ring_ref, "event-channel", "%u", &evtchn, NULL);
+       if (err) {
+               xenbus_dev_fatal(dev, err,
+                                "reading %s/ring-ref and event-channel",
+                                dev->otherend);
+               return err;
+       }
+
+       be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+       protocol = xenbus_read(XBT_NIL, dev->otherend, "protocol", NULL);
+       if (IS_ERR(protocol)) {
+               protocol = NULL;
+               be->blkif->blk_protocol = xen_guest_blkif_protocol(be->blkif->domid);
+#ifndef CONFIG_X86_32
+       } else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) {
+               be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+#endif
+#ifndef CONFIG_X86_64
+       } else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) {
+               be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+#endif
+       } else if (0 != strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) {
+               xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+               kfree(protocol);
+               return -1;
+       }
+       pr_info("blktap: ring-ref %u, event-channel %u, protocol %d (%s)\n",
+               ring_ref, evtchn, be->blkif->blk_protocol,
+               protocol ?: "unspecified");
+       kfree(protocol);
+
+       /* Map the shared frame, irq etc. */
+       err = tap_blkif_map(be->blkif, dev, ring_ref, evtchn);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "mapping ring-ref %u port %u",
+                                ring_ref, evtchn);
+               return err;
+       } 
+
+       return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id blktap_ids[] = {
+       { "tap" },
+       { "" }
+};
+
+static DEFINE_XENBUS_DRIVER(blktap, ,
+       .probe = blktap_probe,
+       .remove = blktap_remove,
+       .otherend_changed = tap_frontend_changed
+);
+
+
+void tap_blkif_xenbus_init(void)
+{
+       WARN_ON(xenbus_register_backend(&blktap_driver));
+}
diff --git a/drivers/xen/blktap2-new/Makefile b/drivers/xen/blktap2-new/Makefile

new file mode 100644 (file)

index 0000000..20c98aa
--- /dev/null
+++ b/drivers/xen/blktap2-new/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_XEN_BLKDEV_TAP2) := xen-blktap.o
+
+xen-blktap-y := control.o ring.o device.o request.o
+xen-blktap-$(CONFIG_SYSFS) += sysfs.o
diff --git a/drivers/xen/blktap2-new/blktap.h b/drivers/xen/blktap2-new/blktap.h

new file mode 100644 (file)

index 0000000..05d5bcb
--- /dev/null
+++ b/drivers/xen/blktap2-new/blktap.h
@@ -0,0 +1,218 @@
+#ifndef _BLKTAP_H_
+#define _BLKTAP_H_
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/init.h>
+#include <linux/scatterlist.h>
+#include <xen/blkif.h>
+
+extern int blktap_debug_level;
+extern int blktap_ring_major;
+extern int blktap_device_major;
+
+#define BTPRINTK(level, tag, force, _f, _a...)                         \
+       do {                                                            \
+               if (blktap_debug_level > level &&                       \
+                   (force || printk_ratelimit()))                      \
+                       printk(tag "%s: " _f, __func__, ##_a);          \
+       } while (0)
+
+#define BTDBG(_f, _a...)             BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
+#define BTINFO(_f, _a...)            BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
+#define BTWARN(_f, _a...)            BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
+#define BTERR(_f, _a...)             BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
+
+#define BLKTAP2_DEV_DIR "xen/blktap-2/"
+
+#define BLKTAP_DEVICE                4
+#define BLKTAP_DEVICE_CLOSED         5
+#define BLKTAP_SHUTDOWN_REQUESTED    8
+
+/* blktap IOCTLs: */
+#define BLKTAP2_IOCTL_KICK_FE        1
+#define BLKTAP2_IOCTL_ALLOC_TAP      200
+#define BLKTAP2_IOCTL_FREE_TAP       201
+#define BLKTAP2_IOCTL_CREATE_DEVICE  202
+#define BLKTAP2_IOCTL_REMOVE_DEVICE  207
+
+#define BLKTAP2_MAX_MESSAGE_LEN      256
+
+#define BLKTAP2_RING_MESSAGE_CLOSE   3
+
+#define BLKTAP_REQUEST_FREE          0
+#define BLKTAP_REQUEST_PENDING       1
+
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by
+ *
+ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define BLK_RING_SIZE          __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
+#define MAX_DYNAMIC_MEM                BLK_RING_SIZE
+#define MAX_PENDING_REQS       BLK_RING_SIZE
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req, _seg)                                 \
+        (_start +                                                       \
+         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
+         ((_seg) * PAGE_SIZE))
+
+struct grant_handle_pair {
+       grant_handle_t                 kernel;
+       grant_handle_t                 user;
+};
+#define INVALID_GRANT_HANDLE           0xFFFF
+
+struct blktap_handle {
+       unsigned int                   ring;
+       unsigned int                   device;
+       unsigned int                   minor;
+};
+
+struct blktap_params {
+       char                           name[BLKTAP2_MAX_MESSAGE_LEN];
+       unsigned long long             capacity;
+       unsigned long                  sector_size;
+};
+
+struct blktap_device {
+       spinlock_t                     lock;
+       struct gendisk                *gd;
+};
+
+struct blktap_ring {
+       struct task_struct            *task;
+
+       struct vm_area_struct         *vma;
+       struct blkif_front_ring        ring;
+       unsigned long                  ring_vstart;
+       unsigned long                  user_vstart;
+
+       int                            n_pending;
+       struct blktap_request         *pending[MAX_PENDING_REQS];
+
+       wait_queue_head_t              poll_wait;
+
+       dev_t                          devno;
+       struct device                 *dev;
+};
+
+struct blktap_statistics {
+       unsigned long                  st_print;
+       int                            st_rd_req;
+       int                            st_wr_req;
+       int                            st_oo_req;
+       int                            st_pk_req;
+       int                            st_rd_sect;
+       int                            st_wr_sect;
+       s64                            st_rd_cnt;
+       s64                            st_rd_sum_usecs;
+       s64                            st_rd_max_usecs;
+       s64                            st_wr_cnt;
+       s64                            st_wr_sum_usecs;
+       s64                            st_wr_max_usecs; 
+};
+
+struct blktap_request {
+       struct blktap                 *tap;
+       struct request                *rq;
+       int                            usr_idx;
+
+       int                            operation;
+       struct timeval                 time;
+
+       struct scatterlist             sg_table[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+       struct page                   *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+       int                            nr_pages;
+};
+
+#define blktap_for_each_sg(_sg, _req, _i)      \
+       for (_sg = (_req)->sg_table, _i = 0;    \
+            _i < (_req)->nr_pages;             \
+            (_sg)++, (_i)++)
+
+struct blktap {
+       int                            minor;
+       unsigned long                  dev_inuse;
+
+       struct blktap_ring             ring;
+       struct blktap_device           device;
+       struct blktap_page_pool       *pool;
+
+       wait_queue_head_t              remove_wait;
+       struct work_struct             remove_work;
+       char                           name[BLKTAP2_MAX_MESSAGE_LEN];
+
+       struct blktap_statistics       stats;
+};
+
+struct blktap_page_pool {
+       struct mempool_s              *bufs;
+       spinlock_t                     lock;
+       struct kobject                 kobj;
+       wait_queue_head_t              wait;
+};
+
+extern struct mutex blktap_lock;
+extern struct blktap **blktaps;
+extern int blktap_max_minor;
+
+int blktap_control_destroy_tap(struct blktap *);
+size_t blktap_control_debug(struct blktap *, char *, size_t);
+
+int blktap_ring_init(void);
+void blktap_ring_exit(void);
+size_t blktap_ring_debug(struct blktap *, char *, size_t);
+int blktap_ring_create(struct blktap *);
+int blktap_ring_destroy(struct blktap *);
+struct blktap_request *blktap_ring_make_request(struct blktap *);
+void blktap_ring_free_request(struct blktap *,struct blktap_request *);
+void blktap_ring_submit_request(struct blktap *, struct blktap_request *);
+int blktap_ring_map_request_segment(struct blktap *, struct blktap_request *, int);
+int blktap_ring_map_request(struct blktap *, struct blktap_request *);
+void blktap_ring_unmap_request(struct blktap *, struct blktap_request *);
+void blktap_ring_set_message(struct blktap *, int);
+void blktap_ring_kick_user(struct blktap *);
+
+#ifdef CONFIG_SYSFS
+int blktap_sysfs_init(void);
+void blktap_sysfs_exit(void);
+int blktap_sysfs_create(struct blktap *);
+void blktap_sysfs_destroy(struct blktap *);
+#else
+static inline int blktap_sysfs_init(void) { return 0; }
+static inline void blktap_sysfs_exit(void) {}
+static inline int blktap_sysfs_create(struct blktap *tapdev) { return 0; }
+static inline void blktap_sysfs_destroy(struct blktap *tapdev) {}
+#endif
+
+int blktap_device_init(void);
+void blktap_device_exit(void);
+size_t blktap_device_debug(struct blktap *, char *, size_t);
+int blktap_device_create(struct blktap *, struct blktap_params *);
+int blktap_device_destroy(struct blktap *);
+void blktap_device_destroy_sync(struct blktap *);
+void blktap_device_run_queue(struct blktap *);
+void blktap_device_end_request(struct blktap *, struct blktap_request *, int);
+
+int blktap_page_pool_init(struct kobject *);
+void blktap_page_pool_exit(void);
+struct blktap_page_pool *blktap_page_pool_get(const char *);
+
+size_t blktap_request_debug(struct blktap *, char *, size_t);
+struct blktap_request *blktap_request_alloc(struct blktap *);
+int blktap_request_get_pages(struct blktap *, struct blktap_request *, int);
+void blktap_request_free(struct blktap *, struct blktap_request *);
+void blktap_request_bounce(struct blktap *, struct blktap_request *, int, int);
+
+
+#endif
diff --git a/drivers/xen/blktap2-new/control.c b/drivers/xen/blktap2-new/control.c

new file mode 100644 (file)

index 0000000..615df74
--- /dev/null
+++ b/drivers/xen/blktap2-new/control.c
@@ -0,0 +1,316 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/miscdevice.h>
+#include <linux/device.h>
+#include <asm/uaccess.h>
+
+#include "blktap.h"
+
+DEFINE_MUTEX(blktap_lock);
+
+struct blktap **blktaps;
+int blktap_max_minor;
+static struct blktap_page_pool *default_pool;
+
+static struct blktap *
+blktap_control_get_minor(void)
+{
+       int minor;
+       struct blktap *tap;
+
+       tap = kzalloc(sizeof(*tap), GFP_KERNEL);
+       if (unlikely(!tap))
+               return NULL;
+
+       mutex_lock(&blktap_lock);
+
+       for (minor = 0; minor < blktap_max_minor; minor++)
+               if (!blktaps[minor])
+                       break;
+
+       if (minor == CONFIG_XEN_NR_TAP2_DEVICES)
+               goto fail;
+
+       if (minor == blktap_max_minor) {
+               void *p;
+               int n;
+
+               n = min(2 * blktap_max_minor, CONFIG_XEN_NR_TAP2_DEVICES);
+               p = krealloc(blktaps, n * sizeof(blktaps[0]), GFP_KERNEL);
+               if (!p)
+                       goto fail;
+
+               blktaps          = p;
+               minor            = blktap_max_minor;
+               blktap_max_minor = n;
+
+               memset(&blktaps[minor], 0, (n - minor) * sizeof(blktaps[0]));
+       }
+
+       tap->minor = minor;
+       blktaps[minor] = tap;
+
+       __module_get(THIS_MODULE);
+out:
+       mutex_unlock(&blktap_lock);
+       return tap;
+
+fail:
+       mutex_unlock(&blktap_lock);
+       kfree(tap);
+       tap = NULL;
+       goto out;
+}
+
+static void
+blktap_control_put_minor(struct blktap* tap)
+{
+       blktaps[tap->minor] = NULL;
+       kfree(tap);
+
+       module_put(THIS_MODULE);
+}
+
+static struct blktap*
+blktap_control_create_tap(void)
+{
+       struct blktap *tap;
+       int err;
+
+       tap = blktap_control_get_minor();
+       if (!tap)
+               return NULL;
+
+       kobject_get(&default_pool->kobj);
+       tap->pool = default_pool;
+
+       err = blktap_ring_create(tap);
+       if (err)
+               goto fail_tap;
+
+       err = blktap_sysfs_create(tap);
+       if (err)
+               goto fail_ring;
+
+       return tap;
+
+fail_ring:
+       blktap_ring_destroy(tap);
+fail_tap:
+       blktap_control_put_minor(tap);
+
+       return NULL;
+}
+
+int
+blktap_control_destroy_tap(struct blktap *tap)
+{
+       int err;
+
+       err = blktap_ring_destroy(tap);
+       if (err)
+               return err;
+
+       kobject_put(&tap->pool->kobj);
+
+       blktap_sysfs_destroy(tap);
+
+       blktap_control_put_minor(tap);
+
+       return 0;
+}
+
+static long
+blktap_control_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+       struct blktap *tap;
+
+       switch (cmd) {
+       case BLKTAP2_IOCTL_ALLOC_TAP: {
+               struct blktap_handle h;
+               void __user *ptr = (void __user*)arg;
+
+               tap = blktap_control_create_tap();
+               if (!tap)
+                       return -ENOMEM;
+
+               h.ring   = blktap_ring_major;
+               h.device = blktap_device_major;
+               h.minor  = tap->minor;
+
+               if (copy_to_user(ptr, &h, sizeof(h))) {
+                       blktap_control_destroy_tap(tap);
+                       return -EFAULT;
+               }
+
+               return 0;
+       }
+
+       case BLKTAP2_IOCTL_FREE_TAP: {
+               int minor = arg;
+
+               if (minor > CONFIG_XEN_NR_TAP2_DEVICES)
+                       return -EINVAL;
+
+               tap = blktaps[minor];
+               if (!tap)
+                       return -ENODEV;
+
+               return blktap_control_destroy_tap(tap);
+       }
+       }
+
+       return -ENOIOCTLCMD;
+}
+
+static const struct file_operations blktap_control_file_operations = {
+       .owner    = THIS_MODULE,
+       .unlocked_ioctl = blktap_control_ioctl,
+};
+
+static struct miscdevice blktap_control = {
+       .minor    = MISC_DYNAMIC_MINOR,
+       .name     = "blktap-control",
+       .nodename = BLKTAP2_DEV_DIR "control",
+       .fops     = &blktap_control_file_operations,
+};
+
+static struct device *control_device;
+
+static ssize_t
+blktap_control_show_default_pool(struct device *device,
+                                struct device_attribute *attr,
+                                char *buf)
+{
+       return sprintf(buf, "%s", kobject_name(&default_pool->kobj));
+}
+
+static ssize_t
+blktap_control_store_default_pool(struct device *device,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t size)
+{
+       struct blktap_page_pool *pool, *tmp = default_pool;
+
+       pool = blktap_page_pool_get(buf);
+       if (IS_ERR(pool))
+               return PTR_ERR(pool);
+
+       default_pool = pool;
+       kobject_put(&tmp->kobj);
+
+       return size;
+}
+
+static DEVICE_ATTR(default_pool, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH,
+                  blktap_control_show_default_pool,
+                  blktap_control_store_default_pool);
+
+size_t
+blktap_control_debug(struct blktap *tap, char *buf, size_t size)
+{
+       char *s = buf, *end = buf + size;
+
+       s += snprintf(s, end - s,
+                     "tap %u:%u name:'%s' flags:%#08lx\n",
+                     MAJOR(tap->ring.devno), MINOR(tap->ring.devno),
+                     tap->name, tap->dev_inuse);
+
+       return s - buf;
+}
+
+static int __init
+blktap_control_init(void)
+{
+       int err;
+
+       err = misc_register(&blktap_control);
+       if (err)
+               return err;
+
+       control_device = blktap_control.this_device;
+
+       blktap_max_minor = min(64, CONFIG_XEN_NR_TAP2_DEVICES);
+       blktaps = kzalloc(blktap_max_minor * sizeof(blktaps[0]), GFP_KERNEL);
+       if (!blktaps) {
+               BTERR("failed to allocate blktap minor map");
+               return -ENOMEM;
+       }
+
+       err = blktap_page_pool_init(&control_device->kobj);
+       if (err)
+               return err;
+
+       default_pool = blktap_page_pool_get("default");
+       if (!default_pool)
+               return -ENOMEM;
+
+       err = device_create_file(control_device, &dev_attr_default_pool);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static void
+blktap_control_exit(void)
+{
+       if (default_pool) {
+               kobject_put(&default_pool->kobj);
+               default_pool = NULL;
+       }
+
+       blktap_page_pool_exit();
+
+       if (blktaps) {
+               kfree(blktaps);
+               blktaps = NULL;
+       }
+
+       if (control_device) {
+               misc_deregister(&blktap_control);
+               control_device = NULL;
+       }
+}
+
+static void
+blktap_exit(void)
+{
+       blktap_control_exit();
+       blktap_ring_exit();
+       blktap_sysfs_exit();
+       blktap_device_exit();
+}
+
+static int __init
+blktap_init(void)
+{
+       int err;
+
+       err = blktap_device_init();
+       if (err)
+               goto fail;
+
+       err = blktap_ring_init();
+       if (err)
+               goto fail;
+
+       err = blktap_sysfs_init();
+       if (err)
+               goto fail;
+
+       err = blktap_control_init();
+       if (err)
+               goto fail;
+
+       return 0;
+
+fail:
+       blktap_exit();
+       return err;
+}
+
+module_init(blktap_init);
+module_exit(blktap_exit);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("devname:" BLKTAP2_DEV_DIR "control");
diff --git a/drivers/xen/blktap2-new/device.c b/drivers/xen/blktap2-new/device.c

new file mode 100644 (file)

index 0000000..77f5028
--- /dev/null
+++ b/drivers/xen/blktap2-new/device.c
@@ -0,0 +1,572 @@
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/cdrom.h>
+#include <linux/hdreg.h>
+#include <linux/module.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+
+#include "blktap.h"
+
+int blktap_device_major;
+
+#define dev_to_blktap(_dev) container_of(_dev, struct blktap, device)
+
+static int
+blktap_device_open(struct block_device *bdev, fmode_t mode)
+{
+       struct gendisk *disk = bdev->bd_disk;
+       struct blktap_device *tapdev = disk->private_data;
+
+       if (!tapdev)
+               return -ENXIO;
+
+       /* NB. we might have bounced a bd trylock by tapdisk. when
+        * failing for reasons not !tapdev, make sure to kick tapdisk
+        * out of destroy wait state again. */
+
+       return 0;
+}
+
+static int
+blktap_device_release(struct gendisk *disk, fmode_t mode)
+{
+       struct blktap_device *tapdev = disk->private_data;
+       struct block_device *bdev = bdget_disk(disk, 0);
+       struct blktap *tap = dev_to_blktap(tapdev);
+
+       bdput(bdev);
+
+       if (!bdev->bd_openers) {
+               set_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse);
+               blktap_ring_kick_user(tap);
+       }
+
+       return 0;
+}
+
+static int
+blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+       /* We don't have real geometry info, but let's at least return
+          values consistent with the size of the device */
+       sector_t nsect = get_capacity(bd->bd_disk);
+       sector_t cylinders = nsect;
+
+       hg->heads = 0xff;
+       hg->sectors = 0x3f;
+       sector_div(cylinders, hg->heads * hg->sectors);
+       hg->cylinders = cylinders;
+       if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+               hg->cylinders = 0xffff;
+       return 0;
+}
+
+static int
+blktap_device_ioctl(struct block_device *bd, fmode_t mode,
+                   unsigned command, unsigned long argument)
+{
+       int i;
+
+       switch (command) {
+       case CDROMMULTISESSION:
+               BTDBG("FIXME: support multisession CDs later\n");
+               for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+                       if (put_user(0, (char __user *)(argument + i)))
+                               return -EFAULT;
+               return 0;
+
+       case SCSI_IOCTL_GET_IDLUN:
+               if (!access_ok(VERIFY_WRITE, argument, 
+                       sizeof(struct scsi_idlun)))
+                       return -EFAULT;
+
+               /* return 0 for now. */
+               __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
+               __put_user(0, 
+                       &((struct scsi_idlun __user *)argument)->host_unique_id);
+               return 0;
+
+       default:
+               /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
+                 command);*/
+               return -EINVAL; /* same return as native Linux */
+       }
+
+       return 0;
+}
+
+static const struct block_device_operations blktap_device_file_operations = {
+       .owner     = THIS_MODULE,
+       .open      = blktap_device_open,
+       .release   = blktap_device_release,
+       .ioctl     = blktap_device_ioctl,
+       .getgeo    = blktap_device_getgeo
+};
+
+/* NB. __blktap holding the queue lock; blktap where unlocked */
+
+static inline struct request*
+__blktap_next_queued_rq(struct request_queue *q)
+{
+       return blk_peek_request(q);
+}
+
+static inline void
+__blktap_dequeue_rq(struct request *rq)
+{
+       blk_start_request(rq);
+}
+
+/* NB. err == 0 indicates success, failures < 0 */
+
+static inline void
+__blktap_end_queued_rq(struct request *rq, int err)
+{
+       blk_start_request(rq);
+       __blk_end_request(rq, err, blk_rq_bytes(rq));
+}
+
+static inline void
+__blktap_end_rq(struct request *rq, int err)
+{
+       __blk_end_request(rq, err, blk_rq_bytes(rq));
+}
+
+static inline void
+blktap_end_rq(struct request *rq, int err)
+{
+       struct request_queue *q = rq->q;
+
+       spin_lock_irq(q->queue_lock);
+       __blktap_end_rq(rq, err);
+       spin_unlock_irq(q->queue_lock);
+}
+
+void
+blktap_device_end_request(struct blktap *tap,
+                         struct blktap_request *request,
+                         int error)
+{
+       struct blktap_device *tapdev = &tap->device;
+       struct request *rq = request->rq;
+
+       blktap_ring_unmap_request(tap, request);
+
+       blktap_ring_free_request(tap, request);
+
+       dev_dbg(disk_to_dev(tapdev->gd),
+               "end_request: op=%d error=%d bytes=%d\n",
+               rq_data_dir(rq), error, blk_rq_bytes(rq));
+
+       blktap_end_rq(rq, error);
+}
+
+int
+blktap_device_make_request(struct blktap *tap, struct request *rq)
+{
+       struct blktap_device *tapdev = &tap->device;
+       struct blktap_request *request;
+       int write, nsegs;
+       int err;
+
+       request = blktap_ring_make_request(tap);
+       if (IS_ERR(request)) {
+               err = PTR_ERR(request);
+               request = NULL;
+
+               if (err == -ENOSPC || err == -ENOMEM)
+                       goto stop;
+
+               goto fail;
+       }
+
+       write = rq_data_dir(rq) == WRITE;
+       nsegs = blk_rq_map_sg(rq->q, rq, request->sg_table);
+
+       dev_dbg(disk_to_dev(tapdev->gd),
+               "make_request: op=%c bytes=%d nsegs=%d\n",
+               write ? 'w' : 'r', blk_rq_bytes(rq), nsegs);
+
+       request->rq = rq;
+       request->operation = write ? BLKIF_OP_WRITE : BLKIF_OP_READ;
+       if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC))
+               request->operation = BLKIF_OP_PACKET;
+
+       err = blktap_request_get_pages(tap, request, nsegs);
+       if (err)
+               goto stop;
+
+       err = blktap_ring_map_request(tap, request);
+       if (err)
+               goto fail;
+
+       blktap_ring_submit_request(tap, request);
+
+       return 0;
+
+stop:
+       tap->stats.st_oo_req++;
+       err = -EBUSY;
+
+_out:
+       if (request)
+               blktap_ring_free_request(tap, request);
+
+       return err;
+fail:
+       if (printk_ratelimit())
+               dev_warn(disk_to_dev(tapdev->gd),
+                        "make request: %d, failing\n", err);
+       goto _out;
+}
+
+/*
+ * called from tapdisk context
+ */
+void
+blktap_device_run_queue(struct blktap *tap)
+{
+       struct blktap_device *tapdev = &tap->device;
+       struct request_queue *q;
+       struct request *rq;
+       int err;
+
+       if (!tapdev->gd)
+               return;
+
+       q = tapdev->gd->queue;
+
+       spin_lock_irq(&tapdev->lock);
+       queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+
+       do {
+               rq = __blktap_next_queued_rq(q);
+               if (!rq)
+                       break;
+
+               if (rq->cmd_type != REQ_TYPE_FS) {
+                       rq->errors = (DID_ERROR << 16) |
+                                    (DRIVER_INVALID << 24);
+                       __blktap_end_queued_rq(rq, -EOPNOTSUPP);
+                       continue;
+               }
+
+               spin_unlock_irq(&tapdev->lock);
+
+               err = blktap_device_make_request(tap, rq);
+
+               spin_lock_irq(&tapdev->lock);
+
+               if (err == -EBUSY) {
+                       blk_stop_queue(q);
+                       break;
+               }
+
+               __blktap_dequeue_rq(rq);
+
+               if (unlikely(err))
+                       __blktap_end_rq(rq, err);
+       } while (1);
+
+       spin_unlock_irq(&tapdev->lock);
+}
+
+static void
+blktap_device_do_request(struct request_queue *rq)
+{
+       struct blktap_device *tapdev = rq->queuedata;
+       struct blktap *tap = dev_to_blktap(tapdev);
+
+       blktap_ring_kick_user(tap);
+}
+
+static void
+blktap_device_configure(struct blktap *tap,
+                       struct blktap_params *params)
+{
+       struct request_queue *rq;
+       struct blktap_device *dev = &tap->device;
+
+       dev = &tap->device;
+       rq  = dev->gd->queue;
+
+       spin_lock_irq(&dev->lock);
+
+       set_capacity(dev->gd, params->capacity);
+
+       /* Hard sector size and max sectors impersonate the equiv. hardware. */
+       blk_queue_logical_block_size(rq, params->sector_size);
+       blk_queue_max_hw_sectors(rq, 512);
+
+       /* Each segment in a request is up to an aligned page in size. */
+       blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+       blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+       /* Ensure a merged request will fit in a single I/O ring slot. */
+       blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+       /* Make sure buffer addresses are sector-aligned. */
+       blk_queue_dma_alignment(rq, 511);
+
+       spin_unlock_irq(&dev->lock);
+}
+
+static int
+blktap_device_validate_params(struct blktap *tap,
+                             struct blktap_params *params)
+{
+       struct device *dev = tap->ring.dev;
+       int sector_order, name_sz;
+
+       sector_order = ffs(params->sector_size) - 1;
+
+       if (sector_order <  9 ||
+           sector_order > 12 ||
+           params->sector_size != 1U<<sector_order)
+               goto fail;
+
+       if (!params->capacity ||
+           (params->capacity > ULLONG_MAX >> sector_order))
+               goto fail;
+
+       name_sz = min(sizeof(params->name), sizeof(tap->name));
+       if (strnlen(params->name, name_sz) >= name_sz)
+               goto fail;
+
+       return 0;
+
+fail:
+       params->name[name_sz-1] = 0;
+       dev_err(dev, "capacity: %llu, sector-size: %lu, name: %s\n",
+               params->capacity, params->sector_size, params->name);
+       return -EINVAL;
+}
+
+int
+blktap_device_destroy(struct blktap *tap)
+{
+       struct blktap_device *tapdev = &tap->device;
+       struct block_device *bdev;
+       struct gendisk *gd;
+       int err;
+
+       gd = tapdev->gd;
+       if (!gd)
+               return 0;
+
+       bdev = bdget_disk(gd, 0);
+
+       err = !mutex_trylock(&bdev->bd_mutex);
+       if (err) {
+               /* NB. avoid a deadlock. the last opener syncs the
+                * bdev holding bd_mutex. */
+               err = -EBUSY;
+               goto out_nolock;
+       }
+
+       if (bdev->bd_openers) {
+               err = -EBUSY;
+               goto out;
+       }
+
+       del_gendisk(gd);
+       gd->private_data = NULL;
+
+       blk_cleanup_queue(gd->queue);
+
+       put_disk(gd);
+       tapdev->gd = NULL;
+
+       clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+       err = 0;
+out:
+       mutex_unlock(&bdev->bd_mutex);
+out_nolock:
+       bdput(bdev);
+
+       return err;
+}
+
+static void
+blktap_device_fail_queue(struct blktap *tap)
+{
+       struct blktap_device *tapdev = &tap->device;
+       struct request_queue *q = tapdev->gd->queue;
+
+       spin_lock_irq(&tapdev->lock);
+       queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+
+       do {
+               struct request *rq = __blktap_next_queued_rq(q);
+               if (!rq)
+                       break;
+
+               __blktap_end_queued_rq(rq, -EIO);
+       } while (1);
+
+       spin_unlock_irq(&tapdev->lock);
+}
+
+static int
+blktap_device_try_destroy(struct blktap *tap)
+{
+       int err;
+
+       err = blktap_device_destroy(tap);
+       if (err)
+               blktap_device_fail_queue(tap);
+
+       return err;
+}
+
+void
+blktap_device_destroy_sync(struct blktap *tap)
+{
+       wait_event(tap->ring.poll_wait,
+                  !blktap_device_try_destroy(tap));
+}
+
+static char *blktap_devnode(struct gendisk *gd, umode_t *mode)
+{
+       return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "tapdev%u",
+                        gd->first_minor);
+}
+
+int
+blktap_device_create(struct blktap *tap, struct blktap_params *params)
+{
+       int minor, err;
+       struct gendisk *gd;
+       struct request_queue *rq;
+       struct blktap_device *tapdev;
+
+       gd     = NULL;
+       rq     = NULL;
+       tapdev = &tap->device;
+       minor  = tap->minor;
+
+       if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+               return -EEXIST;
+
+       if (blktap_device_validate_params(tap, params))
+               return -EINVAL;
+
+       gd = alloc_disk(1);
+       if (!gd) {
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       if (minor < 26) {
+               sprintf(gd->disk_name, "td%c", 'a' + minor % 26);
+       } else if (minor < (26 + 1) * 26) {
+               sprintf(gd->disk_name, "td%c%c",
+                       'a' + minor / 26 - 1,'a' + minor % 26);
+       } else {
+               const unsigned int m1 = (minor / 26 - 1) / 26 - 1;
+               const unsigned int m2 = (minor / 26 - 1) % 26;
+               const unsigned int m3 =  minor % 26;
+               sprintf(gd->disk_name, "td%c%c%c",
+                       'a' + m1, 'a' + m2, 'a' + m3);
+       }
+
+       gd->major = blktap_device_major;
+       gd->first_minor = minor;
+       gd->devnode = blktap_devnode;
+       gd->fops = &blktap_device_file_operations;
+       gd->private_data = tapdev;
+
+       spin_lock_init(&tapdev->lock);
+       rq = blk_init_queue(blktap_device_do_request, &tapdev->lock);
+       if (!rq) {
+               err = -ENOMEM;
+               goto fail;
+       }
+       elevator_init(rq, "noop");
+
+       gd->queue     = rq;
+       rq->queuedata = tapdev;
+       tapdev->gd    = gd;
+
+       blktap_device_configure(tap, params);
+       add_disk(gd);
+
+       strlcpy(tap->name, params->name, ARRAY_SIZE(tap->name));
+
+       set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+
+       dev_info(disk_to_dev(gd), "sector-size: %u capacity: %llu\n",
+                queue_logical_block_size(rq),
+                (unsigned long long)get_capacity(gd));
+
+       return 0;
+
+fail:
+       if (gd)
+               del_gendisk(gd);
+       if (rq)
+               blk_cleanup_queue(rq);
+
+       return err;
+}
+
+size_t
+blktap_device_debug(struct blktap *tap, char *buf, size_t size)
+{
+       struct gendisk *disk = tap->device.gd;
+       struct request_queue *q;
+       struct block_device *bdev;
+       char *s = buf, *end = buf + size;
+
+       if (!disk)
+               return 0;
+
+       q = disk->queue;
+
+       s += snprintf(s, end - s,
+                     "disk capacity:%llu sector size:%u\n",
+                     (unsigned long long)get_capacity(disk),
+                     queue_logical_block_size(q));
+
+       s += snprintf(s, end - s,
+                     "queue flags:%#lx stopped:%d\n",
+                     q->queue_flags,
+                     blk_queue_stopped(q));
+
+       bdev = bdget_disk(disk, 0);
+       if (bdev) {
+               s += snprintf(s, end - s,
+                             "bdev openers:%d closed:%d\n",
+                             bdev->bd_openers,
+                             test_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse));
+               bdput(bdev);
+       }
+
+       return s - buf;
+}
+
+int __init
+blktap_device_init()
+{
+       int major;
+
+       /* Dynamically allocate a major for this device */
+       major = register_blkdev(0, "tapdev");
+       if (major < 0) {
+               BTERR("Couldn't register blktap device\n");
+               return -ENOMEM;
+       }
+
+       blktap_device_major = major;
+       BTINFO("blktap device major %d\n", major);
+
+       return 0;
+}
+
+void
+blktap_device_exit(void)
+{
+       if (blktap_device_major)
+               unregister_blkdev(blktap_device_major, "tapdev");
+}
diff --git a/drivers/xen/blktap2-new/request.c b/drivers/xen/blktap2-new/request.c

new file mode 100644 (file)

index 0000000..9bef48c
--- /dev/null
+++ b/drivers/xen/blktap2-new/request.c
@@ -0,0 +1,418 @@
+#include <linux/mempool.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/device.h>
+
+#include "blktap.h"
+
+/* max pages per shared pool. just to prevent accidental dos. */
+#define POOL_MAX_PAGES           (256*BLKIF_MAX_SEGMENTS_PER_REQUEST)
+
+/* default page pool size. when considering to shrink a shared pool,
+ * note that paused tapdisks may grab a whole lot of pages for a long
+ * time. */
+#define POOL_DEFAULT_PAGES       (2 * MMAP_PAGES)
+
+/* max number of pages allocatable per request. */
+#define POOL_MAX_REQUEST_PAGES   BLKIF_MAX_SEGMENTS_PER_REQUEST
+
+/* min request structs per pool. These grow dynamically. */
+#define POOL_MIN_REQS            BLK_RING_SIZE
+
+static struct kset *pool_set;
+
+#define kobj_to_pool(_kobj) \
+       container_of(_kobj, struct blktap_page_pool, kobj)
+
+static struct kmem_cache *request_cache;
+static mempool_t *request_pool;
+
+static void
+__page_pool_wake(struct blktap_page_pool *pool)
+{
+       mempool_t *mem = pool->bufs;
+
+       /*
+         NB. slightly wasteful to always wait for a full segment
+         set. but this ensures the next disk makes
+         progress. presently, the repeated request struct
+         alloc/release cycles would otherwise keep everyone spinning.
+       */
+
+       if (mem->curr_nr >= POOL_MAX_REQUEST_PAGES)
+               wake_up(&pool->wait);
+}
+
+int
+blktap_request_get_pages(struct blktap *tap,
+                        struct blktap_request *request, int nr_pages)
+{
+       struct blktap_page_pool *pool = tap->pool;
+       mempool_t *mem = pool->bufs;
+       struct page *page;
+
+       BUG_ON(request->nr_pages != 0);
+       BUG_ON(nr_pages > POOL_MAX_REQUEST_PAGES);
+
+       if (mem->curr_nr < nr_pages)
+               return -ENOMEM;
+
+       /* NB. avoid thundering herds of tapdisks colliding. */
+       spin_lock(&pool->lock);
+
+       if (mem->curr_nr < nr_pages) {
+               spin_unlock(&pool->lock);
+               return -ENOMEM;
+       }
+
+       while (request->nr_pages < nr_pages) {
+               page = mempool_alloc(mem, GFP_NOWAIT);
+               BUG_ON(!page);
+               request->pages[request->nr_pages++] = page;
+       }
+
+       spin_unlock(&pool->lock);
+
+       return 0;
+}
+
+static void
+blktap_request_put_pages(struct blktap *tap,
+                        struct blktap_request *request)
+{
+       struct blktap_page_pool *pool = tap->pool;
+       struct page *page;
+
+       while (request->nr_pages) {
+               page = request->pages[--request->nr_pages];
+               mempool_free(page, pool->bufs);
+       }
+}
+
+size_t
+blktap_request_debug(struct blktap *tap, char *buf, size_t size)
+{
+       struct blktap_page_pool *pool = tap->pool;
+       mempool_t *mem = pool->bufs;
+       char *s = buf, *end = buf + size;
+
+       s += snprintf(buf, end - s,
+                     "pool:%s pages:%d free:%d\n",
+                     kobject_name(&pool->kobj),
+                     mem->min_nr, mem->curr_nr);
+
+       return s - buf;
+}
+
+struct blktap_request*
+blktap_request_alloc(struct blktap *tap)
+{
+       struct blktap_request *request;
+
+       request = mempool_alloc(request_pool, GFP_NOWAIT);
+       if (request)
+               request->tap = tap;
+
+       return request;
+}
+
+void
+blktap_request_free(struct blktap *tap,
+                   struct blktap_request *request)
+{
+       blktap_request_put_pages(tap, request);
+
+       mempool_free(request, request_pool);
+
+       __page_pool_wake(tap->pool);
+}
+
+void
+blktap_request_bounce(struct blktap *tap,
+                     struct blktap_request *request,
+                     int seg, int write)
+{
+       struct scatterlist *sg = &request->sg_table[seg];
+       void *s, *p;
+
+       BUG_ON(seg >= request->nr_pages);
+
+       s = sg_virt(sg);
+       p = page_address(request->pages[seg]) + sg->offset;
+
+       if (write)
+               memcpy(p, s, sg->length);
+       else
+               memcpy(s, p, sg->length);
+}
+
+static void
+blktap_request_ctor(void *obj)
+{
+       struct blktap_request *request = obj;
+
+       memset(request, 0, sizeof(*request));
+       sg_init_table(request->sg_table, ARRAY_SIZE(request->sg_table));
+}
+
+static int
+blktap_page_pool_resize(struct blktap_page_pool *pool, int target)
+{
+       mempool_t *bufs = pool->bufs;
+       int err;
+
+       /* NB. mempool asserts min_nr >= 1 */
+       target = max(1, target);
+
+       err = mempool_resize(bufs, target, GFP_KERNEL);
+       if (err)
+               return err;
+
+       __page_pool_wake(pool);
+
+       return 0;
+}
+
+struct pool_attribute {
+       struct attribute attr;
+
+       ssize_t (*show)(struct blktap_page_pool *pool,
+                       char *buf);
+
+       ssize_t (*store)(struct blktap_page_pool *pool,
+                        const char *buf, size_t count);
+};
+
+#define kattr_to_pool_attr(_kattr) \
+       container_of(_kattr, struct pool_attribute, attr)
+
+static ssize_t
+blktap_page_pool_show_size(struct blktap_page_pool *pool,
+                          char *buf)
+{
+       mempool_t *mem = pool->bufs;
+       return sprintf(buf, "%d", mem->min_nr);
+}
+
+static ssize_t
+blktap_page_pool_store_size(struct blktap_page_pool *pool,
+                           const char *buf, size_t size)
+{
+       int target;
+
+       /*
+        * NB. target fixup to avoid undesired results. less than a
+        * full segment set can wedge the disk. much more than a
+        * couple times the physical queue depth is rarely useful.
+        */
+
+       target = simple_strtoul(buf, NULL, 0);
+       target = max(POOL_MAX_REQUEST_PAGES, target);
+       target = min(target, POOL_MAX_PAGES);
+
+       return blktap_page_pool_resize(pool, target) ? : size;
+}
+
+static struct pool_attribute blktap_page_pool_attr_size =
+       __ATTR(size, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH,
+              blktap_page_pool_show_size,
+              blktap_page_pool_store_size);
+
+static ssize_t
+blktap_page_pool_show_free(struct blktap_page_pool *pool,
+                          char *buf)
+{
+       mempool_t *mem = pool->bufs;
+       return sprintf(buf, "%d", mem->curr_nr);
+}
+
+static struct pool_attribute blktap_page_pool_attr_free =
+       __ATTR(free, S_IRUSR|S_IRGRP|S_IROTH,
+              blktap_page_pool_show_free,
+              NULL);
+
+static struct attribute *blktap_page_pool_attrs[] = {
+       &blktap_page_pool_attr_size.attr,
+       &blktap_page_pool_attr_free.attr,
+       NULL,
+};
+
+static inline struct kobject*
+__blktap_kset_find_obj(struct kset *kset, const char *name)
+{
+       struct kobject *k;
+       struct kobject *ret = NULL;
+
+       spin_lock(&kset->list_lock);
+       list_for_each_entry(k, &kset->list, entry) {
+               if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
+                       ret = kobject_get(k);
+                       break;
+               }
+       }
+       spin_unlock(&kset->list_lock);
+       return ret;
+}
+
+static ssize_t
+blktap_page_pool_show_attr(struct kobject *kobj, struct attribute *kattr,
+                          char *buf)
+{
+       struct blktap_page_pool *pool = kobj_to_pool(kobj);
+       struct pool_attribute *attr = kattr_to_pool_attr(kattr);
+
+       if (attr->show)
+               return attr->show(pool, buf);
+
+       return -EIO;
+}
+
+static ssize_t
+blktap_page_pool_store_attr(struct kobject *kobj, struct attribute *kattr,
+                           const char *buf, size_t size)
+{
+       struct blktap_page_pool *pool = kobj_to_pool(kobj);
+       struct pool_attribute *attr = kattr_to_pool_attr(kattr);
+
+       if (attr->show)
+               return attr->store(pool, buf, size);
+
+       return -EIO;
+}
+
+static struct sysfs_ops blktap_page_pool_sysfs_ops = {
+       .show           = blktap_page_pool_show_attr,
+       .store          = blktap_page_pool_store_attr,
+};
+
+static void
+blktap_page_pool_release(struct kobject *kobj)
+{
+       struct blktap_page_pool *pool = kobj_to_pool(kobj);
+       mempool_destroy(pool->bufs);
+       kfree(pool);
+}
+
+struct kobj_type blktap_page_pool_ktype = {
+       .release       = blktap_page_pool_release,
+       .sysfs_ops     = &blktap_page_pool_sysfs_ops,
+       .default_attrs = blktap_page_pool_attrs,
+};
+
+static void*
+__mempool_page_alloc(gfp_t gfp_mask, void *pool_data)
+{
+       struct page *page;
+
+       if (!(gfp_mask & __GFP_WAIT))
+               return NULL;
+
+       page = alloc_page(gfp_mask);
+       if (page)
+               SetPageReserved(page);
+
+       return page;
+}
+
+static void
+__mempool_page_free(void *element, void *pool_data)
+{
+       struct page *page = element;
+
+       ClearPageReserved(page);
+       put_page(page);
+}
+
+static struct kobject*
+blktap_page_pool_create(const char *name, int nr_pages)
+{
+       struct blktap_page_pool *pool;
+       int err;
+
+       pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+       if (!pool)
+               goto fail;
+
+       spin_lock_init(&pool->lock);
+       init_waitqueue_head(&pool->wait);
+
+       pool->bufs = mempool_create(nr_pages,
+                                   __mempool_page_alloc, __mempool_page_free,
+                                   pool);
+       if (!pool->bufs)
+               goto fail_pool;
+
+       kobject_init(&pool->kobj, &blktap_page_pool_ktype);
+       pool->kobj.kset = pool_set;
+       err = kobject_add(&pool->kobj, &pool_set->kobj, "%s", name);
+       if (err)
+               goto fail_bufs;
+
+       return &pool->kobj;
+
+       kobject_del(&pool->kobj);
+fail_bufs:
+       mempool_destroy(pool->bufs);
+fail_pool:
+       kfree(pool);
+fail:
+       return NULL;
+}
+
+struct blktap_page_pool*
+blktap_page_pool_get(const char *name)
+{
+       struct kobject *kobj;
+
+       kobj = __blktap_kset_find_obj(pool_set, name);
+       if (!kobj)
+               kobj = blktap_page_pool_create(name,
+                                              POOL_DEFAULT_PAGES);
+       if (!kobj)
+               return ERR_PTR(-ENOMEM);
+
+       return kobj_to_pool(kobj);
+}
+
+int __init
+blktap_page_pool_init(struct kobject *parent)
+{
+       request_cache =
+               kmem_cache_create("blktap-request",
+                                 sizeof(struct blktap_request), 0,
+                                 0, blktap_request_ctor);
+       if (!request_cache)
+               return -ENOMEM;
+
+       request_pool =
+               mempool_create_slab_pool(POOL_MIN_REQS, request_cache);
+       if (!request_pool)
+               return -ENOMEM;
+
+       pool_set = kset_create_and_add("pools", NULL, parent);
+       if (!pool_set)
+               return -ENOMEM;
+
+       return 0;
+}
+
+void
+blktap_page_pool_exit(void)
+{
+       if (pool_set) {
+               BUG_ON(!list_empty(&pool_set->list));
+               kset_unregister(pool_set);
+               pool_set = NULL;
+       }
+
+       if (request_pool) {
+               mempool_destroy(request_pool);
+               request_pool = NULL;
+       }
+
+       if (request_cache) {
+               kmem_cache_destroy(request_cache);
+               request_cache = NULL;
+       }
+}
diff --git a/drivers/xen/blktap2-new/ring.c b/drivers/xen/blktap2-new/ring.c

new file mode 100644 (file)

index 0000000..24ce5c9
--- /dev/null
+++ b/drivers/xen/blktap2-new/ring.c
@@ -0,0 +1,547 @@
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/poll.h>
+#include <linux/blkdev.h>
+
+#include "blktap.h"
+
+int blktap_ring_major;
+
+ /* 
+  * BLKTAP - immediately before the mmap area,
+  * we have a bunch of pages reserved for shared memory rings.
+  */
+#define RING_PAGES 1
+
+static void
+blktap_ring_read_response(struct blktap *tap,
+                    const struct blkif_response *rsp)
+{
+       struct blktap_ring *ring = &tap->ring;
+       struct blktap_request *request;
+       int usr_idx, err;
+
+       request = NULL;
+
+       usr_idx = rsp->id;
+       if (usr_idx < 0 || usr_idx >= MAX_PENDING_REQS) {
+               err = -ERANGE;
+               goto invalid;
+       }
+
+       request = ring->pending[usr_idx];
+
+       if (!request) {
+               err = -ESRCH;
+               goto invalid;
+       }
+
+       if (rsp->operation != request->operation) {
+               err = -EINVAL;
+               goto invalid;
+       }
+
+       dev_dbg(ring->dev,
+               "request %d [%p] response: %d\n",
+               request->usr_idx, request, rsp->status);
+
+       err = rsp->status == BLKIF_RSP_OKAY ? 0 : -EIO;
+end_request:
+       blktap_device_end_request(tap, request, err);
+       return;
+
+invalid:
+       dev_warn(ring->dev,
+                "invalid response, idx:%d status:%d op:%d/%d: err %d\n",
+                usr_idx, rsp->status,
+                rsp->operation, request->operation,
+                err);
+       if (request)
+               goto end_request;
+}
+
+static void
+blktap_read_ring(struct blktap *tap)
+{
+       struct blktap_ring *ring = &tap->ring;
+       struct blkif_response rsp;
+       RING_IDX rc, rp;
+
+       down_read(&current->mm->mmap_sem);
+       if (!ring->vma) {
+               up_read(&current->mm->mmap_sem);
+               return;
+       }
+
+       /* for each outstanding message on the ring  */
+       rp = ring->ring.sring->rsp_prod;
+       rmb();
+
+       for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
+               memcpy(&rsp, RING_GET_RESPONSE(&ring->ring, rc), sizeof(rsp));
+               blktap_ring_read_response(tap, &rsp);
+       }
+
+       ring->ring.rsp_cons = rc;
+
+       up_read(&current->mm->mmap_sem);
+}
+
+static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       return VM_FAULT_SIGBUS;
+}
+
+static void
+blktap_ring_fail_pending(struct blktap *tap)
+{
+       struct blktap_ring *ring = &tap->ring;
+       struct blktap_request *request;
+       int usr_idx;
+
+       for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
+               request = ring->pending[usr_idx];
+               if (!request)
+                       continue;
+
+               blktap_device_end_request(tap, request, -EIO);
+       }
+}
+
+static void
+blktap_ring_vm_close(struct vm_area_struct *vma)
+{
+       struct blktap *tap = vma->vm_private_data;
+       struct blktap_ring *ring = &tap->ring;
+       struct page *page = virt_to_page(ring->ring.sring);
+
+       blktap_ring_fail_pending(tap);
+
+       zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
+       ClearPageReserved(page);
+       __free_page(page);
+
+       ring->vma = NULL;
+
+       if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+               blktap_control_destroy_tap(tap);
+}
+
+static struct vm_operations_struct blktap_ring_vm_operations = {
+       .close    = blktap_ring_vm_close,
+       .fault    = blktap_ring_fault,
+};
+
+int
+blktap_ring_map_segment(struct blktap *tap,
+                       struct blktap_request *request,
+                       int seg)
+{
+       struct blktap_ring *ring = &tap->ring;
+       unsigned long uaddr;
+
+       uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
+       return vm_insert_page(ring->vma, uaddr, request->pages[seg]);
+}
+
+int
+blktap_ring_map_request(struct blktap *tap,
+                       struct blktap_request *request)
+{
+       int seg, err = 0;
+       int write;
+
+       write = request->operation != BLKIF_OP_READ;
+
+       for (seg = 0; seg < request->nr_pages; seg++) {
+               if (write)
+                       blktap_request_bounce(tap, request, seg, 1);
+
+               err = blktap_ring_map_segment(tap, request, seg);
+               if (err)
+                       break;
+       }
+
+       if (err)
+               blktap_ring_unmap_request(tap, request);
+
+       return err;
+}
+
+void
+blktap_ring_unmap_request(struct blktap *tap,
+                         struct blktap_request *request)
+{
+       struct blktap_ring *ring = &tap->ring;
+       unsigned long uaddr;
+       unsigned size;
+       int seg, read;
+
+       uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, 0);
+       size  = request->nr_pages << PAGE_SHIFT;
+       read  = request->operation != BLKIF_OP_WRITE;
+
+       if (read)
+               for (seg = 0; seg < request->nr_pages; seg++)
+                       blktap_request_bounce(tap, request, seg, 0);
+
+       zap_page_range(ring->vma, uaddr, size, NULL);
+}
+
+void
+blktap_ring_free_request(struct blktap *tap,
+                        struct blktap_request *request)
+{
+       struct blktap_ring *ring = &tap->ring;
+
+       ring->pending[request->usr_idx] = NULL;
+       ring->n_pending--;
+
+       blktap_request_free(tap, request);
+}
+
+struct blktap_request*
+blktap_ring_make_request(struct blktap *tap)
+{
+       struct blktap_ring *ring = &tap->ring;
+       struct blktap_request *request;
+       int usr_idx;
+
+       if (RING_FULL(&ring->ring))
+               return ERR_PTR(-ENOSPC);
+
+       request = blktap_request_alloc(tap);
+       if (!request)
+               return ERR_PTR(-ENOMEM);
+
+       for (usr_idx = 0; usr_idx < BLK_RING_SIZE; usr_idx++)
+               if (!ring->pending[usr_idx])
+                       break;
+
+       BUG_ON(usr_idx >= BLK_RING_SIZE);
+
+       request->tap     = tap;
+       request->usr_idx = usr_idx;
+
+       ring->pending[usr_idx] = request;
+       ring->n_pending++;
+
+       return request;
+}
+
+void
+blktap_ring_submit_request(struct blktap *tap,
+                          struct blktap_request *request)
+{
+       struct blktap_ring *ring = &tap->ring;
+       struct blkif_request *breq;
+       struct scatterlist *sg;
+       int i, nsecs = 0;
+
+       dev_dbg(ring->dev,
+               "request %d [%p] submit\n", request->usr_idx, request);
+
+       breq = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
+
+       breq->id            = request->usr_idx;
+       breq->sector_number = blk_rq_pos(request->rq);
+       breq->handle        = 0;
+       breq->operation     = request->operation;
+       breq->nr_segments   = request->nr_pages;
+
+       blktap_for_each_sg(sg, request, i) {
+               struct blkif_request_segment *seg = &breq->seg[i];
+               int first, count;
+
+               count = sg->length >> 9;
+               first = sg->offset >> 9;
+
+               seg->first_sect = first;
+               seg->last_sect  = first + count - 1;
+
+               nsecs += count;
+       }
+
+       ring->ring.req_prod_pvt++;
+
+       do_gettimeofday(&request->time);
+
+
+       switch (request->operation) {
+       case BLKIF_OP_WRITE:
+               tap->stats.st_wr_sect += nsecs;
+               tap->stats.st_wr_req++;
+               break;
+
+       case BLKIF_OP_READ:
+               tap->stats.st_rd_sect += nsecs;
+               tap->stats.st_rd_req++;
+               break;
+
+       case BLKIF_OP_PACKET:
+               tap->stats.st_pk_req++;
+               break;
+       }
+}
+
+static int
+blktap_ring_open(struct inode *inode, struct file *filp)
+{
+       struct blktap *tap = NULL;
+       int minor;
+
+       minor = iminor(inode);
+
+       if (minor < blktap_max_minor)
+               tap = blktaps[minor];
+
+       if (!tap)
+               return -ENXIO;
+
+       if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+               return -ENXIO;
+
+       if (tap->ring.task)
+               return -EBUSY;
+
+       filp->private_data = tap;
+       tap->ring.task = current;
+
+       return 0;
+}
+
+static int
+blktap_ring_release(struct inode *inode, struct file *filp)
+{
+       struct blktap *tap = filp->private_data;
+
+       blktap_device_destroy_sync(tap);
+
+       tap->ring.task = NULL;
+
+       if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+               blktap_control_destroy_tap(tap);
+
+       return 0;
+}
+
+static int
+blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+       struct blktap *tap = filp->private_data;
+       struct blktap_ring *ring = &tap->ring;
+       struct blkif_sring *sring;
+       struct page *page = NULL;
+       int err;
+
+       if (ring->vma)
+               return -EBUSY;
+
+       page = alloc_page(GFP_KERNEL|__GFP_ZERO);
+       if (!page)
+               return -ENOMEM;
+
+       SetPageReserved(page);
+
+       err = vm_insert_page(vma, vma->vm_start, page);
+       if (err)
+               goto fail;
+
+       sring = page_address(page);
+       SHARED_RING_INIT(sring);
+       FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
+
+       ring->ring_vstart = vma->vm_start;
+       ring->user_vstart = ring->ring_vstart + PAGE_SIZE;
+
+       vma->vm_private_data = tap;
+
+       vma->vm_flags |= VM_DONTCOPY;
+       vma->vm_flags |= VM_RESERVED;
+
+       vma->vm_ops = &blktap_ring_vm_operations;
+
+       ring->vma = vma;
+       return 0;
+
+fail:
+       if (page) {
+               zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL);
+               ClearPageReserved(page);
+               __free_page(page);
+       }
+
+       return err;
+}
+
+static long
+blktap_ring_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+       struct blktap *tap = filp->private_data;
+       struct blktap_ring *ring = &tap->ring;
+
+       BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
+
+       if (!ring->vma || ring->vma->vm_mm != current->mm)
+               return -EACCES;
+
+       switch(cmd) {
+       case BLKTAP2_IOCTL_KICK_FE:
+
+               blktap_read_ring(tap);
+               return 0;
+
+       case BLKTAP2_IOCTL_CREATE_DEVICE: {
+               struct blktap_params params;
+               void __user *ptr = (void *)arg;
+
+               if (!arg)
+                       return -EINVAL;
+
+               if (copy_from_user(&params, ptr, sizeof(params)))
+                       return -EFAULT;
+
+               return blktap_device_create(tap, &params);
+       }
+
+       case BLKTAP2_IOCTL_REMOVE_DEVICE:
+
+               return blktap_device_destroy(tap);
+       }
+
+       return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
+{
+       struct blktap *tap = filp->private_data;
+       struct blktap_ring *ring = &tap->ring;
+       int work;
+
+       poll_wait(filp, &tap->pool->wait, wait);
+       poll_wait(filp, &ring->poll_wait, wait);
+
+       down_read(&current->mm->mmap_sem);
+       if (ring->vma && tap->device.gd)
+               blktap_device_run_queue(tap);
+       up_read(&current->mm->mmap_sem);
+
+       work = ring->ring.req_prod_pvt - ring->ring.sring->req_prod;
+       RING_PUSH_REQUESTS(&ring->ring);
+
+       if (work ||
+           ring->ring.sring->private.tapif_user.msg ||
+           test_and_clear_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse))
+               return POLLIN | POLLRDNORM;
+
+       return 0;
+}
+
+static const struct file_operations blktap_ring_file_operations = {
+       .owner    = THIS_MODULE,
+       .open     = blktap_ring_open,
+       .release  = blktap_ring_release,
+       .unlocked_ioctl = blktap_ring_ioctl,
+       .mmap     = blktap_ring_mmap,
+       .poll     = blktap_ring_poll,
+};
+
+void
+blktap_ring_kick_user(struct blktap *tap)
+{
+       wake_up(&tap->ring.poll_wait);
+}
+
+int
+blktap_ring_destroy(struct blktap *tap)
+{
+       struct blktap_ring *ring = &tap->ring;
+
+       if (ring->task || ring->vma)
+               return -EBUSY;
+
+       return 0;
+}
+
+int
+blktap_ring_create(struct blktap *tap)
+{
+       struct blktap_ring *ring = &tap->ring;
+
+       init_waitqueue_head(&ring->poll_wait);
+       ring->devno = MKDEV(blktap_ring_major, tap->minor);
+
+       return 0;
+}
+
+size_t
+blktap_ring_debug(struct blktap *tap, char *buf, size_t size)
+{
+       struct blktap_ring *ring = &tap->ring;
+       char *s = buf, *end = buf + size;
+       int usr_idx;
+
+       s += snprintf(s, end - s,
+                     "begin pending:%d\n", ring->n_pending);
+
+       for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
+               struct blktap_request *request;
+               struct timeval *time;
+               char op = '?';
+
+               request = ring->pending[usr_idx];
+               if (!request)
+                       continue;
+
+               switch (request->operation) {
+               case BLKIF_OP_WRITE:  op = 'W'; break;
+               case BLKIF_OP_READ:   op = 'R'; break;
+               case BLKIF_OP_PACKET: op = 'P'; break;
+               }
+               time  = &request->time;
+
+               s += snprintf(s, end - s,
+                             "%02d: usr_idx:%02d "
+                             "op:%c nr_pages:%02d time:%lu.%09lu\n",
+                             usr_idx, request->usr_idx,
+                             op, request->nr_pages,
+                             time->tv_sec, time->tv_usec);
+       }
+
+       s += snprintf(s, end - s, "end pending\n");
+
+       return s - buf;
+}
+
+
+int __init
+blktap_ring_init(void)
+{
+       int err;
+
+       err = __register_chrdev(0, 0, CONFIG_XEN_NR_TAP2_DEVICES, "blktap2",
+                               &blktap_ring_file_operations);
+       if (err < 0) {
+               BTERR("error registering ring devices: %d\n", err);
+               return err;
+       }
+
+       blktap_ring_major = err;
+       BTINFO("blktap ring major: %d\n", blktap_ring_major);
+
+       return 0;
+}
+
+void
+blktap_ring_exit(void)
+{
+       if (!blktap_ring_major)
+               return;
+
+       __unregister_chrdev(blktap_ring_major, 0, CONFIG_XEN_NR_TAP2_DEVICES,
+                           "blktap2");
+
+       blktap_ring_major = 0;
+}
diff --git a/drivers/xen/blktap2-new/sysfs.c b/drivers/xen/blktap2-new/sysfs.c

new file mode 100644 (file)

index 0000000..fd8b27b
--- /dev/null
+++ b/drivers/xen/blktap2-new/sysfs.c
@@ -0,0 +1,299 @@
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+
+#include "blktap.h"
+
+int blktap_debug_level = 1;
+
+static struct class *class;
+
+static ssize_t
+blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr, const char *buf, size_t size)
+{
+       struct blktap *tap;
+
+       tap = dev_get_drvdata(dev);
+       if (!tap)
+               return 0;
+
+       if (size > BLKTAP2_MAX_MESSAGE_LEN)
+               return -ENAMETOOLONG;
+
+       if (strnlen(buf, size) >= size)
+               return -EINVAL;
+
+       strlcpy(tap->name, buf, size);
+
+       return size;
+}
+
+static ssize_t
+blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct blktap *tap;
+       ssize_t size;
+
+       tap = dev_get_drvdata(dev);
+       if (!tap)
+               return 0;
+
+       if (tap->name[0])
+               size = sprintf(buf, "%s\n", tap->name);
+       else
+               size = sprintf(buf, "%d\n", tap->minor);
+
+       return size;
+}
+static DEVICE_ATTR(name, S_IRUGO|S_IWUSR,
+                  blktap_sysfs_get_name, blktap_sysfs_set_name);
+
+static void
+blktap_sysfs_remove_work(struct work_struct *work)
+{
+       struct blktap *tap
+               = container_of(work, struct blktap, remove_work);
+       blktap_control_destroy_tap(tap);
+}
+
+static ssize_t
+blktap_sysfs_remove_device(struct device *dev,
+                          struct device_attribute *attr,
+                          const char *buf, size_t size)
+{
+       struct blktap *tap;
+       int err;
+
+       tap = dev_get_drvdata(dev);
+       if (!tap)
+               return size;
+
+       if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+               goto wait;
+
+       if (tap->ring.vma) {
+               struct blkif_sring *sring = tap->ring.ring.sring;
+               sring->private.tapif_user.msg = BLKTAP2_RING_MESSAGE_CLOSE;
+               blktap_ring_kick_user(tap);
+       } else {
+               INIT_WORK(&tap->remove_work, blktap_sysfs_remove_work);
+               schedule_work(&tap->remove_work);
+       }
+wait:
+       err = wait_event_interruptible(tap->remove_wait,
+                                      !dev_get_drvdata(dev));
+       if (err)
+               return err;
+
+       return size;
+}
+static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
+
+static ssize_t
+blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct blktap *tap;
+       char *s = buf, *end = buf + PAGE_SIZE;
+
+       tap = dev_get_drvdata(dev);
+       if (!tap)
+               return 0;
+
+       s += blktap_control_debug(tap, s, end - s);
+
+       s += blktap_request_debug(tap, s, end - s);
+
+       s += blktap_device_debug(tap, s, end - s);
+
+       s += blktap_ring_debug(tap, s, end - s);
+
+       return s - buf;
+}
+static DEVICE_ATTR(debug, S_IRUGO, blktap_sysfs_debug_device, NULL);
+
+static ssize_t
+blktap_sysfs_show_task(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct blktap *tap;
+       ssize_t rv = 0;
+
+       tap = dev_get_drvdata(dev);
+       if (!tap)
+               return 0;
+
+       if (tap->ring.task)
+               rv = sprintf(buf, "%d\n", tap->ring.task->pid);
+
+       return rv;
+}
+static DEVICE_ATTR(task, S_IRUGO, blktap_sysfs_show_task, NULL);
+
+static ssize_t
+blktap_sysfs_show_pool(struct device *dev,
+                      struct device_attribute *attr,
+                      char *buf)
+{
+       struct blktap *tap = dev_get_drvdata(dev);
+       return sprintf(buf, "%s", kobject_name(&tap->pool->kobj));
+}
+
+static ssize_t
+blktap_sysfs_store_pool(struct device *dev,
+                       struct device_attribute *attr,
+                       const char *buf, size_t size)
+{
+       struct blktap *tap = dev_get_drvdata(dev);
+       struct blktap_page_pool *pool, *tmp = tap->pool;
+
+       if (tap->device.gd)
+               return -EBUSY;
+
+       pool = blktap_page_pool_get(buf);
+       if (IS_ERR(pool))
+               return PTR_ERR(pool);
+
+       tap->pool = pool;
+       kobject_put(&tmp->kobj);
+
+       return size;
+}
+static DEVICE_ATTR(pool, S_IRUSR|S_IWUSR,
+                  blktap_sysfs_show_pool, blktap_sysfs_store_pool);
+
+int
+blktap_sysfs_create(struct blktap *tap)
+{
+       struct blktap_ring *ring = &tap->ring;
+       struct device *dev;
+       int err = 0;
+
+       init_waitqueue_head(&tap->remove_wait);
+
+       dev = device_create(class, NULL, ring->devno,
+                           tap, "blktap%d", tap->minor);
+       if (IS_ERR(dev))
+               err = PTR_ERR(dev);
+       if (!err)
+               err = device_create_file(dev, &dev_attr_name);
+       if (!err)
+               err = device_create_file(dev, &dev_attr_remove);
+       if (!err)
+               err = device_create_file(dev, &dev_attr_debug);
+       if (!err)
+               err = device_create_file(dev, &dev_attr_task);
+       if (!err)
+               err = device_create_file(dev, &dev_attr_pool);
+       if (!err)
+               ring->dev = dev;
+       else
+               device_unregister(dev);
+
+       return err;
+}
+
+void
+blktap_sysfs_destroy(struct blktap *tap)
+{
+       struct blktap_ring *ring = &tap->ring;
+       struct device *dev;
+
+       dev = ring->dev;
+
+       if (!dev)
+               return;
+
+       dev_set_drvdata(dev, NULL);
+       wake_up(&tap->remove_wait);
+
+       device_unregister(dev);
+       ring->dev = NULL;
+}
+
+static ssize_t
+blktap_sysfs_show_verbosity(struct class *class, struct class_attribute *attr,
+                           char *buf)
+{
+       return sprintf(buf, "%d\n", blktap_debug_level);
+}
+
+static ssize_t
+blktap_sysfs_set_verbosity(struct class *class, struct class_attribute *attr,
+                          const char *buf, size_t size)
+{
+       int level;
+
+       if (sscanf(buf, "%d", &level) == 1) {
+               blktap_debug_level = level;
+               return size;
+       }
+
+       return -EINVAL;
+}
+static CLASS_ATTR(verbosity, S_IRUGO|S_IWUSR,
+                 blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
+
+static ssize_t
+blktap_sysfs_show_devices(struct class *class, struct class_attribute *attr,
+                         char *buf)
+{
+       int i, ret;
+       struct blktap *tap;
+
+       mutex_lock(&blktap_lock);
+
+       ret = 0;
+       for (i = 0; i < blktap_max_minor; i++) {
+               tap = blktaps[i];
+               if (!tap)
+                       continue;
+
+               if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+                       continue;
+
+               ret += sprintf(buf + ret, "%d %s\n", tap->minor, tap->name);
+       }
+
+       mutex_unlock(&blktap_lock);
+
+       return ret;
+}
+static CLASS_ATTR(devices, S_IRUGO, blktap_sysfs_show_devices, NULL);
+
+static char *blktap_devnode(struct device *dev, umode_t *mode)
+{
+       return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "blktap%u",
+                        MINOR(dev->devt));
+}
+
+void
+blktap_sysfs_exit(void)
+{
+       if (class)
+               class_destroy(class);
+}
+
+int __init
+blktap_sysfs_init(void)
+{
+       struct class *cls;
+       int err = 0;
+
+       cls = class_create(THIS_MODULE, "blktap2");
+       if (IS_ERR(cls))
+               err = PTR_ERR(cls);
+       else
+               cls->devnode = blktap_devnode;
+       if (!err)
+               err = class_create_file(cls, &class_attr_verbosity);
+       if (!err)
+               err = class_create_file(cls, &class_attr_devices);
+       if (!err)
+               class = cls;
+       else
+               class_destroy(cls);
+
+       return err;
+}
diff --git a/drivers/xen/blktap2/Makefile b/drivers/xen/blktap2/Makefile

new file mode 100644 (file)

index 0000000..8bb330c
--- /dev/null
+++ b/drivers/xen/blktap2/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_XEN_BLKDEV_TAP2) := blktap2.o
+
+blktap2-y := control.o ring.o wait_queue.o device.o request.o
+blktap2-$(CONFIG_SYSFS) += sysfs.o
diff --git a/drivers/xen/blktap2/blktap.h b/drivers/xen/blktap2/blktap.h

new file mode 100644 (file)

index 0000000..4726348
--- /dev/null
+++ b/drivers/xen/blktap2/blktap.h
@@ -0,0 +1,264 @@
+#ifndef _BLKTAP_H_
+#define _BLKTAP_H_
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/init.h>
+#include <linux/scatterlist.h>
+#include <xen/blkif.h>
+#include <xen/gnttab.h>
+
+//#define ENABLE_PASSTHROUGH
+
+extern int blktap_debug_level;
+
+#define BTPRINTK(level, tag, force, _f, _a...)                         \
+       do {                                                            \
+               if (blktap_debug_level > level &&                       \
+                   (force || printk_ratelimit()))                      \
+                       printk(tag "%s: " _f, __func__, ##_a);          \
+       } while (0)
+
+#define BTDBG(_f, _a...)             BTPRINTK(8, KERN_DEBUG, 1, _f, ##_a)
+#define BTINFO(_f, _a...)            BTPRINTK(0, KERN_INFO, 0, _f, ##_a)
+#define BTWARN(_f, _a...)            BTPRINTK(0, KERN_WARNING, 0, _f, ##_a)
+#define BTERR(_f, _a...)             BTPRINTK(0, KERN_ERR, 0, _f, ##_a)
+
+#define BLKTAP2_DEV_DIR "xen/blktap-2/"
+
+#define BLKTAP_CONTROL               1
+#define BLKTAP_RING_FD               2
+#define BLKTAP_RING_VMA              3
+#define BLKTAP_DEVICE                4
+#define BLKTAP_SYSFS                 5
+#define BLKTAP_PAUSE_REQUESTED       6
+#define BLKTAP_PAUSED                7
+#define BLKTAP_SHUTDOWN_REQUESTED    8
+#define BLKTAP_PASSTHROUGH           9
+#define BLKTAP_DEFERRED              10
+
+/* blktap IOCTLs: */
+#define BLKTAP2_IOCTL_KICK_FE        1
+#define BLKTAP2_IOCTL_ALLOC_TAP             200
+#define BLKTAP2_IOCTL_FREE_TAP       201
+#define BLKTAP2_IOCTL_CREATE_DEVICE  202
+#define BLKTAP2_IOCTL_SET_PARAMS     203
+#define BLKTAP2_IOCTL_PAUSE          204
+#define BLKTAP2_IOCTL_REOPEN         205
+#define BLKTAP2_IOCTL_RESUME         206
+
+#define BLKTAP2_MAX_MESSAGE_LEN      256
+
+#define BLKTAP2_RING_MESSAGE_PAUSE   1
+#define BLKTAP2_RING_MESSAGE_RESUME  2
+#define BLKTAP2_RING_MESSAGE_CLOSE   3
+
+#define BLKTAP_REQUEST_FREE          0
+#define BLKTAP_REQUEST_PENDING       1
+
+/*
+ * The maximum number of requests that can be outstanding at any time
+ * is determined by
+ *
+ *   [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] 
+ *
+ * where mmap_alloc < MAX_DYNAMIC_MEM.
+ *
+ * TODO:
+ * mmap_alloc is initialised to 2 and should be adjustable on the fly via
+ * sysfs.
+ */
+#define BLK_RING_SIZE          __CONST_RING_SIZE(blkif, PAGE_SIZE)
+#define MAX_DYNAMIC_MEM                BLK_RING_SIZE
+#define MAX_PENDING_REQS       BLK_RING_SIZE
+#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
+#define MMAP_VADDR(_start, _req, _seg)                                 \
+        (_start +                                                       \
+         ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +        \
+         ((_seg) * PAGE_SIZE))
+
+#define blktap_get(_b) (atomic_inc(&(_b)->refcnt))
+#define blktap_put(_b)                                 \
+       do {                                            \
+               if (atomic_dec_and_test(&(_b)->refcnt)) \
+                       wake_up(&(_b)->wq);             \
+       } while (0)
+
+struct blktap;
+
+struct grant_handle_pair {
+       grant_handle_t                 kernel;
+       grant_handle_t                 user;
+};
+#define INVALID_GRANT_HANDLE           0xFFFF
+
+struct blktap_handle {
+       unsigned int                   ring;
+       unsigned int                   device;
+       unsigned int                   minor;
+};
+
+struct blktap_params {
+       char                           name[BLKTAP2_MAX_MESSAGE_LEN];
+       unsigned long long             capacity;
+       unsigned long                  sector_size;
+};
+
+struct blktap_device {
+       int                            users;
+       spinlock_t                     lock;
+       struct gendisk                *gd;
+
+#ifdef ENABLE_PASSTHROUGH
+       struct block_device           *bdev;
+#endif
+};
+
+struct blktap_ring {
+       struct vm_area_struct         *vma;
+       blkif_front_ring_t             ring;
+       struct vm_foreign_map          foreign_map;
+       unsigned long                  ring_vstart;
+       unsigned long                  user_vstart;
+
+       int                            response;
+
+       wait_queue_head_t              poll_wait;
+
+       dev_t                          devno;
+       struct device                 *dev;
+       atomic_t                       sysfs_refcnt;
+       struct mutex                   sysfs_mutex;
+};
+
+struct blktap_statistics {
+       unsigned long                  st_print;
+       int                            st_rd_req;
+       int                            st_wr_req;
+       int                            st_oo_req;
+       int                            st_pk_req;
+       int                            st_rd_sect;
+       int                            st_wr_sect;
+       s64                            st_rd_cnt;
+       s64                            st_rd_sum_usecs;
+       s64                            st_rd_max_usecs;
+       s64                            st_wr_cnt;
+       s64                            st_wr_sum_usecs;
+       s64                            st_wr_max_usecs; 
+};
+
+struct blktap_request {
+       uint64_t                       id;
+       uint16_t                       usr_idx;
+
+       uint8_t                        status;
+       atomic_t                       pendcnt;
+       uint8_t                        nr_pages;
+       unsigned short                 operation;
+
+       struct timeval                 time;
+       struct grant_handle_pair       handles[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+       struct list_head               free_list;
+};
+
+struct blktap {
+       int                            minor;
+       pid_t                          pid;
+       atomic_t                       refcnt;
+       unsigned long                  dev_inuse;
+
+       struct blktap_params           params;
+
+       struct rw_semaphore            tap_sem;
+
+       struct blktap_ring             ring;
+       struct blktap_device           device;
+
+       int                            pending_cnt;
+       struct blktap_request         *pending_requests[MAX_PENDING_REQS];
+       struct scatterlist             sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+
+       wait_queue_head_t              wq;
+       struct list_head               deferred_queue;
+
+       struct blktap_statistics       stats;
+};
+
+extern struct blktap *blktaps[];
+
+static inline int
+blktap_active(struct blktap *tap)
+{
+       return test_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
+}
+
+static inline int
+blktap_validate_params(struct blktap *tap, struct blktap_params *params)
+{
+       /* TODO: sanity check */
+       params->name[sizeof(params->name) - 1] = '\0';
+       BTINFO("%s: capacity: %llu, sector-size: %lu\n",
+              params->name, params->capacity, params->sector_size);
+       return 0;
+}
+
+int blktap_control_destroy_device(struct blktap *);
+int blktap_control_finish_destroy(struct blktap *);
+
+int blktap_ring_init(int *);
+int blktap_ring_free(void);
+int blktap_ring_create(struct blktap *);
+int blktap_ring_destroy(struct blktap *);
+int blktap_ring_pause(struct blktap *);
+int blktap_ring_resume(struct blktap *);
+void blktap_ring_kick_user(struct blktap *);
+
+#ifdef CONFIG_SYSFS
+int blktap_sysfs_init(void);
+void blktap_sysfs_free(void);
+int blktap_sysfs_create(struct blktap *);
+int blktap_sysfs_destroy(struct blktap *);
+#else
+static inline int blktap_sysfs_init(void) { return 0; }
+static inline void blktap_sysfs_exit(void) {}
+static inline int blktap_sysfs_create(struct blktap *tapdev) { return 0; }
+static inline int blktap_sysfs_destroy(struct blktap *tapdev) { return 0; }
+#endif
+
+int blktap_device_init(int *);
+void blktap_device_free(void);
+int blktap_device_create(struct blktap *);
+int blktap_device_destroy(struct blktap *);
+int blktap_device_pause(struct blktap *);
+int blktap_device_resume(struct blktap *);
+void blktap_device_restart(struct blktap *);
+void blktap_device_finish_request(struct blktap *,
+                                 blkif_response_t *,
+                                 struct blktap_request *);
+void blktap_device_fail_pending_requests(struct blktap *);
+#ifdef ENABLE_PASSTHROUGH
+int blktap_device_enable_passthrough(struct blktap *,
+                                    unsigned, unsigned);
+#endif
+
+void blktap_defer(struct blktap *);
+void blktap_run_deferred(void);
+
+int blktap_request_pool_init(void);
+void blktap_request_pool_free(void);
+int blktap_request_pool_grow(void);
+int blktap_request_pool_shrink(void);
+struct blktap_request *blktap_request_allocate(struct blktap *);
+void blktap_request_free(struct blktap *, struct blktap_request *);
+struct page *request_to_page(struct blktap_request *, int);
+
+static inline unsigned long
+request_to_kaddr(struct blktap_request *req, int seg)
+{
+       unsigned long pfn = page_to_pfn(request_to_page(req, seg));
+       return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#endif
diff --git a/drivers/xen/blktap2/control.c b/drivers/xen/blktap2/control.c

new file mode 100644 (file)

index 0000000..f447143
--- /dev/null
+++ b/drivers/xen/blktap2/control.c
@@ -0,0 +1,285 @@
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+
+#include "blktap.h"
+
+static DEFINE_SPINLOCK(blktap_control_lock);
+struct blktap *blktaps[CONFIG_XEN_NR_TAP2_DEVICES];
+
+static int ring_major;
+static int device_major;
+static int blktap_control_registered;
+
+static void
+blktap_control_initialize_tap(struct blktap *tap)
+{
+       int minor = tap->minor;
+
+       memset(tap, 0, sizeof(*tap));
+       set_bit(BLKTAP_CONTROL, &tap->dev_inuse);
+       init_rwsem(&tap->tap_sem);
+       sg_init_table(tap->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+       init_waitqueue_head(&tap->wq);
+       atomic_set(&tap->refcnt, 0);
+
+       tap->minor = minor;
+}
+
+static struct blktap *
+blktap_control_create_tap(void)
+{
+       int minor;
+       struct blktap *tap;
+
+       tap = kmalloc(sizeof(*tap), GFP_KERNEL);
+       if (unlikely(!tap))
+               return NULL;
+
+       blktap_control_initialize_tap(tap);
+
+       spin_lock_irq(&blktap_control_lock);
+       for (minor = 0; minor < CONFIG_XEN_NR_TAP2_DEVICES; minor++)
+               if (!blktaps[minor])
+                       break;
+
+       if (minor == CONFIG_XEN_NR_TAP2_DEVICES) {
+               kfree(tap);
+               tap = NULL;
+               goto out;
+       }
+
+       tap->minor = minor;
+       blktaps[minor] = tap;
+
+out:
+       spin_unlock_irq(&blktap_control_lock);
+       return tap;
+}
+
+static struct blktap *
+blktap_control_allocate_tap(void)
+{
+       int err, minor;
+       struct blktap *tap;
+
+       /*
+        * This is called only from the ioctl, which
+        * means we should always have interrupts enabled.
+        */
+       BUG_ON(irqs_disabled());
+
+       spin_lock_irq(&blktap_control_lock);
+
+       for (minor = 0; minor < CONFIG_XEN_NR_TAP2_DEVICES; minor++) {
+               tap = blktaps[minor];
+               if (!tap)
+                       goto found;
+
+               if (!tap->dev_inuse) {
+                       blktap_control_initialize_tap(tap);
+                       goto found;
+               }
+       }
+
+       tap = NULL;
+
+found:
+       spin_unlock_irq(&blktap_control_lock);
+
+       if (!tap) {
+               tap = blktap_control_create_tap();
+               if (!tap)
+                       return NULL;
+       }
+
+       err = blktap_ring_create(tap);
+       if (err) {
+               BTERR("ring creation failed: %d\n", err);
+               clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
+               return NULL;
+       }
+
+       BTINFO("allocated tap %p\n", tap);
+       return tap;
+}
+
+static long
+blktap_control_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+       unsigned long dev;
+       struct blktap *tap;
+
+       switch (cmd) {
+       case BLKTAP2_IOCTL_ALLOC_TAP: {
+               struct blktap_handle h;
+
+               tap = blktap_control_allocate_tap();
+               if (!tap) {
+                       BTERR("error allocating device\n");
+                       return -ENOMEM;
+               }
+
+               h.ring   = ring_major;
+               h.device = device_major;
+               h.minor  = tap->minor;
+
+               if (copy_to_user((struct blktap_handle __user *)arg,
+                                &h, sizeof(h))) {
+                       blktap_control_destroy_device(tap);
+                       return -EFAULT;
+               }
+
+               return 0;
+       }
+
+       case BLKTAP2_IOCTL_FREE_TAP:
+               dev = arg;
+
+               if (dev >= CONFIG_XEN_NR_TAP2_DEVICES || !blktaps[dev])
+                       return -EINVAL;
+
+               blktap_control_destroy_device(blktaps[dev]);
+               return 0;
+       }
+
+       return -ENOIOCTLCMD;
+}
+
+static const struct file_operations blktap_control_file_operations = {
+       .owner    = THIS_MODULE,
+       .unlocked_ioctl = blktap_control_ioctl,
+};
+
+static struct miscdevice blktap_misc = {
+       .minor    = MISC_DYNAMIC_MINOR,
+       .name     = "blktap-control",
+       .nodename = BLKTAP2_DEV_DIR "control",
+       .fops     = &blktap_control_file_operations,
+};
+
+int
+blktap_control_destroy_device(struct blktap *tap)
+{
+       int err;
+       unsigned long inuse;
+
+       if (!tap)
+               return 0;
+
+       set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
+
+       for (;;) {
+               inuse = tap->dev_inuse;
+               err   = blktap_device_destroy(tap);
+               if (err)
+                       goto wait;
+
+               inuse = tap->dev_inuse;
+               err   = blktap_ring_destroy(tap);
+               if (err)
+                       goto wait;
+
+               inuse = tap->dev_inuse;
+               err   = blktap_sysfs_destroy(tap);
+               if (err)
+                       goto wait;
+
+               break;
+
+       wait:
+               BTDBG("inuse: 0x%lx, dev_inuse: 0x%lx\n",
+                     inuse, tap->dev_inuse);
+               if (wait_event_interruptible(tap->wq, tap->dev_inuse != inuse))
+                       break;
+       }
+
+       clear_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse);
+
+       if (blktap_control_finish_destroy(tap))
+               err = 0;
+
+       return err;
+}
+
+int
+blktap_control_finish_destroy(struct blktap *tap)
+{
+       if (tap->dev_inuse == (1UL << BLKTAP_CONTROL))
+               clear_bit(BLKTAP_CONTROL, &tap->dev_inuse);
+       return !tap->dev_inuse;
+}
+
+static int __init
+blktap_control_init(void)
+{
+       int err;
+
+       err = misc_register(&blktap_misc);
+       if (err) {
+               BTERR("misc_register failed for control device");
+               return err;
+       }
+
+       blktap_control_registered = 1;
+       return 0;
+}
+
+static void
+blktap_control_free(void)
+{
+       int i;
+
+       for (i = 0; i < CONFIG_XEN_NR_TAP2_DEVICES; i++)
+               blktap_control_destroy_device(blktaps[i]);
+
+       if (blktap_control_registered)
+               if (misc_deregister(&blktap_misc) < 0)
+                       BTERR("misc_deregister failed for control device");
+}
+
+static void
+blktap_exit(void)
+{
+       blktap_control_free();
+       blktap_ring_free();
+       blktap_sysfs_free();
+       blktap_device_free();
+       blktap_request_pool_free();
+}
+
+static int __init
+blktap_init(void)
+{
+       int err;
+
+       err = blktap_request_pool_init();
+       if (err)
+               return err;
+
+       err = blktap_device_init(&device_major);
+       if (err)
+               goto fail;
+
+       err = blktap_ring_init(&ring_major);
+       if (err)
+               goto fail;
+
+       err = blktap_sysfs_init();
+       if (err)
+               goto fail;
+
+       err = blktap_control_init();
+       if (err)
+               goto fail;
+
+       return 0;
+
+fail:
+       blktap_exit();
+       return err;
+}
+
+module_init(blktap_init);
+module_exit(blktap_exit);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("devname:" BLKTAP2_DEV_DIR "control");
diff --git a/drivers/xen/blktap2/device.c b/drivers/xen/blktap2/device.c

new file mode 100644 (file)

index 0000000..4b8adaa
--- /dev/null
+++ b/drivers/xen/blktap2/device.c
@@ -0,0 +1,1176 @@
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/cdrom.h>
+#include <linux/hdreg.h>
+#include <linux/module.h>
+#include <scsi/scsi.h>
+#include <asm/tlbflush.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+
+#include <xen/xenbus.h>
+#include <xen/interface/io/blkif.h>
+
+#include "blktap.h"
+
+#include "../blkback/blkback-pagemap.h"
+
+#if 0
+#define DPRINTK_IOCTL(_f, _a...) pr_alert(_f, ## _a)
+#else
+#define DPRINTK_IOCTL(_f, _a...) ((void)0)
+#endif
+
+struct blktap_grant_table {
+       int cnt;
+       struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
+};
+
+static int blktap_device_major;
+
+static inline struct blktap *
+dev_to_blktap(struct blktap_device *dev)
+{
+       return container_of(dev, struct blktap, device);
+}
+
+static int
+blktap_device_open(struct block_device *bd, fmode_t mode)
+{
+       struct blktap *tap;
+       struct blktap_device *dev = bd->bd_disk->private_data;
+
+       if (!dev)
+               return -ENOENT;
+
+       tap = dev_to_blktap(dev);
+       if (!blktap_active(tap) ||
+           test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+               return -ENOENT;
+
+       dev->users++;
+
+       return 0;
+}
+
+static int
+blktap_device_release(struct gendisk *disk, fmode_t mode)
+{
+       struct blktap_device *dev = disk->private_data;
+       struct blktap *tap = dev_to_blktap(dev);
+
+       dev->users--;
+       if (test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+               blktap_device_destroy(tap);
+
+       return 0;
+}
+
+static int
+blktap_device_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+       /* We don't have real geometry info, but let's at least return
+          values consistent with the size of the device */
+       sector_t nsect = get_capacity(bd->bd_disk);
+       sector_t cylinders = nsect;
+
+       hg->heads = 0xff;
+       hg->sectors = 0x3f;
+       sector_div(cylinders, hg->heads * hg->sectors);
+       hg->cylinders = cylinders;
+       if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+               hg->cylinders = 0xffff;
+       return 0;
+}
+
+static int
+blktap_device_ioctl(struct block_device *bd, fmode_t mode,
+                   unsigned command, unsigned long argument)
+{
+       int i;
+
+       DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx\n",
+                     command, (long)argument);
+
+       switch (command) {
+       case CDROMMULTISESSION:
+               BTDBG("FIXME: support multisession CDs later\n");
+               for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+                       if (put_user(0, (char __user *)(argument + i)))
+                               return -EFAULT;
+               return 0;
+
+       case SCSI_IOCTL_GET_IDLUN:
+               if (!access_ok(VERIFY_WRITE, argument, 
+                       sizeof(struct scsi_idlun)))
+                       return -EFAULT;
+
+               /* return 0 for now. */
+               __put_user(0, &((struct scsi_idlun __user *)argument)->dev_id);
+               __put_user(0, 
+                       &((struct scsi_idlun __user *)argument)->host_unique_id);
+               return 0;
+
+       default:
+               return -EINVAL; /* same return as native Linux */
+       }
+
+       return 0;
+}
+
+static const struct block_device_operations blktap_device_file_operations = {
+       .owner     = THIS_MODULE,
+       .open      = blktap_device_open,
+       .release   = blktap_device_release,
+       .ioctl     = blktap_device_ioctl,
+       .getgeo    = blktap_device_getgeo
+};
+
+static int
+blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page,
+                   unsigned long addr, void *data)
+{
+       pte_t *pte = (pte_t *)data;
+
+       BTDBG("ptep %p -> %012llx\n", ptep, (unsigned long long)pte_val(*pte));
+       set_pte(ptep, *pte);
+       return 0;
+}
+
+static int
+blktap_map_uaddr(struct vm_area_struct *vma, unsigned long address, pte_t pte)
+{
+       return apply_to_page_range(vma ? vma->vm_mm : NULL, address,
+                                  PAGE_SIZE, blktap_map_uaddr_fn, &pte);
+}
+
+static int
+blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page,
+                    unsigned long addr, void *data)
+{
+       struct vm_area_struct *vma = data;
+
+       BTDBG("ptep %p\n", ptep);
+       xen_ptep_get_and_clear_full(vma, addr, ptep, 1);
+       return 0;
+}
+
+static int
+blktap_umap_uaddr(struct vm_area_struct *vma, unsigned long address)
+{
+       struct mm_struct *mm = NULL;
+
+       if (!vma) {
+#ifdef CONFIG_X86
+               if (HYPERVISOR_update_va_mapping(address, __pte(0),
+                                                UVMF_INVLPG|UVMF_ALL))
+                       BUG();
+               return 1;
+#endif
+       } else
+               mm = vma->vm_mm;
+       return apply_to_page_range(mm, address,
+                                  PAGE_SIZE, blktap_umap_uaddr_fn, vma);
+}
+
+static inline void
+flush_tlb_kernel_page(unsigned long kvaddr)
+{
+#ifdef CONFIG_X86
+       xen_invlpg_all(kvaddr);
+#else
+       flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE);
+#endif
+}
+
+/*
+ * tap->tap_sem held on entry
+ */
+static void
+blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request)
+{
+       uint64_t ptep;
+       int ret, usr_idx;
+       unsigned int i, cnt;
+       struct page **map, *page;
+       struct blktap_ring *ring;
+       struct grant_handle_pair *khandle;
+       unsigned long kvaddr, uvaddr, offset;
+       struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2];
+       grant_handle_t self_gref[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+       int self_gref_nr = 0;
+
+       cnt     = 0;
+       ring    = &tap->ring;
+       usr_idx = request->usr_idx;
+       map     = ring->foreign_map.map;
+
+       if (!ring->vma)
+               return;
+
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               zap_page_range(ring->vma, 
+                              MMAP_VADDR(ring->user_vstart, usr_idx, 0),
+                              request->nr_pages << PAGE_SHIFT, NULL);
+
+       for (i = 0; i < request->nr_pages; i++) {
+               kvaddr = request_to_kaddr(request, i);
+               uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
+
+               khandle = request->handles + i;
+
+               if (khandle->kernel != INVALID_GRANT_HANDLE) {
+                       gnttab_set_unmap_op(&unmap[cnt], kvaddr,
+                                           GNTMAP_host_map, khandle->kernel);
+                       cnt++;
+                       set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
+                                           INVALID_P2M_ENTRY);
+               }
+
+               if (khandle->user != INVALID_GRANT_HANDLE) {
+                       BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+                       if (create_lookup_pte_addr(ring->vma->vm_mm,
+                                                  uvaddr, &ptep) != 0) {
+                               BTERR("Couldn't get a pte addr!\n");
+                               return;
+                       }
+
+                       gnttab_set_unmap_op(&unmap[cnt], ptep,
+                                           GNTMAP_host_map
+                                           | GNTMAP_application_map
+                                           | GNTMAP_contains_pte,
+                                           khandle->user);
+                       cnt++;
+               }
+
+               offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
+
+               BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, "
+                     "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: "
+                     "0x%08lx, handle: %u\n", offset, map[offset], request,
+                     usr_idx, i, kvaddr, khandle->kernel, uvaddr,
+                     khandle->user);
+
+               page = map[offset];
+               if (page) {
+                       if (PageBlkback(page)) {
+                               ClearPageBlkback(page);
+                               set_page_private(page, 0);
+                       } else if (
+                               xen_feature(XENFEAT_auto_translated_physmap)) {
+                               self_gref[self_gref_nr] = khandle->kernel;
+                               self_gref_nr++;
+                       }
+               }
+               map[offset] = NULL;
+
+               khandle->kernel = INVALID_GRANT_HANDLE;
+               khandle->user   = INVALID_GRANT_HANDLE;
+       }
+
+       if (cnt) {
+               ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+                                               unmap, cnt);
+               BUG_ON(ret);
+       }
+
+       if (!xen_feature(XENFEAT_auto_translated_physmap))
+               zap_page_range(ring->vma, 
+                              MMAP_VADDR(ring->user_vstart, usr_idx, 0), 
+                              request->nr_pages << PAGE_SHIFT, NULL);
+       else {
+               for (i = 0; i < self_gref_nr; i++) {
+                       gnttab_end_foreign_access_ref(self_gref[i]);
+               }
+       }
+}
+
+/*
+ * tap->tap_sem held on entry
+ */
+static void
+blktap_unmap(struct blktap *tap, struct blktap_request *request)
+{
+       int i, usr_idx;
+       unsigned long kvaddr;
+
+       usr_idx = request->usr_idx;
+       down_write(&tap->ring.vma->vm_mm->mmap_sem);
+
+       for (i = 0; i < request->nr_pages; i++) {
+               kvaddr = request_to_kaddr(request, i);
+               BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, "
+                     "uvaddr: 0x%08lx, uhandle: %u\n", request, i,
+                     kvaddr, request->handles[i].kernel,
+                     MMAP_VADDR(tap->ring.user_vstart, usr_idx, i),
+                     request->handles[i].user);
+
+               if (!xen_feature(XENFEAT_auto_translated_physmap) &&
+                   request->handles[i].kernel == INVALID_GRANT_HANDLE) {
+                       if (blktap_umap_uaddr(NULL, kvaddr) == 0)
+                               flush_tlb_kernel_page(kvaddr);
+                       set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT,
+                                           INVALID_P2M_ENTRY);
+               }
+       }
+
+       blktap_device_fast_flush(tap, request);
+       up_write(&tap->ring.vma->vm_mm->mmap_sem);
+}
+
+/*
+ * called if the tapdisk process dies unexpectedly.
+ * fail and release any pending requests and disable queue.
+ */
+void
+blktap_device_fail_pending_requests(struct blktap *tap)
+{
+       int usr_idx;
+       struct request *req;
+       struct blktap_device *dev;
+       struct blktap_request *request;
+
+       if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+               return;
+
+       down_write(&tap->tap_sem);
+
+       dev = &tap->device;
+       for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) {
+               request = tap->pending_requests[usr_idx];
+               if (!request || request->status != BLKTAP_REQUEST_PENDING)
+                       continue;
+
+               BTERR("%u:%u: failing pending %s of %d pages\n",
+                     blktap_device_major, tap->minor,
+                     (request->operation == BLKIF_OP_PACKET ?
+                      "packet" : request->operation == BLKIF_OP_READ ?
+                      "read" : "write"), request->nr_pages);
+
+               blktap_unmap(tap, request);
+               req = (struct request *)(unsigned long)request->id;
+               blk_end_request_all(req, -ENODEV);
+               blktap_request_free(tap, request);
+       }
+
+       up_write(&tap->tap_sem);
+
+       spin_lock_irq(&dev->lock);
+
+       /* fail any future requests */
+       dev->gd->queue->queuedata = NULL;
+       blk_start_queue(dev->gd->queue);
+
+       spin_unlock_irq(&dev->lock);
+}
+
+/*
+ * tap->tap_sem held on entry
+ */
+void
+blktap_device_finish_request(struct blktap *tap,
+                            blkif_response_t *res,
+                            struct blktap_request *request)
+{
+       struct request *req;
+
+       blktap_unmap(tap, request);
+
+       req = (struct request *)(unsigned long)request->id;
+
+       BTDBG("req %p res status %d operation %d/%d id %lld\n", req,
+             res->status, res->operation, request->operation,
+             (unsigned long long)res->id);
+
+       switch (request->operation) {
+       case BLKIF_OP_READ:
+       case BLKIF_OP_WRITE:
+       case BLKIF_OP_PACKET:
+               if (unlikely(res->status != BLKIF_RSP_OKAY))
+                       BTERR("Bad return from device data "
+                               "request: %x\n", res->status);
+               blk_end_request_all(req,
+                       res->status == BLKIF_RSP_OKAY ? 0 : -EIO);
+               break;
+       default:
+               BUG();
+       }
+
+       blktap_request_free(tap, request);
+}
+
+static int
+blktap_prep_foreign(struct blktap *tap,
+                   struct blktap_request *request,
+                   blkif_request_t *blkif_req,
+                   unsigned int seg, struct page *page,
+                   struct blktap_grant_table *table)
+{
+       uint64_t ptep;
+       uint32_t flags;
+       struct page *tap_page;
+       struct blktap_ring *ring;
+       struct blkback_pagemap map;
+       unsigned long uvaddr, kvaddr;
+
+       ring = &tap->ring;
+       map  = blkback_pagemap_read(page);
+       blkif_req->seg[seg].gref = map.gref;
+
+       uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg);
+       kvaddr = request_to_kaddr(request, seg);
+       flags  = GNTMAP_host_map |
+               (request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0);
+
+       gnttab_set_map_op(&table->grants[table->cnt],
+                         kvaddr, flags, map.gref, map.domid);
+       table->cnt++;
+
+       /* enable chained tap devices */
+       tap_page = request_to_page(request, seg);
+       set_page_private(tap_page, page_private(page));
+       SetPageBlkback(tap_page);
+
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return 0;
+
+       if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) {
+               BTERR("couldn't get a pte addr!\n");
+               return -1;
+       }
+
+       flags |= GNTMAP_application_map | GNTMAP_contains_pte;
+       gnttab_set_map_op(&table->grants[table->cnt],
+                         ptep, flags, map.gref, map.domid);
+       table->cnt++;
+
+       return 0;
+}
+
+static int
+blktap_map_foreign(struct blktap *tap,
+                  struct blktap_request *request,
+                  blkif_request_t *blkif_req,
+                  struct blktap_grant_table *table)
+{
+       struct page *page;
+       int i, grant, err, usr_idx;
+       struct blktap_ring *ring;
+       unsigned long uvaddr, foreign_mfn;
+
+       if (!table->cnt)
+               return 0;
+
+       err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+                                       table->grants, table->cnt);
+       BUG_ON(err);
+
+       grant   = 0;
+       usr_idx = request->usr_idx;
+       ring    = &tap->ring;
+
+       for (i = 0; i < request->nr_pages; i++) {
+               if (!blkif_req->seg[i].gref)
+                       continue;
+
+               uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
+
+               if (unlikely(table->grants[grant].status != GNTST_okay)) {
+                       BTERR("invalid kernel buffer: could not remap it\n");
+                       /* This should never happen: blkback should handle eagain first */
+                       BUG_ON(table->grants[grant].status == GNTST_eagain);
+                       err |= 1;
+                       table->grants[grant].handle = INVALID_GRANT_HANDLE;
+               }
+
+               request->handles[i].kernel = table->grants[grant].handle;
+               foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT;
+               grant++;
+
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                       if (unlikely(table->grants[grant].status != GNTST_okay)) {
+                               /* This should never happen: blkback should handle eagain first */
+                               WARN_ON(table->grants[grant].status == GNTST_eagain);
+                               BTERR("invalid user buffer: could not remap it\n");
+                               err |= 1;
+                               table->grants[grant].handle = INVALID_GRANT_HANDLE;
+                       }
+                       request->handles[i].user = table->grants[grant].handle;
+                       grant++;
+               }
+
+               if (err)
+                       continue;
+
+               page = request_to_page(request, i);
+
+               if (!xen_feature(XENFEAT_auto_translated_physmap))
+                       set_phys_to_machine(page_to_pfn(page),
+                                           FOREIGN_FRAME(foreign_mfn));
+               else if (vm_insert_page(ring->vma, uvaddr, page))
+                       err |= 1;
+
+               BTDBG("pending_req: %p, seg: %d, page: %p, "
+                     "kvaddr: 0x%p, khandle: %u, uvaddr: 0x%08lx, "
+                     "uhandle: %u\n", request, i, page,
+                     pfn_to_kaddr(page_to_pfn(page)),
+                     request->handles[i].kernel,
+                     uvaddr, request->handles[i].user);
+       }
+
+       return err;
+}
+
+static int
+blktap_map(struct blktap *tap,
+          struct blktap_request *request,
+          unsigned int seg, struct page *page)
+{
+       pte_t pte;
+       int usr_idx;
+       struct blktap_ring *ring;
+       unsigned long uvaddr, kvaddr;
+       int err = 0;
+
+       ring    = &tap->ring;
+       usr_idx = request->usr_idx;
+       uvaddr  = MMAP_VADDR(ring->user_vstart, usr_idx, seg);
+       kvaddr  = request_to_kaddr(request, seg);
+
+       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+               pte = mk_pte(page, ring->vma->vm_page_prot);
+               blktap_map_uaddr(ring->vma, uvaddr,
+                                pte_mkspecial(pte_mkwrite(pte)));
+               flush_tlb_page(ring->vma, uvaddr);
+               blktap_map_uaddr(NULL, kvaddr, mk_pte(page, PAGE_KERNEL));
+               flush_tlb_kernel_page(kvaddr);
+
+               set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte));
+               request->handles[seg].kernel = INVALID_GRANT_HANDLE;
+       } else {
+               /* grant this page access to self domain and map it. */
+               domid_t domid = 0; /* XXX my domian id: grant table hypercall
+                                     doesn't understand DOMID_SELF */
+               int gref;
+               uint32_t flags;
+               struct gnttab_map_grant_ref map;
+               struct page *tap_page;
+
+               gref = gnttab_grant_foreign_access(
+                       domid, page_to_pfn(page),
+                       (request->operation == BLKIF_OP_WRITE)?
+                       GTF_readonly: 0);
+
+               flags  = GNTMAP_host_map |
+                       (request->operation == BLKIF_OP_WRITE ?
+                        GNTMAP_readonly : 0);
+
+               gnttab_set_map_op(&map, kvaddr, flags, gref, domid);
+
+               /* enable chained tap devices */
+               tap_page = request_to_page(request, seg);
+               set_page_private(tap_page, page_private(page));
+               SetPageBlkback(tap_page);
+
+               gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map);
+
+               /* We are not expecting the grant op to fail */
+               BUG_ON(map.status != GNTST_okay);
+
+               err = vm_insert_page(ring->vma, uvaddr, tap_page);
+               if (err) {
+                       struct gnttab_unmap_grant_ref unmap;
+                       gnttab_set_unmap_op(&unmap, kvaddr,
+                                           GNTMAP_host_map, gref);
+                       VOID(HYPERVISOR_grant_table_op(
+                               GNTTABOP_unmap_grant_ref, &unmap, 1));
+               } else
+                       request->handles[seg].kernel = gref;
+       }
+       request->handles[seg].user = INVALID_GRANT_HANDLE;
+
+       BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, "
+             "uvaddr: 0x%08lx\n", request, seg, page, kvaddr,
+             uvaddr);
+
+       return err;
+}
+
+static int
+blktap_device_process_request(struct blktap *tap,
+                             struct blktap_request *request,
+                             struct request *req)
+{
+       struct page *page;
+       int i, usr_idx, err;
+       struct blktap_ring *ring;
+       struct scatterlist *sg;
+       struct blktap_grant_table table;
+       unsigned int fsect, lsect, nr_sects;
+       unsigned long offset, uvaddr;
+       struct blkif_request blkif_req, *target;
+
+       err = -1;
+       memset(&table, 0, sizeof(table));
+
+       if (!blktap_active(tap))
+               goto out;
+
+       ring    = &tap->ring;
+       usr_idx = request->usr_idx;
+       blkif_req.id = usr_idx;
+       blkif_req.sector_number = (blkif_sector_t)blk_rq_pos(req);
+       blkif_req.handle = 0;
+       blkif_req.operation = rq_data_dir(req) ?
+               BLKIF_OP_WRITE : BLKIF_OP_READ;
+       if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
+               blkif_req.operation = BLKIF_OP_PACKET;
+
+       request->id        = (unsigned long)req;
+       request->operation = blkif_req.operation;
+       request->status    = BLKTAP_REQUEST_PENDING;
+       do_gettimeofday(&request->time);
+
+       nr_sects = 0;
+       request->nr_pages = 0;
+       blkif_req.nr_segments = blk_rq_map_sg(req->q, req, tap->sg);
+       BUG_ON(blkif_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+       for_each_sg(tap->sg, sg, blkif_req.nr_segments, i) {
+                       fsect = sg->offset >> 9;
+                       lsect = fsect + (sg->length >> 9) - 1;
+                       nr_sects += sg->length >> 9;
+
+                       blkif_req.seg[i] =
+                               (struct blkif_request_segment) {
+                               .gref       = 0,
+                               .first_sect = fsect,
+                               .last_sect  = lsect };
+
+                       if (PageBlkback(sg_page(sg))) {
+                               /* foreign page -- use xen */
+                               if (blktap_prep_foreign(tap,
+                                                       request,
+                                                       &blkif_req,
+                                                       i,
+                                                       sg_page(sg),
+                                                       &table))
+                                       goto out;
+                       } else {
+                               /* do it the old fashioned way */
+                               if (blktap_map(tap,
+                                              request,
+                                              i,
+                                              sg_page(sg)))
+                                       goto out;
+                       }
+
+                       uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i);
+                       offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT;
+                       page   = request_to_page(request, i);
+                       ring->foreign_map.map[offset] = page;
+                       SetPageReserved(page);
+
+                       BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n",
+                             uvaddr, page, page_to_pfn(page));
+                       BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, "
+                             "page: %p, kvaddr: %p, uvaddr: 0x%08lx\n",
+                             offset, request, i,
+                             page, pfn_to_kaddr(page_to_pfn(page)), uvaddr);
+
+                       request->nr_pages++;
+       }
+
+       if (blktap_map_foreign(tap, request, &blkif_req, &table))
+               goto out;
+
+       /* Finally, write the request message to the user ring. */
+       target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt);
+       memcpy(target, &blkif_req, sizeof(blkif_req));
+       target->id = request->usr_idx;
+       wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */
+       ring->ring.req_prod_pvt++;
+
+       if (unlikely(req->cmd_type == REQ_TYPE_BLOCK_PC))
+               tap->stats.st_pk_req++;
+       else if (rq_data_dir(req)) {
+               tap->stats.st_wr_sect += nr_sects;
+               tap->stats.st_wr_req++;
+       } else {
+               tap->stats.st_rd_sect += nr_sects;
+               tap->stats.st_rd_req++;
+       }
+
+       err = 0;
+
+out:
+       if (err)
+               blktap_device_fast_flush(tap, request);
+       return err;
+}
+
+#ifdef ENABLE_PASSTHROUGH
+#define rq_for_each_bio_safe(_bio, _tmp, _req)                         \
+       if ((_req)->bio)                                                \
+               for (_bio = (_req)->bio;                                \
+                    _bio && ((_tmp = _bio->bi_next) || 1);             \
+                    _bio = _tmp)
+
+static void
+blktap_device_forward_request(struct blktap *tap, struct request *req)
+{
+       struct bio *bio, *tmp;
+       struct blktap_device *dev;
+
+       dev = &tap->device;
+
+       rq_for_each_bio_safe(bio, tmp, req) {
+               bio->bi_bdev = dev->bdev;
+               submit_bio(bio->bi_rw, bio);
+       }
+}
+
+static void
+blktap_device_close_bdev(struct blktap *tap)
+{
+       struct blktap_device *dev;
+
+       dev = &tap->device;
+
+       if (dev->bdev)
+               blkdev_put(dev->bdev, FMODE_WRITE|FMODE_EXCL);
+
+       dev->bdev = NULL;
+       clear_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
+}
+
+static int
+blktap_device_open_bdev(struct blktap *tap, u32 pdev)
+{
+       struct block_device *bdev;
+       struct blktap_device *dev;
+
+       dev = &tap->device;
+
+       bdev = blkdev_get_by_dev(pdev, FMODE_WRITE|FMODE_EXCL, tap);
+       if (IS_ERR(bdev)) {
+               BTERR("opening device %x:%x failed: %ld\n",
+                     MAJOR(pdev), MINOR(pdev), PTR_ERR(bdev));
+               return PTR_ERR(bdev);
+       }
+
+       if (!bdev->bd_disk) {
+               BTERR("device %x:%x doesn't exist\n",
+                     MAJOR(pdev), MINOR(pdev));
+               blkdev_put(bdev, FMODE_WRITE|FMODE_EXCL);
+               return -ENOENT;
+       }
+
+       dev->bdev = bdev;
+       set_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse);
+
+       /* TODO: readjust queue parameters */
+
+       BTINFO("set device %d to passthrough on %x:%x\n",
+              tap->minor, MAJOR(pdev), MINOR(pdev));
+
+       return 0;
+}
+
+int
+blktap_device_enable_passthrough(struct blktap *tap,
+                                unsigned major, unsigned minor)
+{
+       u32 pdev;
+       struct blktap_device *dev;
+
+       dev  = &tap->device;
+       pdev = MKDEV(major, minor);
+
+       if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+               return -EINVAL;
+
+       if (dev->bdev) {
+               if (pdev)
+                       return -EINVAL;
+               blktap_device_close_bdev(tap);
+               return 0;
+       }
+
+       return blktap_device_open_bdev(tap, pdev);
+}
+#endif
+
+/*
+ * dev->lock held on entry
+ */
+static void
+blktap_device_run_queue(struct blktap *tap)
+{
+       int queued, err;
+       struct request_queue *rq;
+       struct request *req;
+       struct blktap_ring *ring;
+       struct blktap_device *dev;
+       struct blktap_request *request;
+
+       queued = 0;
+       ring   = &tap->ring;
+       dev    = &tap->device;
+       rq     = dev->gd->queue;
+
+       BTDBG("running queue for %d\n", tap->minor);
+
+       while ((req = blk_peek_request(rq)) != NULL) {
+               if (req->cmd_type != REQ_TYPE_FS) {
+                       blk_start_request(req);
+                       req->errors = (DID_ERROR << 16) |
+                                     (DRIVER_INVALID << 24);
+                       __blk_end_request_all(req, -EIO);
+                       continue;
+               }
+
+               if (req->cmd_flags & (REQ_FLUSH|REQ_FUA)) {
+                       blk_start_request(req);
+                       __blk_end_request_all(req, -EOPNOTSUPP);
+                       continue;
+               }
+
+#ifdef ENABLE_PASSTHROUGH
+               if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
+                       blk_start_request(req);
+                       blktap_device_forward_request(tap, req);
+                       continue;
+               }
+#endif
+
+               if (RING_FULL(&ring->ring)) {
+               wait:
+                       /* Avoid pointless unplugs. */
+                       blk_stop_queue(rq);
+                       blktap_defer(tap);
+                       break;
+               }
+
+               request = blktap_request_allocate(tap);
+               if (!request) {
+                       tap->stats.st_oo_req++;
+                       goto wait;
+               }
+
+               BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%x) "
+                     "buffer:%p [%s], pending: %p\n", req, tap->minor,
+                     req->cmd, (unsigned long long)blk_rq_pos(req),
+                     blk_rq_cur_sectors(req), blk_rq_sectors(req), req->buffer,
+                     rq_data_dir(req) ? "write" : "read", request);
+
+               blk_start_request(req);
+
+               spin_unlock_irq(&dev->lock);
+               down_write(&tap->tap_sem);
+
+               err = blktap_device_process_request(tap, request, req);
+               if (!err)
+                       queued++;
+               else {
+                       blk_end_request_all(req, err);
+                       blktap_request_free(tap, request);
+               }
+
+               up_write(&tap->tap_sem);
+               spin_lock_irq(&dev->lock);
+       }
+
+       if (queued)
+               blktap_ring_kick_user(tap);
+}
+
+/*
+ * dev->lock held on entry
+ */
+static void
+blktap_device_do_request(struct request_queue *rq)
+{
+       struct request *req;
+       struct blktap *tap;
+       struct blktap_device *dev;
+
+       dev = rq->queuedata;
+       if (!dev)
+               goto fail;
+
+       tap = dev_to_blktap(dev);
+       if (!blktap_active(tap))
+               goto fail;
+
+       if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
+           test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
+               blktap_defer(tap);
+               return;
+       }
+
+       blktap_device_run_queue(tap);
+       return;
+
+fail:
+       while ((req = blk_fetch_request(rq))) {
+               if (req->cmd_type != REQ_TYPE_FS) {
+                       unsigned long long sec = blk_rq_pos(req);
+
+                       BTERR("device closed: failing secs %#Lx-%#Lx\n",
+                             sec, sec + blk_rq_sectors(req) - 1);
+               } else
+                       req->errors = (DID_ERROR << 16)
+                                     | (DRIVER_INVALID << 24);
+               __blk_end_request_all(req, -EIO);
+       }
+}
+
+void
+blktap_device_restart(struct blktap *tap)
+{
+       struct blktap_device *dev;
+
+       dev = &tap->device;
+
+       if (blktap_active(tap) && RING_FULL(&tap->ring.ring)) {
+               blktap_defer(tap);
+               return;
+       }
+
+       if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse) ||
+           test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
+               blktap_defer(tap);
+               return;
+       }
+
+       spin_lock_irq(&dev->lock);
+
+       /* Re-enable calldowns. */
+       if (dev->gd) {
+               struct request_queue *rq = dev->gd->queue;
+
+               if (blk_queue_stopped(rq))
+                       blk_start_queue(rq);
+
+               /* Kick things off immediately. */
+               blktap_device_do_request(rq);
+       }
+
+       spin_unlock_irq(&dev->lock);
+}
+
+static void
+blktap_device_configure(struct blktap *tap)
+{
+       struct request_queue *rq;
+       struct blktap_device *dev = &tap->device;
+
+       if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !dev->gd)
+               return;
+
+       dev = &tap->device;
+       rq  = dev->gd->queue;
+
+       spin_lock_irq(&dev->lock);
+
+       set_capacity(dev->gd, tap->params.capacity);
+
+       /* Hard sector size and max sectors impersonate the equiv. hardware. */
+       blk_queue_logical_block_size(rq, tap->params.sector_size);
+       blk_queue_max_hw_sectors(rq, 512);
+
+       /* Each segment in a request is up to an aligned page in size. */
+       blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+       blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+       /* Ensure a merged request will fit in a single I/O ring slot. */
+       blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+       /* Make sure buffer addresses are sector-aligned. */
+       blk_queue_dma_alignment(rq, 511);
+
+       spin_unlock_irq(&dev->lock);
+}
+
+int
+blktap_device_resume(struct blktap *tap)
+{
+       int err;
+
+       if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
+               return -ENODEV;
+
+       if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+               return 0;
+
+       err = blktap_ring_resume(tap);
+       if (err)
+               return err;
+
+       /* device size may have changed */
+       blktap_device_configure(tap);
+
+       BTDBG("restarting device\n");
+       blktap_device_restart(tap);
+
+       return 0;
+}
+
+int
+blktap_device_pause(struct blktap *tap)
+{
+       unsigned long flags;
+       struct blktap_device *dev = &tap->device;
+
+       if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse) || !blktap_active(tap))
+               return -ENODEV;
+
+       if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+               return 0;
+
+       spin_lock_irqsave(&dev->lock, flags);
+
+       blk_stop_queue(dev->gd->queue);
+       set_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
+
+       spin_unlock_irqrestore(&dev->lock, flags);
+
+       return blktap_ring_pause(tap);
+}
+
+int
+blktap_device_destroy(struct blktap *tap)
+{
+       struct blktap_device *dev = &tap->device;
+       struct gendisk *gd = dev->gd;
+
+       if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+               return 0;
+
+       BTINFO("destroy device %d users %d\n", tap->minor, dev->users);
+
+       if (dev->users)
+               return -EBUSY;
+
+       spin_lock_irq(&dev->lock);
+       /* No more blktap_device_do_request(). */
+       blk_stop_queue(gd->queue);
+       clear_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+       dev->gd = NULL;
+       spin_unlock_irq(&dev->lock);
+
+#ifdef ENABLE_PASSTHROUGH
+       if (dev->bdev)
+               blktap_device_close_bdev(tap);
+#endif
+
+       del_gendisk(gd);
+       blk_cleanup_queue(gd->queue);
+       put_disk(gd);
+
+       wake_up(&tap->wq);
+
+       return 0;
+}
+
+static char *blktap_devnode(struct gendisk *gd, umode_t *mode)
+{
+       return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "tapdev%u",
+                        gd->first_minor);
+}
+
+int
+blktap_device_create(struct blktap *tap)
+{
+       int minor, err;
+       struct gendisk *gd;
+       struct request_queue *rq;
+       struct blktap_device *dev;
+
+       gd    = NULL;
+       rq    = NULL;
+       dev   = &tap->device;
+       minor = tap->minor;
+
+       if (test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+               return -EEXIST;
+
+       if (blktap_validate_params(tap, &tap->params))
+               return -EINVAL;
+
+       BTINFO("minor %d sectors %Lu sector-size %lu\n",
+              minor, tap->params.capacity, tap->params.sector_size);
+
+       err = -ENODEV;
+
+       gd = alloc_disk(1);
+       if (!gd)
+               goto error;
+
+       if (minor < 26)
+               sprintf(gd->disk_name, "tapdev%c", 'a' + minor);
+       else
+               sprintf(gd->disk_name, "tapdev%c%c",
+                       'a' + ((minor / 26) - 1), 'a' + (minor % 26));
+
+       gd->major = blktap_device_major;
+       gd->first_minor = minor;
+       gd->devnode = blktap_devnode;
+       gd->fops = &blktap_device_file_operations;
+       gd->private_data = dev;
+
+       spin_lock_init(&dev->lock);
+       rq = blk_init_queue(blktap_device_do_request, &dev->lock);
+       if (!rq)
+               goto error;
+
+       elevator_init(rq, "noop");
+
+       gd->queue     = rq;
+       rq->queuedata = dev;
+       dev->gd       = gd;
+
+       set_bit(BLKTAP_DEVICE, &tap->dev_inuse);
+       blktap_device_configure(tap);
+
+       add_disk(gd);
+
+       err = 0;
+       goto out;
+
+ error:
+       if (gd)
+               del_gendisk(gd);
+       if (rq)
+               blk_cleanup_queue(rq);
+
+ out:
+       BTINFO("creation of %u:%u: %d\n", blktap_device_major, tap->minor, err);
+       return err;
+}
+
+int __init
+blktap_device_init(int *maj)
+{
+       int major;
+
+       /* Dynamically allocate a major for this device */
+       major = register_blkdev(0, "tapdev");
+       if (major < 0) {
+               BTERR("Couldn't register blktap device\n");
+               return -ENOMEM;
+       }       
+
+       blktap_device_major = *maj = major;
+       BTINFO("blktap device major %d\n", major);
+
+       return 0;
+}
+
+void
+blktap_device_free(void)
+{
+       if (blktap_device_major)
+               unregister_blkdev(blktap_device_major, "tapdev");
+}
diff --git a/drivers/xen/blktap2/request.c b/drivers/xen/blktap2/request.c

new file mode 100644 (file)

index 0000000..a27cf8a
--- /dev/null
+++ b/drivers/xen/blktap2/request.c
@@ -0,0 +1,296 @@
+#include <linux/spinlock.h>
+#include <xen/balloon.h>
+
+#include "blktap.h"
+
+#define MAX_BUCKETS                      8
+#define BUCKET_SIZE                      MAX_PENDING_REQS
+
+#define BLKTAP_POOL_CLOSING              1
+
+struct blktap_request_bucket;
+
+struct blktap_request_handle {
+       int                              slot;
+       uint8_t                          inuse;
+       struct blktap_request            request;
+       struct blktap_request_bucket    *bucket;
+};
+
+struct blktap_request_bucket {
+       atomic_t                         reqs_in_use;
+       struct blktap_request_handle     handles[BUCKET_SIZE];
+       struct page                    **foreign_pages;
+};
+
+struct blktap_request_pool {
+       spinlock_t                       lock;
+       uint8_t                          status;
+       struct list_head                 free_list;
+       atomic_t                         reqs_in_use;
+       wait_queue_head_t                wait_queue;
+       struct blktap_request_bucket    *buckets[MAX_BUCKETS];
+};
+
+static struct blktap_request_pool pool;
+
+static inline struct blktap_request_handle *
+blktap_request_to_handle(struct blktap_request *req)
+{
+       return container_of(req, struct blktap_request_handle, request);
+}
+
+static void
+blktap_request_pool_init_request(struct blktap_request *request)
+{
+       int i;
+
+       request->usr_idx  = -1;
+       request->nr_pages = 0;
+       request->status   = BLKTAP_REQUEST_FREE;
+       INIT_LIST_HEAD(&request->free_list);
+       for (i = 0; i < ARRAY_SIZE(request->handles); i++) {
+               request->handles[i].user   = INVALID_GRANT_HANDLE;
+               request->handles[i].kernel = INVALID_GRANT_HANDLE;
+       }
+}
+
+static int
+blktap_request_pool_allocate_bucket(void)
+{
+       int i, idx;
+       unsigned long flags;
+       struct blktap_request *request;
+       struct blktap_request_handle *handle;
+       struct blktap_request_bucket *bucket;
+
+       bucket = kzalloc(sizeof(struct blktap_request_bucket), GFP_KERNEL);
+       if (!bucket)
+               goto fail;
+
+       bucket->foreign_pages = alloc_empty_pages_and_pagevec(MMAP_PAGES);
+       if (!bucket->foreign_pages)
+               goto fail;
+
+       spin_lock_irqsave(&pool.lock, flags);
+
+       idx = -1;
+       for (i = 0; i < MAX_BUCKETS; i++) {
+               if (!pool.buckets[i]) {
+                       idx = i;
+                       pool.buckets[idx] = bucket;
+                       break;
+               }
+       }
+
+       if (idx == -1) {
+               spin_unlock_irqrestore(&pool.lock, flags);
+               goto fail;
+       }
+
+       for (i = 0; i < BUCKET_SIZE; i++) {
+               handle  = bucket->handles + i;
+               request = &handle->request;
+
+               handle->slot   = i;
+               handle->inuse  = 0;
+               handle->bucket = bucket;
+
+               blktap_request_pool_init_request(request);
+               list_add_tail(&request->free_list, &pool.free_list);
+       }
+
+       spin_unlock_irqrestore(&pool.lock, flags);
+
+       return 0;
+
+fail:
+       if (bucket && bucket->foreign_pages)
+               free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
+       kfree(bucket);
+       return -ENOMEM;
+}
+
+static void
+blktap_request_pool_free_bucket(struct blktap_request_bucket *bucket)
+{
+       if (!bucket)
+               return;
+
+       BTDBG("freeing bucket %p\n", bucket);
+
+       free_empty_pages_and_pagevec(bucket->foreign_pages, MMAP_PAGES);
+       kfree(bucket);
+}
+
+struct page *
+request_to_page(struct blktap_request *req, int seg)
+{
+       struct blktap_request_handle *handle = blktap_request_to_handle(req);
+       int idx = handle->slot * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+       return handle->bucket->foreign_pages[idx];
+}
+
+int
+blktap_request_pool_shrink(void)
+{
+       int i, err;
+       unsigned long flags;
+       struct blktap_request_bucket *bucket;
+
+       err = -EAGAIN;
+
+       spin_lock_irqsave(&pool.lock, flags);
+
+       /* always keep at least one bucket */
+       for (i = 1; i < MAX_BUCKETS; i++) {
+               bucket = pool.buckets[i];
+               if (!bucket)
+                       continue;
+
+               if (atomic_read(&bucket->reqs_in_use))
+                       continue;
+
+               blktap_request_pool_free_bucket(bucket);
+               pool.buckets[i] = NULL;
+               err = 0;
+               break;
+       }
+
+       spin_unlock_irqrestore(&pool.lock, flags);
+
+       return err;
+}
+
+int
+blktap_request_pool_grow(void)
+{
+       return blktap_request_pool_allocate_bucket();
+}
+
+struct blktap_request *
+blktap_request_allocate(struct blktap *tap)
+{
+       int i;
+       uint16_t usr_idx;
+       unsigned long flags;
+       struct blktap_request *request;
+
+       usr_idx = -1;
+       request = NULL;
+
+       spin_lock_irqsave(&pool.lock, flags);
+
+       if (pool.status == BLKTAP_POOL_CLOSING)
+               goto out;
+
+       for (i = 0; i < ARRAY_SIZE(tap->pending_requests); i++)
+               if (!tap->pending_requests[i]) {
+                       usr_idx = i;
+                       break;
+               }
+
+       if (usr_idx == (uint16_t)-1)
+               goto out;
+
+       if (!list_empty(&pool.free_list)) {
+               request = list_entry(pool.free_list.next,
+                                    struct blktap_request, free_list);
+               list_del(&request->free_list);
+       }
+
+       if (request) {
+               struct blktap_request_handle *handle;
+
+               atomic_inc(&pool.reqs_in_use);
+
+               handle = blktap_request_to_handle(request);
+               atomic_inc(&handle->bucket->reqs_in_use);
+               handle->inuse = 1;
+
+               request->usr_idx = usr_idx;
+
+               tap->pending_requests[usr_idx] = request;
+               tap->pending_cnt++;
+       }
+
+out:
+       spin_unlock_irqrestore(&pool.lock, flags);
+       return request;
+}
+
+void
+blktap_request_free(struct blktap *tap, struct blktap_request *request)
+{
+       int free;
+       unsigned long flags;
+       struct blktap_request_handle *handle;
+
+       BUG_ON(request->usr_idx >= ARRAY_SIZE(tap->pending_requests));
+       handle = blktap_request_to_handle(request);
+
+       spin_lock_irqsave(&pool.lock, flags);
+
+       handle->inuse = 0;
+       tap->pending_requests[request->usr_idx] = NULL;
+       blktap_request_pool_init_request(request);
+       list_add(&request->free_list, &pool.free_list);
+       atomic_dec(&handle->bucket->reqs_in_use);
+       free = atomic_dec_and_test(&pool.reqs_in_use);
+
+       spin_unlock_irqrestore(&pool.lock, flags);
+
+       if (--tap->pending_cnt == 0)
+               wake_up_interruptible(&tap->wq);
+
+       if (free)
+               wake_up(&pool.wait_queue);
+}
+
+void
+blktap_request_pool_free(void)
+{
+       int i;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pool.lock, flags);
+
+       pool.status = BLKTAP_POOL_CLOSING;
+       while (atomic_read(&pool.reqs_in_use)) {
+               spin_unlock_irqrestore(&pool.lock, flags);
+               wait_event(pool.wait_queue, !atomic_read(&pool.reqs_in_use));
+               spin_lock_irqsave(&pool.lock, flags);
+       }
+
+       for (i = 0; i < MAX_BUCKETS; i++) {
+               blktap_request_pool_free_bucket(pool.buckets[i]);
+               pool.buckets[i] = NULL;
+       }
+
+       spin_unlock_irqrestore(&pool.lock, flags);
+}
+
+int __init
+blktap_request_pool_init(void)
+{
+       int i, err;
+
+       memset(&pool, 0, sizeof(pool));
+
+       spin_lock_init(&pool.lock);
+       INIT_LIST_HEAD(&pool.free_list);
+       atomic_set(&pool.reqs_in_use, 0);
+       init_waitqueue_head(&pool.wait_queue);
+
+       for (i = 0; i < 2; i++) {
+               err = blktap_request_pool_allocate_bucket();
+               if (err)
+                       goto fail;
+       }
+
+       return 0;
+
+fail:
+       blktap_request_pool_free();
+       return err;
+}
diff --git a/drivers/xen/blktap2/ring.c b/drivers/xen/blktap2/ring.c

new file mode 100644 (file)

index 0000000..28de657
--- /dev/null
+++ b/drivers/xen/blktap2/ring.c
@@ -0,0 +1,610 @@
+#include <linux/module.h>
+#include <linux/signal.h>
+
+#include "blktap.h"
+
+static int blktap_ring_major;
+
+static inline struct blktap *
+vma_to_blktap(struct vm_area_struct *vma)
+{
+       struct vm_foreign_map *m = vma->vm_private_data;
+       struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map);
+       return container_of(r, struct blktap, ring);
+}
+
+ /* 
+  * BLKTAP - immediately before the mmap area,
+  * we have a bunch of pages reserved for shared memory rings.
+  */
+#define RING_PAGES 1
+
+static int
+blktap_read_ring(struct blktap *tap)
+{
+       /* This is called to read responses from the ring. */
+       int usr_idx;
+       RING_IDX rc, rp;
+       blkif_response_t res;
+       struct blktap_ring *ring;
+       struct blktap_request *request;
+
+       down_read(&tap->tap_sem);
+
+       ring = &tap->ring;
+       if (!ring->vma) {
+               up_read(&tap->tap_sem);
+               return 0;
+       }
+
+       /* for each outstanding message on the ring  */
+       rp = ring->ring.sring->rsp_prod;
+       rmb();
+
+       for (rc = ring->ring.rsp_cons; rc != rp; rc++) {
+               memcpy(&res, RING_GET_RESPONSE(&ring->ring, rc), sizeof(res));
+               mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */
+               ++ring->ring.rsp_cons;
+
+               usr_idx = (int)res.id;
+               if (usr_idx >= MAX_PENDING_REQS ||
+                   !tap->pending_requests[usr_idx]) {
+                       BTWARN("Request %d/%d invalid [%x], tapdisk %d%p\n",
+                              rc, rp, usr_idx, tap->pid, ring->vma);
+                       continue;
+               }
+
+               request = tap->pending_requests[usr_idx];
+               BTDBG("request %p response #%d id %x\n", request, rc, usr_idx);
+               blktap_device_finish_request(tap, &res, request);
+       }
+
+       up_read(&tap->tap_sem);
+
+       blktap_run_deferred();
+
+       return 0;
+}
+
+static int
+blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       /*
+        * if the page has not been mapped in by the driver then return
+        * VM_FAULT_SIGBUS to the domain.
+        */
+
+       return VM_FAULT_SIGBUS;
+}
+
+static pte_t
+blktap_ring_clear_pte(struct vm_area_struct *vma,
+                     unsigned long uvaddr,
+                     pte_t *ptep, int is_fullmm)
+{
+       pte_t copy;
+       struct blktap *tap;
+       unsigned long kvaddr;
+       struct page **map, *page;
+       struct blktap_ring *ring;
+       struct blktap_request *request;
+       struct grant_handle_pair *khandle;
+       struct gnttab_unmap_grant_ref unmap[2];
+       int offset, seg, usr_idx, count = 0;
+
+       tap  = vma_to_blktap(vma);
+       ring = &tap->ring;
+       map  = ring->foreign_map.map;
+       BUG_ON(!map);   /* TODO Should this be changed to if statement? */
+
+       /*
+        * Zap entry if the address is before the start of the grant
+        * mapped region.
+        */
+       if (uvaddr < ring->user_vstart)
+               return xen_ptep_get_and_clear_full(vma, uvaddr,
+                                                  ptep, is_fullmm);
+
+       offset  = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT);
+       usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       seg     = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+       offset  = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT);
+       page    = map[offset];
+       if (page && PageBlkback(page)) {
+               ClearPageBlkback(page);
+               set_page_private(page, 0);
+       }
+       map[offset] = NULL;
+
+       request = tap->pending_requests[usr_idx];
+       kvaddr  = request_to_kaddr(request, seg);
+       khandle = request->handles + seg;
+
+       if (khandle->kernel != INVALID_GRANT_HANDLE) {
+               gnttab_set_unmap_op(&unmap[count], kvaddr, 
+                                   GNTMAP_host_map, khandle->kernel);
+               count++;
+
+               set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, 
+                                   INVALID_P2M_ENTRY);
+       }
+
+
+       if (khandle->user != INVALID_GRANT_HANDLE) {
+               BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+
+               copy = *ptep;
+               gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep), 
+                                   GNTMAP_host_map 
+                                   | GNTMAP_application_map 
+                                   | GNTMAP_contains_pte,
+                                   khandle->user);
+               count++;
+       } else
+               copy = xen_ptep_get_and_clear_full(vma, uvaddr, ptep,
+                                                  is_fullmm);
+
+       if (count)
+               if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+                                             unmap, count))
+                       BUG();
+
+       khandle->kernel = INVALID_GRANT_HANDLE;
+       khandle->user   = INVALID_GRANT_HANDLE;
+
+       return copy;
+}
+
+static void
+blktap_ring_vm_unmap(struct vm_area_struct *vma)
+{
+       struct blktap *tap = vma_to_blktap(vma);
+
+       down_write(&tap->tap_sem);
+       clear_bit(BLKTAP_RING_VMA, &tap->dev_inuse);
+       clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
+       clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
+       up_write(&tap->tap_sem);
+}
+
+static void
+blktap_ring_vm_close(struct vm_area_struct *vma)
+{
+       struct blktap *tap = vma_to_blktap(vma);
+       struct blktap_ring *ring = &tap->ring;
+
+       blktap_ring_vm_unmap(vma);                 /* fail future requests */
+       blktap_device_fail_pending_requests(tap);  /* fail pending requests */
+       blktap_device_restart(tap);                /* fail deferred requests */
+
+       down_write(&tap->tap_sem);
+
+       zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
+
+       kfree(ring->foreign_map.map);
+       ring->foreign_map.map = NULL;
+
+       /* Free the ring page. */
+       ClearPageReserved(virt_to_page(ring->ring.sring));
+       free_page((unsigned long)ring->ring.sring);
+
+       BTINFO("unmapping ring %d\n", tap->minor);
+       ring->ring.sring = NULL;
+       ring->vma = NULL;
+
+       up_write(&tap->tap_sem);
+
+       wake_up(&tap->wq);
+}
+
+static struct vm_operations_struct blktap_ring_vm_operations = {
+       .close    = blktap_ring_vm_close,
+       .unmap    = blktap_ring_vm_unmap,
+       .fault    = blktap_ring_fault,
+       .zap_pte  = blktap_ring_clear_pte,
+};
+
+static int
+blktap_ring_open(struct inode *inode, struct file *filp)
+{
+       int idx;
+       struct blktap *tap;
+
+       idx = iminor(inode);
+       if (idx < 0 || idx >= CONFIG_XEN_NR_TAP2_DEVICES || !blktaps[idx]) {
+               BTERR("unable to open device blktap%d\n", idx);
+               return -ENODEV;
+       }
+
+       tap = blktaps[idx];
+
+       BTINFO("opening device blktap%d\n", idx);
+
+       if (!test_bit(BLKTAP_CONTROL, &tap->dev_inuse))
+               return -ENODEV;
+
+       /* Only one process can access ring at a time */
+       if (test_and_set_bit(BLKTAP_RING_FD, &tap->dev_inuse))
+               return -EBUSY;
+
+       filp->private_data = tap;
+       BTINFO("opened device %d\n", tap->minor);
+
+       return 0;
+}
+
+static int
+blktap_ring_release(struct inode *inode, struct file *filp)
+{
+       struct blktap *tap = filp->private_data;
+
+       BTINFO("freeing device %d\n", tap->minor);
+       clear_bit(BLKTAP_RING_FD, &tap->dev_inuse);
+       filp->private_data = NULL;
+       wake_up(&tap->wq);      
+       return 0;
+}
+
+/* Note on mmap:
+ * We need to map pages to user space in a way that will allow the block
+ * subsystem set up direct IO to them.  This couldn't be done before, because
+ * there isn't really a sane way to translate a user virtual address down to a 
+ * physical address when the page belongs to another domain.
+ *
+ * My first approach was to map the page in to kernel memory, add an entry
+ * for it in the physical frame list (using alloc_lomem_region as in blkback)
+ * and then attempt to map that page up to user space.  This is disallowed
+ * by xen though, which realizes that we don't really own the machine frame
+ * underlying the physical page.
+ *
+ * The new approach is to provide explicit support for this in xen linux.
+ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages
+ * mapped from other vms.  vma->vm_private_data is set up as a mapping 
+ * from pages to actual page structs.  There is a new clause in get_user_pages
+ * that does the right thing for this sort of mapping.
+ */
+static int
+blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+       int size, err;
+       struct page **map;
+       struct blktap *tap;
+       blkif_sring_t *sring;
+       struct blktap_ring *ring;
+
+       tap   = filp->private_data;
+       ring  = &tap->ring;
+       map   = NULL;
+       sring = NULL;
+
+       if (!tap || test_and_set_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
+               return -ENOMEM;
+
+       size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+       if (size != (MMAP_PAGES + RING_PAGES)) {
+               BTERR("you _must_ map exactly %lu pages!\n",
+                     MMAP_PAGES + RING_PAGES);
+               return -EAGAIN;
+       }
+
+       /* Allocate the fe ring. */
+       sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
+       if (!sring) {
+               BTERR("Couldn't alloc sring.\n");
+               goto fail_mem;
+       }
+
+       map = kzalloc(size * sizeof(struct page *), GFP_KERNEL);
+       if (!map) {
+               BTERR("Couldn't alloc VM_FOREIGN map.\n");
+               goto fail_mem;
+       }
+
+       SetPageReserved(virt_to_page(sring));
+    
+       SHARED_RING_INIT(sring);
+       FRONT_RING_INIT(&ring->ring, sring, PAGE_SIZE);
+
+       ring->ring_vstart = vma->vm_start;
+       ring->user_vstart = ring->ring_vstart + (RING_PAGES << PAGE_SHIFT);
+
+       /* Map the ring pages to the start of the region and reserve it. */
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               err = vm_insert_page(vma, vma->vm_start,
+                                    virt_to_page(ring->ring.sring));
+       else
+               err = remap_pfn_range(vma, vma->vm_start,
+                                     __pa(ring->ring.sring) >> PAGE_SHIFT,
+                                     PAGE_SIZE, vma->vm_page_prot);
+       if (err) {
+               BTERR("Mapping user ring failed: %d\n", err);
+               goto fail;
+       }
+
+       /* Mark this VM as containing foreign pages, and set up mappings. */
+       ring->foreign_map.map = map;
+       vma->vm_private_data = &ring->foreign_map;
+       vma->vm_flags |= VM_FOREIGN;
+       vma->vm_flags |= VM_DONTCOPY;
+       vma->vm_flags |= VM_RESERVED;
+       vma->vm_ops = &blktap_ring_vm_operations;
+
+#ifdef CONFIG_X86
+       vma->vm_mm->context.has_foreign_mappings = 1;
+#endif
+
+       tap->pid = current->pid;
+       BTINFO("blktap: mapping pid is %d\n", tap->pid);
+
+       ring->vma = vma;
+       return 0;
+
+ fail:
+       /* Clear any active mappings. */
+       zap_page_range(vma, vma->vm_start, 
+                      vma->vm_end - vma->vm_start, NULL);
+       ClearPageReserved(virt_to_page(sring));
+ fail_mem:
+       free_page((unsigned long)sring);
+       kfree(map);
+
+       return -ENOMEM;
+}
+
+static inline void
+blktap_ring_set_message(struct blktap *tap, int msg)
+{
+       struct blktap_ring *ring = &tap->ring;
+
+       down_read(&tap->tap_sem);
+       if (ring->ring.sring)
+               ring->ring.sring->private.tapif_user.msg = msg;
+       up_read(&tap->tap_sem);
+}
+
+static long
+blktap_ring_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+       struct blktap_params params;
+       struct blktap *tap = filp->private_data;
+
+       BTDBG("%d: cmd: %u, arg: %lu\n", tap->minor, cmd, arg);
+
+       switch(cmd) {
+       case BLKTAP2_IOCTL_KICK_FE:
+               /* There are fe messages to process. */
+               return blktap_read_ring(tap);
+
+       case BLKTAP2_IOCTL_CREATE_DEVICE:
+               if (!arg)
+                       return -EINVAL;
+
+               if (copy_from_user(&params, (struct blktap_params __user *)arg,
+                                  sizeof(params))) {
+                       BTERR("failed to get params\n");
+                       return -EFAULT;
+               }
+
+               if (blktap_validate_params(tap, &params)) {
+                       BTERR("invalid params\n");
+                       return -EINVAL;
+               }
+
+               tap->params = params;
+               return blktap_device_create(tap);
+
+       case BLKTAP2_IOCTL_SET_PARAMS:
+               if (!arg)
+                       return -EINVAL;
+
+               if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+                       return -EINVAL;
+
+               if (copy_from_user(&params, (struct blktap_params __user *)arg,
+                                  sizeof(params))) {
+                       BTERR("failed to get params\n");
+                       return -EFAULT;
+               }
+
+               if (blktap_validate_params(tap, &params)) {
+                       BTERR("invalid params\n");
+                       return -EINVAL;
+               }
+
+               tap->params = params;
+               return 0;
+
+       case BLKTAP2_IOCTL_PAUSE:
+               if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
+                       return -EINVAL;
+
+               set_bit(BLKTAP_PAUSED, &tap->dev_inuse);
+               clear_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse);
+
+               blktap_ring_set_message(tap, 0);
+               wake_up_interruptible(&tap->wq);
+
+               return 0;
+
+
+       case BLKTAP2_IOCTL_REOPEN:
+               if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+                       return -EINVAL;
+
+               if (!arg)
+                       return -EINVAL;
+
+               if (copy_to_user((char __user *)arg,
+                                tap->params.name,
+                                strlen(tap->params.name) + 1))
+                       return -EFAULT;
+
+               blktap_ring_set_message(tap, 0);
+               wake_up_interruptible(&tap->wq);
+
+               return 0;
+
+       case BLKTAP2_IOCTL_RESUME:
+               if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+                       return -EINVAL;
+
+               tap->ring.response = (int)arg;
+               if (!tap->ring.response)
+                       clear_bit(BLKTAP_PAUSED, &tap->dev_inuse);
+
+               blktap_ring_set_message(tap, 0);
+               wake_up_interruptible(&tap->wq);
+
+               return 0;
+       }
+
+       return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait)
+{
+       struct blktap *tap = filp->private_data;
+       struct blktap_ring *ring = &tap->ring;
+
+       poll_wait(filp, &ring->poll_wait, wait);
+       if (ring->ring.sring->private.tapif_user.msg ||
+           ring->ring.req_prod_pvt != ring->ring.sring->req_prod) {
+               RING_PUSH_REQUESTS(&ring->ring);
+               return POLLIN | POLLRDNORM;
+       }
+
+       return 0;
+}
+
+static const struct file_operations blktap_ring_file_operations = {
+       .owner    = THIS_MODULE,
+       .open     = blktap_ring_open,
+       .release  = blktap_ring_release,
+       .unlocked_ioctl = blktap_ring_ioctl,
+       .mmap     = blktap_ring_mmap,
+       .poll     = blktap_ring_poll,
+};
+
+void
+blktap_ring_kick_user(struct blktap *tap)
+{
+       wake_up_interruptible(&tap->ring.poll_wait);
+}
+
+int
+blktap_ring_resume(struct blktap *tap)
+{
+       int err;
+       struct blktap_ring *ring = &tap->ring;
+
+       if (!blktap_active(tap))
+               return -ENODEV;
+
+       if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+               return -EINVAL;
+
+       /* set shared flag for resume */
+       ring->response = 0;
+
+       blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_RESUME);
+       blktap_ring_kick_user(tap);
+
+       wait_event_interruptible(tap->wq, ring->response ||
+                                !test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
+
+       err = ring->response;
+       ring->response = 0;
+
+       BTDBG("err: %d\n", err);
+
+       if (err)
+               return err;
+
+       if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+               return -EAGAIN;
+
+       return 0;
+}
+
+int
+blktap_ring_pause(struct blktap *tap)
+{
+       if (!blktap_active(tap))
+               return -ENODEV;
+
+       if (!test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse))
+               return -EINVAL;
+
+       BTDBG("draining queue\n");
+       wait_event_interruptible(tap->wq, !tap->pending_cnt);
+       if (tap->pending_cnt)
+               return -EAGAIN;
+
+       blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_PAUSE);
+       blktap_ring_kick_user(tap);
+
+       BTDBG("waiting for tapdisk response\n");
+       wait_event_interruptible(tap->wq, test_bit(BLKTAP_PAUSED, &tap->dev_inuse));
+       if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse))
+               return -EAGAIN;
+
+       return 0;
+}
+
+int
+blktap_ring_destroy(struct blktap *tap)
+{
+       if (!test_bit(BLKTAP_RING_FD, &tap->dev_inuse) &&
+           !test_bit(BLKTAP_RING_VMA, &tap->dev_inuse))
+               return 0;
+
+       BTDBG("sending tapdisk close message\n");
+       blktap_ring_set_message(tap, BLKTAP2_RING_MESSAGE_CLOSE);
+       blktap_ring_kick_user(tap);
+
+       return -EAGAIN;
+}
+
+static void
+blktap_ring_initialize(struct blktap_ring *ring, int minor)
+{
+       memset(ring, 0, sizeof(*ring));
+       init_waitqueue_head(&ring->poll_wait);
+       ring->devno = MKDEV(blktap_ring_major, minor);
+}
+
+int
+blktap_ring_create(struct blktap *tap)
+{
+       struct blktap_ring *ring = &tap->ring;
+       blktap_ring_initialize(ring, tap->minor);
+       return blktap_sysfs_create(tap);
+}
+
+int __init
+blktap_ring_init(int *major)
+{
+       int err;
+
+       err = __register_chrdev(0, 0, CONFIG_XEN_NR_TAP2_DEVICES, "blktap2",
+                               &blktap_ring_file_operations);
+       if (err < 0) {
+               BTERR("error registering blktap ring device: %d\n", err);
+               return err;
+       }
+
+       blktap_ring_major = *major = err;
+       BTINFO("blktap ring major: %d\n", blktap_ring_major);
+       return 0;
+}
+
+int
+blktap_ring_free(void)
+{
+       if (blktap_ring_major)
+               __unregister_chrdev(blktap_ring_major, 0,
+                                   CONFIG_XEN_NR_TAP2_DEVICES, "blktap2");
+
+       return 0;
+}
diff --git a/drivers/xen/blktap2/sysfs.c b/drivers/xen/blktap2/sysfs.c

new file mode 100644 (file)

index 0000000..26a3d93
--- /dev/null
+++ b/drivers/xen/blktap2/sysfs.c
@@ -0,0 +1,475 @@
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/module.h>
+
+#include "blktap.h"
+
+int blktap_debug_level = 1;
+
+static struct class *class;
+static DECLARE_WAIT_QUEUE_HEAD(sysfs_wq);
+
+static inline void
+blktap_sysfs_get(struct blktap *tap)
+{
+       atomic_inc(&tap->ring.sysfs_refcnt);
+}
+
+static inline void
+blktap_sysfs_put(struct blktap *tap)
+{
+       if (atomic_dec_and_test(&tap->ring.sysfs_refcnt))
+               wake_up(&sysfs_wq);
+}
+
+static inline void
+blktap_sysfs_enter(struct blktap *tap)
+{
+       blktap_sysfs_get(tap);               /* pin sysfs device */
+       mutex_lock(&tap->ring.sysfs_mutex);  /* serialize sysfs operations */
+}
+
+static inline void
+blktap_sysfs_exit(struct blktap *tap)
+{
+       mutex_unlock(&tap->ring.sysfs_mutex);
+       blktap_sysfs_put(tap);
+}
+
+static ssize_t blktap_sysfs_pause_device(struct device *,
+                                        struct device_attribute *,
+                                        const char *, size_t);
+static DEVICE_ATTR(pause, S_IWUSR, NULL, blktap_sysfs_pause_device);
+static ssize_t blktap_sysfs_resume_device(struct device *,
+                                         struct device_attribute *,
+                                         const char *, size_t);
+static DEVICE_ATTR(resume, S_IWUSR, NULL, blktap_sysfs_resume_device);
+
+static ssize_t
+blktap_sysfs_set_name(struct device *dev, struct device_attribute *attr,
+                     const char *buf, size_t size)
+{
+       int err;
+       struct blktap *tap = dev_get_drvdata(dev);
+
+       blktap_sysfs_enter(tap);
+
+       if (!tap->ring.dev ||
+           test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
+               err = -ENODEV;
+               goto out;
+       }
+
+       if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
+               err = -EPERM;
+               goto out;
+       }
+
+       if (size > BLKTAP2_MAX_MESSAGE_LEN) {
+               err = -ENAMETOOLONG;
+               goto out;
+       }
+
+       if (strnlen(buf, size) >= size) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       strlcpy(tap->params.name, buf, size);
+       err = size;
+
+out:
+       blktap_sysfs_exit(tap); 
+       return err;
+}
+
+static ssize_t
+blktap_sysfs_get_name(struct device *dev, struct device_attribute *attr,
+                     char *buf)
+{
+       ssize_t size;
+       struct blktap *tap = dev_get_drvdata(dev);
+
+       blktap_sysfs_enter(tap);
+
+       if (!tap->ring.dev)
+               size = -ENODEV;
+       else if (tap->params.name[0])
+               size = sprintf(buf, "%s\n", tap->params.name);
+       else
+               size = sprintf(buf, "%d\n", tap->minor);
+
+       blktap_sysfs_exit(tap);
+
+       return size;
+}
+static DEVICE_ATTR(name, S_IRUSR | S_IWUSR,
+                  blktap_sysfs_get_name, blktap_sysfs_set_name);
+
+static ssize_t
+blktap_sysfs_remove_device(struct device *dev, struct device_attribute *attr,
+                          const char *buf, size_t size)
+{
+       int err;
+       struct blktap *tap = dev_get_drvdata(dev);
+
+       if (!tap->ring.dev)
+               return size;
+
+       if (test_and_set_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse))
+               return -EBUSY;
+
+       err = blktap_control_destroy_device(tap);
+
+       return (err ? : size);
+}
+static DEVICE_ATTR(remove, S_IWUSR, NULL, blktap_sysfs_remove_device);
+
+static ssize_t
+blktap_sysfs_pause_device(struct device *dev, struct device_attribute *attr,
+                         const char *buf, size_t size)
+{
+       int err;
+       struct blktap *tap = dev_get_drvdata(dev);
+
+       blktap_sysfs_enter(tap);
+
+       BTDBG("pausing %u:%u: dev_inuse: %lu\n",
+             MAJOR(tap->ring.devno), MINOR(tap->ring.devno), tap->dev_inuse);
+
+       if (!tap->ring.dev ||
+           test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
+               err = -ENODEV;
+               goto out;
+       }
+
+       if (test_bit(BLKTAP_PAUSE_REQUESTED, &tap->dev_inuse)) {
+               err = -EBUSY;
+               goto out;
+       }
+
+       if (test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
+               err = 0;
+               goto out;
+       }
+
+       err = blktap_device_pause(tap);
+       if (!err) {
+               device_remove_file(dev, &dev_attr_pause);
+               err = device_create_file(dev, &dev_attr_resume);
+       }
+
+out:
+       blktap_sysfs_exit(tap);
+
+       return (err ? err : size);
+}
+
+static ssize_t
+blktap_sysfs_resume_device(struct device *dev, struct device_attribute *attr,
+                          const char *buf, size_t size)
+{
+       int err;
+       struct blktap *tap = dev_get_drvdata(dev);
+
+       blktap_sysfs_enter(tap);
+
+       if (!tap->ring.dev ||
+           test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
+               err = -ENODEV;
+               goto out;
+       }
+
+       if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = blktap_device_resume(tap);
+       if (!err) {
+               device_remove_file(dev, &dev_attr_resume);
+               err = device_create_file(dev, &dev_attr_pause);
+       }
+
+out:
+       blktap_sysfs_exit(tap);
+
+       BTDBG("returning %zd\n", (err ? err : size));
+       return (err ? err : size);
+}
+
+#ifdef ENABLE_PASSTHROUGH
+static ssize_t
+blktap_sysfs_enable_passthrough(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t size)
+{
+       int err;
+       unsigned major, minor;
+       struct blktap *tap = dev_get_drvdata(dev);
+
+       BTINFO("passthrough request enabled\n");
+
+       blktap_sysfs_enter(tap);
+
+       if (!tap->ring.dev ||
+           test_bit(BLKTAP_SHUTDOWN_REQUESTED, &tap->dev_inuse)) {
+               err = -ENODEV;
+               goto out;
+       }
+
+       if (!test_bit(BLKTAP_PAUSED, &tap->dev_inuse)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (test_bit(BLKTAP_PASSTHROUGH, &tap->dev_inuse)) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = sscanf(buf, "%x:%x", &major, &minor);
+       if (err != 2) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = blktap_device_enable_passthrough(tap, major, minor);
+
+out:
+       blktap_sysfs_exit(tap);
+       BTDBG("returning %d\n", (err ? err : size));
+       return (err ? err : size);
+}
+#endif
+
+static ssize_t
+blktap_sysfs_debug_device(struct device *dev, struct device_attribute *attr,
+                         char *buf)
+{
+       char *tmp;
+       int i, ret;
+       struct blktap *tap = dev_get_drvdata(dev);
+
+       tmp = buf;
+       blktap_sysfs_get(tap);
+
+       if (!tap->ring.dev) {
+               ret = sprintf(tmp, "no device\n");
+               goto out;
+       }
+
+       tmp += sprintf(tmp, "%s (%u:%u), refcnt: %d, dev_inuse: 0x%08lx\n",
+                      tap->params.name, MAJOR(tap->ring.devno),
+                      MINOR(tap->ring.devno), atomic_read(&tap->refcnt),
+                      tap->dev_inuse);
+       tmp += sprintf(tmp, "capacity: 0x%llx, sector size: 0x%lx, "
+                      "device users: %d\n", tap->params.capacity,
+                      tap->params.sector_size, tap->device.users);
+
+       down_read(&tap->tap_sem);
+
+       tmp += sprintf(tmp, "pending requests: %d\n", tap->pending_cnt);
+       for (i = 0; i < MAX_PENDING_REQS; i++) {
+               struct blktap_request *req = tap->pending_requests[i];
+               if (!req)
+                       continue;
+
+               tmp += sprintf(tmp, "req %d: id: %llu, usr_idx: %d, "
+                              "status: 0x%02x, pendcnt: %d, "
+                              "nr_pages: %u, op: %d, time: %lu:%lu\n",
+                              i, (unsigned long long)req->id, req->usr_idx,
+                              req->status, atomic_read(&req->pendcnt),
+                              req->nr_pages, req->operation, req->time.tv_sec,
+                              req->time.tv_usec);
+       }
+
+       up_read(&tap->tap_sem);
+       ret = (tmp - buf) + 1;
+
+out:
+       blktap_sysfs_put(tap);
+       BTDBG("%s\n", buf);
+
+       return ret;
+}
+static DEVICE_ATTR(debug, S_IRUSR, blktap_sysfs_debug_device, NULL);
+
+int
+blktap_sysfs_create(struct blktap *tap)
+{
+       struct blktap_ring *ring;
+       struct device *dev;
+       int err, state = 0;
+
+       if (!class)
+               return -ENODEV;
+
+       ring = &tap->ring;
+
+       dev = device_create(class, NULL, ring->devno, tap,
+                           "blktap%d", tap->minor);
+       if (IS_ERR(dev))
+               return PTR_ERR(dev);
+
+       ring->dev = dev;
+
+       mutex_init(&ring->sysfs_mutex);
+       atomic_set(&ring->sysfs_refcnt, 0);
+       set_bit(BLKTAP_SYSFS, &tap->dev_inuse);
+
+       err = device_create_file(dev, &dev_attr_name);
+       if (!err) {
+               ++state;
+               err = device_create_file(dev, &dev_attr_remove);
+       }
+       if (!err) {
+               ++state;
+               err = device_create_file(dev, &dev_attr_pause);
+       }
+       if (!err) {
+               ++state;
+               err = device_create_file(dev, &dev_attr_debug);
+       }
+
+       switch (state * !!err) {
+       case 3: device_remove_file(dev, &dev_attr_pause);
+       case 2: device_remove_file(dev, &dev_attr_remove);
+       case 1: device_remove_file(dev, &dev_attr_name);
+       }
+
+       return err;
+}
+
+static void
+_blktap_sysfs_destroy(struct device *dev)
+{
+       struct blktap *tap = dev_get_drvdata(dev);
+
+       device_remove_file(dev, &dev_attr_name);
+       device_remove_file(dev, &dev_attr_remove);
+       device_remove_file(dev, &dev_attr_pause);
+       device_remove_file(dev, &dev_attr_resume);
+       device_remove_file(dev, &dev_attr_debug);
+
+       device_unregister(dev);
+
+       clear_bit(BLKTAP_SYSFS, &tap->dev_inuse);
+
+       blktap_control_finish_destroy(tap);
+}
+
+int
+blktap_sysfs_destroy(struct blktap *tap)
+{
+       struct blktap_ring *ring;
+       struct device *dev;
+
+       ring = &tap->ring;
+       dev  = ring->dev;
+       if (!class || !dev)
+               return 0;
+
+       ring->dev = NULL;
+       if (wait_event_interruptible(sysfs_wq,
+                                    !atomic_read(&tap->ring.sysfs_refcnt)))
+               return -EAGAIN;
+
+       return device_schedule_callback(dev, _blktap_sysfs_destroy);
+}
+
+static ssize_t
+blktap_sysfs_show_verbosity(struct class *class, struct class_attribute *attr,
+                           char *buf)
+{
+       return sprintf(buf, "%d\n", blktap_debug_level);
+}
+
+static ssize_t
+blktap_sysfs_set_verbosity(struct class *class, struct class_attribute *attr,
+                          const char *buf, size_t size)
+{
+       int level;
+
+       if (sscanf(buf, "%d", &level) == 1) {
+               blktap_debug_level = level;
+               return size;
+       }
+
+       return -EINVAL;
+}
+static CLASS_ATTR(verbosity, S_IRUSR | S_IWUSR,
+                 blktap_sysfs_show_verbosity, blktap_sysfs_set_verbosity);
+
+static ssize_t
+blktap_sysfs_show_devices(struct class *class, struct class_attribute *attr,
+                         char *buf)
+{
+       int i, ret;
+       struct blktap *tap;
+
+       ret = 0;
+       for (i = 0; i < CONFIG_XEN_NR_TAP2_DEVICES; i++) {
+               tap = blktaps[i];
+               if (!tap)
+                       continue;
+
+               if (!test_bit(BLKTAP_DEVICE, &tap->dev_inuse))
+                       continue;
+
+               ret += sprintf(buf + ret, "%d ", tap->minor);
+               ret += snprintf(buf + ret, sizeof(tap->params.name) - 1,
+                               tap->params.name);
+               ret += sprintf(buf + ret, "\n");
+       }
+
+       return ret;
+}
+static CLASS_ATTR(devices, S_IRUSR, blktap_sysfs_show_devices, NULL);
+
+void
+blktap_sysfs_free(void)
+{
+       if (!class)
+               return;
+
+       class_remove_file(class, &class_attr_verbosity);
+       class_remove_file(class, &class_attr_devices);
+
+       class_destroy(class);
+}
+
+static char *blktap_devnode(struct device *dev, umode_t *mode)
+{
+       return kasprintf(GFP_KERNEL, BLKTAP2_DEV_DIR "blktap%u",
+                        MINOR(dev->devt));
+}
+
+int __init
+blktap_sysfs_init(void)
+{
+       struct class *cls;
+       int err;
+
+       if (class)
+               return -EEXIST;
+
+       cls = class_create(THIS_MODULE, "blktap2");
+       if (IS_ERR(cls))
+               return PTR_ERR(cls);
+
+       cls->devnode = blktap_devnode;
+
+       err = class_create_file(cls, &class_attr_verbosity);
+       if (!err) {
+               err = class_create_file(cls, &class_attr_devices);
+               if (err)
+                       class_remove_file(cls, &class_attr_verbosity);
+       }
+       if (!err)
+               class = cls;
+       else
+               class_destroy(cls);
+
+       return err;
+}
diff --git a/drivers/xen/blktap2/wait_queue.c b/drivers/xen/blktap2/wait_queue.c

new file mode 100644 (file)

index 0000000..f8995aa
--- /dev/null
+++ b/drivers/xen/blktap2/wait_queue.c
@@ -0,0 +1,40 @@
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include "blktap.h"
+
+static LIST_HEAD(deferred_work_queue);
+static DEFINE_SPINLOCK(deferred_work_lock);
+
+void
+blktap_run_deferred(void)
+{
+       LIST_HEAD(queue);
+       struct blktap *tap;
+       unsigned long flags;
+
+       spin_lock_irqsave(&deferred_work_lock, flags);
+       list_splice_init(&deferred_work_queue, &queue);
+       list_for_each_entry(tap, &queue, deferred_queue)
+               clear_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
+       spin_unlock_irqrestore(&deferred_work_lock, flags);
+
+       while (!list_empty(&queue)) {
+               tap = list_entry(queue.next, struct blktap, deferred_queue);
+               list_del_init(&tap->deferred_queue);
+               blktap_device_restart(tap);
+       }
+}
+
+void
+blktap_defer(struct blktap *tap)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&deferred_work_lock, flags);
+       if (!test_bit(BLKTAP_DEFERRED, &tap->dev_inuse)) {
+               set_bit(BLKTAP_DEFERRED, &tap->dev_inuse);
+               list_add_tail(&tap->deferred_queue, &deferred_work_queue);
+       }
+       spin_unlock_irqrestore(&deferred_work_lock, flags);
+}
diff --git a/drivers/xen/char/Makefile b/drivers/xen/char/Makefile

new file mode 100644 (file)

index 0000000..13604ad
--- /dev/null
+++ b/drivers/xen/char/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_XEN_DEVMEM)       := mem.o
diff --git a/drivers/xen/char/mem.c b/drivers/xen/char/mem.c

new file mode 100644 (file)

index 0000000..f55cff3
--- /dev/null
+++ b/drivers/xen/char/mem.c
@@ -0,0 +1,222 @@
+/*
+ *  Originally from linux/drivers/char/mem.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Added devfs support.
+ *    Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu>
+ *  Shared /dev/zero mmapping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/capability.h>
+#include <linux/ptrace.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/hypervisor.h>
+
+static inline unsigned long size_inside_page(unsigned long start,
+                                            unsigned long size)
+{
+       unsigned long sz;
+
+       sz = PAGE_SIZE - (start & (PAGE_SIZE - 1));
+
+       return min(sz, size);
+}
+
+static inline int uncached_access(struct file *file)
+{
+       if (file->f_flags & O_DSYNC)
+               return 1;
+       /* Xen sets correct MTRR type on non-RAM for us. */
+       return 0;
+}
+
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+#ifdef CONFIG_STRICT_DEVMEM
+       u64 from = ((u64)pfn) << PAGE_SHIFT;
+       u64 to = from + size;
+       u64 cursor = from;
+
+       while (cursor < to) {
+               if (!devmem_is_allowed(pfn)) {
+                       printk(KERN_INFO
+               "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+                               current->comm, from, to);
+                       return 0;
+               }
+               cursor += PAGE_SIZE;
+               pfn++;
+       }
+#endif
+       return 1;
+}
+
+/*
+ * This funcion reads the *physical* memory. The f_pos points directly to the
+ * memory location.
+ */
+static ssize_t read_mem(struct file *file, char __user *buf,
+                       size_t count, loff_t *ppos)
+{
+       unsigned long p = *ppos;
+       ssize_t read = 0, sz;
+       void __iomem *v;
+
+       while (count > 0) {
+               unsigned long remaining;
+
+               sz = size_inside_page(p, count);
+
+               if (!range_is_allowed(p >> PAGE_SHIFT, count))
+                       return -EPERM;
+
+               v = ioremap(p, sz);
+               if (IS_ERR(v) || v == NULL) {
+                       /*
+                        * Some programs (e.g., dmidecode) groove off into
+                        * weird RAM areas where no tables can possibly exist
+                        * (because Xen will have stomped on them!). These
+                        * programs get rather upset if we let them know that
+                        * Xen failed their access, so we fake out a read of
+                        * all zeroes.
+                        */
+                       if (clear_user(buf, count))
+                               return -EFAULT;
+                       read += count;
+                       break;
+               }
+
+               remaining = copy_to_user(buf, v, sz);
+               iounmap(v);
+               if (remaining)
+                       return -EFAULT;
+
+               buf += sz;
+               p += sz;
+               count -= sz;
+               read += sz;
+       }
+
+       *ppos += read;
+       return read;
+}
+
+static ssize_t write_mem(struct file *file, const char __user *buf,
+                        size_t count, loff_t *ppos)
+{
+       unsigned long p = *ppos, ignored;
+       ssize_t written = 0, sz;
+       void __iomem *v;
+
+       while (count > 0) {
+               sz = size_inside_page(p, count);
+
+               if (!range_is_allowed(p >> PAGE_SHIFT, sz))
+                       return -EPERM;
+
+               v = ioremap(p, sz);
+               if (v == NULL)
+                       break;
+               if (IS_ERR(v)) {
+                       if (written == 0)
+                               return PTR_ERR(v);
+                       break;
+               }
+
+               ignored = copy_from_user(v, buf, sz);
+               iounmap(v);
+               if (ignored) {
+                       written += sz - ignored;
+                       if (written)
+                               break;
+                       return -EFAULT;
+               }
+               buf += sz;
+               p += sz;
+               count -= sz;
+               written += sz;
+       }
+
+       *ppos += written;
+       return written;
+}
+
+#ifndef ARCH_HAS_DEV_MEM_MMAP_MEM
+static struct vm_operations_struct mmap_mem_ops = {
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+       .access = generic_access_phys
+#endif
+};
+
+static int xen_mmap_mem(struct file *file, struct vm_area_struct *vma)
+{
+       size_t size = vma->vm_end - vma->vm_start;
+
+       if (uncached_access(file))
+               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+       if (!range_is_allowed(vma->vm_pgoff, size))
+               return -EPERM;
+
+       if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
+                                               &vma->vm_page_prot))
+               return -EINVAL;
+
+       vma->vm_ops = &mmap_mem_ops;
+
+       /* We want to return the real error code, not EAGAIN. */
+       return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+                                     size, vma->vm_page_prot, DOMID_IO);
+}
+#endif
+
+/*
+ * The memory devices use the full 32/64 bits of the offset, and so we cannot
+ * check against negative addresses: they are ok. The return value is weird,
+ * though, in that case (0).
+ *
+ * also note that seeking relative to the "end of file" isn't supported:
+ * it has no meaning, so it returns -EINVAL.
+ */
+static loff_t memory_lseek(struct file *file, loff_t offset, int orig)
+{
+       loff_t ret;
+
+       mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
+       switch (orig) {
+       case SEEK_CUR:
+               offset += file->f_pos;
+       case SEEK_SET:
+               /* to avoid userland mistaking f_pos=-9 as -EBADF=-9 */
+               if ((unsigned long long)offset >= ~0xFFFULL) {
+                       ret = -EOVERFLOW;
+                       break;
+               }
+               file->f_pos = offset;
+               ret = file->f_pos;
+               force_successful_syscall_return();
+               break;
+       default:
+               ret = -EINVAL;
+       }
+       mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
+       return ret;
+}
+
+static int open_mem(struct inode * inode, struct file * filp)
+{
+       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
+const struct file_operations mem_fops = {
+       .llseek         = memory_lseek,
+       .read           = read_mem,
+       .write          = write_mem,
+       .mmap           = xen_mmap_mem,
+       .open           = open_mem,
+};
diff --git a/drivers/xen/console/Makefile b/drivers/xen/console/Makefile

new file mode 100644 (file)

index 0000000..35de3e9
--- /dev/null
+++ b/drivers/xen/console/Makefile
@@ -0,0 +1,2 @@
+
+obj-y  := console.o xencons_ring.o
diff --git a/drivers/xen/console/console.c b/drivers/xen/console/console.c

new file mode 100644 (file)

index 0000000..aef55bb
--- /dev/null
+++ b/drivers/xen/console/console.c
@@ -0,0 +1,748 @@
+/******************************************************************************
+ * console.c
+ * 
+ * Virtual console driver.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser.
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+#include <linux/serial.h>
+#include <linux/major.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/console.h>
+#include <linux/sysrq.h>
+#include <linux/vt.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+#include "xencons.h"
+
+/*
+ * Modes:
+ *  'xencons=off'  [XC_OFF]:     Console is disabled.
+ *  'xencons=tty'  [XC_TTY]:     Console attached to '/dev/tty[0-9]+'.
+ *  'xencons=ttyS' [XC_SERIAL]:  Console attached to '/dev/ttyS[0-9]+'.
+ *  'xencons=xvc'  [XC_XVC]:     Console attached to '/dev/xvc0'.
+ *  'xencons=hvc'  [XC_HVC]:     Console attached to '/dev/hvc0'.
+ *  default:                     XC_XVC
+ * 
+ * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses
+ * warnings from standard distro startup scripts.
+ */
+static enum {
+       XC_OFF, XC_TTY, XC_SERIAL, XC_XVC, XC_HVC
+} xc_mode = XC_XVC;
+static int xc_num = -1;
+
+/* /dev/xvc0 device number allocated by lanana.org. */
+#define XEN_XVC_MAJOR 204
+#define XEN_XVC_MINOR 191
+
+/* /dev/hvc0 device number */
+#define XEN_HVC_MAJOR 229
+#define XEN_HVC_MINOR 0
+
+static int __init xencons_setup(char *str)
+{
+       char *q;
+       int n;
+
+       console_use_vt = 1;
+       if (!strncmp(str, "ttyS", 4)) {
+               xc_mode = XC_SERIAL;
+               str += 4;
+       } else if (!strncmp(str, "tty", 3)) {
+               xc_mode = XC_TTY;
+               str += 3;
+               console_use_vt = 0;
+       } else if (!strncmp(str, "xvc", 3)) {
+               xc_mode = XC_XVC;
+               str += 3;
+       } else if (!strncmp(str, "hvc", 3)) {
+               xc_mode = XC_HVC;
+               str += 3;
+       } else if (!strncmp(str, "off", 3)) {
+               xc_mode = XC_OFF;
+               str += 3;
+       }
+
+       n = simple_strtol(str, &q, 10);
+       if (q != str)
+               xc_num = n;
+
+       return 1;
+}
+__setup("xencons=", xencons_setup);
+
+/* The kernel and user-land drivers share a common transmit buffer. */
+static unsigned int wbuf_size = 4096;
+#define WBUF_MASK(_i) ((_i)&(wbuf_size-1))
+static char *wbuf;
+static unsigned int wc, wp; /* write_cons, write_prod */
+
+static int __init xencons_bufsz_setup(char *str)
+{
+       unsigned int goal;
+       goal = simple_strtoul(str, NULL, 0);
+       if (goal) {
+               goal = roundup_pow_of_two(goal);
+               if (wbuf_size < goal)
+                       wbuf_size = goal;
+       }
+       return 1;
+}
+__setup("xencons_bufsz=", xencons_bufsz_setup);
+
+/* This lock protects accesses to the common transmit buffer. */
+static DEFINE_SPINLOCK(xencons_lock);
+
+/* Common transmit-kick routine. */
+static void __xencons_tx_flush(void);
+
+static struct tty_driver *xencons_driver;
+
+/******************** Kernel console driver ********************************/
+
+static void kcons_write(struct console *c, const char *s, unsigned int count)
+{
+       int           i = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&xencons_lock, flags);
+
+       while (i < count) {
+               for (; i < count; i++) {
+                       if ((wp - wc) >= (wbuf_size - 1))
+                               break;
+                       if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n')
+                               wbuf[WBUF_MASK(wp++)] = '\r';
+               }
+
+               __xencons_tx_flush();
+       }
+
+       spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void kcons_write_dom0(struct console *c, const char *s, unsigned int count)
+{
+
+       while (count > 0) {
+               int rc;
+               rc = HYPERVISOR_console_io( CONSOLEIO_write, count, (char *)s);
+               if (rc <= 0)
+                       break;
+               count -= rc;
+               s += rc;
+       }
+}
+
+static struct tty_driver *kcons_device(struct console *c, int *index)
+{
+       *index = 0;
+       return xencons_driver;
+}
+
+static struct console kcons_info = {
+       .device = kcons_device,
+       .flags  = CON_PRINTBUFFER | CON_ENABLED,
+       .index  = -1,
+};
+
+static int __init xen_console_init(void)
+{
+       if (!is_running_on_xen())
+               goto out;
+
+       if (is_initial_xendomain()) {
+               kcons_info.write = kcons_write_dom0;
+       } else {
+               if (!xen_start_info->console.domU.evtchn)
+                       goto out;
+               kcons_info.write = kcons_write;
+       }
+
+       switch (xc_mode) {
+       case XC_XVC:
+               strcpy(kcons_info.name, "xvc");
+               if (xc_num == -1)
+                       xc_num = 0;
+               break;
+
+       case XC_HVC:
+               strcpy(kcons_info.name, "hvc");
+               if (xc_num == -1)
+                       xc_num = 0;
+               if (!is_initial_xendomain())
+                       add_preferred_console(kcons_info.name, xc_num, NULL);
+               break;
+
+       case XC_SERIAL:
+               strcpy(kcons_info.name, "ttyS");
+               if (xc_num == -1)
+                       xc_num = 0;
+               break;
+
+       case XC_TTY:
+               strcpy(kcons_info.name, "tty");
+               if (xc_num == -1)
+                       xc_num = 1;
+               break;
+
+       default:
+               goto out;
+       }
+
+       wbuf = kmalloc(wbuf_size, GFP_KERNEL);
+
+       register_console(&kcons_info);
+
+ out:
+       return 0;
+}
+console_initcall(xen_console_init);
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+/*** Useful function for console debugging -- goes straight to Xen. ***/
+int xprintk(const char *fmt, ...)
+{
+       va_list args;
+       int printk_len;
+       static char printk_buf[1024];
+
+       /* Emit the output into the temporary buffer */
+       va_start(args, fmt);
+       printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+       va_end(args);
+
+       /* Send the processed output directly to Xen. */
+       kcons_write_dom0(NULL, printk_buf, printk_len);
+
+       return 0;
+}
+#endif
+
+/*** Forcibly flush console data before dying. ***/
+void xencons_force_flush(void)
+{
+       int sz;
+
+       /* Emergency console is synchronous, so there's nothing to flush. */
+       if (!is_running_on_xen() ||
+           is_initial_xendomain() ||
+           !xen_start_info->console.domU.evtchn)
+               return;
+
+       /* Spin until console data is flushed through to the daemon. */
+       while (wc != wp) {
+               int sent = 0;
+               if ((sz = wp - wc) == 0)
+                       continue;
+               sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
+               if (sent > 0)
+                       wc += sent;
+       }
+}
+
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#include <linux/screen_info.h>
+
+void __init dom0_init_screen_info(const struct dom0_vga_console_info *info, size_t size)
+{
+       /* This is drawn from a dump from vgacon:startup in
+        * standard Linux. */
+       screen_info.orig_video_mode = 3;
+       screen_info.orig_video_isVGA = 1;
+       screen_info.orig_video_lines = 25;
+       screen_info.orig_video_cols = 80;
+       screen_info.orig_video_ega_bx = 3;
+       screen_info.orig_video_points = 16;
+       screen_info.orig_y = screen_info.orig_video_lines - 1;
+
+       switch (info->video_type) {
+       case XEN_VGATYPE_TEXT_MODE_3:
+               if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+                          + sizeof(info->u.text_mode_3))
+                       break;
+               screen_info.orig_video_lines = info->u.text_mode_3.rows;
+               screen_info.orig_video_cols = info->u.text_mode_3.columns;
+               screen_info.orig_x = info->u.text_mode_3.cursor_x;
+               screen_info.orig_y = info->u.text_mode_3.cursor_y;
+               screen_info.orig_video_points =
+                       info->u.text_mode_3.font_height;
+               break;
+
+       case XEN_VGATYPE_VESA_LFB:
+       case XEN_VGATYPE_EFI_LFB:
+               if (size < offsetof(struct dom0_vga_console_info,
+                                   u.vesa_lfb.gbl_caps))
+                       break;
+               screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
+               screen_info.lfb_width = info->u.vesa_lfb.width;
+               screen_info.lfb_height = info->u.vesa_lfb.height;
+               screen_info.lfb_depth = info->u.vesa_lfb.bits_per_pixel;
+               screen_info.lfb_base = info->u.vesa_lfb.lfb_base;
+               screen_info.lfb_size = info->u.vesa_lfb.lfb_size;
+               screen_info.lfb_linelength = info->u.vesa_lfb.bytes_per_line;
+               screen_info.red_size = info->u.vesa_lfb.red_size;
+               screen_info.red_pos = info->u.vesa_lfb.red_pos;
+               screen_info.green_size = info->u.vesa_lfb.green_size;
+               screen_info.green_pos = info->u.vesa_lfb.green_pos;
+               screen_info.blue_size = info->u.vesa_lfb.blue_size;
+               screen_info.blue_pos = info->u.vesa_lfb.blue_pos;
+               screen_info.rsvd_size = info->u.vesa_lfb.rsvd_size;
+               screen_info.rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+               if (info->video_type == XEN_VGATYPE_EFI_LFB) {
+                       screen_info.orig_video_isVGA = VIDEO_TYPE_EFI;
+                       break;
+               }
+               if (size >= offsetof(struct dom0_vga_console_info,
+                                    u.vesa_lfb.gbl_caps)
+                           + sizeof(info->u.vesa_lfb.gbl_caps))
+                       screen_info.capabilities = info->u.vesa_lfb.gbl_caps;
+               if (size >= offsetof(struct dom0_vga_console_info,
+                                    u.vesa_lfb.mode_attrs)
+                           + sizeof(info->u.vesa_lfb.mode_attrs))
+                       screen_info.vesa_attributes = info->u.vesa_lfb.mode_attrs;
+               break;
+       }
+}
+#endif
+
+
+/******************** User-space console driver (/dev/console) ************/
+
+#define DRV(_d)         (_d)
+#define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) &&                \
+                        ((_tty)->index != (xc_num - 1)))
+
+static struct ktermios *xencons_termios[MAX_NR_CONSOLES];
+static struct tty_struct *xencons_tty;
+static int xencons_priv_irq;
+static char x_char;
+
+void xencons_rx(char *buf, unsigned len)
+{
+       int           i;
+       unsigned long flags;
+
+       spin_lock_irqsave(&xencons_lock, flags);
+       if (xencons_tty == NULL)
+               goto out;
+
+       for (i = 0; i < len; i++) {
+#ifdef CONFIG_MAGIC_SYSRQ
+               static unsigned long sysrq_requested;
+
+               if (buf[i] == '\x0f') { /* ^O */
+                       if (!sysrq_requested) {
+                               sysrq_requested = jiffies;
+                               continue; /* don't print sysrq key */
+                       }
+                       sysrq_requested = 0;
+               } else if (sysrq_requested) {
+                       unsigned long sysrq_timeout = sysrq_requested + HZ*2;
+
+                       sysrq_requested = 0;
+                       if (time_before(jiffies, sysrq_timeout)) {
+                               spin_unlock_irqrestore(&xencons_lock, flags);
+                               handle_sysrq(buf[i]);
+                               spin_lock_irqsave(&xencons_lock, flags);
+                               continue;
+                       }
+               }
+#endif
+               tty_insert_flip_char(xencons_tty, buf[i], 0);
+       }
+       tty_flip_buffer_push(xencons_tty);
+
+ out:
+       spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void __xencons_tx_flush(void)
+{
+       int sent, sz, work_done = 0;
+
+       if (x_char) {
+               if (is_initial_xendomain())
+                       kcons_write_dom0(NULL, &x_char, 1);
+               else
+                       while (x_char)
+                               if (xencons_ring_send(&x_char, 1) == 1)
+                                       break;
+               x_char = 0;
+               work_done = 1;
+       }
+
+       while (wc != wp) {
+               sz = wp - wc;
+               if (sz > (wbuf_size - WBUF_MASK(wc)))
+                       sz = wbuf_size - WBUF_MASK(wc);
+               if (is_initial_xendomain()) {
+                       kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz);
+                       wc += sz;
+               } else {
+                       sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz);
+                       if (sent == 0)
+                               break;
+                       wc += sent;
+               }
+               work_done = 1;
+       }
+
+       if (work_done && (xencons_tty != NULL)) {
+               wake_up_interruptible(&xencons_tty->write_wait);
+               tty_wakeup(xencons_tty);
+       }
+}
+
+void xencons_tx(void)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&xencons_lock, flags);
+       __xencons_tx_flush();
+       spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+/* Privileged receive callback and transmit kicker. */
+static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id)
+{
+       static char rbuf[16];
+       int         l;
+
+       while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
+               xencons_rx(rbuf, l);
+
+       xencons_tx();
+
+       return IRQ_HANDLED;
+}
+
+static int xencons_write_room(struct tty_struct *tty)
+{
+       return wbuf_size - (wp - wc);
+}
+
+static int xencons_chars_in_buffer(struct tty_struct *tty)
+{
+       return wp - wc;
+}
+
+static void xencons_send_xchar(struct tty_struct *tty, char ch)
+{
+       unsigned long flags;
+
+       if (DUMMY_TTY(tty))
+               return;
+
+       spin_lock_irqsave(&xencons_lock, flags);
+       x_char = ch;
+       __xencons_tx_flush();
+       spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void xencons_throttle(struct tty_struct *tty)
+{
+       if (DUMMY_TTY(tty))
+               return;
+
+       if (I_IXOFF(tty))
+               xencons_send_xchar(tty, STOP_CHAR(tty));
+}
+
+static void xencons_unthrottle(struct tty_struct *tty)
+{
+       if (DUMMY_TTY(tty))
+               return;
+
+       if (I_IXOFF(tty)) {
+               if (x_char != 0)
+                       x_char = 0;
+               else
+                       xencons_send_xchar(tty, START_CHAR(tty));
+       }
+}
+
+static void xencons_flush_buffer(struct tty_struct *tty)
+{
+       unsigned long flags;
+
+       if (DUMMY_TTY(tty))
+               return;
+
+       spin_lock_irqsave(&xencons_lock, flags);
+       wc = wp = 0;
+       spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static inline int __xencons_put_char(int ch)
+{
+       char _ch = (char)ch;
+       if ((wp - wc) == wbuf_size)
+               return 0;
+       wbuf[WBUF_MASK(wp++)] = _ch;
+       return 1;
+}
+
+static int xencons_write(
+       struct tty_struct *tty,
+       const unsigned char *buf,
+       int count)
+{
+       int i;
+       unsigned long flags;
+
+       if (DUMMY_TTY(tty))
+               return count;
+
+       spin_lock_irqsave(&xencons_lock, flags);
+
+       for (i = 0; i < count; i++)
+               if (!__xencons_put_char(buf[i]))
+                       break;
+
+       if (i != 0)
+               __xencons_tx_flush();
+
+       spin_unlock_irqrestore(&xencons_lock, flags);
+
+       return i;
+}
+
+static int xencons_put_char(struct tty_struct *tty, u_char ch)
+{
+       unsigned long flags;
+       int ret;
+
+       if (DUMMY_TTY(tty))
+               return 0;
+
+       spin_lock_irqsave(&xencons_lock, flags);
+       ret = __xencons_put_char(ch);
+       spin_unlock_irqrestore(&xencons_lock, flags);
+       return ret;
+}
+
+static void xencons_flush_chars(struct tty_struct *tty)
+{
+       unsigned long flags;
+
+       if (DUMMY_TTY(tty))
+               return;
+
+       spin_lock_irqsave(&xencons_lock, flags);
+       __xencons_tx_flush();
+       spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static void xencons_wait_until_sent(struct tty_struct *tty, int timeout)
+{
+       unsigned long orig_jiffies = jiffies;
+
+       if (DUMMY_TTY(tty))
+               return;
+
+       while (tty_chars_in_buffer(tty)) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(1);
+               if (signal_pending(current))
+                       break;
+               if (timeout && time_after(jiffies, orig_jiffies + timeout))
+                       break;
+       }
+
+       set_current_state(TASK_RUNNING);
+}
+
+static int xencons_open(struct tty_struct *tty, struct file *filp)
+{
+       unsigned long flags;
+
+       if (DUMMY_TTY(tty))
+               return 0;
+
+       spin_lock_irqsave(&xencons_lock, flags);
+       tty->driver_data = NULL;
+       if (xencons_tty == NULL)
+               xencons_tty = tty;
+       __xencons_tx_flush();
+       spin_unlock_irqrestore(&xencons_lock, flags);
+
+       return 0;
+}
+
+static void xencons_close(struct tty_struct *tty, struct file *filp)
+{
+       unsigned long flags;
+
+       if (DUMMY_TTY(tty))
+               return;
+
+       /*
+        * Must follow lock nesting; callers are prepared for this
+        * (__tty_hangup) or don't care as they drop the lock right after our
+        * return (tty_release) in order to then acquire both in proper order.
+        */
+       tty_unlock();
+       mutex_lock(&tty_mutex);
+       tty_lock();
+
+       if (tty->count != 1) {
+               mutex_unlock(&tty_mutex);
+               return;
+       }
+
+       /* Prevent other threads from re-opening this tty. */
+       set_bit(TTY_CLOSING, &tty->flags);
+       mutex_unlock(&tty_mutex);
+
+       tty->closing = 1;
+       tty_wait_until_sent(tty, 0);
+       tty_driver_flush_buffer(tty);
+       if (tty->ldisc->ops->flush_buffer)
+               tty->ldisc->ops->flush_buffer(tty);
+       tty->closing = 0;
+       spin_lock_irqsave(&xencons_lock, flags);
+       xencons_tty = NULL;
+       spin_unlock_irqrestore(&xencons_lock, flags);
+}
+
+static const struct tty_operations xencons_ops = {
+       .open = xencons_open,
+       .close = xencons_close,
+       .write = xencons_write,
+       .write_room = xencons_write_room,
+       .put_char = xencons_put_char,
+       .flush_chars = xencons_flush_chars,
+       .chars_in_buffer = xencons_chars_in_buffer,
+       .send_xchar = xencons_send_xchar,
+       .flush_buffer = xencons_flush_buffer,
+       .throttle = xencons_throttle,
+       .unthrottle = xencons_unthrottle,
+       .wait_until_sent = xencons_wait_until_sent,
+};
+
+static int __init xencons_init(void)
+{
+       int rc;
+
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       if (xc_mode == XC_OFF)
+               return 0;
+
+       if (!is_initial_xendomain()) {
+               rc = xencons_ring_init();
+               if (rc)
+                       return rc;
+       }
+
+       xencons_driver = alloc_tty_driver((xc_mode == XC_TTY) ?
+                                         MAX_NR_CONSOLES : 1);
+       if (xencons_driver == NULL)
+               return -ENOMEM;
+
+       DRV(xencons_driver)->name            = "xencons";
+       DRV(xencons_driver)->major           = TTY_MAJOR;
+       DRV(xencons_driver)->type            = TTY_DRIVER_TYPE_SERIAL;
+       DRV(xencons_driver)->subtype         = SERIAL_TYPE_NORMAL;
+       DRV(xencons_driver)->init_termios    = tty_std_termios;
+       DRV(xencons_driver)->flags           =
+               TTY_DRIVER_REAL_RAW |
+               TTY_DRIVER_RESET_TERMIOS;
+       DRV(xencons_driver)->termios         = xencons_termios;
+
+       switch (xc_mode) {
+       case XC_XVC:
+               DRV(xencons_driver)->name        = "xvc";
+               DRV(xencons_driver)->major       = XEN_XVC_MAJOR;
+               DRV(xencons_driver)->minor_start = XEN_XVC_MINOR;
+               DRV(xencons_driver)->name_base   = xc_num;
+               break;
+       case XC_HVC:
+               DRV(xencons_driver)->name        = "hvc";
+               DRV(xencons_driver)->major       = XEN_HVC_MAJOR;
+               DRV(xencons_driver)->minor_start = XEN_HVC_MINOR;
+               DRV(xencons_driver)->name_base   = xc_num;
+               break;
+       case XC_SERIAL:
+               DRV(xencons_driver)->name        = "ttyS";
+               DRV(xencons_driver)->minor_start = 64 + xc_num;
+               DRV(xencons_driver)->name_base   = xc_num;
+               break;
+       default:
+               DRV(xencons_driver)->name        = "tty";
+               DRV(xencons_driver)->minor_start = 1;
+               DRV(xencons_driver)->name_base   = 1;
+               break;
+       }
+
+       tty_set_operations(xencons_driver, &xencons_ops);
+
+       if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) {
+               pr_warning("WARNING: Failed to register Xen virtual "
+                          "console driver as '%s%d'\n",
+                          DRV(xencons_driver)->name,
+                          DRV(xencons_driver)->name_base);
+               put_tty_driver(xencons_driver);
+               xencons_driver = NULL;
+               return rc;
+       }
+
+       if (is_initial_xendomain()) {
+               xencons_priv_irq = bind_virq_to_irqhandler(
+                       VIRQ_CONSOLE,
+                       0,
+                       xencons_priv_interrupt,
+                       0,
+                       "console",
+                       NULL);
+               BUG_ON(xencons_priv_irq < 0);
+       }
+
+       pr_info("Xen virtual console successfully installed as %s%d\n",
+               DRV(xencons_driver)->name, xc_num);
+
+       return 0;
+}
+
+module_init(xencons_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/console/xencons.h b/drivers/xen/console/xencons.h

new file mode 100644 (file)

index 0000000..1b2ee66
--- /dev/null
+++ b/drivers/xen/console/xencons.h
@@ -0,0 +1,12 @@
+#include <xen/evtchn.h>
+#include <xen/xencons.h>
+
+void xencons_force_flush(void);
+
+/* Interrupt work hooks. Receive data, or kick data out. */
+struct pt_regs;
+void xencons_rx(char *buf, unsigned len);
+void xencons_tx(void);
+
+int xencons_ring_init(void);
+int xencons_ring_send(const char *data, unsigned len);
diff --git a/drivers/xen/console/xencons_ring.c b/drivers/xen/console/xencons_ring.c

new file mode 100644 (file)

index 0000000..01651e3
--- /dev/null
+++ b/drivers/xen/console/xencons_ring.c
@@ -0,0 +1,129 @@
+/* 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <xen/interface/io/console.h>
+#include "xencons.h"
+
+static int xencons_irq;
+
+static inline struct xencons_interface *xencons_interface(void)
+{
+       return mfn_to_virt(xen_start_info->console.domU.mfn);
+}
+
+static inline void notify_daemon(void)
+{
+       /* Use evtchn: this is called early, before irq is set up. */
+       notify_remote_via_evtchn(xen_start_info->console.domU.evtchn);
+}
+
+int xencons_ring_send(const char *data, unsigned len)
+{
+       int sent = 0;
+       struct xencons_interface *intf = xencons_interface();
+       XENCONS_RING_IDX cons, prod;
+
+       cons = intf->out_cons;
+       prod = intf->out_prod;
+       mb();
+       BUG_ON((prod - cons) > sizeof(intf->out));
+
+       while ((sent < len) && ((prod - cons) < sizeof(intf->out)))
+               intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++];
+
+       wmb();
+       intf->out_prod = prod;
+
+       notify_daemon();
+
+       return sent;
+}
+
+static irqreturn_t handle_input(int irq, void *unused)
+{
+       struct xencons_interface *intf = xencons_interface();
+       XENCONS_RING_IDX cons, prod;
+
+       cons = intf->in_cons;
+       prod = intf->in_prod;
+       mb();
+       BUG_ON((prod - cons) > sizeof(intf->in));
+
+       while (cons != prod) {
+               xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1);
+               cons++;
+       }
+
+       mb();
+       intf->in_cons = cons;
+
+       notify_daemon();
+
+       xencons_tx();
+
+       return IRQ_HANDLED;
+}
+
+int
+#ifndef CONFIG_PM_SLEEP
+__init
+#endif
+xencons_ring_init(void)
+{
+       int irq;
+
+       if (!xen_start_info->console.domU.evtchn)
+               return -ENODEV;
+
+       irq = bind_caller_port_to_irqhandler(
+               xen_start_info->console.domU.evtchn,
+               handle_input, 0, "xencons", NULL);
+       if (irq < 0) {
+               pr_err("XEN console request irq failed %i\n", irq);
+               return irq;
+       }
+
+       xencons_irq = irq;
+
+       return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+void xencons_resume(void)
+{
+       if (xencons_irq)
+               unbind_from_irqhandler(xencons_irq, NULL);
+       xencons_irq = 0;
+
+       if (is_running_on_xen() && !is_initial_xendomain())
+               xencons_ring_init();
+
+       /* In case we have in-flight data after save/restore... */
+       notify_daemon();
+}
+#endif
diff --git a/drivers/xen/core/Makefile b/drivers/xen/core/Makefile

new file mode 100644 (file)

index 0000000..ab06252
--- /dev/null
+++ b/drivers/xen/core/Makefile
@@ -0,0 +1,16 @@
+#
+# Makefile for the linux kernel.
+#
+
+obj-y := evtchn.o gnttab.o reboot.o machine_reboot.o
+
+obj-$(CONFIG_XEN_PRIVILEGED_GUEST) += firmware.o pcpu.o
+obj-$(CONFIG_PROC_FS)          += xen_proc.o
+obj-$(CONFIG_HOTPLUG_CPU)      += cpu_hotplug.o
+obj-$(CONFIG_XEN_SMPBOOT)      += smpboot.o
+obj-$(CONFIG_SMP)              += spinlock.o
+obj-$(CONFIG_KEXEC)            += machine_kexec.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
+obj-$(CONFIG_XEN_DOMCTL)       += domctl.o
+CFLAGS_domctl.o                        := -D__XEN_PUBLIC_XEN_H__ -D__XEN_PUBLIC_GRANT_TABLE_H__
+CFLAGS_domctl.o                        += -D__XEN_TOOLS__ -imacros xen/interface/domctl.h -imacros xen/interface/sysctl.h
diff --git a/drivers/xen/core/acpi_memhotplug.c b/drivers/xen/core/acpi_memhotplug.c

new file mode 100644 (file)

index 0000000..c993a5a
--- /dev/null
+++ b/drivers/xen/core/acpi_memhotplug.c
@@ -0,0 +1,190 @@
+/*
+ *  xen_acpi_memhotplug.c - interface to notify Xen on memory device hotadd
+ *
+ *  Copyright (C) 2008, Intel corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <xen/interface/platform.h>
+#include <asm/hypervisor.h>
+
+struct xen_hotmem_entry {
+       struct list_head hotmem_list;
+       uint64_t start;
+       uint64_t end;
+       uint32_t flags;
+       uint32_t pxm;
+};
+
+struct xen_hotmem_list {
+       struct list_head list;
+       unsigned int entry_nr;
+};
+
+static struct xen_hotmem_list xen_hotmem = {
+       .list = LIST_HEAD_INIT(xen_hotmem.list)
+};
+static DEFINE_SPINLOCK(xen_hotmem_lock);
+
+static int xen_hyper_addmem(struct xen_hotmem_entry *entry)
+{
+       xen_platform_op_t op;
+
+       op.cmd = XENPF_mem_hotadd;
+       op.u.mem_add.spfn = entry->start >> PAGE_SHIFT;
+       op.u.mem_add.epfn = entry->end >> PAGE_SHIFT;
+       op.u.mem_add.flags = entry->flags;
+       op.u.mem_add.pxm = entry->pxm;
+
+       return HYPERVISOR_platform_op(&op);
+}
+
+static int add_hotmem_entry(int pxm, uint64_t start,
+                       uint64_t length, uint32_t flags)
+{
+       struct xen_hotmem_entry *entry;
+
+       if (pxm < 0 || !length)
+               return -EINVAL;
+
+       entry = kzalloc(sizeof(struct xen_hotmem_entry), GFP_ATOMIC);
+       if (!entry)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&entry->hotmem_list);
+       entry->start = start;
+       entry->end = start + length;
+       entry->flags = flags;
+       entry->pxm = pxm;
+
+       spin_lock(&xen_hotmem_lock);
+
+       list_add_tail(&entry->hotmem_list, &xen_hotmem.list);
+       xen_hotmem.entry_nr++;
+
+       spin_unlock(&xen_hotmem_lock);
+
+       return 0;
+}
+
+static int free_hotmem_entry(struct xen_hotmem_entry *entry)
+{
+       list_del(&entry->hotmem_list);
+       kfree(entry);
+
+       return 0;
+}
+
+static void xen_hotadd_mem_dpc(struct work_struct *work)
+{
+       struct list_head *elem, *tmp;
+       struct xen_hotmem_entry *entry;
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&xen_hotmem_lock, flags);
+       list_for_each_safe(elem, tmp, &xen_hotmem.list) {
+               entry = list_entry(elem, struct xen_hotmem_entry, hotmem_list);
+               ret = xen_hyper_addmem(entry);
+               if (ret)
+                       pr_warn("xen addmem failed with %x\n", ret);
+               free_hotmem_entry(entry);
+               xen_hotmem.entry_nr--;
+       }
+       spin_unlock_irqrestore(&xen_hotmem_lock, flags);
+}
+
+static DECLARE_WORK(xen_hotadd_mem_work, xen_hotadd_mem_dpc);
+
+static int xen_acpi_get_pxm(acpi_handle h)
+{
+       unsigned long long pxm;
+       acpi_status status;
+       acpi_handle handle;
+       acpi_handle phandle = h;
+
+       do {
+               handle = phandle;
+               status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm);
+               if (ACPI_SUCCESS(status))
+                       return pxm;
+               status = acpi_get_parent(handle, &phandle);
+       } while (ACPI_SUCCESS(status));
+
+       return -1;
+}
+
+static int xen_hotadd_memory(struct acpi_memory_device *mem_device)
+{
+       int pxm, result;
+       int num_enabled = 0;
+       struct acpi_memory_info *info;
+
+       if (!mem_device)
+               return -EINVAL;
+
+       pxm = xen_acpi_get_pxm(mem_device->device->handle);
+
+       if (pxm < 0)
+               return -EINVAL;
+
+       /*
+        * Always return success to ACPI driver, and notify hypervisor later
+        * because hypervisor will utilize the memory in memory hotadd hypercall
+        */
+       list_for_each_entry(info, &mem_device->res_list, list) {
+               if (info->enabled) { /* just sanity check...*/
+                       num_enabled++;
+                       continue;
+               }
+               /*
+                * If the memory block size is zero, please ignore it.
+                * Don't try to do the following memory hotplug flowchart.
+                */
+               if (!info->length)
+                       continue;
+
+               result = add_hotmem_entry(pxm, info->start_addr,
+                                         info->length, 0);
+               if (result)
+                       continue;
+               info->enabled = 1;
+               num_enabled++;
+       }
+
+       if (!num_enabled)
+               return -EINVAL;
+
+       schedule_work(&xen_hotadd_mem_work);
+
+       return 0;
+}
+
+static int xen_hotadd_mem_init(void)
+{
+       if (!is_initial_xendomain())
+               return -ENODEV;
+
+       return 0;
+}
+
+static void xen_hotadd_mem_exit(void)
+{
+       flush_scheduled_work();
+}
diff --git a/drivers/xen/core/clockevents.c b/drivers/xen/core/clockevents.c

new file mode 100644 (file)

index 0000000..e3a914a
--- /dev/null
+++ b/drivers/xen/core/clockevents.c
@@ -0,0 +1,293 @@
+/*
+ *     Xen clockevent functions
+ *
+ *     See arch/x86/xen/time.c for copyright and credits for derived
+ *     portions of this file.
+ *
+ * Xen clockevent implementation
+ *
+ * Xen has two clockevent implementations:
+ *
+ * The old timer_op one works with all released versions of Xen prior
+ * to version 3.0.4.  This version of the hypervisor provides a
+ * single-shot timer with nanosecond resolution.  However, sharing the
+ * same event channel is a 100Hz tick which is delivered while the
+ * vcpu is running.  We don't care about or use this tick, but it will
+ * cause the core time code to think the timer fired too soon, and
+ * will end up resetting it each time.  It could be filtered, but
+ * doing so has complications when the ktime clocksource is not yet
+ * the xen clocksource (ie, at boot time).
+ *
+ * The new vcpu_op-based timer interface allows the tick timer period
+ * to be changed or turned off.  The tick timer is not useful as a
+ * periodic timer because events are only delivered to running vcpus.
+ * The one-shot timer can report when a timeout is in the past, so
+ * set_next_event is capable of returning -ETIME when appropriate.
+ * This interface is used when available.
+ */
+#include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/math64.h>
+#include <asm/hypervisor.h>
+#include <xen/clock.h>
+#include <xen/evtchn.h>
+#include <xen/interface/vcpu.h>
+
+#define XEN_SHIFT 22
+
+/* Xen may fire a timer up to this many ns early */
+#define TIMER_SLOP     100000
+#define NS_PER_TICK    (1000000000LL / HZ)
+
+/*
+ * Get a hypervisor absolute time.  In theory we could maintain an
+ * offset between the kernel's time and the hypervisor's time, and
+ * apply that to a kernel's absolute timeout.  Unfortunately the
+ * hypervisor and kernel times can drift even if the kernel is using
+ * the Xen clocksource, because ntp can warp the kernel's clocksource.
+ */
+static u64 get_abs_timeout(unsigned long delta)
+{
+       return xen_local_clock() + delta;
+}
+
+#if CONFIG_XEN_COMPAT <= 0x030004
+static void timerop_set_mode(enum clock_event_mode mode,
+                            struct clock_event_device *evt)
+{
+       switch (mode) {
+       case CLOCK_EVT_MODE_PERIODIC:
+               WARN_ON(1); /* unsupported */
+               break;
+
+       case CLOCK_EVT_MODE_ONESHOT:
+       case CLOCK_EVT_MODE_RESUME:
+               break;
+
+       case CLOCK_EVT_MODE_UNUSED:
+       case CLOCK_EVT_MODE_SHUTDOWN:
+               if (HYPERVISOR_set_timer_op(0)) /* cancel timeout */
+                       BUG();
+               break;
+       }
+}
+
+static int timerop_set_next_event(unsigned long delta,
+                                 struct clock_event_device *evt)
+{
+       WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+       if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
+               BUG();
+
+       /*
+        * We may have missed the deadline, but there's no real way of
+        * knowing for sure.  If the event was in the past, then we'll
+        * get an immediate interrupt.
+        */
+
+       return 0;
+}
+#endif
+
+static void vcpuop_set_mode(enum clock_event_mode mode,
+                           struct clock_event_device *evt)
+{
+       switch (mode) {
+       case CLOCK_EVT_MODE_PERIODIC:
+               WARN_ON(1); /* unsupported */
+               break;
+
+       case CLOCK_EVT_MODE_UNUSED:
+       case CLOCK_EVT_MODE_SHUTDOWN:
+               if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer,
+                                      smp_processor_id(), NULL))
+                       BUG();
+               /* fall through */
+       case CLOCK_EVT_MODE_ONESHOT:
+               if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
+                                      smp_processor_id(), NULL))
+                       BUG();
+               break;
+
+       case CLOCK_EVT_MODE_RESUME:
+               break;
+       }
+}
+
+static int vcpuop_set_next_event(unsigned long delta,
+                                struct clock_event_device *evt)
+{
+       struct vcpu_set_singleshot_timer single;
+       int ret;
+
+       WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+
+       single.timeout_abs_ns = get_abs_timeout(delta);
+       single.flags = VCPU_SSHOTTMR_future;
+
+       ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer,
+                                smp_processor_id(), &single);
+
+       BUG_ON(ret != 0 && ret != -ETIME);
+
+       return ret;
+}
+
+static DEFINE_PER_CPU(struct clock_event_device, xen_clock_event) = {
+       .name           = "xen",
+       .features       = CLOCK_EVT_FEAT_ONESHOT,
+
+       .max_delta_ns   = 0xffffffff,
+       .min_delta_ns   = TIMER_SLOP,
+
+       .mult           = 1,
+       .shift          = 0,
+       .rating         = 500,
+
+       .irq            = -1,
+};
+
+/* snapshots of runstate info */
+static DEFINE_PER_CPU(u64, runnable_snapshot);
+static DEFINE_PER_CPU(u64, offline_snapshot);
+
+/* unused ns of stolen time */
+static DEFINE_PER_CPU(unsigned int, residual_stolen);
+
+static void init_missing_ticks_accounting(unsigned int cpu)
+{
+       setup_runstate_area(cpu);
+       if (cpu == smp_processor_id()) {
+               this_cpu_write(runnable_snapshot,
+                              this_vcpu_read(runstate.time[RUNSTATE_runnable]));
+               this_cpu_write(offline_snapshot,
+                              this_vcpu_read(runstate.time[RUNSTATE_offline]));
+       }
+       per_cpu(residual_stolen, cpu) = 0;
+}
+
+static irqreturn_t timer_interrupt(int irq, void *dev_id)
+{
+       struct clock_event_device *evt = &__get_cpu_var(xen_clock_event);
+       u64 runnable, offline;
+       s64 stolen;
+       irqreturn_t ret = IRQ_NONE;
+
+       if (evt->event_handler) {
+               evt->event_handler(evt);
+               ret = IRQ_HANDLED;
+       }
+
+       xen_check_wallclock_update();
+
+       runnable = this_vcpu_read(runstate.time[RUNSTATE_runnable]);
+       offline = this_vcpu_read(runstate.time[RUNSTATE_offline]);
+
+       stolen = runnable - __this_cpu_read(runnable_snapshot)
+                + offline - __this_cpu_read(offline_snapshot)
+                + __this_cpu_read(residual_stolen);
+
+       if (stolen >= NS_PER_TICK)
+               account_steal_ticks(div_u64_rem(stolen, NS_PER_TICK,
+                                               &__get_cpu_var(residual_stolen)));
+       else
+               __this_cpu_write(residual_stolen, stolen > 0 ? stolen : 0);
+
+       __this_cpu_write(runnable_snapshot, runnable);
+       __this_cpu_write(offline_snapshot, offline);
+
+       return ret;
+}
+
+static struct irqaction timer_action = {
+       .handler = timer_interrupt,
+       .flags   = IRQF_DISABLED|IRQF_TIMER,
+       .name    = "timer"
+};
+
+void __cpuinit xen_setup_cpu_clockevents(void)
+{
+       unsigned int cpu = smp_processor_id();
+       struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
+
+       init_missing_ticks_accounting(cpu);
+
+       evt->cpumask = cpumask_of(cpu);
+       clockevents_register_device(evt);
+}
+
+#ifdef CONFIG_SMP
+int __cpuinit local_setup_timer(unsigned int cpu)
+{
+       struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
+
+       BUG_ON(cpu == smp_processor_id());
+
+       evt->irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action);
+       if (evt->irq < 0)
+               return evt->irq;
+       BUG_ON(per_cpu(xen_clock_event.irq, 0) != evt->irq);
+
+       evt->set_mode = this_cpu_read(xen_clock_event.set_mode);
+       evt->set_next_event = this_cpu_read(xen_clock_event.set_next_event);
+
+       return 0;
+}
+
+void __cpuinit local_teardown_timer(unsigned int cpu)
+{
+       struct clock_event_device *evt = &per_cpu(xen_clock_event, cpu);
+
+       BUG_ON(cpu == 0);
+       unbind_from_per_cpu_irq(evt->irq, cpu, &timer_action);
+}
+#endif
+
+void xen_clockevents_resume(void)
+{
+       unsigned int cpu;
+
+       if (__this_cpu_read(xen_clock_event.set_mode) != vcpuop_set_mode)
+               return;
+
+       for_each_online_cpu(cpu) {
+               init_missing_ticks_accounting(cpu);
+               if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+                       BUG();
+       }
+}
+
+void __init xen_clockevents_init(void)
+{
+       unsigned int cpu = smp_processor_id();
+       struct clock_event_device *evt = &__get_cpu_var(xen_clock_event);
+
+       switch (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
+                                  cpu, NULL)) {
+       case 0:
+               /*
+                * Successfully turned off 100Hz tick, so we have the
+                * vcpuop-based timer interface
+                */
+               evt->set_mode = vcpuop_set_mode;
+               evt->set_next_event = vcpuop_set_next_event;
+               break;
+#if CONFIG_XEN_COMPAT <= 0x030004
+       case -ENOSYS:
+               printk(KERN_DEBUG "Xen: using timerop interface\n");
+               evt->set_mode = timerop_set_mode;
+               evt->set_next_event = timerop_set_next_event;
+               break;
+#endif
+       default:
+               BUG();
+       }
+
+       evt->irq = bind_virq_to_irqaction(VIRQ_TIMER, cpu, &timer_action);
+       BUG_ON(evt->irq < 0);
+
+       xen_setup_cpu_clockevents();
+}
diff --git a/drivers/xen/core/cpu_hotplug.c b/drivers/xen/core/cpu_hotplug.c

new file mode 100644 (file)

index 0000000..171ab0e
--- /dev/null
+++ b/drivers/xen/core/cpu_hotplug.c
@@ -0,0 +1,182 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/kobject.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <xen/cpu_hotplug.h>
+#include <xen/xenbus.h>
+
+/*
+ * Set of CPUs that remote admin software will allow us to bring online.
+ * Notified to us via xenbus.
+ */
+static cpumask_var_t xenbus_allowed_cpumask;
+
+/* Set of CPUs that local admin will allow us to bring online. */
+static cpumask_var_t local_allowed_cpumask;
+
+static int local_cpu_hotplug_request(void)
+{
+       /*
+        * We assume a CPU hotplug request comes from local admin if it is made
+        * via a userspace process (i.e., one with a real mm_struct).
+        */
+       return (current->mm != NULL);
+}
+
+static void __cpuinit vcpu_hotplug(unsigned int cpu, struct device *dev)
+{
+       int err;
+       char dir[32], state[32];
+
+       if ((cpu >= NR_CPUS) || !cpu_possible(cpu))
+               return;
+
+       sprintf(dir, "cpu/%u", cpu);
+       err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
+       if (err != 1) {
+               pr_err("XENBUS: Unable to read cpu state\n");
+               return;
+       }
+
+       if (strcmp(state, "online") == 0) {
+               cpumask_set_cpu(cpu, xenbus_allowed_cpumask);
+               if (!cpu_up(cpu) && dev)
+                       kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+       } else if (strcmp(state, "offline") == 0) {
+               cpumask_clear_cpu(cpu, xenbus_allowed_cpumask);
+               if (!cpu_down(cpu) && dev)
+                       kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+       } else {
+               pr_err("XENBUS: unknown state(%s) on CPU%d\n",
+                      state, cpu);
+       }
+}
+
+static void __cpuinit handle_vcpu_hotplug_event(
+       struct xenbus_watch *watch, const char **vec, unsigned int len)
+{
+       unsigned int cpu;
+       char *cpustr;
+       const char *node = vec[XS_WATCH_PATH];
+
+       if ((cpustr = strstr(node, "cpu/")) != NULL) {
+               sscanf(cpustr, "cpu/%u", &cpu);
+               vcpu_hotplug(cpu, get_cpu_device(cpu));
+       }
+}
+
+static int smpboot_cpu_notify(struct notifier_block *notifier,
+                             unsigned long action, void *hcpu)
+{
+       unsigned int cpu = (long)hcpu;
+
+       /*
+        * We do this in a callback notifier rather than __cpu_disable()
+        * because local_cpu_hotplug_request() does not work in the latter
+        * as it's always executed from within a stopmachine kthread.
+        */
+       if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request())
+               cpumask_clear_cpu(cpu, local_allowed_cpumask);
+
+       return NOTIFY_OK;
+}
+
+static int __cpuinit setup_cpu_watcher(struct notifier_block *notifier,
+                                      unsigned long event, void *data)
+{
+       unsigned int i;
+
+       static struct xenbus_watch __cpuinitdata cpu_watch = {
+               .node = "cpu",
+               .callback = handle_vcpu_hotplug_event,
+               .flags = XBWF_new_thread };
+       (void)register_xenbus_watch(&cpu_watch);
+
+       if (!is_initial_xendomain()) {
+               for_each_possible_cpu(i)
+                       vcpu_hotplug(i, get_cpu_device(i));
+               pr_info("Brought up %ld CPUs\n", (long)num_online_cpus());
+       }
+
+       return NOTIFY_DONE;
+}
+
+static int __init setup_vcpu_hotplug_event(void)
+{
+       static struct notifier_block hotplug_cpu = {
+               .notifier_call = smpboot_cpu_notify };
+       static struct notifier_block __cpuinitdata xsn_cpu = {
+               .notifier_call = setup_cpu_watcher };
+
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       register_cpu_notifier(&hotplug_cpu);
+       register_xenstore_notifier(&xsn_cpu);
+
+       return 0;
+}
+
+arch_initcall(setup_vcpu_hotplug_event);
+
+int __ref smp_suspend(void)
+{
+       unsigned int cpu;
+       int err;
+
+       for_each_online_cpu(cpu) {
+               if (cpu == 0)
+                       continue;
+               err = cpu_down(cpu);
+               if (err) {
+                       pr_crit("Failed to take all CPUs down: %d\n", err);
+                       for_each_possible_cpu(cpu)
+                               vcpu_hotplug(cpu, NULL);
+                       return err;
+               }
+       }
+
+       return 0;
+}
+
+void __ref smp_resume(void)
+{
+       unsigned int cpu;
+
+       for_each_possible_cpu(cpu) {
+               if (cpu == 0)
+                       continue;
+               vcpu_hotplug(cpu, NULL);
+       }
+}
+
+int cpu_up_check(unsigned int cpu)
+{
+       int rc = 0;
+
+       if (local_cpu_hotplug_request()) {
+               cpumask_set_cpu(cpu, local_allowed_cpumask);
+               if (!cpumask_test_cpu(cpu, xenbus_allowed_cpumask)) {
+                       pr_warning("%s: attempt to bring up CPU %u disallowed "
+                                  "by remote admin.\n", __FUNCTION__, cpu);
+                       rc = -EBUSY;
+               }
+       } else if (!cpumask_test_cpu(cpu, local_allowed_cpumask) ||
+                  !cpumask_test_cpu(cpu, xenbus_allowed_cpumask)) {
+               rc = -EBUSY;
+       }
+
+       return rc;
+}
+
+void __init init_xenbus_allowed_cpumask(void)
+{
+       if (!alloc_cpumask_var(&xenbus_allowed_cpumask, GFP_KERNEL))
+               BUG();
+       cpumask_copy(xenbus_allowed_cpumask, cpu_present_mask);
+       if (!alloc_cpumask_var(&local_allowed_cpumask, GFP_KERNEL))
+               BUG();
+       cpumask_setall(local_allowed_cpumask);
+}
diff --git a/drivers/xen/core/domctl.c b/drivers/xen/core/domctl.c

new file mode 100644 (file)

index 0000000..700149a
--- /dev/null
+++ b/drivers/xen/core/domctl.c
@@ -0,0 +1,562 @@
+/*
+ * !!!  dirty hack alert  !!!
+ *
+ * Problem: old guests kernels don't have a "protocol" node
+ *          in the frontend xenstore directory, so mixing
+ *          32 and 64bit domains doesn't work.
+ *
+ * Upstream plans to solve this in the tools, by letting them
+ * create a protocol node.  Which certainly makes sense.
+ * But it isn't trivial and isn't done yet.  Too bad.
+ *
+ * So for the time being we use the get_address_size domctl
+ * hypercall for a pretty good guess.  Not nice as the domctl
+ * hypercall isn't supposed to be used by the kernel.  Because
+ * we don't want to have dependencies between dom0 kernel and
+ * xen kernel versions.  Now we have one.  Ouch.
+ */
+#undef __XEN_PUBLIC_XEN_H__
+#undef __XEN_PUBLIC_GRANT_TABLE_H__
+#undef __XEN_TOOLS__
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <asm/hypervisor.h>
+#include <xen/blkif.h>
+
+#include "domctl.h"
+
+/* stuff copied from xen/interface/domctl.h, which we can't
+ * include directly for the reasons outlined above .... */
+
+typedef struct xen_domctl_address_size {
+       uint32_t size;
+} xen_domctl_address_size_t;
+
+typedef __attribute__((aligned(8))) uint64_t uint64_aligned_t;
+
+struct xenctl_cpumap_v4 {
+       XEN_GUEST_HANDLE(uint8) bitmap;
+       uint32_t nr_cpus;
+};
+
+struct xenctl_cpumap_v5 {
+       union {
+               XEN_GUEST_HANDLE(uint8) bitmap;
+               uint64_aligned_t _align;
+       };
+       uint32_t nr_cpus;
+};
+
+struct xen_domctl_vcpuaffinity_v4 {
+    uint32_t vcpu;
+    struct xenctl_cpumap_v4 cpumap;
+};
+
+struct xen_domctl_vcpuaffinity_v5 {
+    uint32_t vcpu;
+    struct xenctl_cpumap_v5 cpumap;
+};
+
+union xen_domctl {
+       /* v4: sle10 sp1: xen 3.0.4 + 32-on-64 patches */
+       struct {
+               uint32_t cmd;
+               uint32_t interface_version;
+               domid_t  domain;
+               union {
+                       /* left out lots of other struct xen_domctl_foobar */
+                       struct xen_domctl_address_size       address_size;
+                       struct xen_domctl_vcpuaffinity_v4    vcpu_affinity;
+                       uint64_t                             dummy_align;
+                       uint8_t                              dummy_pad[128];
+               };
+       } v4;
+
+       /*
+        * v5: upstream: xen 3.1
+        * v6: upstream: xen 4.0
+        * v7: upstream: xen 4.1; sle11 sp1: xen 4.0 + cpupools patches
+        * v8: upstream: xen 4.2
+        */
+       struct {
+               uint32_t cmd;
+               uint32_t interface_version;
+               domid_t  domain;
+               union {
+                       struct xen_domctl_address_size       address_size;
+                       struct xen_domctl_vcpuaffinity_v5    vcpu_affinity;
+                       uint64_aligned_t                     dummy_align;
+                       uint8_t                              dummy_pad[128];
+               };
+       } v5, v6, v7, v8;
+};
+
+struct xen_sysctl_physinfo_v6 {
+       uint32_t threads_per_core;
+       uint32_t cores_per_socket;
+       uint32_t nr_cpus;
+       uint32_t nr_nodes;
+       uint32_t cpu_khz;
+       uint64_aligned_t total_pages;
+       uint64_aligned_t free_pages;
+       uint64_aligned_t scrub_pages;
+       uint32_t hw_cap[8];
+       uint32_t max_cpu_id;
+       union {
+               XEN_GUEST_HANDLE(uint32) cpu_to_node;
+               uint64_aligned_t _ctn_align;
+       };
+       uint32_t capabilities;
+};
+
+struct xen_sysctl_physinfo_v7 {
+       uint32_t threads_per_core;
+       uint32_t cores_per_socket;
+       uint32_t nr_cpus;
+       uint32_t max_node_id;
+       uint32_t cpu_khz;
+       uint64_aligned_t total_pages;
+       uint64_aligned_t free_pages;
+       uint64_aligned_t scrub_pages;
+       uint32_t hw_cap[8];
+       uint32_t max_cpu_id;
+       union {
+               XEN_GUEST_HANDLE(uint32) cpu_to_node;
+               uint64_aligned_t _ctn_align;
+       };
+       uint32_t capabilities;
+};
+
+#define XEN_SYSCTL_pm_op_get_cputopo 0x20
+struct xen_get_cputopo_v6 {
+       uint32_t max_cpus;
+       union {
+               XEN_GUEST_HANDLE(uint32) cpu_to_core;
+               uint64_aligned_t _ctc_align;
+       };
+       union {
+               XEN_GUEST_HANDLE(uint32) cpu_to_socket;
+               uint64_aligned_t _cts_align;
+       };
+       uint32_t nr_cpus;
+};
+
+struct xen_sysctl_pm_op_v6 {
+       uint32_t cmd;
+       uint32_t cpuid;
+       union {
+               struct xen_get_cputopo_v6 get_topo;
+       };
+};
+#define xen_sysctl_pm_op_v7 xen_sysctl_pm_op_v6
+
+struct xen_sysctl_topologyinfo_v8 {
+       uint32_t max_cpu_index;
+       union {
+               XEN_GUEST_HANDLE(uint32) cpu_to_core;
+               uint64_aligned_t _ctc_align;
+       };
+       union {
+               XEN_GUEST_HANDLE(uint32) cpu_to_socket;
+               uint64_aligned_t _cts_align;
+       };
+       union {
+               XEN_GUEST_HANDLE(uint32) cpu_to_node;
+               uint64_aligned_t _ctn_align;
+       };
+};
+
+union xen_sysctl {
+       /* v6: Xen 3.4.x */
+       struct {
+               uint32_t cmd;
+               uint32_t interface_version;
+               union {
+                       struct xen_sysctl_physinfo_v6 physinfo;
+                       struct xen_sysctl_pm_op_v6 pm_op;
+               };
+       } v6;
+       /* v7: Xen 4.0.x */
+       struct {
+               uint32_t cmd;
+               uint32_t interface_version;
+               union {
+                       struct xen_sysctl_physinfo_v7 physinfo;
+                       struct xen_sysctl_pm_op_v7 pm_op;
+               };
+       } v7;
+       /*
+        * v8: Xen 4.1.x
+        * v9: Xen 4.2+
+        */
+       struct {
+               uint32_t cmd;
+               uint32_t interface_version;
+               union {
+                       struct xen_sysctl_topologyinfo_v8 topologyinfo;
+               };
+       } v8, v9;
+};
+
+/* The actual code comes here */
+
+static inline int hypervisor_domctl(void *domctl)
+{
+       return _hypercall1(int, domctl, domctl);
+}
+
+static inline int hypervisor_sysctl(void *sysctl)
+{
+       return _hypercall1(int, sysctl, sysctl);
+}
+
+int xen_guest_address_size(int domid)
+{
+       union xen_domctl domctl;
+       int low, ret;
+
+#define guest_address_size(ver) do {                                   \
+       memset(&domctl, 0, sizeof(domctl));                             \
+       domctl.v##ver.cmd = XEN_DOMCTL_get_address_size;                \
+       domctl.v##ver.interface_version = low = ver;                    \
+       domctl.v##ver.domain = domid;                                   \
+       ret = hypervisor_domctl(&domctl) ?: domctl.v##ver.address_size.size; \
+       if (ret == 32 || ret == 64) {                                   \
+               pr_info("v" #ver " domctl worked ok: dom%d is %d-bit\n",\
+                       domid, ret);                                    \
+               return ret;                                             \
+       }                                                               \
+} while (0)
+
+       BUILD_BUG_ON(XEN_DOMCTL_INTERFACE_VERSION > 8);
+       guest_address_size(8);
+#if CONFIG_XEN_COMPAT < 0x040200
+       guest_address_size(7);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040100
+       guest_address_size(6);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040000
+       guest_address_size(5);
+#endif
+#if CONFIG_XEN_COMPAT < 0x030100
+       guest_address_size(4);
+#endif
+
+       ret = BITS_PER_LONG;
+       pr_warn("v%d...%d domctls failed, assuming dom%d is native: %d\n",
+               low, XEN_DOMCTL_INTERFACE_VERSION, domid, ret);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(xen_guest_address_size);
+
+int xen_guest_blkif_protocol(int domid)
+{
+       int address_size = xen_guest_address_size(domid);
+
+       if (address_size == BITS_PER_LONG)
+               return BLKIF_PROTOCOL_NATIVE;
+       if (address_size == 32)
+               return BLKIF_PROTOCOL_X86_32;
+       if (address_size == 64)
+               return BLKIF_PROTOCOL_X86_64;
+       return BLKIF_PROTOCOL_NATIVE;
+}
+EXPORT_SYMBOL_GPL(xen_guest_blkif_protocol);
+
+#ifdef CONFIG_X86
+
+#define vcpuaffinity(what, ver) ({                                     \
+       memset(&domctl, 0, sizeof(domctl));                             \
+       domctl.v##ver.cmd = XEN_DOMCTL_##what##vcpuaffinity;            \
+       domctl.v##ver.interface_version = ver;                          \
+       /* domctl.v##ver.domain = 0; */                                 \
+       domctl.v##ver.vcpu_affinity.vcpu = smp_processor_id();          \
+       domctl.v##ver.vcpu_affinity.cpumap.nr_cpus = nr;                \
+       set_xen_guest_handle(domctl.v##ver.vcpu_affinity.cpumap.bitmap, \
+                            mask);                                     \
+       hypervisor_domctl(&domctl);                                     \
+})
+
+static inline int get_vcpuaffinity(unsigned int nr, void *mask)
+{
+       union xen_domctl domctl;
+       int rc;
+
+       BUILD_BUG_ON(XEN_DOMCTL_INTERFACE_VERSION > 8);
+       rc = vcpuaffinity(get, 8);
+#if CONFIG_XEN_COMPAT < 0x040200
+       if (rc)
+               rc = vcpuaffinity(get, 7);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040100
+       if (rc)
+               rc = vcpuaffinity(get, 6);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040000
+       if (rc)
+               rc = vcpuaffinity(get, 5);
+#endif
+#if CONFIG_XEN_COMPAT < 0x030100
+       if (rc)
+               rc = vcpuaffinity(get, 4);
+#endif
+       return rc;
+}
+
+static inline int set_vcpuaffinity(unsigned int nr, void *mask)
+{
+       union xen_domctl domctl;
+       int rc;
+
+       BUILD_BUG_ON(XEN_DOMCTL_INTERFACE_VERSION > 8);
+       rc = vcpuaffinity(set, 8);
+#if CONFIG_XEN_COMPAT < 0x040200
+       if (rc)
+               rc = vcpuaffinity(set, 7);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040100
+       if (rc)
+               rc = vcpuaffinity(set, 6);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040000
+       if (rc)
+               rc = vcpuaffinity(set, 5);
+#endif
+#if CONFIG_XEN_COMPAT < 0x030100
+       if (rc)
+               rc = vcpuaffinity(set, 4);
+#endif
+       return rc;
+}
+
+static DEFINE_PER_CPU(void *, saved_pcpu_affinity);
+
+#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_LONG / sizeof(long))
+
+int xen_set_physical_cpu_affinity(int pcpu)
+{
+       int rc;
+
+       if (!is_initial_xendomain())
+               return -EPERM;
+
+       if (pcpu >= 0) {
+               void *oldmap;
+
+               if (pcpu > BITS_PER_PAGE)
+                       return -ERANGE;
+
+               if (this_cpu_read(saved_pcpu_affinity))
+                       return -EBUSY;
+
+               oldmap = (void *)get_zeroed_page(GFP_KERNEL);
+               if (!oldmap)
+                       return -ENOMEM;
+
+               rc = get_vcpuaffinity(BITS_PER_PAGE, oldmap);
+               if (!rc) {
+                       void *newmap = kcalloc(BITS_TO_LONGS(pcpu + 1),
+                                              sizeof(long), GFP_KERNEL);
+
+                       if (newmap) {
+                               __set_bit(pcpu, newmap);
+                               rc = set_vcpuaffinity(pcpu + 1, newmap);
+                               kfree(newmap);
+                       } else
+                               rc = -ENOMEM;
+               }
+
+               if (!rc)
+                       this_cpu_write(saved_pcpu_affinity, oldmap);
+               else
+                       free_page((unsigned long)oldmap);
+       } else {
+               if (!this_cpu_read(saved_pcpu_affinity))
+                       return 0;
+               rc = set_vcpuaffinity(BITS_PER_PAGE,
+                                     this_cpu_read(saved_pcpu_affinity));
+               free_page((unsigned long)this_cpu_read(saved_pcpu_affinity));
+               this_cpu_write(saved_pcpu_affinity, NULL);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL_GPL(xen_set_physical_cpu_affinity);
+
+int xen_get_topology_info(unsigned int cpu, u32 *core, u32 *sock, u32 *node)
+{
+       union xen_sysctl sysctl;
+       uint32_t *cores = NULL, *socks = NULL, *nodes = NULL;
+       unsigned int nr;
+       int rc;
+
+       if (core)
+               cores = kmalloc((cpu + 1) * sizeof(*cores), GFP_KERNEL);
+       if (sock)
+               socks = kmalloc((cpu + 1) * sizeof(*socks), GFP_KERNEL);
+       if (node)
+               nodes = kmalloc((cpu + 1) * sizeof(*nodes), GFP_KERNEL);
+       if ((core && !cores) || (sock && !socks) || (node && !nodes)) {
+               kfree(cores);
+               kfree(socks);
+               kfree(nodes);
+               return -ENOMEM;
+       }
+
+#define topologyinfo(ver) do {                                         \
+       memset(&sysctl, 0, sizeof(sysctl));                             \
+       sysctl.v##ver.cmd = XEN_SYSCTL_topologyinfo;                    \
+       sysctl.v##ver.interface_version = ver;                          \
+       sysctl.v##ver.topologyinfo.max_cpu_index = cpu;                 \
+       set_xen_guest_handle(sysctl.v##ver.topologyinfo.cpu_to_core,    \
+                            cores);                                    \
+       set_xen_guest_handle(sysctl.v##ver.topologyinfo.cpu_to_socket,  \
+                            socks);                                    \
+       set_xen_guest_handle(sysctl.v##ver.topologyinfo.cpu_to_node,    \
+                            nodes);                                    \
+       rc = hypervisor_sysctl(&sysctl);                                \
+       nr = sysctl.v##ver.topologyinfo.max_cpu_index + 1;              \
+} while (0)
+
+       BUILD_BUG_ON(XEN_SYSCTL_INTERFACE_VERSION > 9);
+       topologyinfo(9);
+#if CONFIG_XEN_COMPAT < 0x040200
+       if (rc)
+               topologyinfo(8);
+#endif
+
+#if CONFIG_XEN_COMPAT < 0x040100
+#define pm_op_cputopo(ver) do {                                                \
+       memset(&sysctl, 0, sizeof(sysctl));                             \
+       sysctl.v##ver.cmd = XEN_SYSCTL_pm_op;                           \
+       sysctl.v##ver.interface_version = ver;                          \
+       sysctl.v##ver.pm_op.cmd = XEN_SYSCTL_pm_op_get_cputopo;         \
+       sysctl.v##ver.pm_op.cpuid = 0;                                  \
+       sysctl.v##ver.pm_op.get_topo.max_cpus = cpu + 1;                \
+       set_xen_guest_handle(sysctl.v##ver.pm_op.get_topo.cpu_to_core,  \
+                            cores);                                    \
+       set_xen_guest_handle(sysctl.v##ver.pm_op.get_topo.cpu_to_socket,\
+                            socks);                                    \
+       rc = hypervisor_sysctl(&sysctl);                                \
+       memset(&sysctl, 0, sizeof(sysctl));                             \
+       sysctl.v##ver.cmd = XEN_SYSCTL_physinfo;                        \
+       sysctl.v##ver.interface_version = ver;                          \
+       sysctl.v##ver.physinfo.max_cpu_id = cpu;                        \
+       set_xen_guest_handle(sysctl.v##ver.physinfo.cpu_to_node, nodes);\
+       rc = hypervisor_sysctl(&sysctl) ?: rc;                          \
+       nr = sysctl.v##ver.physinfo.max_cpu_id + 1;                     \
+} while (0)
+
+       if (rc)
+               pm_op_cputopo(7);
+#endif
+#if CONFIG_XEN_COMPAT < 0x040000
+       if (rc)
+               pm_op_cputopo(6);
+#endif
+
+       if (!rc && cpu >= nr)
+               rc = -EDOM;
+
+       if (!rc && core && (*core = cores[cpu]) == INVALID_TOPOLOGY_ID)
+               rc = -ENOENT;
+       kfree(cores);
+
+       if (!rc && sock && (*sock = socks[cpu]) == INVALID_TOPOLOGY_ID)
+               rc = -ENOENT;
+       kfree(socks);
+
+       if (!rc && node && (*node = nodes[cpu]) == INVALID_TOPOLOGY_ID)
+               rc = -ENOENT;
+       kfree(nodes);
+
+       return rc;
+}
+EXPORT_SYMBOL_GPL(xen_get_topology_info);
+
+#include <xen/pcpu.h>
+#include <asm/msr.h>
+
+int rdmsr_safe_on_pcpu(unsigned int pcpu, u32 msr_no, u32 *l, u32 *h)
+{
+       int err = xen_set_physical_cpu_affinity(pcpu);
+
+       switch (err) {
+       case 0:
+               err = rdmsr_safe(msr_no, l, h);
+               WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
+               break;
+       case -EINVAL:
+               /* Fall back in case this is due to dom0_vcpus_pinned. */
+               err = rdmsr_safe_on_cpu(pcpu, msr_no, l, h) ?: 1;
+               break;
+       }
+
+       return err;
+}
+EXPORT_SYMBOL_GPL(rdmsr_safe_on_pcpu);
+
+int wrmsr_safe_on_pcpu(unsigned int pcpu, u32 msr_no, u32 l, u32 h)
+{
+       int err = xen_set_physical_cpu_affinity(pcpu);
+
+       switch (err) {
+       case 0:
+               err = wrmsr_safe(msr_no, l, h);
+               WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
+               break;
+       case -EINVAL:
+               /* Fall back in case this is due to dom0_vcpus_pinned. */
+               err = wrmsr_safe_on_cpu(pcpu, msr_no, l, h) ?: 1;
+               break;
+       }
+
+       return err;
+}
+EXPORT_SYMBOL_GPL(wrmsr_safe_on_pcpu);
+
+int rdmsr_safe_regs_on_pcpu(unsigned int pcpu, u32 *regs)
+{
+       int err = xen_set_physical_cpu_affinity(pcpu);
+
+       switch (err) {
+       case 0:
+               err = rdmsr_safe_regs(regs);
+               WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
+               break;
+       case -EINVAL:
+               /* Fall back in case this is due to dom0_vcpus_pinned. */
+               err = rdmsr_safe_regs_on_cpu(pcpu, regs) ?: 1;
+               break;
+       }
+
+       return err;
+}
+EXPORT_SYMBOL_GPL(rdmsr_safe_regs_on_pcpu);
+
+int wrmsr_safe_regs_on_pcpu(unsigned int pcpu, u32 *regs)
+{
+       int err = xen_set_physical_cpu_affinity(pcpu);
+
+       switch (err) {
+       case 0:
+               err = wrmsr_safe_regs(regs);
+               WARN_ON_ONCE(xen_set_physical_cpu_affinity(-1));
+               break;
+       case -EINVAL:
+               /* Fall back in case this is due to dom0_vcpus_pinned. */
+               err = wrmsr_safe_regs_on_cpu(pcpu, regs) ?: 1;
+               break;
+       }
+
+       return err;
+}
+EXPORT_SYMBOL_GPL(wrmsr_safe_regs_on_pcpu);
+
+#endif /* CONFIG_X86 */
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/core/domctl.h b/drivers/xen/core/domctl.h

new file mode 100644 (file)

index 0000000..e8a26a2
--- /dev/null
+++ b/drivers/xen/core/domctl.h
@@ -0,0 +1,4 @@
+int xen_guest_address_size(int domid);
+int xen_guest_blkif_protocol(int domid);
+int xen_set_physical_cpu_affinity(int pcpu);
+int xen_get_topology_info(unsigned int cpu, u32 *core, u32 *socket, u32 *node);
diff --git a/drivers/xen/core/evtchn.c b/drivers/xen/core/evtchn.c

new file mode 100644 (file)

index 0000000..66401d4
--- /dev/null
+++ b/drivers/xen/core/evtchn.c
@@ -0,0 +1,2000 @@
+/******************************************************************************
+ * evtchn.c
+ * 
+ * Communication via Xen event channels.
+ * 
+ * Copyright (c) 2002-2005, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/ftrace.h>
+#include <linux/atomic.h>
+#include <asm/barrier.h>
+#include <asm/ptrace.h>
+#include <xen/evtchn.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/physdev.h>
+#include <asm/hypervisor.h>
+#include <linux/mc146818rtc.h> /* RTC_IRQ */
+#include "../../../kernel/irq/internals.h" /* IRQS_AUTODETECT, IRQS_PENDING */
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+
+/* IRQ <-> event-channel mappings. */
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+       [0 ...  NR_EVENT_CHANNELS-1] = -1 };
+
+#if defined(CONFIG_SMP) && defined(CONFIG_X86)
+static struct percpu_irqaction {
+       struct irqaction action; /* must be first */
+       struct percpu_irqaction *next;
+       cpumask_var_t cpus;
+} *virq_actions[NR_VIRQS];
+/* IRQ <-> VIRQ mapping. */
+static DECLARE_BITMAP(virq_per_cpu, NR_VIRQS) __read_mostly;
+static DEFINE_PER_CPU_READ_MOSTLY(int[NR_VIRQS], virq_to_evtchn);
+#define BUG_IF_VIRQ_PER_CPU(irq_cfg) \
+       BUG_ON(type_from_irq_cfg(irq_cfg) == IRQT_VIRQ \
+              && test_bit(index_from_irq_cfg(irq_cfg), virq_per_cpu))
+#else
+#define BUG_IF_VIRQ_PER_CPU(irq_cfg) ((void)0)
+#define PER_CPU_VIRQ_IRQ
+#endif
+
+/* IRQ <-> IPI mapping. */
+#if defined(CONFIG_SMP) && defined(CONFIG_X86)
+static int __read_mostly ipi_irq = -1;
+DEFINE_PER_CPU(DECLARE_BITMAP(, NR_IPIS), ipi_pending);
+static DEFINE_PER_CPU_READ_MOSTLY(evtchn_port_t, ipi_evtchn);
+#else
+#define PER_CPU_IPI_IRQ
+#endif
+#if !defined(CONFIG_SMP) || !defined(PER_CPU_IPI_IRQ)
+#define BUG_IF_IPI(irq_cfg) BUG_ON(type_from_irq_cfg(irq_cfg) == IRQT_IPI)
+#else
+#define BUG_IF_IPI(irq_cfg) ((void)0)
+#endif
+
+/* Binding types. */
+enum {
+       IRQT_UNBOUND,
+       IRQT_PIRQ,
+       IRQT_VIRQ,
+       IRQT_IPI,
+       IRQT_LOCAL_PORT,
+       IRQT_CALLER_PORT,
+       _IRQT_COUNT
+};
+
+#define _IRQT_BITS 4
+#define _EVTCHN_BITS 12
+#define _INDEX_BITS (32 - _IRQT_BITS - _EVTCHN_BITS)
+
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND    (IRQT_UNBOUND << (32 - _IRQT_BITS))
+
+static struct irq_cfg _irq_cfg[] = {
+       [0 ...
+#ifdef CONFIG_SPARSE_IRQ
+              BUILD_BUG_ON_ZERO(PIRQ_BASE) + NR_IRQS_LEGACY
+#else
+              NR_IRQS
+#endif
+                      - 1].info = IRQ_UNBOUND
+};
+
+static inline struct irq_cfg *__pure irq_cfg(unsigned int irq)
+{
+#ifdef CONFIG_SPARSE_IRQ
+       return irq_get_chip_data(irq);
+#else
+       return irq < NR_IRQS ? _irq_cfg + irq : NULL;
+#endif
+}
+
+static inline struct irq_cfg *__pure irq_data_cfg(struct irq_data *data)
+{
+       return irq_data_get_irq_chip_data(data);
+}
+
+/* Constructor for packed IRQ information. */
+static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+       BUILD_BUG_ON(_IRQT_COUNT > (1U << _IRQT_BITS));
+
+       BUILD_BUG_ON(NR_PIRQS > (1U << _INDEX_BITS));
+       BUILD_BUG_ON(NR_VIRQS > (1U << _INDEX_BITS));
+#if defined(PER_CPU_IPI_IRQ) && defined(NR_IPIS)
+       BUILD_BUG_ON(NR_IPIS > (1U << _INDEX_BITS));
+#endif
+       BUG_ON(index >> _INDEX_BITS);
+
+       BUILD_BUG_ON(NR_EVENT_CHANNELS > (1U << _EVTCHN_BITS));
+
+       return ((type << (32 - _IRQT_BITS)) | (index << _EVTCHN_BITS) | evtchn);
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+
+static inline unsigned int index_from_irq_cfg(const struct irq_cfg *cfg)
+{
+       return (cfg->info >> _EVTCHN_BITS) & ((1U << _INDEX_BITS) - 1);
+}
+
+static inline unsigned int index_from_irq(int irq)
+{
+       const struct irq_cfg *cfg = irq_cfg(irq);
+
+       return cfg ? index_from_irq_cfg(cfg) : 0;
+}
+
+static inline unsigned int type_from_irq_cfg(const struct irq_cfg *cfg)
+{
+       return cfg->info >> (32 - _IRQT_BITS);
+}
+
+static inline unsigned int type_from_irq(int irq)
+{
+       const struct irq_cfg *cfg = irq_cfg(irq);
+
+       return cfg ? type_from_irq_cfg(cfg) : IRQT_UNBOUND;
+}
+
+static inline unsigned int evtchn_from_per_cpu_irq(const struct irq_cfg *cfg,
+                                                  unsigned int cpu)
+{
+       switch (type_from_irq_cfg(cfg)) {
+#ifndef PER_CPU_VIRQ_IRQ
+       case IRQT_VIRQ:
+               return per_cpu(virq_to_evtchn, cpu)[index_from_irq_cfg(cfg)];
+#endif
+#ifndef PER_CPU_IPI_IRQ
+       case IRQT_IPI:
+               return per_cpu(ipi_evtchn, cpu);
+#endif
+       }
+       BUG();
+       return 0;
+}
+
+static inline unsigned int evtchn_from_irq_cfg(const struct irq_cfg *cfg)
+{
+       switch (type_from_irq_cfg(cfg)) {
+#ifndef PER_CPU_VIRQ_IRQ
+       case IRQT_VIRQ:
+#endif
+#ifndef PER_CPU_IPI_IRQ
+       case IRQT_IPI:
+#endif
+               return evtchn_from_per_cpu_irq(cfg, smp_processor_id());
+       }
+       return cfg->info & ((1U << _EVTCHN_BITS) - 1);
+}
+
+static inline unsigned int evtchn_from_irq_data(struct irq_data *data)
+{
+       const struct irq_cfg *cfg = irq_data_cfg(data);
+
+       return cfg ? evtchn_from_irq_cfg(cfg) : 0;
+}
+
+static inline unsigned int evtchn_from_irq(int irq)
+{
+       struct irq_data *data = irq_get_irq_data(irq);
+
+       return data ? evtchn_from_irq_data(data) : 0;
+}
+
+unsigned int irq_from_evtchn(unsigned int port)
+{
+       return evtchn_to_irq[port];
+}
+EXPORT_SYMBOL_GPL(irq_from_evtchn);
+
+/* IRQ <-> VIRQ mapping. */
+DEFINE_PER_CPU(int[NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
+
+#if defined(CONFIG_SMP) && defined(PER_CPU_IPI_IRQ)
+/* IRQ <-> IPI mapping. */
+#ifndef NR_IPIS
+#define NR_IPIS 1
+#endif
+DEFINE_PER_CPU(int[NR_IPIS], ipi_to_irq) = {[0 ... NR_IPIS-1] = -1};
+#endif
+
+#ifdef CONFIG_SMP
+
+#if CONFIG_NR_CPUS <= 256
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+#else
+static u16 cpu_evtchn[NR_EVENT_CHANNELS];
+#endif
+static DEFINE_PER_CPU(unsigned long[BITS_TO_LONGS(NR_EVENT_CHANNELS)],
+                     cpu_evtchn_mask);
+
+static inline unsigned long active_evtchns(unsigned int idx)
+{
+       shared_info_t *sh = HYPERVISOR_shared_info;
+
+       return (sh->evtchn_pending[idx] &
+               percpu_read(cpu_evtchn_mask[idx]) &
+               ~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+       int irq = evtchn_to_irq[chn];
+
+       BUG_ON(!test_bit(chn, s->evtchn_mask));
+
+       if (irq != -1) {
+               struct irq_data *data = irq_get_irq_data(irq);
+
+               if (!irqd_is_per_cpu(data))
+                       cpumask_copy(data->affinity, cpumask_of(cpu));
+               else
+                       cpumask_set_cpu(cpu, data->affinity);
+       }
+
+       clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_evtchn[chn]));
+       set_bit(chn, per_cpu(cpu_evtchn_mask, cpu));
+       cpu_evtchn[chn] = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+       int i;
+
+       /* By default all event channels notify CPU#0. */
+       for (i = 0; i < nr_irqs; i++) {
+               struct irq_data *data = irq_get_irq_data(i);
+
+               if (data)
+                       cpumask_copy(data->affinity, cpumask_of(0));
+       }
+
+       memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+       for_each_possible_cpu(i)
+               memset(per_cpu(cpu_evtchn_mask, i), -!i,
+                      sizeof(per_cpu(cpu_evtchn_mask, i)));
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+       return cpu_evtchn[evtchn];
+}
+
+#else
+
+static inline unsigned long active_evtchns(unsigned int idx)
+{
+       shared_info_t *sh = HYPERVISOR_shared_info;
+
+       return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+       return 0;
+}
+
+#endif
+
+#ifdef CONFIG_X86
+void __init xen_init_IRQ(void);
+void __init init_IRQ(void)
+{
+       irq_ctx_init(0);
+       xen_init_IRQ();
+}
+#include <asm/idle.h>
+#endif
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn)      ((chn) != 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+       VOID(HYPERVISOR_xen_version(0, NULL));
+}
+/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */
+EXPORT_SYMBOL(force_evtchn_callback);
+
+#define UPC_INACTIVE 0
+#define UPC_ACTIVE 1
+#define UPC_NESTED_LATCH 2
+#define UPC_RESTART (UPC_ACTIVE|UPC_NESTED_LATCH)
+static DEFINE_PER_CPU(unsigned int, upcall_state);
+static DEFINE_PER_CPU(unsigned int, current_l1i);
+static DEFINE_PER_CPU(unsigned int, current_l2i);
+
+#ifndef vcpu_info_xchg
+#define vcpu_info_xchg(fld, val) xchg(&current_vcpu_info()->fld, val)
+#endif
+
+/* NB. Interrupts are disabled on entry. */
+asmlinkage void __irq_entry evtchn_do_upcall(struct pt_regs *regs)
+{
+       unsigned long       l1, l2;
+       unsigned long       masked_l1, masked_l2;
+       unsigned int        l1i, l2i, start_l1i, start_l2i, port, i;
+       int                 irq;
+       struct pt_regs     *old_regs;
+
+       /* Nested invocations bail immediately. */
+       if (unlikely(__this_cpu_cmpxchg(upcall_state, UPC_INACTIVE,
+                                       UPC_ACTIVE) != UPC_INACTIVE)) {
+               __this_cpu_or(upcall_state, UPC_NESTED_LATCH);
+               /* Avoid a callback storm when we reenable delivery. */
+               vcpu_info_write(evtchn_upcall_pending, 0);
+               return;
+       }
+
+       old_regs = set_irq_regs(regs);
+       xen_spin_irq_enter();
+       irq_enter();
+       exit_idle();
+
+       do {
+               vcpu_info_write(evtchn_upcall_pending, 0);
+
+#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
+               /* Clear master flag /before/ clearing selector flag. */
+               wmb();
+#else
+               barrier();
+#endif
+
+#ifndef CONFIG_NO_HZ
+               /*
+                * Handle timer interrupts before all others, so that all
+                * hardirq handlers see an up-to-date system time even if we
+                * have just woken from a long idle period.
+                */
+#ifdef PER_CPU_VIRQ_IRQ
+               if ((irq = percpu_read(virq_to_irq[VIRQ_TIMER])) != -1) {
+                       port = evtchn_from_irq(irq);
+#else
+               port = __this_cpu_read(virq_to_evtchn[VIRQ_TIMER]);
+               if (VALID_EVTCHN(port)) {
+#endif
+                       l1i = port / BITS_PER_LONG;
+                       l2i = port % BITS_PER_LONG;
+                       if (active_evtchns(l1i) & (1ul<<l2i)) {
+                               mask_evtchn(port);
+                               clear_evtchn(port);
+#ifndef PER_CPU_VIRQ_IRQ
+                               irq = evtchn_to_irq[port];
+                               BUG_ON(irq == -1);
+#endif
+                               if (!handle_irq(irq, regs))
+                                       BUG();
+                       }
+               }
+#endif /* CONFIG_NO_HZ */
+
+               l1 = vcpu_info_xchg(evtchn_pending_sel, 0);
+
+               start_l1i = l1i = percpu_read(current_l1i);
+               start_l2i = percpu_read(current_l2i);
+
+               for (i = 0; l1 != 0; i++) {
+                       masked_l1 = l1 & ((~0UL) << l1i);
+                       /* If we masked out all events, wrap to beginning. */
+                       if (masked_l1 == 0) {
+                               l1i = l2i = 0;
+                               continue;
+                       }
+                       l1i = __ffs(masked_l1);
+
+                       l2 = active_evtchns(l1i);
+                       l2i = 0; /* usually scan entire word from start */
+                       if (l1i == start_l1i) {
+                               /* We scan the starting word in two parts. */
+                               if (i == 0)
+                                       /* 1st time: start in the middle */
+                                       l2i = start_l2i;
+                               else
+                                       /* 2nd time: mask bits done already */
+                                       l2 &= (1ul << start_l2i) - 1;
+                       }
+
+                       do {
+                               bool handled = false;
+
+                               masked_l2 = l2 & ((~0UL) << l2i);
+                               if (masked_l2 == 0)
+                                       break;
+                               l2i = __ffs(masked_l2);
+
+                               /* process port */
+                               port = (l1i * BITS_PER_LONG) + l2i;
+                               mask_evtchn(port);
+                               if ((irq = evtchn_to_irq[port]) != -1) {
+#ifndef PER_CPU_IPI_IRQ
+                                       if (port != __this_cpu_read(ipi_evtchn))
+#endif
+                                               clear_evtchn(port);
+                                       handled = handle_irq(irq, regs);
+                               }
+                               if (!handled && printk_ratelimit())
+                                       pr_emerg("No handler for irq %d"
+                                                " (port %u)\n",
+                                                irq, port);
+
+                               l2i = (l2i + 1) % BITS_PER_LONG;
+
+                               /* Next caller starts at last processed + 1 */
+                               percpu_write(current_l1i,
+                                       l2i ? l1i : (l1i + 1) % BITS_PER_LONG);
+                               percpu_write(current_l2i, l2i);
+
+                       } while (l2i != 0);
+
+                       /* Scan start_l1i twice; all others once. */
+                       if ((l1i != start_l1i) || (i != 0))
+                               l1 &= ~(1UL << l1i);
+
+                       l1i = (l1i + 1) % BITS_PER_LONG;
+               }
+
+               /* If there were nested callbacks then we have more to do. */
+       } while (unlikely(__this_cpu_cmpxchg(upcall_state, UPC_RESTART,
+                                            UPC_ACTIVE) == UPC_RESTART));
+
+       __this_cpu_write(upcall_state, UPC_INACTIVE);
+       irq_exit();
+       xen_spin_irq_exit();
+       set_irq_regs(old_regs);
+}
+
+static int find_unbound_irq(unsigned int node, struct irq_cfg **pcfg,
+                           struct irq_chip *chip, bool percpu)
+{
+       static int warned;
+       int irq;
+
+       for (irq = DYNIRQ_BASE; irq < nr_irqs; irq++) {
+               struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
+               struct irq_data *data = irq_get_irq_data(irq);
+
+               if (unlikely(!cfg))
+                       return -ENOMEM;
+               if (data->chip != &no_irq_chip &&
+                   data->chip != chip)
+                       continue;
+
+               if (!cfg->bindcount) {
+                       irq_flow_handler_t handle;
+                       const char *name;
+
+                       *pcfg = cfg;
+                       irq_set_noprobe(irq);
+                       if (!percpu) {
+                               handle = handle_fasteoi_irq;
+                               name = "fasteoi";
+                       } else {
+                               handle = handle_percpu_irq;
+                               name = "percpu";
+                       }
+                       irq_set_chip_and_handler_name(irq, chip,
+                                                     handle, name);
+                       return irq;
+               }
+       }
+
+       if (!warned) {
+               warned = 1;
+               pr_warning("No available IRQ to bind to: "
+                          "increase NR_DYNIRQS.\n");
+       }
+
+       return -ENOSPC;
+}
+
+static struct irq_chip dynirq_chip;
+
+static int bind_caller_port_to_irq(unsigned int caller_port)
+{
+       struct irq_cfg *cfg;
+       int irq;
+
+       spin_lock(&irq_mapping_update_lock);
+
+       if ((irq = evtchn_to_irq[caller_port]) == -1) {
+               if ((irq = find_unbound_irq(numa_node_id(), &cfg,
+                                           &dynirq_chip, false)) < 0)
+                       goto out;
+
+               evtchn_to_irq[caller_port] = irq;
+               cfg->info = mk_irq_info(IRQT_CALLER_PORT, 0, caller_port);
+       } else
+               cfg = irq_cfg(irq);
+
+       cfg->bindcount++;
+
+ out:
+       spin_unlock(&irq_mapping_update_lock);
+       return irq;
+}
+
+static int bind_local_port_to_irq(unsigned int local_port)
+{
+       struct irq_cfg *cfg;
+       int irq;
+
+       spin_lock(&irq_mapping_update_lock);
+
+       BUG_ON(evtchn_to_irq[local_port] != -1);
+
+       if ((irq = find_unbound_irq(numa_node_id(), &cfg, &dynirq_chip,
+                                   false)) < 0) {
+               if (close_evtchn(local_port))
+                       BUG();
+               goto out;
+       }
+
+       evtchn_to_irq[local_port] = irq;
+       cfg->info = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port);
+       cfg->bindcount++;
+
+ out:
+       spin_unlock(&irq_mapping_update_lock);
+       return irq;
+}
+
+static int bind_listening_port_to_irq(unsigned int remote_domain)
+{
+       struct evtchn_alloc_unbound alloc_unbound;
+       int err;
+
+       alloc_unbound.dom        = DOMID_SELF;
+       alloc_unbound.remote_dom = remote_domain;
+
+       err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+                                         &alloc_unbound);
+
+       return err ? : bind_local_port_to_irq(alloc_unbound.port);
+}
+
+static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+                                         unsigned int remote_port)
+{
+       struct evtchn_bind_interdomain bind_interdomain;
+       int err;
+
+       bind_interdomain.remote_dom  = remote_domain;
+       bind_interdomain.remote_port = remote_port;
+
+       err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+                                         &bind_interdomain);
+
+       return err ? : bind_local_port_to_irq(bind_interdomain.local_port);
+}
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+       struct evtchn_bind_virq bind_virq;
+       struct irq_cfg *cfg;
+       int evtchn, irq;
+
+       spin_lock(&irq_mapping_update_lock);
+
+       if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
+               if ((irq = find_unbound_irq(cpu_to_node(cpu), &cfg,
+                                           &dynirq_chip, false)) < 0)
+                       goto out;
+
+               bind_virq.virq = virq;
+               bind_virq.vcpu = cpu;
+               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+                                               &bind_virq) != 0)
+                       BUG();
+               evtchn = bind_virq.port;
+
+               evtchn_to_irq[evtchn] = irq;
+#ifndef PER_CPU_VIRQ_IRQ
+               {
+                       unsigned int cpu;
+
+                       for_each_possible_cpu(cpu)
+                               per_cpu(virq_to_evtchn, cpu)[virq] = evtchn;
+               }
+#endif
+               cfg->info = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+               per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+               bind_evtchn_to_cpu(evtchn, cpu);
+       } else
+               cfg = irq_cfg(irq);
+
+       cfg->bindcount++;
+
+ out:
+       spin_unlock(&irq_mapping_update_lock);
+       return irq;
+}
+
+#if defined(CONFIG_SMP) && defined(PER_CPU_IPI_IRQ)
+static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
+{
+       struct evtchn_bind_ipi bind_ipi;
+       struct irq_cfg *cfg;
+       int evtchn, irq;
+
+       spin_lock(&irq_mapping_update_lock);
+
+       if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) {
+               if ((irq = find_unbound_irq(cpu_to_node(cpu), &cfg,
+                                           &dynirq_chip, false)) < 0)
+                       goto out;
+
+               bind_ipi.vcpu = cpu;
+               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+                                               &bind_ipi) != 0)
+                       BUG();
+               evtchn = bind_ipi.port;
+
+               evtchn_to_irq[evtchn] = irq;
+               cfg->info = mk_irq_info(IRQT_IPI, ipi, evtchn);
+
+               per_cpu(ipi_to_irq, cpu)[ipi] = irq;
+
+               bind_evtchn_to_cpu(evtchn, cpu);
+       } else
+               cfg = irq_cfg(irq);
+
+       cfg->bindcount++;
+
+ out:
+       spin_unlock(&irq_mapping_update_lock);
+       return irq;
+}
+#endif
+
+static void unbind_from_irq(unsigned int irq)
+{
+       struct irq_cfg *cfg = irq_cfg(irq);
+       unsigned int evtchn = evtchn_from_irq_cfg(cfg);
+
+       BUG_IF_VIRQ_PER_CPU(cfg);
+       BUG_IF_IPI(cfg);
+
+       spin_lock(&irq_mapping_update_lock);
+
+       if (!--cfg->bindcount && VALID_EVTCHN(evtchn)) {
+               if ((type_from_irq_cfg(cfg) != IRQT_CALLER_PORT) &&
+                   close_evtchn(evtchn))
+                       BUG();
+
+               switch (type_from_irq_cfg(cfg)) {
+               case IRQT_VIRQ:
+                       per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+                               [index_from_irq_cfg(cfg)] = -1;
+#ifndef PER_CPU_VIRQ_IRQ
+                       {
+                               unsigned int cpu;
+
+                               for_each_possible_cpu(cpu)
+                                       per_cpu(virq_to_evtchn, cpu)
+                                               [index_from_irq_cfg(cfg)] = 0;
+                       }
+#endif
+                       break;
+#if defined(CONFIG_SMP) && defined(PER_CPU_IPI_IRQ)
+               case IRQT_IPI:
+                       per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn))
+                               [index_from_irq_cfg(cfg)] = -1;
+                       break;
+#endif
+               default:
+                       break;
+               }
+
+               /* Closed ports are implicitly re-bound to VCPU0. */
+               bind_evtchn_to_cpu(evtchn, 0);
+
+               evtchn_to_irq[evtchn] = -1;
+               cfg->info = IRQ_UNBOUND;
+
+               dynamic_irq_cleanup(irq);
+       }
+
+       spin_unlock(&irq_mapping_update_lock);
+}
+
+#if !defined(PER_CPU_IPI_IRQ) || !defined(PER_CPU_VIRQ_IRQ)
+static inline struct percpu_irqaction *alloc_percpu_irqaction(gfp_t gfp)
+{
+       struct percpu_irqaction *new = kzalloc(sizeof(*new), GFP_ATOMIC);
+
+       if (new && !zalloc_cpumask_var(&new->cpus, gfp)) {
+               kfree(new);
+               new = NULL;
+       }
+       return new;
+}
+
+static inline void free_percpu_irqaction(struct percpu_irqaction *action)
+{
+       if (!action)
+               return;
+       free_cpumask_var(action->cpus);
+       kfree(action);
+}
+
+void unbind_from_per_cpu_irq(unsigned int irq, unsigned int cpu,
+                            struct irqaction *action)
+{
+       struct evtchn_close close;
+       struct irq_data *data = irq_get_irq_data(irq);
+       struct irq_cfg *cfg = irq_data_cfg(data);
+       unsigned int evtchn = evtchn_from_per_cpu_irq(cfg, cpu);
+       struct percpu_irqaction *free_action = NULL;
+
+       spin_lock(&irq_mapping_update_lock);
+
+       if (VALID_EVTCHN(evtchn)) {
+               mask_evtchn(evtchn);
+
+               BUG_ON(cfg->bindcount <= 1);
+               cfg->bindcount--;
+
+#ifndef PER_CPU_VIRQ_IRQ
+               if (type_from_irq_cfg(cfg) == IRQT_VIRQ) {
+                       unsigned int virq = index_from_irq_cfg(cfg);
+                       struct percpu_irqaction *cur, *prev = NULL;
+
+                       cur = virq_actions[virq];
+                       while (cur) {
+                               if (cur->action.dev_id == action) {
+                                       cpumask_clear_cpu(cpu, cur->cpus);
+                                       if (cpumask_empty(cur->cpus)) {
+                                               WARN_ON(free_action);
+                                               if (prev)
+                                                       prev->next = cur->next;
+                                               else
+                                                       virq_actions[virq]
+                                                               = cur->next;
+                                               free_action = cur;
+                                       }
+                               } else if (cpumask_test_cpu(cpu, cur->cpus))
+                                       evtchn = 0;
+                               cur = (prev = cur)->next;
+                       }
+                       if (!VALID_EVTCHN(evtchn))
+                               goto done;
+               }
+#endif
+
+               cpumask_clear_cpu(cpu, data->affinity);
+
+               close.port = evtchn;
+               if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
+                       BUG();
+
+               switch (type_from_irq_cfg(cfg)) {
+#ifndef PER_CPU_VIRQ_IRQ
+               case IRQT_VIRQ:
+                       per_cpu(virq_to_evtchn, cpu)
+                               [index_from_irq_cfg(cfg)] = 0;
+                       break;
+#endif
+#ifndef PER_CPU_IPI_IRQ
+               case IRQT_IPI:
+                       per_cpu(ipi_evtchn, cpu) = 0;
+                       break;
+#endif
+               default:
+                       BUG();
+                       break;
+               }
+
+               /* Closed ports are implicitly re-bound to VCPU0. */
+               bind_evtchn_to_cpu(evtchn, 0);
+
+               evtchn_to_irq[evtchn] = -1;
+       }
+
+#ifndef PER_CPU_VIRQ_IRQ
+done:
+#endif
+       spin_unlock(&irq_mapping_update_lock);
+
+       if (free_action) {
+               cpumask_t *cpus = free_action->cpus;
+
+               free_irq(irq, free_action->action.dev_id);
+               free_cpumask_var(cpus);
+       }
+}
+EXPORT_SYMBOL_GPL(unbind_from_per_cpu_irq);
+#endif /* !PER_CPU_IPI_IRQ || !PER_CPU_VIRQ_IRQ */
+
+int bind_caller_port_to_irqhandler(
+       unsigned int caller_port,
+       irq_handler_t handler,
+       unsigned long irqflags,
+       const char *devname,
+       void *dev_id)
+{
+       int irq, retval;
+
+       irq = bind_caller_port_to_irq(caller_port);
+       if (irq < 0)
+               return irq;
+
+       retval = request_irq(irq, handler, irqflags, devname, dev_id);
+       if (retval != 0) {
+               unbind_from_irq(irq);
+               return retval;
+       }
+
+       return irq;
+}
+EXPORT_SYMBOL_GPL(bind_caller_port_to_irqhandler);
+
+int bind_listening_port_to_irqhandler(
+       unsigned int remote_domain,
+       irq_handler_t handler,
+       unsigned long irqflags,
+       const char *devname,
+       void *dev_id)
+{
+       int irq, retval;
+
+       irq = bind_listening_port_to_irq(remote_domain);
+       if (irq < 0)
+               return irq;
+
+       retval = request_irq(irq, handler, irqflags, devname, dev_id);
+       if (retval != 0) {
+               unbind_from_irq(irq);
+               return retval;
+       }
+
+       return irq;
+}
+EXPORT_SYMBOL_GPL(bind_listening_port_to_irqhandler);
+
+int bind_interdomain_evtchn_to_irqhandler(
+       unsigned int remote_domain,
+       unsigned int remote_port,
+       irq_handler_t handler,
+       unsigned long irqflags,
+       const char *devname,
+       void *dev_id)
+{
+       int irq, retval;
+
+       irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
+       if (irq < 0)
+               return irq;
+
+       retval = request_irq(irq, handler, irqflags, devname, dev_id);
+       if (retval != 0) {
+               unbind_from_irq(irq);
+               return retval;
+       }
+
+       return irq;
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(
+       unsigned int virq,
+       unsigned int cpu,
+       irq_handler_t handler,
+       unsigned long irqflags,
+       const char *devname,
+       void *dev_id)
+{
+       int irq, retval;
+
+#ifndef PER_CPU_VIRQ_IRQ
+       BUG_ON(test_bit(virq, virq_per_cpu));
+#endif
+
+       irq = bind_virq_to_irq(virq, cpu);
+       if (irq < 0)
+               return irq;
+
+       retval = request_irq(irq, handler, irqflags, devname, dev_id);
+       if (retval != 0) {
+               unbind_from_irq(irq);
+               return retval;
+       }
+
+       return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+#ifdef CONFIG_SMP
+#ifndef PER_CPU_VIRQ_IRQ
+int bind_virq_to_irqaction(
+       unsigned int virq,
+       unsigned int cpu,
+       struct irqaction *action)
+{
+       struct evtchn_bind_virq bind_virq;
+       struct irq_cfg *cfg;
+       unsigned int evtchn;
+       int irq, retval = 0;
+       struct percpu_irqaction *cur = NULL, *new;
+
+       BUG_ON(!test_bit(virq, virq_per_cpu));
+
+       if (action->dev_id)
+               return -EINVAL;
+
+       new = alloc_percpu_irqaction(GFP_ATOMIC);
+       if (new) {
+               new->action = *action;
+               new->action.dev_id = action;
+       }
+
+       spin_lock(&irq_mapping_update_lock);
+
+       for (cur = virq_actions[virq]; cur; cur = cur->next)
+               if (cur->action.dev_id == action)
+                       break;
+       if (!cur) {
+               if (!new) {
+                       spin_unlock(&irq_mapping_update_lock);
+                       return -ENOMEM;
+               }
+               new->next = virq_actions[virq];
+               virq_actions[virq] = cur = new;
+               new = NULL;
+               retval = 1;
+       }
+       cpumask_set_cpu(cpu, cur->cpus);
+       action = &cur->action;
+
+       if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) {
+               unsigned int nr;
+
+               BUG_ON(!retval);
+
+               if ((irq = find_unbound_irq(cpu_to_node(cpu), &cfg,
+                                           &dynirq_chip, true)) < 0) {
+                       virq_actions[virq] = cur->next;
+                       spin_unlock(&irq_mapping_update_lock);
+                       free_percpu_irqaction(new);
+                       return irq;
+               }
+
+               /* Extra reference so count will never drop to zero. */
+               cfg->bindcount++;
+
+               for_each_possible_cpu(nr)
+                       per_cpu(virq_to_irq, nr)[virq] = irq;
+               cfg->info = mk_irq_info(IRQT_VIRQ, virq, 0);
+       } else
+               cfg = irq_cfg(irq);
+
+       evtchn = per_cpu(virq_to_evtchn, cpu)[virq];
+       if (!VALID_EVTCHN(evtchn)) {
+               bind_virq.virq = virq;
+               bind_virq.vcpu = cpu;
+               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+                                               &bind_virq) != 0)
+                       BUG();
+               evtchn = bind_virq.port;
+               evtchn_to_irq[evtchn] = irq;
+               per_cpu(virq_to_evtchn, cpu)[virq] = evtchn;
+
+               bind_evtchn_to_cpu(evtchn, cpu);
+       }
+
+       cfg->bindcount++;
+
+       spin_unlock(&irq_mapping_update_lock);
+
+       free_percpu_irqaction(new);
+
+       if (retval == 0) {
+               unsigned long flags;
+
+               local_irq_save(flags);
+               unmask_evtchn(evtchn);
+               local_irq_restore(flags);
+       } else {
+               action->flags |= IRQF_PERCPU;
+               retval = setup_irq(irq, action);
+               if (retval) {
+                       unbind_from_per_cpu_irq(irq, cpu, action);
+                       BUG_ON(retval > 0);
+                       irq = retval;
+               }
+       }
+
+       return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqaction);
+#endif
+
+#ifdef PER_CPU_IPI_IRQ
+int bind_ipi_to_irqhandler(
+       unsigned int ipi,
+       unsigned int cpu,
+       irq_handler_t handler,
+       unsigned long irqflags,
+       const char *devname,
+       void *dev_id)
+{
+       int irq, retval;
+
+       irq = bind_ipi_to_irq(ipi, cpu);
+       if (irq < 0)
+               return irq;
+
+       retval = request_irq(irq, handler, irqflags | IRQF_NO_SUSPEND,
+                            devname, dev_id);
+       if (retval != 0) {
+               unbind_from_irq(irq);
+               return retval;
+       }
+
+       return irq;
+}
+#else
+int __cpuinit bind_ipi_to_irqaction(
+       unsigned int cpu,
+       struct irqaction *action)
+{
+       struct evtchn_bind_ipi bind_ipi;
+       struct irq_cfg *cfg;
+       unsigned int evtchn;
+       int retval = 0;
+
+       spin_lock(&irq_mapping_update_lock);
+
+       if (VALID_EVTCHN(per_cpu(ipi_evtchn, cpu))) {
+               spin_unlock(&irq_mapping_update_lock);
+               return -EBUSY;
+       }
+
+       if (ipi_irq < 0) {
+               if ((ipi_irq = find_unbound_irq(cpu_to_node(cpu), &cfg,
+                                               &dynirq_chip, true)) < 0) {
+                       spin_unlock(&irq_mapping_update_lock);
+                       return ipi_irq;
+               }
+
+               /* Extra reference so count will never drop to zero. */
+               cfg->bindcount++;
+
+               cfg->info = mk_irq_info(IRQT_IPI, 0, 0);
+               retval = 1;
+       } else
+               cfg = irq_cfg(ipi_irq);
+
+       bind_ipi.vcpu = cpu;
+       if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi))
+               BUG();
+
+       evtchn = bind_ipi.port;
+       evtchn_to_irq[evtchn] = ipi_irq;
+       per_cpu(ipi_evtchn, cpu) = evtchn;
+
+       bind_evtchn_to_cpu(evtchn, cpu);
+
+       cfg->bindcount++;
+
+       spin_unlock(&irq_mapping_update_lock);
+
+       if (retval == 0) {
+               unsigned long flags;
+
+               local_irq_save(flags);
+               unmask_evtchn(evtchn);
+               local_irq_restore(flags);
+       } else {
+               action->flags |= IRQF_PERCPU | IRQF_NO_SUSPEND;
+               retval = setup_irq(ipi_irq, action);
+               if (retval) {
+                       unbind_from_per_cpu_irq(ipi_irq, cpu, NULL);
+                       BUG_ON(retval > 0);
+                       ipi_irq = retval;
+               }
+       }
+
+       return ipi_irq;
+}
+#endif /* PER_CPU_IPI_IRQ */
+#endif /* CONFIG_SMP */
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+       free_irq(irq, dev_id);
+       unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+#ifdef CONFIG_SMP
+static int set_affinity_irq(struct irq_data *data,
+                           const struct cpumask *dest, bool force)
+{
+       const struct irq_cfg *cfg = irq_data_cfg(data);
+       unsigned int port = evtchn_from_irq_cfg(cfg);
+       unsigned int cpu = cpumask_any(dest);
+       struct evtchn_bind_vcpu ebv = { .port = port, .vcpu = cpu };
+       bool masked;
+       int rc;
+
+       BUG_IF_VIRQ_PER_CPU(cfg);
+       BUG_IF_IPI(cfg);
+
+       if (!VALID_EVTCHN(port))
+               return -ENXIO;
+
+       masked = test_and_set_evtchn_mask(port);
+       rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &ebv);
+       if (rc == 0) {
+               bind_evtchn_to_cpu(port, cpu);
+               rc = evtchn_to_irq[port] != -1 ? IRQ_SET_MASK_OK_NOCOPY
+                                              : IRQ_SET_MASK_OK;
+       }
+       if (!masked)
+               unmask_evtchn(port);
+
+       return rc;
+}
+#endif
+
+int resend_irq_on_evtchn(struct irq_data *data)
+{
+       unsigned int evtchn = evtchn_from_irq_data(data);
+       bool masked;
+
+       if (!VALID_EVTCHN(evtchn))
+               return 1;
+
+       masked = test_and_set_evtchn_mask(evtchn);
+       set_evtchn(evtchn);
+       if (!masked)
+               unmask_evtchn(evtchn);
+
+       return 1;
+}
+
+/*
+ * Interface to generic handling in irq.c
+ */
+
+static void unmask_dynirq(struct irq_data *data)
+{
+       unsigned int evtchn = evtchn_from_irq_data(data);
+
+       if (VALID_EVTCHN(evtchn))
+               unmask_evtchn(evtchn);
+}
+
+static void mask_dynirq(struct irq_data *data)
+{
+       unsigned int evtchn = evtchn_from_irq_data(data);
+
+       if (VALID_EVTCHN(evtchn))
+               mask_evtchn(evtchn);
+}
+
+static unsigned int startup_dynirq(struct irq_data *data)
+{
+       unmask_dynirq(data);
+       return 0;
+}
+
+#define shutdown_dynirq mask_dynirq
+
+static void end_dynirq(struct irq_data *data)
+{
+       if (!irqd_irq_disabled(data)) {
+               irq_move_masked_irq(data);
+               unmask_dynirq(data);
+       }
+}
+
+static struct irq_chip dynirq_chip = {
+       .name             = "Dynamic",
+       .irq_startup      = startup_dynirq,
+       .irq_shutdown     = shutdown_dynirq,
+       .irq_enable       = unmask_dynirq,
+       .irq_disable      = mask_dynirq,
+       .irq_mask         = mask_dynirq,
+       .irq_unmask       = unmask_dynirq,
+       .irq_eoi          = end_dynirq,
+#ifdef CONFIG_SMP
+       .irq_set_affinity = set_affinity_irq,
+#endif
+       .irq_retrigger    = resend_irq_on_evtchn,
+};
+
+/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
+static bool pirq_eoi_does_unmask;
+static unsigned long *pirq_needs_eoi;
+static DECLARE_BITMAP(probing_pirq, NR_PIRQS);
+
+static void pirq_unmask_and_notify(unsigned int evtchn, unsigned int irq)
+{
+       struct physdev_eoi eoi = { .irq = evtchn_get_xen_pirq(irq) };
+
+       if (pirq_eoi_does_unmask) {
+               if (test_bit(eoi.irq, pirq_needs_eoi))
+                       VOID(HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi));
+               else
+                       unmask_evtchn(evtchn);
+       } else if (test_bit(irq - PIRQ_BASE, pirq_needs_eoi)) {
+               if (smp_processor_id() != cpu_from_evtchn(evtchn)) {
+                       struct evtchn_unmask unmask = { .port = evtchn };
+                       struct multicall_entry mcl[2];
+
+                       mcl[0].op = __HYPERVISOR_event_channel_op;
+                       mcl[0].args[0] = EVTCHNOP_unmask;
+                       mcl[0].args[1] = (unsigned long)&unmask;
+                       mcl[1].op = __HYPERVISOR_physdev_op;
+                       mcl[1].args[0] = PHYSDEVOP_eoi;
+                       mcl[1].args[1] = (unsigned long)&eoi;
+
+                       if (HYPERVISOR_multicall(mcl, 2))
+                               BUG();
+               } else {
+                       unmask_evtchn(evtchn);
+                       VOID(HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi));
+               }
+       } else
+               unmask_evtchn(evtchn);
+}
+
+static inline void pirq_query_unmask(int irq)
+{
+       struct physdev_irq_status_query irq_status;
+
+       if (pirq_eoi_does_unmask)
+               return;
+       irq_status.irq = evtchn_get_xen_pirq(irq);
+       if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+               irq_status.flags = 0;
+       clear_bit(irq - PIRQ_BASE, pirq_needs_eoi);
+       if (irq_status.flags & XENIRQSTAT_needs_eoi)
+               set_bit(irq - PIRQ_BASE, pirq_needs_eoi);
+}
+
+static int set_type_pirq(struct irq_data *data, unsigned int type)
+{
+       if (type != IRQ_TYPE_PROBE)
+               return -EINVAL;
+       set_bit(data->irq - PIRQ_BASE, probing_pirq);
+       return 0;
+}
+
+static void enable_pirq(struct irq_data *data)
+{
+       struct evtchn_bind_pirq bind_pirq;
+       struct irq_cfg *cfg = irq_data_cfg(data);
+       unsigned int evtchn = evtchn_from_irq_cfg(cfg);
+       unsigned int irq = data->irq, pirq = irq - PIRQ_BASE;
+
+       if (VALID_EVTCHN(evtchn)) {
+               if (pirq < nr_pirqs)
+                       clear_bit(pirq, probing_pirq);
+               goto out;
+       }
+
+       bind_pirq.pirq = evtchn_get_xen_pirq(irq);
+       /* NB. We are happy to share unless we are probing. */
+       bind_pirq.flags = (pirq < nr_pirqs
+                          && test_and_clear_bit(pirq, probing_pirq))
+                         || (irq_to_desc(irq)->istate & IRQS_AUTODETECT)
+                         ? 0 : BIND_PIRQ__WILL_SHARE;
+       if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) {
+               if (bind_pirq.flags)
+                       pr_info("Failed to obtain physical IRQ %d\n", irq);
+               return;
+       }
+       evtchn = bind_pirq.port;
+
+       pirq_query_unmask(irq);
+
+       evtchn_to_irq[evtchn] = irq;
+       bind_evtchn_to_cpu(evtchn, 0);
+       cfg->info = mk_irq_info(IRQT_PIRQ, bind_pirq.pirq, evtchn);
+
+ out:
+       pirq_unmask_and_notify(evtchn, irq);
+}
+
+#define disable_pirq mask_pirq
+
+static unsigned int startup_pirq(struct irq_data *data)
+{
+       enable_pirq(data);
+       return 0;
+}
+
+static void shutdown_pirq(struct irq_data *data)
+{
+       struct irq_cfg *cfg = irq_data_cfg(data);
+       unsigned int evtchn = evtchn_from_irq_cfg(cfg);
+
+       if (!VALID_EVTCHN(evtchn))
+               return;
+
+       mask_evtchn(evtchn);
+
+       if (close_evtchn(evtchn))
+               BUG();
+
+       bind_evtchn_to_cpu(evtchn, 0);
+       evtchn_to_irq[evtchn] = -1;
+       cfg->info = mk_irq_info(IRQT_PIRQ, index_from_irq_cfg(cfg), 0);
+}
+
+static void unmask_pirq(struct irq_data *data)
+{
+       unsigned int evtchn = evtchn_from_irq_data(data);
+
+       if (VALID_EVTCHN(evtchn))
+               pirq_unmask_and_notify(evtchn, data->irq);
+}
+
+#define mask_pirq mask_dynirq
+
+static void end_pirq(struct irq_data *data)
+{
+       bool disabled = irqd_irq_disabled(data);
+
+       if (disabled && (irq_to_desc(data->irq)->istate & IRQS_PENDING))
+               shutdown_pirq(data);
+       else {
+               if (!disabled)
+                       irq_move_masked_irq(data);
+               unmask_pirq(data);
+       }
+}
+
+static struct irq_chip pirq_chip = {
+       .name             = "Phys",
+       .irq_startup      = startup_pirq,
+       .irq_shutdown     = shutdown_pirq,
+       .irq_enable       = enable_pirq,
+       .irq_disable      = disable_pirq,
+       .irq_mask         = mask_pirq,
+       .irq_unmask       = unmask_pirq,
+       .irq_eoi          = end_pirq,
+       .irq_set_type     = set_type_pirq,
+#ifdef CONFIG_SMP
+       .irq_set_affinity = set_affinity_irq,
+#endif
+       .irq_retrigger    = resend_irq_on_evtchn,
+};
+
+int irq_ignore_unhandled(unsigned int irq)
+{
+       struct physdev_irq_status_query irq_status = { .irq = irq };
+
+       if (!is_running_on_xen() || irq >= nr_pirqs)
+               return 0;
+
+       if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+               return 0;
+       return !!(irq_status.flags & XENIRQSTAT_shared);
+}
+
+#if defined(CONFIG_SMP) && !defined(PER_CPU_IPI_IRQ)
+void notify_remote_via_ipi(unsigned int ipi, unsigned int cpu)
+{
+       unsigned int evtchn = per_cpu(ipi_evtchn, cpu);
+
+#ifdef NMI_VECTOR
+       if (ipi == NMI_VECTOR) {
+               int rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, cpu, NULL);
+
+               if (rc)
+                       pr_warn_once("Unable (%d) to send NMI to CPU#%u\n",
+                                    rc, cpu);
+               return;
+       }
+#endif
+
+       if (VALID_EVTCHN(evtchn)
+           && !test_and_set_bit(ipi, per_cpu(ipi_pending, cpu))
+           && !test_evtchn(evtchn))
+               notify_remote_via_evtchn(evtchn);
+}
+
+void clear_ipi_evtchn(void)
+{
+       unsigned int evtchn = this_cpu_read(ipi_evtchn);
+
+       BUG_ON(!VALID_EVTCHN(evtchn));
+       clear_evtchn(evtchn);
+}
+#endif
+
+void notify_remote_via_irq(int irq)
+{
+       const struct irq_cfg *cfg = irq_cfg(irq);
+       unsigned int evtchn;
+
+       if (WARN_ON_ONCE(!cfg))
+               return;
+       BUG_ON(type_from_irq_cfg(cfg) == IRQT_VIRQ);
+       BUG_IF_IPI(cfg);
+
+       evtchn = evtchn_from_irq_cfg(cfg);
+       if (VALID_EVTCHN(evtchn))
+               notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
+int multi_notify_remote_via_irq(multicall_entry_t *mcl, int irq)
+{
+       const struct irq_cfg *cfg = irq_cfg(irq);
+       unsigned int evtchn;
+
+       if (WARN_ON_ONCE(!cfg))
+               return -EINVAL;
+       BUG_ON(type_from_irq_cfg(cfg) == IRQT_VIRQ);
+       BUG_IF_IPI(cfg);
+
+       evtchn = evtchn_from_irq_cfg(cfg);
+       if (!VALID_EVTCHN(evtchn))
+               return -EINVAL;
+
+       multi_notify_remote_via_evtchn(mcl, evtchn);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(multi_notify_remote_via_irq);
+#endif
+
+int irq_to_evtchn_port(int irq)
+{
+       const struct irq_cfg *cfg = irq_cfg(irq);
+
+       if (!cfg)
+               return 0;
+       BUG_IF_VIRQ_PER_CPU(cfg);
+       BUG_IF_IPI(cfg);
+       return evtchn_from_irq_cfg(cfg);
+}
+EXPORT_SYMBOL_GPL(irq_to_evtchn_port);
+
+void mask_evtchn(int port)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+       sync_set_bit(port, s->evtchn_mask);
+}
+EXPORT_SYMBOL_GPL(mask_evtchn);
+
+void unmask_evtchn(int port)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+       unsigned int cpu = smp_processor_id();
+
+       BUG_ON(!irqs_disabled());
+
+       /* Slow path (hypercall) if this is a non-local port. */
+       if (unlikely(cpu != cpu_from_evtchn(port))) {
+               struct evtchn_unmask unmask = { .port = port };
+               VOID(HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask));
+               return;
+       }
+
+       sync_clear_bit(port, s->evtchn_mask);
+
+       /* Did we miss an interrupt 'edge'? Re-fire if so. */
+       if (sync_test_bit(port, s->evtchn_pending)) {
+               vcpu_info_t *v = current_vcpu_info();
+
+               if (!sync_test_and_set_bit(port / BITS_PER_LONG,
+                                          &v->evtchn_pending_sel))
+                       v->evtchn_upcall_pending = 1;
+       }
+}
+EXPORT_SYMBOL_GPL(unmask_evtchn);
+
+void disable_all_local_evtchn(void)
+{
+       unsigned i, cpu = smp_processor_id();
+       shared_info_t *s = HYPERVISOR_shared_info;
+
+       for (i = 0; i < NR_EVENT_CHANNELS; ++i)
+               if (cpu_from_evtchn(i) == cpu)
+                       sync_set_bit(i, &s->evtchn_mask[0]);
+}
+
+/* Test an irq's pending state. */
+int xen_test_irq_pending(int irq)
+{
+       unsigned int evtchn = evtchn_from_irq(irq);
+
+       return VALID_EVTCHN(evtchn) && test_evtchn(evtchn);
+}
+
+#ifdef CONFIG_PM_SLEEP
+#include <linux/syscore_ops.h>
+
+static void restore_cpu_virqs(unsigned int cpu)
+{
+       struct evtchn_bind_virq bind_virq;
+       int virq, irq, evtchn;
+
+       for (virq = 0; virq < NR_VIRQS; virq++) {
+               if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
+                       continue;
+
+#ifndef PER_CPU_VIRQ_IRQ
+               if (test_bit(virq, virq_per_cpu)
+                   && !VALID_EVTCHN(per_cpu(virq_to_evtchn, cpu)[virq]))
+                       continue;
+#endif
+
+               BUG_ON(irq_cfg(irq)->info != mk_irq_info(IRQT_VIRQ, virq, 0));
+
+               /* Get a new binding from Xen. */
+               bind_virq.virq = virq;
+               bind_virq.vcpu = cpu;
+               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+                                               &bind_virq) != 0)
+                       BUG();
+               evtchn = bind_virq.port;
+
+               /* Record the new mapping. */
+               evtchn_to_irq[evtchn] = irq;
+#ifdef PER_CPU_VIRQ_IRQ
+               irq_cfg(irq)->info = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+#else
+               if (test_bit(virq, virq_per_cpu))
+                       per_cpu(virq_to_evtchn, cpu)[virq] = evtchn;
+               else {
+                       unsigned int cpu;
+
+                       irq_cfg(irq)->info = mk_irq_info(IRQT_VIRQ, virq,
+                                                        evtchn);
+                       for_each_possible_cpu(cpu)
+                               per_cpu(virq_to_evtchn, cpu)[virq] = evtchn;
+               }
+#endif
+               bind_evtchn_to_cpu(evtchn, cpu);
+
+               /* Ready for use. */
+               unmask_evtchn(evtchn);
+       }
+}
+
+static void restore_cpu_ipis(unsigned int cpu)
+{
+#ifdef CONFIG_SMP
+       struct evtchn_bind_ipi bind_ipi;
+       struct irq_data *data;
+       unsigned int evtchn;
+#ifdef PER_CPU_IPI_IRQ
+       int ipi, irq;
+
+       for (ipi = 0; ipi < NR_IPIS; ipi++) {
+               if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
+                       continue;
+#else
+#define ipi 0
+#define irq ipi_irq
+               if (irq == -1
+                   || !VALID_EVTCHN(per_cpu(ipi_evtchn, cpu)))
+                       return;
+#endif
+
+               data = irq_get_irq_data(irq);
+               BUG_ON(irq_data_cfg(data)->info != mk_irq_info(IRQT_IPI, ipi, 0));
+
+               /* Get a new binding from Xen. */
+               bind_ipi.vcpu = cpu;
+               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+                                               &bind_ipi) != 0)
+                       BUG();
+               evtchn = bind_ipi.port;
+
+               /* Record the new mapping. */
+               evtchn_to_irq[evtchn] = irq;
+#ifdef PER_CPU_IPI_IRQ
+               irq_data_cfg(data)->info = mk_irq_info(IRQT_IPI, ipi, evtchn);
+#else
+               per_cpu(ipi_evtchn, cpu) = evtchn;
+#endif
+               bind_evtchn_to_cpu(evtchn, cpu);
+
+               /* Ready for use. */
+               if (!irqd_irq_disabled(data))
+                       unmask_evtchn(evtchn);
+#ifdef PER_CPU_IPI_IRQ
+       }
+#else
+#undef irq
+#undef ipi
+#endif
+#endif /* CONFIG_SMP */
+}
+
+static void evtchn_resume(void)
+{
+       unsigned int cpu, irq, evtchn;
+       struct evtchn_status status;
+
+       /* Avoid doing anything in the 'suspend cancelled' case. */
+       status.dom = DOMID_SELF;
+#ifdef PER_CPU_VIRQ_IRQ
+       status.port = evtchn_from_irq(__this_cpu_read(virq_to_irq[VIRQ_TIMER]));
+#else
+       status.port = __this_cpu_read(virq_to_evtchn[VIRQ_TIMER]);
+#endif
+       if (HYPERVISOR_event_channel_op(EVTCHNOP_status, &status))
+               BUG();
+       if (status.status == EVTCHNSTAT_virq
+           && status.vcpu == smp_processor_id()
+           && status.u.virq == VIRQ_TIMER)
+               return;
+
+       init_evtchn_cpu_bindings();
+
+       if (pirq_eoi_does_unmask) {
+               struct physdev_pirq_eoi_gmfn eoi_gmfn;
+
+               eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
+               if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn))
+                       BUG();
+       }
+
+       /* New event-channel space is not 'live' yet. */
+       for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+               mask_evtchn(evtchn);
+
+       /* No IRQ <-> event-channel mappings. */
+       for (irq = 0; irq < nr_irqs; irq++) {
+               struct irq_cfg *cfg = irq_cfg(irq);
+
+               if (!cfg)
+                       continue;
+
+               /* Check that no PIRQs are still bound. */
+#ifdef CONFIG_SPARSE_IRQ
+               if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs)
+                       BUG_ON(type_from_irq_cfg(cfg) == IRQT_PIRQ);
+               else
+#endif
+                       BUG_ON(cfg->info != IRQ_UNBOUND);
+
+               cfg->info &= ~((1U << _EVTCHN_BITS) - 1);
+       }
+       for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+               evtchn_to_irq[evtchn] = -1;
+
+       for_each_possible_cpu(cpu) {
+               restore_cpu_virqs(cpu);
+               restore_cpu_ipis(cpu);
+       }
+}
+
+static struct syscore_ops evtchn_syscore_ops = {
+       .resume = evtchn_resume,
+};
+
+static int __init evtchn_register(void)
+{
+       if (!is_initial_xendomain())
+               register_syscore_ops(&evtchn_syscore_ops);
+       return 0;
+}
+core_initcall(evtchn_register);
+#endif
+
+int __init arch_early_irq_init(void)
+{
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(_irq_cfg); i++)
+               irq_set_chip_data(i, _irq_cfg + i);
+
+       return 0;
+}
+
+struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+{
+       int res = irq_alloc_desc_at(at, node);
+       struct irq_cfg *cfg = NULL;
+
+       if (res < 0) {
+               if (res != -EEXIST)
+                       return NULL;
+               cfg = irq_get_chip_data(at);
+               if (cfg)
+                       return cfg;
+       }
+
+#ifdef CONFIG_SPARSE_IRQ
+#ifdef CONFIG_SMP
+       /* By default all event channels notify CPU#0. */
+       cpumask_copy(irq_get_irq_data(at)->affinity, cpumask_of(0));
+#endif
+
+       cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+       if (cfg)
+               irq_set_chip_data(at, cfg);
+       else
+               irq_free_desc(at);
+
+       return cfg;
+#else
+       return irq_cfg(at);
+#endif
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+#ifdef CONFIG_X86_IO_APIC
+#include <asm/io_apic.h>
+#endif
+
+int nr_pirqs = NR_PIRQS;
+EXPORT_SYMBOL_GPL(nr_pirqs);
+
+int __init arch_probe_nr_irqs(void)
+{
+       int nr = 64 + CONFIG_XEN_NR_GUEST_DEVICES, nr_irqs_gsi;
+
+       if (is_initial_xendomain()) {
+               nr_irqs_gsi = NR_IRQS_LEGACY;
+#ifdef CONFIG_X86_IO_APIC
+               nr_irqs_gsi += gsi_top;
+#endif
+#ifdef CONFIG_PCI_MSI
+               nr += max(nr_irqs_gsi * 16, nr_cpu_ids * 8);
+#endif
+       } else {
+               nr_irqs_gsi = NR_VECTORS;
+#ifdef CONFIG_PCI_MSI
+               nr += max(NR_IRQS_LEGACY * 16, nr_cpu_ids * 8);
+#endif
+       }
+
+       if (nr_pirqs > nr_irqs_gsi)
+               nr_pirqs = nr_irqs_gsi;
+       if (nr > min_t(int, NR_DYNIRQS, NR_EVENT_CHANNELS))
+               nr = min_t(int, NR_DYNIRQS, NR_EVENT_CHANNELS);
+       nr_irqs = min_t(int, nr_pirqs + nr, PAGE_SIZE * 8);
+
+       printk(KERN_DEBUG "nr_pirqs: %d\n", nr_pirqs);
+
+       return ARRAY_SIZE(_irq_cfg);
+}
+#endif
+
+#if defined(CONFIG_X86_IO_APIC)
+int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+       struct physdev_irq irq_op;
+
+       if (irq < PIRQ_BASE || irq - PIRQ_BASE >= nr_pirqs)
+               return -EINVAL;
+
+       if (cfg->vector)
+               return 0;
+
+       irq_op.irq = irq;
+       if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
+               return -ENOSPC;
+
+       cfg->vector = irq_op.vector;
+
+       return 0;
+}
+#define identity_mapped_irq(irq) (!IO_APIC_IRQ((irq) - PIRQ_BASE))
+#elif defined(CONFIG_X86)
+#define identity_mapped_irq(irq) (((irq) - PIRQ_BASE) < NR_IRQS_LEGACY)
+#else
+#define identity_mapped_irq(irq) (1)
+#endif
+
+void evtchn_register_pirq(int irq)
+{
+       struct irq_cfg *cfg = irq_cfg(irq);
+
+       BUG_ON(irq < PIRQ_BASE || irq - PIRQ_BASE >= nr_pirqs);
+       if (identity_mapped_irq(irq) || type_from_irq_cfg(cfg) != IRQT_UNBOUND)
+               return;
+       cfg->info = mk_irq_info(IRQT_PIRQ, irq, 0);
+       irq_set_chip_and_handler_name(irq, &pirq_chip, handle_fasteoi_irq,
+                                     "fasteoi");
+}
+
+#ifdef CONFIG_PCI_MSI
+int evtchn_map_pirq(int irq, int xen_pirq)
+{
+       if (irq < 0) {
+#ifdef CONFIG_SPARSE_IRQ
+               struct irq_cfg *cfg;
+
+               spin_lock(&irq_mapping_update_lock);
+               irq = find_unbound_irq(numa_node_id(), &cfg, &pirq_chip,
+                                      false);
+               if (irq >= 0) {
+                       BUG_ON(type_from_irq_cfg(cfg) != IRQT_UNBOUND);
+                       cfg->bindcount++;
+                       cfg->info = mk_irq_info(IRQT_PIRQ, xen_pirq, 0);
+               }
+               spin_unlock(&irq_mapping_update_lock);
+               if (irq < 0)
+                       return irq;
+       } else if (irq >= PIRQ_BASE && irq < PIRQ_BASE + nr_pirqs) {
+               WARN_ONCE(1, "Non-MSI IRQ#%d (Xen %d)\n", irq, xen_pirq);
+               return -EINVAL;
+#else
+               static DEFINE_SPINLOCK(irq_alloc_lock);
+
+               irq = PIRQ_BASE + nr_pirqs - 1;
+               spin_lock(&irq_alloc_lock);
+               do {
+                       struct irq_cfg *cfg;
+
+                       if (identity_mapped_irq(irq))
+                               continue;
+                       cfg = alloc_irq_and_cfg_at(irq, numa_node_id());
+                       if (unlikely(!cfg)) {
+                               spin_unlock(&irq_alloc_lock);
+                               return -ENOMEM;
+                       }
+                       if (!index_from_irq_cfg(cfg)) {
+                               BUG_ON(type_from_irq_cfg(cfg) != IRQT_UNBOUND);
+                               cfg->info = mk_irq_info(IRQT_PIRQ,
+                                                       xen_pirq, 0);
+                               break;
+                       }
+               } while (--irq >= PIRQ_BASE);
+               spin_unlock(&irq_alloc_lock);
+               if (irq < PIRQ_BASE)
+                       return -ENOSPC;
+               irq_set_chip_and_handler_name(irq, &pirq_chip,
+                                             handle_fasteoi_irq, "fasteoi");
+#endif
+       } else if (!xen_pirq) {
+               struct irq_cfg *cfg = irq_cfg(irq);
+
+               if (!cfg || unlikely(type_from_irq_cfg(cfg) != IRQT_PIRQ))
+                       return -EINVAL;
+               /*
+                * dynamic_irq_cleanup(irq) would seem to be the correct thing
+                * here, but cannot be used as we get here also during shutdown
+                * when a driver didn't free_irq() its MSI(-X) IRQ(s), which
+                * then causes a warning in dynamic_irq_cleanup().
+                */
+               irq_set_chip_and_handler(irq, NULL, NULL);
+               cfg->info = IRQ_UNBOUND;
+#ifdef CONFIG_SPARSE_IRQ
+               cfg->bindcount--;
+#endif
+               return 0;
+       } else if (type_from_irq(irq) != IRQT_PIRQ
+                  || index_from_irq(irq) != xen_pirq) {
+               pr_err("IRQ#%d is already mapped to %d:%u - "
+                      "cannot map to PIRQ#%u\n",
+                      irq, type_from_irq(irq), index_from_irq(irq), xen_pirq);
+               return -EINVAL;
+       }
+       return index_from_irq(irq) ? irq : -EINVAL;
+}
+#endif
+
+int evtchn_get_xen_pirq(int irq)
+{
+       struct irq_cfg *cfg = irq_cfg(irq);
+
+       if (identity_mapped_irq(irq))
+               return irq;
+       BUG_ON(type_from_irq_cfg(cfg) != IRQT_PIRQ);
+       return index_from_irq_cfg(cfg);
+}
+
+void __init xen_init_IRQ(void)
+{
+       unsigned int i;
+       struct physdev_pirq_eoi_gmfn eoi_gmfn;
+
+#ifndef PER_CPU_VIRQ_IRQ
+       __set_bit(VIRQ_TIMER, virq_per_cpu);
+       __set_bit(VIRQ_DEBUG, virq_per_cpu);
+       __set_bit(VIRQ_XENOPROF, virq_per_cpu);
+#ifdef CONFIG_IA64
+       __set_bit(VIRQ_ITC, virq_per_cpu);
+#endif
+#endif
+
+       init_evtchn_cpu_bindings();
+
+#ifdef CONFIG_SPARSE_IRQ
+       i = nr_irqs;
+#else
+       i = nr_pirqs;
+#endif
+       i = get_order(sizeof(unsigned long) * BITS_TO_LONGS(i));
+       pirq_needs_eoi = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, i);
+       BUILD_BUG_ON(NR_PIRQS > PAGE_SIZE * 8);
+       eoi_gmfn.gmfn = virt_to_machine(pirq_needs_eoi) >> PAGE_SHIFT;
+       if (HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn, &eoi_gmfn) == 0)
+               pirq_eoi_does_unmask = true;
+
+       /* No event channels are 'live' right now. */
+       for (i = 0; i < NR_EVENT_CHANNELS; i++)
+               mask_evtchn(i);
+
+#ifndef CONFIG_SPARSE_IRQ
+       for (i = DYNIRQ_BASE; i < (DYNIRQ_BASE + NR_DYNIRQS); i++) {
+               irq_set_noprobe(i);
+               irq_set_chip_and_handler_name(i, &dynirq_chip,
+                                             handle_fasteoi_irq, "fasteoi");
+       }
+
+       for (i = PIRQ_BASE; i < (PIRQ_BASE + nr_pirqs); i++) {
+#else
+       for (i = PIRQ_BASE; i < (PIRQ_BASE + NR_IRQS_LEGACY); i++) {
+#endif
+               if (!identity_mapped_irq(i))
+                       continue;
+
+#ifdef RTC_IRQ
+               /* If not domain 0, force our RTC driver to fail its probe. */
+               if (i - PIRQ_BASE == RTC_IRQ && !is_initial_xendomain())
+                       continue;
+#endif
+
+               irq_set_chip_and_handler_name(i, &pirq_chip,
+                                             handle_fasteoi_irq, "fasteoi");
+       }
+}
diff --git a/drivers/xen/core/firmware.c b/drivers/xen/core/firmware.c

new file mode 100644 (file)

index 0000000..2f851ee
--- /dev/null
+++ b/drivers/xen/core/firmware.c
@@ -0,0 +1,75 @@
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <video/edid.h>
+#include <xen/interface/platform.h>
+#include <asm/hypervisor.h>
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+void __init copy_edd(void)
+{
+       int ret;
+       struct xen_platform_op op;
+
+       if (!is_initial_xendomain())
+               return;
+
+       op.cmd = XENPF_firmware_info;
+
+       op.u.firmware_info.type = XEN_FW_DISK_INFO;
+       for (op.u.firmware_info.index = 0;
+            edd.edd_info_nr < EDDMAXNR;
+            op.u.firmware_info.index++) {
+               struct edd_info *info = edd.edd_info + edd.edd_info_nr;
+
+               info->params.length = sizeof(info->params);
+               set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
+                                    &info->params);
+               ret = HYPERVISOR_platform_op(&op);
+               if (ret)
+                       break;
+
+#define C(x) info->x = op.u.firmware_info.u.disk_info.x
+               C(device);
+               C(version);
+               C(interface_support);
+               C(legacy_max_cylinder);
+               C(legacy_max_head);
+               C(legacy_sectors_per_track);
+#undef C
+
+               edd.edd_info_nr++;
+       }
+
+       op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
+       for (op.u.firmware_info.index = 0;
+            edd.mbr_signature_nr < EDD_MBR_SIG_MAX;
+            op.u.firmware_info.index++) {
+               ret = HYPERVISOR_platform_op(&op);
+               if (ret)
+                       break;
+               edd.mbr_signature[edd.mbr_signature_nr++] =
+                       op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
+       }
+}
+#endif
+
+void __init copy_edid(void)
+{
+#if defined(CONFIG_FIRMWARE_EDID) && defined(CONFIG_X86)
+       struct xen_platform_op op;
+
+       if (!is_initial_xendomain())
+               return;
+
+       op.cmd = XENPF_firmware_info;
+       op.u.firmware_info.index = 0;
+       op.u.firmware_info.type = XEN_FW_VBEDDC_INFO;
+       set_xen_guest_handle(op.u.firmware_info.u.vbeddc_info.edid,
+                            edid_info.dummy);
+       if (HYPERVISOR_platform_op(&op) != 0)
+               memset(edid_info.dummy, 0x13, sizeof(edid_info.dummy));
+#endif
+}
diff --git a/drivers/xen/core/gnttab.c b/drivers/xen/core/gnttab.c

new file mode 100644 (file)

index 0000000..7f1b507
--- /dev/null
+++ b/drivers/xen/core/gnttab.c
@@ -0,0 +1,975 @@
+/******************************************************************************
+ * gnttab.c
+ *
+ * Granting foreign access to our memory reservation.
+ *
+ * Copyright (c) 2005-2006, Christopher Clark
+ * Copyright (c) 2004-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/seqlock.h>
+#include <linux/timer.h>
+#include <xen/interface/xen.h>
+#include <xen/gnttab.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/cmpxchg.h>
+#include <asm/io.h>
+#include <xen/interface/memory.h>
+#include <asm/gnttab_dma.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+/* External tools reserve first few grant table entries. */
+#define NR_RESERVED_ENTRIES 8
+#define GNTTAB_LIST_END 0xffffffff
+#define ENTRIES_PER_GRANT_FRAME (PAGE_SIZE / sizeof(grant_entry_t))
+
+static grant_ref_t **gnttab_list;
+static unsigned int nr_grant_frames;
+static unsigned int boot_max_nr_grant_frames;
+static int gnttab_free_count;
+static grant_ref_t gnttab_free_head;
+static DEFINE_SPINLOCK(gnttab_list_lock);
+
+static struct grant_entry *shared;
+
+static struct gnttab_free_callback *gnttab_free_callback_list;
+
+static int gnttab_expand(unsigned int req_entries);
+
+#define RPP (PAGE_SIZE / sizeof(grant_ref_t))
+#define gnttab_entry(entry) (gnttab_list[(entry) / RPP][(entry) % RPP])
+
+#define nr_freelist_frames(grant_frames)                               \
+       (((grant_frames) * ENTRIES_PER_GRANT_FRAME + RPP - 1) / RPP)
+
+static int get_free_entries(int count)
+{
+       unsigned long flags;
+       int ref, rc;
+       grant_ref_t head;
+
+       spin_lock_irqsave(&gnttab_list_lock, flags);
+
+       if ((gnttab_free_count < count) &&
+           ((rc = gnttab_expand(count - gnttab_free_count)) < 0)) {
+               spin_unlock_irqrestore(&gnttab_list_lock, flags);
+               return rc;
+       }
+
+       ref = head = gnttab_free_head;
+       gnttab_free_count -= count;
+       while (count-- > 1)
+               head = gnttab_entry(head);
+       gnttab_free_head = gnttab_entry(head);
+       gnttab_entry(head) = GNTTAB_LIST_END;
+
+       spin_unlock_irqrestore(&gnttab_list_lock, flags);
+
+       return ref;
+}
+
+#define get_free_entry() get_free_entries(1)
+
+static void do_free_callbacks(void)
+{
+       struct gnttab_free_callback *callback, *next;
+
+       callback = gnttab_free_callback_list;
+       gnttab_free_callback_list = NULL;
+
+       while (callback != NULL) {
+               next = callback->next;
+               if (gnttab_free_count >= callback->count) {
+                       callback->next = NULL;
+                       callback->queued = 0;
+                       callback->fn(callback->arg);
+               } else {
+                       callback->next = gnttab_free_callback_list;
+                       gnttab_free_callback_list = callback;
+               }
+               callback = next;
+       }
+}
+
+static inline void check_free_callbacks(void)
+{
+       if (unlikely(gnttab_free_callback_list))
+               do_free_callbacks();
+}
+
+static void put_free_entry(grant_ref_t ref)
+{
+       unsigned long flags;
+       spin_lock_irqsave(&gnttab_list_lock, flags);
+       gnttab_entry(ref) = gnttab_free_head;
+       gnttab_free_head = ref;
+       gnttab_free_count++;
+       check_free_callbacks();
+       spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+
+/*
+ * Public grant-issuing interface functions
+ */
+
+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
+                               int flags)
+{
+       int ref;
+
+       if (unlikely((ref = get_free_entry()) < 0))
+               return -ENOSPC;
+
+       shared[ref].frame = frame;
+       shared[ref].domid = domid;
+       wmb();
+       BUG_ON(flags & (GTF_accept_transfer | GTF_reading | GTF_writing));
+       shared[ref].flags = GTF_permit_access | flags;
+
+       return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
+
+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+                                    unsigned long frame, int flags)
+{
+       shared[ref].frame = frame;
+       shared[ref].domid = domid;
+       wmb();
+       BUG_ON(flags & (GTF_accept_transfer | GTF_reading | GTF_writing));
+       shared[ref].flags = GTF_permit_access | flags;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref);
+
+
+int gnttab_query_foreign_access(grant_ref_t ref)
+{
+       u16 nflags;
+
+       nflags = shared[ref].flags;
+
+       return (nflags & (GTF_reading|GTF_writing));
+}
+EXPORT_SYMBOL_GPL(gnttab_query_foreign_access);
+
+static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref)
+{
+       u16 flags, nflags;
+
+       nflags = shared[ref].flags;
+       do {
+               if ((flags = nflags) & (GTF_reading|GTF_writing))
+                       return 0;
+       } while ((nflags = sync_cmpxchg(&shared[ref].flags, flags, 0)) !=
+                flags);
+
+       return 1;
+}
+
+int gnttab_end_foreign_access_ref(grant_ref_t ref)
+{
+       if (_gnttab_end_foreign_access_ref(ref))
+               return 1;
+       printk(KERN_DEBUG "WARNING: g.e. %#x still in use!\n", ref);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref);
+
+struct deferred_entry {
+       struct list_head list;
+       grant_ref_t ref;
+       uint16_t warn_delay;
+       struct page *page;
+};
+static LIST_HEAD(deferred_list);
+static void gnttab_handle_deferred(unsigned long);
+static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred, 0, 0);
+
+static void gnttab_handle_deferred(unsigned long unused)
+{
+       unsigned int nr = 10;
+       struct deferred_entry *first = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&gnttab_list_lock, flags);
+       while (nr--) {
+               struct deferred_entry *entry
+                       = list_first_entry(&deferred_list,
+                                          struct deferred_entry, list);
+
+               if (entry == first)
+                       break;
+               list_del(&entry->list);
+               spin_unlock_irqrestore(&gnttab_list_lock, flags);
+               if (_gnttab_end_foreign_access_ref(entry->ref)) {
+                       put_free_entry(entry->ref);
+                       if (entry->page) {
+                               printk(KERN_DEBUG
+                                      "freeing g.e. %#x (pfn %#lx)\n",
+                                      entry->ref, page_to_pfn(entry->page));
+                               __free_page(entry->page);
+                       } else
+                               printk(KERN_DEBUG "freeing g.e. %#x\n",
+                                      entry->ref);
+                       kfree(entry);
+                       entry = NULL;
+               } else {
+                       if (!--entry->warn_delay)
+                               pr_info("g.e. %#x still pending\n",
+                                       entry->ref);
+                       if (!first)
+                               first = entry;
+               }
+               spin_lock_irqsave(&gnttab_list_lock, flags);
+               if (entry)
+                       list_add_tail(&entry->list, &deferred_list);
+               else if (list_empty(&deferred_list))
+                       break;
+       }
+       if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
+               deferred_timer.expires = jiffies + HZ;
+               add_timer(&deferred_timer);
+       }
+       spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+
+static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
+{
+       struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+       const char *what = KERN_WARNING "leaking";
+
+       if (entry) {
+               unsigned long flags;
+
+               entry->ref = ref;
+               entry->page = page;
+               entry->warn_delay = 60;
+               spin_lock_irqsave(&gnttab_list_lock, flags);
+               list_add_tail(&entry->list, &deferred_list);
+               if (!timer_pending(&deferred_timer)) {
+                       deferred_timer.expires = jiffies + HZ;
+                       add_timer(&deferred_timer);
+               }
+               spin_unlock_irqrestore(&gnttab_list_lock, flags);
+               what = KERN_DEBUG "deferring";
+       }
+       printk("%s g.e. %#x (pfn %lx)\n", what,
+              ref, page ? page_to_pfn(page) : -1);
+}
+
+void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page)
+{
+       if (gnttab_end_foreign_access_ref(ref)) {
+               put_free_entry(ref);
+               if (page != 0)
+                       free_page(page);
+       } else
+               gnttab_add_deferred(ref, page ? virt_to_page(page) : NULL);
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
+
+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
+{
+       int ref;
+
+       if (unlikely((ref = get_free_entry()) < 0))
+               return -ENOSPC;
+       gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
+
+       return ref;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
+
+void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
+                                      unsigned long pfn)
+{
+       shared[ref].frame = pfn;
+       shared[ref].domid = domid;
+       wmb();
+       shared[ref].flags = GTF_accept_transfer;
+}
+EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
+{
+       unsigned long frame;
+       u16           flags;
+
+       /*
+        * If a transfer is not even yet started, try to reclaim the grant
+        * reference and return failure (== 0).
+        */
+       while (!((flags = shared[ref].flags) & GTF_transfer_committed)) {
+               if (sync_cmpxchg(&shared[ref].flags, flags, 0) == flags)
+                       return 0;
+               cpu_relax();
+       }
+
+       /* If a transfer is in progress then wait until it is completed. */
+       while (!(flags & GTF_transfer_completed)) {
+               flags = shared[ref].flags;
+               cpu_relax();
+       }
+
+       /* Read the frame number /after/ reading completion status. */
+       rmb();
+       frame = shared[ref].frame;
+       BUG_ON(frame == 0);
+
+       return frame;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
+
+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
+{
+       unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
+       put_free_entry(ref);
+       return frame;
+}
+EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
+
+void gnttab_free_grant_reference(grant_ref_t ref)
+{
+       put_free_entry(ref);
+}
+EXPORT_SYMBOL_GPL(gnttab_free_grant_reference);
+
+void gnttab_free_grant_references(grant_ref_t head)
+{
+       grant_ref_t ref;
+       unsigned long flags;
+       int count = 1;
+       if (head == GNTTAB_LIST_END)
+               return;
+       spin_lock_irqsave(&gnttab_list_lock, flags);
+       ref = head;
+       while (gnttab_entry(ref) != GNTTAB_LIST_END) {
+               ref = gnttab_entry(ref);
+               count++;
+       }
+       gnttab_entry(ref) = gnttab_free_head;
+       gnttab_free_head = head;
+       gnttab_free_count += count;
+       check_free_callbacks();
+       spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_free_grant_references);
+
+int gnttab_alloc_grant_references(u16 count, grant_ref_t *head)
+{
+       int h = get_free_entries(count);
+
+       if (h < 0)
+               return -ENOSPC;
+
+       *head = h;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references);
+
+int gnttab_empty_grant_references(const grant_ref_t *private_head)
+{
+       return (*private_head == GNTTAB_LIST_END);
+}
+EXPORT_SYMBOL_GPL(gnttab_empty_grant_references);
+
+int gnttab_claim_grant_reference(grant_ref_t *private_head)
+{
+       grant_ref_t g = *private_head;
+       if (unlikely(g == GNTTAB_LIST_END))
+               return -ENOSPC;
+       *private_head = gnttab_entry(g);
+       return g;
+}
+EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference);
+
+void gnttab_release_grant_reference(grant_ref_t *private_head,
+                                   grant_ref_t release)
+{
+       gnttab_entry(release) = *private_head;
+       *private_head = release;
+}
+EXPORT_SYMBOL_GPL(gnttab_release_grant_reference);
+
+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
+                                 void (*fn)(void *), void *arg, u16 count)
+{
+       unsigned long flags;
+       spin_lock_irqsave(&gnttab_list_lock, flags);
+       if (callback->queued)
+               goto out;
+       callback->fn = fn;
+       callback->arg = arg;
+       callback->count = count;
+       callback->queued = 1;
+       callback->next = gnttab_free_callback_list;
+       gnttab_free_callback_list = callback;
+       check_free_callbacks();
+out:
+       spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_request_free_callback);
+
+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback)
+{
+       struct gnttab_free_callback **pcb;
+       unsigned long flags;
+
+       spin_lock_irqsave(&gnttab_list_lock, flags);
+       for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) {
+               if (*pcb == callback) {
+                       *pcb = callback->next;
+                       callback->queued = 0;
+                       break;
+               }
+       }
+       spin_unlock_irqrestore(&gnttab_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback);
+
+static int grow_gnttab_list(unsigned int more_frames)
+{
+       unsigned int new_nr_grant_frames, extra_entries, i;
+       unsigned int nr_glist_frames, new_nr_glist_frames;
+
+       new_nr_grant_frames = nr_grant_frames + more_frames;
+       extra_entries       = more_frames * ENTRIES_PER_GRANT_FRAME;
+
+       nr_glist_frames = nr_freelist_frames(nr_grant_frames);
+       new_nr_glist_frames = nr_freelist_frames(new_nr_grant_frames);
+       for (i = nr_glist_frames; i < new_nr_glist_frames; i++) {
+               gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC);
+               if (!gnttab_list[i])
+                       goto grow_nomem;
+       }
+
+       for (i = ENTRIES_PER_GRANT_FRAME * nr_grant_frames;
+            i < ENTRIES_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++)
+               gnttab_entry(i) = i + 1;
+
+       gnttab_entry(i) = gnttab_free_head;
+       gnttab_free_head = ENTRIES_PER_GRANT_FRAME * nr_grant_frames;
+       gnttab_free_count += extra_entries;
+
+       nr_grant_frames = new_nr_grant_frames;
+
+       check_free_callbacks();
+
+       return 0;
+       
+grow_nomem:
+       for ( ; i >= nr_glist_frames; i--)
+               free_page((unsigned long) gnttab_list[i]);
+       return -ENOMEM;
+}
+
+static unsigned int __max_nr_grant_frames(void)
+{
+       struct gnttab_query_size query;
+       int rc;
+
+       query.dom = DOMID_SELF;
+
+       rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1);
+       if ((rc < 0) || (query.status != GNTST_okay))
+               return 4; /* Legacy max supported number of frames */
+
+       return query.max_nr_frames;
+}
+
+static inline unsigned int max_nr_grant_frames(void)
+{
+       unsigned int xen_max = __max_nr_grant_frames();
+
+       if (xen_max > boot_max_nr_grant_frames)
+               return boot_max_nr_grant_frames;
+       return xen_max;
+}
+
+#ifdef CONFIG_XEN
+
+#ifdef CONFIG_X86
+static int map_pte_fn(pte_t *pte, struct page *pmd_page,
+                     unsigned long addr, void *data)
+{
+       unsigned long **frames = (unsigned long **)data;
+
+       set_pte_at(&init_mm, addr, pte, pfn_pte_ma((*frames)[0], PAGE_KERNEL));
+       (*frames)++;
+       return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
+                       unsigned long addr, void *data)
+{
+
+       set_pte_at(&init_mm, addr, pte, __pte(0));
+       return 0;
+}
+#endif
+
+void *arch_gnttab_alloc_shared(unsigned long *frames)
+{
+       struct vm_struct *area;
+       area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames(), NULL);
+       BUG_ON(area == NULL);
+       return area->addr;
+}
+#endif /* CONFIG_X86 */
+
+static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+{
+       struct gnttab_setup_table setup;
+       unsigned long *frames;
+       unsigned int nr_gframes = end_idx + 1;
+       int rc;
+
+       frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
+       if (!frames)
+               return -ENOMEM;
+
+       setup.dom        = DOMID_SELF;
+       setup.nr_frames  = nr_gframes;
+       set_xen_guest_handle(setup.frame_list, frames);
+
+       rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1);
+       if (rc == -ENOSYS) {
+               kfree(frames);
+               return -ENOSYS;
+       }
+
+       BUG_ON(rc || setup.status != GNTST_okay);
+
+       if (shared == NULL)
+               shared = arch_gnttab_alloc_shared(frames);
+
+#ifdef CONFIG_X86
+       rc = apply_to_page_range(&init_mm, (unsigned long)shared,
+                                PAGE_SIZE * nr_gframes,
+                                map_pte_fn, &frames);
+       BUG_ON(rc);
+       frames -= nr_gframes; /* adjust after map_pte_fn() */
+#endif /* CONFIG_X86 */
+
+       kfree(frames);
+
+       return 0;
+}
+
+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
+
+static DEFINE_SEQLOCK(gnttab_dma_lock);
+
+static void gnttab_page_free(struct page *page, unsigned int order)
+{
+       BUG_ON(order);
+       ClearPageForeign(page);
+       gnttab_reset_grant_page(page);
+       ClearPageReserved(page);
+       put_page(page);
+}
+
+/*
+ * Must not be called with IRQs off.  This should only be used on the
+ * slow path.
+ *
+ * Copy a foreign granted page to local memory.
+ */
+int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep)
+{
+       struct gnttab_unmap_and_replace unmap;
+       mmu_update_t mmu;
+       struct page *page;
+       struct page *new_page;
+       void *new_addr;
+       void *addr;
+       paddr_t pfn;
+       maddr_t mfn;
+       maddr_t new_mfn;
+       int err;
+
+       page = *pagep;
+       if (!get_page_unless_zero(page))
+               return -ENOENT;
+
+       err = -ENOMEM;
+       new_page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+       if (!new_page)
+               goto out;
+
+       new_addr = page_address(new_page);
+       addr = page_address(page);
+       copy_page(new_addr, addr);
+
+       pfn = page_to_pfn(page);
+       mfn = pfn_to_mfn(pfn);
+       new_mfn = virt_to_mfn(new_addr);
+
+       write_seqlock_bh(&gnttab_dma_lock);
+
+       /* Make seq visible before checking page_mapped. */
+       smp_mb();
+
+       /* Has the page been DMA-mapped? */
+       if (unlikely(page_mapped(page))) {
+               write_sequnlock_bh(&gnttab_dma_lock);
+               put_page(new_page);
+               err = -EBUSY;
+               goto out;
+       }
+
+       if (!xen_feature(XENFEAT_auto_translated_physmap))
+               set_phys_to_machine(pfn, new_mfn);
+
+       gnttab_set_replace_op(&unmap, (unsigned long)addr,
+                             (unsigned long)new_addr, ref);
+
+       err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
+                                       &unmap, 1);
+       BUG_ON(err);
+       BUG_ON(unmap.status != GNTST_okay);
+
+       write_sequnlock_bh(&gnttab_dma_lock);
+
+       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+               set_phys_to_machine(page_to_pfn(new_page), INVALID_P2M_ENTRY);
+
+               mmu.ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+               mmu.val = pfn;
+               err = HYPERVISOR_mmu_update(&mmu, 1, NULL, DOMID_SELF);
+               BUG_ON(err);
+       }
+
+       new_page->mapping = page->mapping;
+       new_page->index = page->index;
+       set_bit(PG_foreign, &new_page->flags);
+       if (PageReserved(page))
+               SetPageReserved(new_page);
+       *pagep = new_page;
+
+       SetPageForeign(page, gnttab_page_free);
+       page->mapping = NULL;
+
+out:
+       put_page(page);
+       return err;
+}
+EXPORT_SYMBOL_GPL(gnttab_copy_grant_page);
+
+void gnttab_reset_grant_page(struct page *page)
+{
+       init_page_count(page);
+       reset_page_mapcount(page);
+}
+EXPORT_SYMBOL_GPL(gnttab_reset_grant_page);
+
+/*
+ * Keep track of foreign pages marked as PageForeign so that we don't
+ * return them to the remote domain prematurely.
+ *
+ * PageForeign pages are pinned down by increasing their mapcount.
+ *
+ * All other pages are simply returned as is.
+ */
+void __gnttab_dma_map_page(struct page *page)
+{
+       unsigned int seq;
+
+       if (!is_running_on_xen() || !PageForeign(page))
+               return;
+
+       do {
+               seq = read_seqbegin(&gnttab_dma_lock);
+
+               if (gnttab_dma_local_pfn(page))
+                       break;
+
+               atomic_set(&page->_mapcount, 0);
+
+               /* Make _mapcount visible before read_seqretry. */
+               smp_mb();
+       } while (unlikely(read_seqretry(&gnttab_dma_lock, seq)));
+}
+
+#endif /* CONFIG_XEN_BACKEND */
+
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+
+static unsigned int GNTMAP_pte_special;
+
+bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *map,
+                          unsigned int count)
+{
+       unsigned int i;
+
+       if (unlikely(cmd != GNTTABOP_map_grant_ref))
+               count = 0;
+
+       for (i = 0; i < count; ++i, ++map) {
+               if (!(map->flags & GNTMAP_host_map)
+                   || !(map->flags & GNTMAP_application_map))
+                       continue;
+               if (GNTMAP_pte_special)
+                       map->flags |= GNTMAP_pte_special;
+               else {
+                       BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+                       return true;
+               }
+       }
+
+       return false;
+}
+EXPORT_SYMBOL(gnttab_pre_map_adjust);
+
+#if CONFIG_XEN_COMPAT < 0x030400
+int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *map, unsigned int count)
+{
+       unsigned int i;
+       int rc = 0;
+
+       for (i = 0; i < count && rc == 0; ++i, ++map) {
+               pte_t pte;
+
+               if (!(map->flags & GNTMAP_host_map)
+                   || !(map->flags & GNTMAP_application_map))
+                       continue;
+
+#ifdef CONFIG_X86
+               pte = __pte_ma((map->dev_bus_addr | _PAGE_PRESENT | _PAGE_USER
+                               | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX
+                               | _PAGE_SPECIAL)
+                              & __supported_pte_mask);
+#else
+#error Architecture not yet supported.
+#endif
+               if (!(map->flags & GNTMAP_readonly))
+                       pte = pte_mkwrite(pte);
+
+               if (map->flags & GNTMAP_contains_pte) {
+                       mmu_update_t u;
+
+                       u.ptr = map->host_addr;
+                       u.val = __pte_val(pte);
+                       rc = HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
+               } else
+                       rc = HYPERVISOR_update_va_mapping(map->host_addr, pte, 0);
+       }
+
+       return rc;
+}
+EXPORT_SYMBOL(gnttab_post_map_adjust);
+#endif
+
+#endif /* __HAVE_ARCH_PTE_SPECIAL */
+
+int gnttab_resume(void)
+{
+       if (max_nr_grant_frames() < nr_grant_frames)
+               return 0;
+       return gnttab_map(0, nr_grant_frames - 1);
+}
+
+#ifdef CONFIG_PM_SLEEP
+#include <linux/syscore_ops.h>
+
+#ifdef CONFIG_X86
+static int gnttab_suspend(void)
+{
+       apply_to_page_range(&init_mm, (unsigned long)shared,
+                           PAGE_SIZE * nr_grant_frames,
+                           unmap_pte_fn, NULL);
+       return 0;
+}
+#else
+#define gnttab_suspend NULL
+#endif
+
+static void _gnttab_resume(void)
+{
+       if (gnttab_resume())
+               BUG();
+}
+
+static struct syscore_ops gnttab_syscore_ops = {
+       .resume         = _gnttab_resume,
+       .suspend        = gnttab_suspend,
+};
+#endif
+
+#else /* !CONFIG_XEN */
+
+#include <platform-pci.h>
+
+static unsigned long resume_frames;
+
+static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
+{
+       struct xen_add_to_physmap xatp;
+       unsigned int i = end_idx;
+
+       /* Loop backwards, so that the first hypercall has the largest index,
+        * ensuring that the table will grow only once.
+        */
+       do {
+               xatp.domid = DOMID_SELF;
+               xatp.idx = i;
+               xatp.space = XENMAPSPACE_grant_table;
+               xatp.gpfn = (resume_frames >> PAGE_SHIFT) + i;
+               if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+                       BUG();
+       } while (i-- > start_idx);
+
+       return 0;
+}
+
+int gnttab_resume(void)
+{
+       unsigned int max_nr_gframes, nr_gframes;
+
+       nr_gframes = nr_grant_frames;
+       max_nr_gframes = max_nr_grant_frames();
+       if (max_nr_gframes < nr_gframes)
+               return -ENOSYS;
+
+       if (!resume_frames) {
+               resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+               shared = ioremap(resume_frames, PAGE_SIZE * max_nr_gframes);
+               if (shared == NULL) {
+                       pr_warning("error to ioremap gnttab share frames\n");
+                       return -1;
+               }
+       }
+
+       gnttab_map(0, nr_gframes - 1);
+
+       return 0;
+}
+
+#endif /* !CONFIG_XEN */
+
+static int gnttab_expand(unsigned int req_entries)
+{
+       int rc;
+       unsigned int cur, extra;
+
+       cur = nr_grant_frames;
+       extra = ((req_entries + (ENTRIES_PER_GRANT_FRAME-1)) /
+                ENTRIES_PER_GRANT_FRAME);
+       if (cur + extra > max_nr_grant_frames())
+               return -ENOSPC;
+
+       if ((rc = gnttab_map(cur, cur + extra - 1)) == 0)
+               rc = grow_gnttab_list(extra);
+
+       return rc;
+}
+
+#ifdef CONFIG_XEN
+static int __init
+#else
+int __devinit
+#endif
+gnttab_init(void)
+{
+       int i;
+       unsigned int max_nr_glist_frames, nr_glist_frames;
+       unsigned int nr_init_grefs;
+
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       nr_grant_frames = 1;
+       boot_max_nr_grant_frames = __max_nr_grant_frames();
+
+       /* Determine the maximum number of frames required for the
+        * grant reference free list on the current hypervisor.
+        */
+       max_nr_glist_frames = nr_freelist_frames(boot_max_nr_grant_frames);
+
+       gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *),
+                             GFP_KERNEL);
+       if (gnttab_list == NULL)
+               return -ENOMEM;
+
+       nr_glist_frames = nr_freelist_frames(nr_grant_frames);
+       for (i = 0; i < nr_glist_frames; i++) {
+               gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL);
+               if (gnttab_list[i] == NULL)
+                       goto ini_nomem;
+       }
+
+       if (gnttab_resume() < 0)
+               return -ENODEV;
+
+       nr_init_grefs = nr_grant_frames * ENTRIES_PER_GRANT_FRAME;
+
+       for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++)
+               gnttab_entry(i) = i + 1;
+
+       gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END;
+       gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
+       gnttab_free_head  = NR_RESERVED_ENTRIES;
+
+#if defined(CONFIG_XEN) && defined(__HAVE_ARCH_PTE_SPECIAL)
+       if (!xen_feature(XENFEAT_auto_translated_physmap)
+           && xen_feature(XENFEAT_gnttab_map_avail_bits)) {
+#ifdef CONFIG_X86
+               GNTMAP_pte_special = (__pte_val(pte_mkspecial(__pte_ma(0)))
+                                     >> _PAGE_BIT_UNUSED1) << _GNTMAP_guest_avail0;
+#else
+#error Architecture not yet supported.
+#endif
+       }
+#endif
+
+#if defined(CONFIG_XEN) && defined(CONFIG_PM_SLEEP)
+       if (!is_initial_xendomain())
+               register_syscore_ops(&gnttab_syscore_ops);
+#endif
+
+       return 0;
+
+ ini_nomem:
+       for (i--; i >= 0; i--)
+               free_page((unsigned long)gnttab_list[i]);
+       kfree(gnttab_list);
+       return -ENOMEM;
+}
+
+#ifdef CONFIG_XEN
+core_initcall(gnttab_init);
+#endif
diff --git a/drivers/xen/core/machine_kexec.c b/drivers/xen/core/machine_kexec.c

new file mode 100644 (file)

index 0000000..f053d28
--- /dev/null
+++ b/drivers/xen/core/machine_kexec.c
@@ -0,0 +1,403 @@
+/*
+ * drivers/xen/core/machine_kexec.c 
+ * handle transition of Linux booting another kernel
+ */
+
+#include <linux/kexec.h>
+#include <linux/slab.h>
+#include <xen/interface/kexec.h>
+#include <xen/interface/platform.h>
+#include <linux/reboot.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <xen/pcpu.h>
+
+extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, 
+                                        struct kimage *image);
+extern int machine_kexec_setup_resources(struct resource *hypervisor,
+                                        struct resource *phys_cpus,
+                                        int nr_phys_cpus);
+extern int machine_kexec_setup_resource(struct resource *hypervisor,
+                                       struct resource *phys_cpu);
+extern void machine_kexec_register_resources(struct resource *res);
+
+static unsigned int xen_nr_phys_cpus, xen_max_nr_phys_cpus;
+static struct resource xen_hypervisor_res;
+static struct resource *xen_phys_cpus;
+static struct xen_phys_cpu_entry {
+       struct xen_phys_cpu_entry *next;
+       struct resource res;
+} *xen_phys_cpu_list;
+
+size_t vmcoreinfo_size_xen;
+unsigned long paddr_vmcoreinfo_xen;
+
+static int fill_crash_res(struct resource *res, unsigned int cpu)
+{
+       xen_kexec_range_t range = {
+               .range = KEXEC_RANGE_MA_CPU,
+               .nr = cpu
+       };
+       int rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range);
+
+       if (!rc && !range.size)
+               rc = -ENODEV;
+       if (!rc) {
+               res->name = "Crash note";
+               res->start = range.start;
+               res->end = range.start + range.size - 1;
+               res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+       }
+
+       return rc;
+}
+
+static struct resource *find_crash_res(const struct resource *r,
+                                      unsigned int *idx)
+{
+       unsigned int i;
+       struct xen_phys_cpu_entry *ent;
+
+       for (i = 0; i < xen_max_nr_phys_cpus; ++i) {
+               struct resource *res = xen_phys_cpus + i;
+
+               if (res->parent && res->start == r->start
+                   && res->end == r->end) {
+                       if (idx)
+                               *idx = i;
+                       return res;
+               }
+       }
+
+       for (ent = xen_phys_cpu_list; ent; ent = ent->next, ++i)
+               if (ent->res.parent && ent->res.start == r->start
+                   && ent->res.end == r->end) {
+                       if (idx)
+                               *idx = i;
+                       return &ent->res;
+               }
+
+       return NULL;
+}
+
+static int kexec_cpu_callback(struct notifier_block *nfb,
+                             unsigned long action, void *hcpu)
+{
+       unsigned int i, cpu = (unsigned long)hcpu;
+       struct xen_phys_cpu_entry *ent;
+       struct resource *res = NULL, r;
+
+       if (xen_nr_phys_cpus < xen_max_nr_phys_cpus)
+               xen_nr_phys_cpus = xen_max_nr_phys_cpus;
+       switch (action) {
+       case CPU_ONLINE:
+               for (i = 0; i < xen_max_nr_phys_cpus; ++i)
+                       if (!xen_phys_cpus[i].parent) {
+                               res = xen_phys_cpus + i;
+                               break;
+                       }
+               if (!res)
+                       for (ent = xen_phys_cpu_list; ent; ent = ent->next)
+                               if (!ent->res.parent) {
+                                       res = &ent->res;
+                                       break;
+                               }
+               if (!res) {
+                       ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+                       res = ent ? &ent->res : NULL;
+               } else
+                       ent = NULL;
+               if (res && !fill_crash_res(res, cpu)
+                   && !machine_kexec_setup_resource(&xen_hypervisor_res,
+                                                    res)) {
+                       if (ent) {
+                               ent->next = xen_phys_cpu_list;
+                               xen_phys_cpu_list = ent;
+                               ++xen_nr_phys_cpus;
+                       }
+               } else {
+                       pr_warn("Could not set up crash note for pCPU#%u\n",
+                               cpu);
+                       kfree(ent);
+               }
+               break;
+
+       case CPU_DEAD:
+               if (!fill_crash_res(&r, cpu))
+                       res = find_crash_res(&r, NULL);
+               if (!res) {
+                       unsigned long *map;
+                       xen_platform_op_t op;
+
+                       map = kcalloc(BITS_TO_LONGS(xen_nr_phys_cpus),
+                                     sizeof(long), GFP_KERNEL);
+                       if (!map)
+                               break;
+
+                       op.cmd = XENPF_get_cpuinfo;
+                       op.u.pcpu_info.xen_cpuid = 0;
+                       if (HYPERVISOR_platform_op(&op) == 0)
+                               i = op.u.pcpu_info.max_present + 1;
+                       else
+                               i = xen_nr_phys_cpus;
+
+                       for (cpu = 0; cpu < i; ++cpu) {
+                               unsigned int idx;
+
+                               if (fill_crash_res(&r, cpu))
+                                       continue;
+                               if (find_crash_res(&r, &idx)) {
+                                       BUG_ON(idx >= xen_nr_phys_cpus);
+                                       __set_bit(idx, map);
+                               }
+                       }
+
+                       for (i = 0; i < xen_max_nr_phys_cpus; ++i)
+                               if (xen_phys_cpus[i].parent && !test_bit(i, map)) {
+                                       res = xen_phys_cpus + i;
+                                       break;
+                               }
+                       for (ent = xen_phys_cpu_list; !res && ent;
+                            ent = ent->next, ++i)
+                               if (ent->res.parent && !test_bit(i, map))
+                                       res = &ent->res;
+                       kfree(map);
+               }
+               if (res)
+                       release_resource(res);
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block kexec_cpu_notifier = {
+       .notifier_call = kexec_cpu_callback
+};
+
+void __init xen_machine_kexec_setup_resources(void)
+{
+       xen_kexec_range_t range;
+       xen_platform_op_t op;
+       unsigned int k = 0, nr = 0;
+       int rc;
+
+       if (strstr(boot_command_line, "crashkernel="))
+               pr_warning("Ignoring crashkernel command line, "
+                          "parameter will be supplied by xen\n");
+
+       if (!is_initial_xendomain())
+               return;
+
+       /* fill in crashk_res if range is reserved by hypervisor */
+       memset(&range, 0, sizeof(range));
+       range.range = KEXEC_RANGE_MA_CRASH;
+
+       if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range)
+           || !range.size)
+               return;
+
+       crashk_res.start = range.start;
+       crashk_res.end = range.start + range.size - 1;
+
+       /* determine maximum number of physical cpus */
+       op.cmd = XENPF_get_cpuinfo;
+       op.u.pcpu_info.xen_cpuid = 0;
+       if (HYPERVISOR_platform_op(&op) == 0)
+               k = op.u.pcpu_info.max_present + 1;
+#if CONFIG_XEN_COMPAT < 0x040000
+       else while (1) {
+               memset(&range, 0, sizeof(range));
+               range.range = KEXEC_RANGE_MA_CPU;
+               range.nr = k;
+
+               if(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+                       break;
+
+               k++;
+       }
+#endif
+
+       if (k == 0)
+               return;
+
+       xen_max_nr_phys_cpus = k;
+
+       /* allocate xen_phys_cpus */
+
+       xen_phys_cpus = alloc_bootmem(k * sizeof(struct resource));
+
+       /* fill in xen_phys_cpus with per-cpu crash note information */
+
+       for (k = 0; k < xen_max_nr_phys_cpus; k++)
+               if (!fill_crash_res(xen_phys_cpus + nr, k))
+                       ++nr;
+
+       if (nr == 0)
+               goto free;
+
+       /* fill in xen_hypervisor_res with hypervisor machine address range */
+
+       memset(&range, 0, sizeof(range));
+       range.range = KEXEC_RANGE_MA_XEN;
+
+       if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range))
+               goto free;
+
+       xen_hypervisor_res.name = "Hypervisor code and data";
+       xen_hypervisor_res.start = range.start;
+       xen_hypervisor_res.end = range.start + range.size - 1;
+       xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+
+       /* get physical address of vmcoreinfo */
+       memset(&range, 0, sizeof(range));
+       range.range = KEXEC_RANGE_MA_VMCOREINFO;
+
+       rc = HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range);
+
+       if (rc == 0) {
+               /* Hypercall succeeded */
+               vmcoreinfo_size_xen = range.size;
+               paddr_vmcoreinfo_xen = range.start;
+
+       } else {
+               /* Hypercall failed.
+                * Indicate not to create sysfs file by resetting globals
+                */
+               vmcoreinfo_size_xen = 0;
+               paddr_vmcoreinfo_xen = 0;
+               
+#if CONFIG_XEN_COMPAT < 0x030300
+               /* The KEXEC_CMD_kexec_get_range hypercall did not implement
+                * KEXEC_RANGE_MA_VMCOREINFO until Xen 3.3.
+                * Do not bail out if it fails for this reason.
+                */
+               if (rc != -EINVAL)
+#endif
+                       goto free;
+       }
+
+       if (machine_kexec_setup_resources(&xen_hypervisor_res, xen_phys_cpus,
+                                         nr)) {
+               /*
+                * It's too cumbersome to properly free xen_phys_cpus here.
+                * Failure at this stage is unexpected and the amount of
+                * memory is small therefore we tolerate the potential leak.
+                */
+               goto err;
+       }
+
+       xen_nr_phys_cpus = nr;
+       rc = register_pcpu_notifier(&kexec_cpu_notifier);
+       if (rc)
+               pr_warn("kexec: pCPU notifier registration failed (%d)\n", rc);
+
+       return;
+
+ free:
+       free_bootmem(__pa(xen_phys_cpus),
+                    xen_max_nr_phys_cpus * sizeof(*xen_phys_cpus));
+ err:
+       xen_nr_phys_cpus = 0;
+}
+
+#ifndef CONFIG_X86
+void __init xen_machine_kexec_register_resources(struct resource *res)
+{
+       int k;
+       struct resource *r;
+
+       request_resource(res, &xen_hypervisor_res);
+       for (k = 0; k < xen_nr_phys_cpus; k++) {
+               r = xen_phys_cpus + k;
+               if (r->parent == NULL) /* out of xen_hypervisor_res range */
+                       request_resource(res, r);
+       } 
+       machine_kexec_register_resources(res);
+}
+#endif
+
+static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image)
+{
+       machine_kexec_setup_load_arg(xki, image);
+
+       xki->indirection_page = image->head;
+       xki->start_address = image->start;
+}
+
+/*
+ * Load the image into xen so xen can kdump itself
+ * This might have been done in prepare, but prepare
+ * is currently called too early. It might make sense
+ * to move prepare, but for now, just add an extra hook.
+ */
+int xen_machine_kexec_load(struct kimage *image)
+{
+       xen_kexec_load_t xkl;
+
+       memset(&xkl, 0, sizeof(xkl));
+       xkl.type = image->type;
+       setup_load_arg(&xkl.image, image);
+       return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl);
+}
+
+/*
+ * Unload the image that was stored by machine_kexec_load()
+ * This might have been done in machine_kexec_cleanup() but it
+ * is called too late, and its possible xen could try and kdump
+ * using resources that have been freed.
+ */
+void xen_machine_kexec_unload(struct kimage *image)
+{
+       xen_kexec_load_t xkl;
+
+       memset(&xkl, 0, sizeof(xkl));
+       xkl.type = image->type;
+       WARN_ON(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl));
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ *
+ * This has the hypervisor move to the prefered reboot CPU, 
+ * stop all CPUs and kexec. That is it combines machine_shutdown()
+ * and machine_kexec() in Linux kexec terms.
+ */
+void __noreturn machine_kexec(struct kimage *image)
+{
+       xen_kexec_exec_t xke;
+
+       memset(&xke, 0, sizeof(xke));
+       xke.type = image->type;
+       VOID(HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke));
+       panic("KEXEC_CMD_kexec hypercall should not return\n");
+}
+
+#ifdef CONFIG_X86
+unsigned long paddr_vmcoreinfo_note(void)
+{
+       return virt_to_machine(&vmcoreinfo_note);
+}
+#endif
+
+void machine_shutdown(void)
+{
+       /* do nothing */
+}
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+       /* The kernel is broken so disable interrupts */
+       local_irq_disable();
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
diff --git a/drivers/xen/core/machine_reboot.c b/drivers/xen/core/machine_reboot.c

new file mode 100644 (file)

index 0000000..fb3ac79
--- /dev/null
+++ b/drivers/xen/core/machine_reboot.c
@@ -0,0 +1,285 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/unistd.h>
+#include <linux/export.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+#include <linux/stringify.h>
+#include <linux/stop_machine.h>
+#include <linux/syscore_ops.h>
+#include <asm/irq.h>
+#include <asm/mmu_context.h>
+#include <xen/evtchn.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <linux/cpu.h>
+#include <xen/clock.h>
+#include <xen/gnttab.h>
+#include <xen/xencons.h>
+#include <xen/cpu_hotplug.h>
+#include <xen/interface/vcpu.h>
+#include "../../base/base.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <asm/pci_x86.h>
+/* TBD: Dom0 should propagate the determined value to Xen. */
+bool port_cf9_safe = false;
+
+/*
+ * Power off function, if any
+ */
+void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
+
+void machine_emergency_restart(void)
+{
+       /* We really want to get pending console data out before we die. */
+       xencons_force_flush();
+       HYPERVISOR_shutdown(SHUTDOWN_reboot);
+}
+
+void machine_restart(char * __unused)
+{
+       machine_emergency_restart();
+}
+
+void machine_halt(void)
+{
+       machine_power_off();
+}
+
+void machine_power_off(void)
+{
+       /* We really want to get pending console data out before we die. */
+       xencons_force_flush();
+       if (pm_power_off)
+               pm_power_off();
+       HYPERVISOR_shutdown(SHUTDOWN_poweroff);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static void pre_suspend(void)
+{
+       HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+       WARN_ON(HYPERVISOR_update_va_mapping(fix_to_virt(FIX_SHARED_INFO),
+                                            __pte_ma(0), 0));
+
+       xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
+       xen_start_info->console.domU.mfn =
+               mfn_to_pfn(xen_start_info->console.domU.mfn);
+}
+
+static void post_suspend(int suspend_cancelled, int fast_suspend)
+{
+       unsigned long shinfo_mfn;
+
+       if (suspend_cancelled) {
+               xen_start_info->store_mfn =
+                       pfn_to_mfn(xen_start_info->store_mfn);
+               xen_start_info->console.domU.mfn =
+                       pfn_to_mfn(xen_start_info->console.domU.mfn);
+       } else {
+               unsigned int i;
+
+#ifdef CONFIG_SMP
+               cpumask_copy(vcpu_initialized_mask, cpu_online_mask);
+#endif
+               for_each_possible_cpu(i) {
+                       setup_runstate_area(i);
+
+#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT
+                       if (fast_suspend && i != smp_processor_id()
+                           && cpu_online(i)
+                           && HYPERVISOR_vcpu_op(VCPUOP_down, i, NULL))
+                               BUG();
+
+                       setup_vcpu_info(i);
+
+                       if (fast_suspend && i != smp_processor_id()
+                           && cpu_online(i)
+                           && HYPERVISOR_vcpu_op(VCPUOP_up, i, NULL))
+                               BUG();
+#endif
+               }
+       }
+
+       shinfo_mfn = xen_start_info->shared_info >> PAGE_SHIFT;
+       if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_SHARED_INFO),
+                                        pfn_pte_ma(shinfo_mfn, PAGE_KERNEL),
+                                        0))
+               BUG();
+       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+
+       clear_page(empty_zero_page);
+
+       if (!suspend_cancelled)
+               setup_pfn_to_mfn_frame_list(NULL);
+}
+#endif
+
+#else /* !(defined(__i386__) || defined(__x86_64__)) */
+
+#ifndef HAVE_XEN_PRE_SUSPEND
+#define xen_pre_suspend()      ((void)0)
+#endif
+
+#ifndef HAVE_XEN_POST_SUSPEND
+#define xen_post_suspend(x)    ((void)0)
+#endif
+
+#define switch_idle_mm()       ((void)0)
+#define mm_pin_all()           ((void)0)
+#define pre_suspend()          xen_pre_suspend()
+#define post_suspend(x, f)     xen_post_suspend(x)
+
+#endif
+
+#ifdef CONFIG_PM_SLEEP
+struct suspend {
+       int fast_suspend;
+       void (*resume_notifier)(int);
+};
+
+static int take_machine_down(void *_suspend)
+{
+       struct suspend *suspend = _suspend;
+       int suspend_cancelled;
+
+       BUG_ON(!irqs_disabled());
+
+       mm_pin_all();
+       suspend_cancelled = syscore_suspend();
+       if (!suspend_cancelled) {
+               pre_suspend();
+
+               /*
+                * This hypercall returns 1 if suspend was cancelled or the domain was
+                * merely checkpointed, and 0 if it is resuming in a new domain.
+                */
+               suspend_cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
+       } else
+               BUG_ON(suspend_cancelled > 0);
+       suspend->resume_notifier(suspend_cancelled);
+       if (suspend_cancelled >= 0)
+               post_suspend(suspend_cancelled, suspend->fast_suspend);
+       if (!suspend_cancelled)
+               xen_clockevents_resume();
+       if (suspend_cancelled >= 0)
+               syscore_resume();
+       if (!suspend_cancelled) {
+#ifdef __x86_64__
+               /*
+                * Older versions of Xen do not save/restore the user %cr3.
+                * We do it here just in case, but there's no need if we are
+                * in fast-suspend mode as that implies a new enough Xen.
+                */
+               if (!suspend->fast_suspend)
+                       xen_new_user_pt(current->active_mm->pgd);
+#endif
+       }
+
+       return suspend_cancelled;
+}
+
+int __xen_suspend(int fast_suspend, void (*resume_notifier)(int))
+{
+       int err, suspend_cancelled;
+       const char *what;
+       struct suspend suspend;
+
+#define _check(fn, args...) ({ \
+       what = #fn; \
+       err = (fn)(args); \
+})
+
+       BUG_ON(smp_processor_id() != 0);
+       BUG_ON(in_interrupt());
+
+#if defined(__i386__) || defined(__x86_64__)
+       if (xen_feature(XENFEAT_auto_translated_physmap)) {
+               pr_warning("Can't suspend in auto_translated_physmap mode\n");
+               return -EOPNOTSUPP;
+       }
+#endif
+
+       /* If we are definitely UP then 'slow mode' is actually faster. */
+       if (num_possible_cpus() == 1)
+               fast_suspend = 0;
+
+       suspend.fast_suspend = fast_suspend;
+       suspend.resume_notifier = resume_notifier;
+
+       if (_check(dpm_suspend_start, PMSG_SUSPEND)) {
+               dpm_resume_end(PMSG_RESUME);
+               pr_err("%s() failed: %d\n", what, err);
+               return err;
+       }
+
+       if (fast_suspend) {
+               xenbus_suspend();
+
+               if (_check(dpm_suspend_end, PMSG_SUSPEND)) {
+                       xenbus_suspend_cancel();
+                       dpm_resume_end(PMSG_RESUME);
+                       pr_err("%s() failed: %d\n", what, err);
+                       return err;
+               }
+
+               err = stop_machine(take_machine_down, &suspend,
+                                  &cpumask_of_cpu(0));
+               if (err < 0)
+                       xenbus_suspend_cancel();
+       } else {
+               BUG_ON(irqs_disabled());
+
+               for (;;) {
+                       xenbus_suspend();
+
+                       if (!_check(dpm_suspend_end, PMSG_SUSPEND)
+                           && _check(smp_suspend))
+                               dpm_resume_start(PMSG_RESUME);
+                       if (err) {
+                               xenbus_suspend_cancel();
+                               dpm_resume_end(PMSG_RESUME);
+                               pr_err("%s() failed: %d\n", what, err);
+                               return err;
+                       }
+
+                       preempt_disable();
+
+                       if (num_online_cpus() == 1)
+                               break;
+
+                       preempt_enable();
+
+                       dpm_resume_start(PMSG_RESUME);
+
+                       xenbus_suspend_cancel();
+               }
+
+               local_irq_disable();
+               err = take_machine_down(&suspend);
+               local_irq_enable();
+       }
+
+       dpm_resume_start(PMSG_RESUME);
+
+       if (err >= 0) {
+               suspend_cancelled = err;
+               if (!suspend_cancelled) {
+                       xencons_resume();
+                       xenbus_resume();
+               } else {
+                       xenbus_suspend_cancel();
+                       err = 0;
+               }
+
+               if (!fast_suspend)
+                       smp_resume();
+       }
+
+       dpm_resume_end(PMSG_RESUME);
+
+       return err;
+}
+#endif
diff --git a/drivers/xen/core/pcpu.c b/drivers/xen/core/pcpu.c

new file mode 100644 (file)

index 0000000..4280699
--- /dev/null
+++ b/drivers/xen/core/pcpu.c
@@ -0,0 +1,447 @@
+/*
+ * pcpu.c - management physical cpu in dom0 environment
+ */
+#include <linux/acpi.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <asm/hypervisor.h>
+#include <xen/interface/platform.h>
+#include <xen/evtchn.h>
+#include <xen/pcpu.h>
+#include <acpi/processor.h>
+
+struct pcpu {
+       struct list_head pcpu_list;
+       struct device dev;
+       uint32_t apic_id;
+       uint32_t acpi_id;
+       uint32_t flags;
+};
+
+static inline int xen_pcpu_online(uint32_t flags)
+{
+       return !!(flags & XEN_PCPU_FLAGS_ONLINE);
+}
+
+static DEFINE_MUTEX(xen_pcpu_lock);
+
+/* No need for irq disable since hotplug notify is in workqueue context */
+#define get_pcpu_lock() mutex_lock(&xen_pcpu_lock);
+#define put_pcpu_lock() mutex_unlock(&xen_pcpu_lock);
+
+static LIST_HEAD(xen_pcpus);
+
+static BLOCKING_NOTIFIER_HEAD(pcpu_chain);
+
+static inline void *notifier_param(const struct pcpu *pcpu)
+{
+       return (void *)(unsigned long)pcpu->dev.id;
+}
+
+int register_pcpu_notifier(struct notifier_block *nb)
+{
+       int err;
+
+       get_pcpu_lock();
+
+       err = blocking_notifier_chain_register(&pcpu_chain, nb);
+
+       if (!err) {
+               struct pcpu *pcpu;
+
+               list_for_each_entry(pcpu, &xen_pcpus, pcpu_list)
+                       if (xen_pcpu_online(pcpu->flags))
+                               nb->notifier_call(nb, CPU_ONLINE,
+                                                 notifier_param(pcpu));
+       }
+
+       put_pcpu_lock();
+
+       return err;
+}
+EXPORT_SYMBOL_GPL(register_pcpu_notifier);
+
+void unregister_pcpu_notifier(struct notifier_block *nb)
+{
+       get_pcpu_lock();
+       blocking_notifier_chain_unregister(&pcpu_chain, nb);
+       put_pcpu_lock();
+}
+EXPORT_SYMBOL_GPL(unregister_pcpu_notifier);
+
+static int xen_pcpu_down(uint32_t xen_id)
+{
+       xen_platform_op_t op;
+
+       op.cmd = XENPF_cpu_offline;
+       op.u.cpu_ol.cpuid = xen_id;
+       return HYPERVISOR_platform_op(&op);
+}
+
+static int xen_pcpu_up(uint32_t xen_id)
+{
+       xen_platform_op_t op;
+
+       op.cmd = XENPF_cpu_online;
+       op.u.cpu_ol.cpuid = xen_id;
+       return HYPERVISOR_platform_op(&op);
+}
+
+static ssize_t show_online(struct device *dev,
+                          struct device_attribute *attr,
+                          char *buf)
+{
+       struct pcpu *cpu = container_of(dev, struct pcpu, dev);
+
+       return sprintf(buf, "%d\n", xen_pcpu_online(cpu->flags));
+}
+
+static ssize_t store_online(struct device *dev,
+                           struct device_attribute *attr,
+                           const char *buf, size_t count)
+{
+       ssize_t ret;
+
+       if (!count)
+               return -EINVAL;
+
+       switch (buf[0]) {
+       case '0':
+               ret = xen_pcpu_down(dev->id);
+               break;
+       case '1':
+               ret = xen_pcpu_up(dev->id);
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       if (ret >= 0)
+               ret = count;
+       return ret;
+}
+
+static DEVICE_ATTR(online, 0644, show_online, store_online);
+
+static ssize_t show_apicid(struct device *dev,
+                          struct device_attribute *attr,
+                          char *buf)
+{
+       struct pcpu *cpu = container_of(dev, struct pcpu, dev);
+
+       return sprintf(buf, "%#x\n", cpu->apic_id);
+}
+static DEVICE_ATTR(apic_id, 0444, show_apicid, NULL);
+
+static ssize_t show_acpiid(struct device *dev,
+                          struct device_attribute *attr,
+                          char *buf)
+{
+       struct pcpu *cpu = container_of(dev, struct pcpu, dev);
+
+       return sprintf(buf, "%#x\n", cpu->acpi_id);
+}
+static DEVICE_ATTR(acpi_id, 0444, show_acpiid, NULL);
+
+static struct bus_type xen_pcpu_subsys = {
+       .name = "xen_pcpu",
+       .dev_name = "xen_pcpu",
+};
+
+static int xen_pcpu_free(struct pcpu *pcpu)
+{
+       if (!pcpu)
+               return 0;
+
+       device_remove_file(&pcpu->dev, &dev_attr_online);
+       device_remove_file(&pcpu->dev, &dev_attr_apic_id);
+       device_remove_file(&pcpu->dev, &dev_attr_acpi_id);
+       device_unregister(&pcpu->dev);
+       list_del(&pcpu->pcpu_list);
+       kfree(pcpu);
+
+       return 0;
+}
+
+static inline int same_pcpu(struct xenpf_pcpuinfo *info,
+                           struct pcpu *pcpu)
+{
+       return (pcpu->apic_id == info->apic_id) &&
+               (pcpu->dev.id == info->xen_cpuid);
+}
+
+/*
+ * Return 1 if online status changed
+ */
+static int xen_pcpu_online_check(struct xenpf_pcpuinfo *info,
+                                struct pcpu *pcpu)
+{
+       int result = 0;
+
+       if (info->xen_cpuid != pcpu->dev.id)
+               return 0;
+
+       if (xen_pcpu_online(info->flags) && !xen_pcpu_online(pcpu->flags)) {
+               /* the pcpu is onlined */
+               pcpu->flags |= XEN_PCPU_FLAGS_ONLINE;
+               blocking_notifier_call_chain(&pcpu_chain, CPU_ONLINE,
+                                            notifier_param(pcpu));
+               kobject_uevent(&pcpu->dev.kobj, KOBJ_ONLINE);
+               result = 1;
+       } else if (!xen_pcpu_online(info->flags) &&
+                  xen_pcpu_online(pcpu->flags))  {
+               /* The pcpu is offlined now */
+               pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE;
+               blocking_notifier_call_chain(&pcpu_chain, CPU_DEAD,
+                                            notifier_param(pcpu));
+               kobject_uevent(&pcpu->dev.kobj, KOBJ_OFFLINE);
+               result = 1;
+       }
+
+       return result;
+}
+
+static int pcpu_dev_init(struct pcpu *cpu)
+{
+       int err = device_register(&cpu->dev);
+
+       if (!err) {
+               device_create_file(&cpu->dev, &dev_attr_online);
+               device_create_file(&cpu->dev, &dev_attr_apic_id);
+               device_create_file(&cpu->dev, &dev_attr_acpi_id);
+       }
+       return err;
+}
+
+static struct pcpu *get_pcpu(unsigned int xen_id)
+{
+       struct pcpu *pcpu;
+
+       list_for_each_entry(pcpu, &xen_pcpus, pcpu_list)
+               if (pcpu->dev.id == xen_id)
+                       return pcpu;
+
+       return NULL;
+}
+
+static struct pcpu *init_pcpu(struct xenpf_pcpuinfo *info)
+{
+       struct pcpu *pcpu;
+       int err;
+
+       if (info->flags & XEN_PCPU_FLAGS_INVALID)
+               return ERR_PTR(-EINVAL);
+
+       /* The PCPU is just added */
+       pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL);
+       if (!pcpu)
+               return ERR_PTR(-ENOMEM);
+
+       INIT_LIST_HEAD(&pcpu->pcpu_list);
+       pcpu->apic_id = info->apic_id;
+       pcpu->acpi_id = info->acpi_id;
+       pcpu->flags = info->flags;
+
+       pcpu->dev.bus = &xen_pcpu_subsys;
+       pcpu->dev.id = info->xen_cpuid;
+
+       err = pcpu_dev_init(pcpu);
+       if (err) {
+               kfree(pcpu);
+               return ERR_PTR(err);
+       }
+
+       list_add_tail(&pcpu->pcpu_list, &xen_pcpus);
+       return pcpu;
+}
+
+#define PCPU_NO_CHANGE                 0
+#define PCPU_ADDED                     1
+#define PCPU_ONLINE_OFFLINE            2
+#define PCPU_REMOVED                   3
+/*
+ * Caller should hold the pcpu lock
+ * < 0: Something wrong
+ * 0: No changes
+ * > 0: State changed
+ */
+static int _sync_pcpu(unsigned int cpu_num, unsigned int *max_id)
+{
+       struct pcpu *pcpu;
+       struct xenpf_pcpuinfo *info;
+       xen_platform_op_t op;
+       int ret;
+
+       op.cmd = XENPF_get_cpuinfo;
+       info = &op.u.pcpu_info;
+       info->xen_cpuid = cpu_num;
+
+       do {
+               ret = HYPERVISOR_platform_op(&op);
+       } while (ret == -EBUSY);
+       if (ret)
+               return ret;
+
+       if (max_id)
+               *max_id = op.u.pcpu_info.max_present;
+
+       pcpu = get_pcpu(cpu_num);
+
+       if (info->flags & XEN_PCPU_FLAGS_INVALID) {
+               /* The pcpu has been removed */
+               if (pcpu) {
+                       xen_pcpu_free(pcpu);
+                       return PCPU_REMOVED;
+               }
+               return PCPU_NO_CHANGE;
+       }
+
+
+       if (!pcpu) {
+               pcpu = init_pcpu(info);
+               if (!IS_ERR(pcpu))
+                       return PCPU_ADDED;
+               pr_warn("Failed to init pCPU %#x (%ld)\n",
+                       info->xen_cpuid, PTR_ERR(pcpu));
+               return PTR_ERR(pcpu);
+       }
+
+       if (!same_pcpu(info, pcpu)) {
+               /*
+                * Old pCPU is replaced by a new one, which means
+                * several vIRQ-s were missed - can this happen?
+                */
+               pr_warn("pCPU %#x changed!\n", pcpu->dev.id);
+               pcpu->apic_id = info->apic_id;
+               pcpu->acpi_id = info->acpi_id;
+       }
+       if (xen_pcpu_online_check(info, pcpu))
+               return PCPU_ONLINE_OFFLINE;
+       return PCPU_NO_CHANGE;
+}
+
+/*
+ * Sync dom0's pcpu information with xen hypervisor's
+ */
+static int xen_sync_pcpus(void)
+{
+       /*
+        * Boot cpu always have cpu_id 0 in xen
+        */
+       unsigned int cpu_num = 0, max_id = 0;
+       int result = 0;
+
+       get_pcpu_lock();
+
+       while ((result >= 0) && (cpu_num <= max_id)) {
+               result = _sync_pcpu(cpu_num, &max_id);
+
+               switch (result) {
+               case PCPU_NO_CHANGE:
+               case PCPU_ADDED:
+               case PCPU_ONLINE_OFFLINE:
+               case PCPU_REMOVED:
+                       break;
+               default:
+                       pr_warn("Failed to sync pcpu %#x\n", cpu_num);
+                       break;
+               }
+               cpu_num++;
+       }
+
+       if (result < 0) {
+               struct pcpu *pcpu, *tmp;
+
+               list_for_each_entry_safe(pcpu, tmp, &xen_pcpus, pcpu_list)
+                       xen_pcpu_free(pcpu);
+       }
+
+       put_pcpu_lock();
+
+       return result;
+}
+
+static void xen_pcpu_dpc(struct work_struct *work)
+{
+       if (xen_sync_pcpus() < 0)
+               pr_warn("xen_pcpu_dpc: Failed to sync pcpu information\n");
+}
+static DECLARE_WORK(xen_pcpu_work, xen_pcpu_dpc);
+
+static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id)
+{
+       schedule_work(&xen_pcpu_work);
+
+       return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+
+int xen_pcpu_hotplug(int type)
+{
+       schedule_work(&xen_pcpu_work);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(xen_pcpu_hotplug);
+
+int xen_pcpu_index(uint32_t id, bool is_acpiid)
+{
+       unsigned int cpu_num, max_id;
+       xen_platform_op_t op;
+       struct xenpf_pcpuinfo *info = &op.u.pcpu_info;
+
+       op.cmd = XENPF_get_cpuinfo;
+       for (max_id = cpu_num = 0; cpu_num <= max_id; ++cpu_num) {
+               int ret;
+
+               info->xen_cpuid = cpu_num;
+               do {
+                       ret = HYPERVISOR_platform_op(&op);
+               } while (ret == -EBUSY);
+               if (ret)
+                       continue;
+
+               if (info->max_present > max_id)
+                       max_id = info->max_present;
+               if (id == (is_acpiid ? info->acpi_id : info->apic_id))
+                       return cpu_num;
+       }
+
+       return -1;
+}
+EXPORT_SYMBOL_GPL(xen_pcpu_index);
+
+#endif /* CONFIG_ACPI_HOTPLUG_CPU */
+
+static int __init xen_pcpu_init(void)
+{
+       int err;
+
+       if (!is_initial_xendomain())
+               return 0;
+
+       err = subsys_system_register(&xen_pcpu_subsys, NULL);
+       if (err) {
+               pr_warn("xen_pcpu_init: "
+                       "Failed to register subsys (%d)\n", err);
+               return err;
+       }
+
+       xen_sync_pcpus();
+
+       if (!list_empty(&xen_pcpus))
+               err = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, 0,
+                                             xen_pcpu_interrupt, 0,
+                                             "pcpu", NULL);
+       if (err < 0)
+               pr_warn("xen_pcpu_init: "
+                       "Failed to bind virq (%d)\n", err);
+
+       return err;
+}
+subsys_initcall(xen_pcpu_init);
diff --git a/drivers/xen/core/reboot.c b/drivers/xen/core/reboot.c

new file mode 100644 (file)

index 0000000..90789f1
--- /dev/null
+++ b/drivers/xen/core/reboot.c
@@ -0,0 +1,348 @@
+#include <linux/kernel.h>
+#include <linux/unistd.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/evtchn.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#undef handle_sysrq
+#endif
+
+#define SHUTDOWN_INVALID  -1
+#define SHUTDOWN_POWEROFF  0
+#define SHUTDOWN_SUSPEND   2
+#define SHUTDOWN_RESUMING  3
+#define SHUTDOWN_HALT      4
+
+/* Ignore multiple shutdown requests. */
+static int shutting_down = SHUTDOWN_INVALID;
+
+/* Can we leave APs online when we suspend? */
+static int fast_suspend;
+
+static void __shutdown_handler(struct work_struct *unused);
+static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
+
+int __xen_suspend(int fast_suspend, void (*resume_notifier)(int));
+
+static int shutdown_process(void *__unused)
+{
+       static char *envp[] = { "HOME=/", "TERM=linux",
+                               "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
+       static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
+
+       extern asmlinkage long sys_reboot(int magic1, int magic2,
+                                         unsigned int cmd, void *arg);
+
+       if ((shutting_down == SHUTDOWN_POWEROFF) ||
+           (shutting_down == SHUTDOWN_HALT)) {
+               if (call_usermodehelper("/sbin/poweroff", poweroff_argv,
+                                       envp, 0) < 0) {
+#ifdef CONFIG_XEN
+                       sys_reboot(LINUX_REBOOT_MAGIC1,
+                                  LINUX_REBOOT_MAGIC2,
+                                  LINUX_REBOOT_CMD_POWER_OFF,
+                                  NULL);
+#endif /* CONFIG_XEN */
+               }
+       }
+
+       shutting_down = SHUTDOWN_INVALID; /* could try again */
+
+       return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+
+static int setup_suspend_evtchn(void);
+
+/* Was last suspend request cancelled? */
+static int suspend_cancelled;
+
+static void xen_resume_notifier(int _suspend_cancelled)
+{
+       int old_state = xchg(&shutting_down, SHUTDOWN_RESUMING);
+       BUG_ON(old_state != SHUTDOWN_SUSPEND);
+       suspend_cancelled = _suspend_cancelled;
+}
+
+static int xen_suspend(void *__unused)
+{
+       int err, old_state;
+
+       daemonize("suspend");
+       err = set_cpus_allowed_ptr(current, cpumask_of(0));
+       if (err) {
+               pr_err("Xen suspend can't run on CPU0 (%d)\n", err);
+               goto fail;
+       }
+
+       do {
+               err = __xen_suspend(fast_suspend, xen_resume_notifier);
+               if (err) {
+                       pr_err("Xen suspend failed (%d)\n", err);
+                       goto fail;
+               }
+               if (!suspend_cancelled)
+                       setup_suspend_evtchn();
+               old_state = cmpxchg(
+                       &shutting_down, SHUTDOWN_RESUMING, SHUTDOWN_INVALID);
+       } while (old_state == SHUTDOWN_SUSPEND);
+
+       switch (old_state) {
+       case SHUTDOWN_INVALID:
+       case SHUTDOWN_SUSPEND:
+               BUG();
+       case SHUTDOWN_RESUMING:
+               break;
+       default:
+               schedule_delayed_work(&shutdown_work, 0);
+               break;
+       }
+
+       return 0;
+
+ fail:
+       old_state = xchg(&shutting_down, SHUTDOWN_INVALID);
+       BUG_ON(old_state != SHUTDOWN_SUSPEND);
+       return 0;
+}
+
+#else
+# define xen_suspend NULL
+#endif
+
+static void switch_shutdown_state(int new_state)
+{
+       int prev_state, old_state = SHUTDOWN_INVALID;
+
+       /* We only drive shutdown_state into an active state. */
+       if (new_state == SHUTDOWN_INVALID)
+               return;
+
+       do {
+               /* We drop this transition if already in an active state. */
+               if ((old_state != SHUTDOWN_INVALID) &&
+                   (old_state != SHUTDOWN_RESUMING))
+                       return;
+               /* Attempt to transition. */
+               prev_state = old_state;
+               old_state = cmpxchg(&shutting_down, old_state, new_state);
+       } while (old_state != prev_state);
+
+       /* Either we kick off the work, or we leave it to xen_suspend(). */
+       if (old_state == SHUTDOWN_INVALID)
+               schedule_delayed_work(&shutdown_work, 0);
+       else
+               BUG_ON(old_state != SHUTDOWN_RESUMING);
+}
+
+static void __shutdown_handler(struct work_struct *unused)
+{
+       int err;
+
+       err = kernel_thread((shutting_down == SHUTDOWN_SUSPEND) ?
+                           xen_suspend : shutdown_process,
+                           NULL, CLONE_FS | CLONE_FILES);
+
+       if (err < 0) {
+               pr_warning("Error creating shutdown process (%d): "
+                          "retrying...\n", -err);
+               schedule_delayed_work(&shutdown_work, HZ/2);
+       }
+}
+
+static void shutdown_handler(struct xenbus_watch *watch,
+                            const char **vec, unsigned int len)
+{
+       extern void ctrl_alt_del(void);
+       char *str;
+       struct xenbus_transaction xbt;
+       int err, new_state = SHUTDOWN_INVALID;
+
+       if ((shutting_down != SHUTDOWN_INVALID) &&
+           (shutting_down != SHUTDOWN_RESUMING))
+               return;
+
+ again:
+       err = xenbus_transaction_start(&xbt);
+       if (err)
+               return;
+
+       str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
+       /* Ignore read errors and empty reads. */
+       if (XENBUS_IS_ERR_READ(str)) {
+               xenbus_transaction_end(xbt, 1);
+               return;
+       }
+
+       xenbus_write(xbt, "control", "shutdown", "");
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err == -EAGAIN) {
+               kfree(str);
+               goto again;
+       }
+
+       if (strcmp(str, "poweroff") == 0)
+               new_state = SHUTDOWN_POWEROFF;
+       else if (strcmp(str, "reboot") == 0)
+               ctrl_alt_del();
+#ifdef CONFIG_PM_SLEEP
+       else if (strcmp(str, "suspend") == 0)
+               new_state = SHUTDOWN_SUSPEND;
+#endif
+       else if (strcmp(str, "halt") == 0)
+               new_state = SHUTDOWN_HALT;
+       else
+               pr_warning("Ignoring shutdown request: %s\n", str);
+
+       switch_shutdown_state(new_state);
+
+       kfree(str);
+}
+
+static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
+                         unsigned int len)
+{
+       char sysrq_key = '\0';
+       struct xenbus_transaction xbt;
+       int err;
+
+ again:
+       err = xenbus_transaction_start(&xbt);
+       if (err)
+               return;
+       if (xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key) <= 0) {
+               pr_err("Unable to read sysrq code in control/sysrq\n");
+               xenbus_transaction_end(xbt, 1);
+               return;
+       }
+
+       if (sysrq_key != '\0')
+               xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err == -EAGAIN)
+               goto again;
+
+#ifdef CONFIG_MAGIC_SYSRQ
+       if (sysrq_key != '\0')
+               handle_sysrq(sysrq_key);
+#endif
+}
+
+static struct xenbus_watch shutdown_watch = {
+       .node = "control/shutdown",
+       .callback = shutdown_handler
+};
+
+static struct xenbus_watch sysrq_watch = {
+       .node = "control/sysrq",
+       .callback = sysrq_handler
+};
+
+#ifdef CONFIG_PM_SLEEP
+static irqreturn_t suspend_int(int irq, void* dev_id)
+{
+       switch_shutdown_state(SHUTDOWN_SUSPEND);
+       return IRQ_HANDLED;
+}
+
+static int setup_suspend_evtchn(void)
+{
+       static int irq;
+       int port;
+       char portstr[16];
+
+       if (irq > 0)
+               unbind_from_irqhandler(irq, NULL);
+
+       irq = bind_listening_port_to_irqhandler(0, suspend_int, 0, "suspend",
+                                               NULL);
+       if (irq <= 0)
+               return -1;
+
+       port = irq_to_evtchn_port(irq);
+       pr_info("suspend: event channel %d\n", port);
+       sprintf(portstr, "%d", port);
+       xenbus_write(XBT_NIL, "device/suspend", "event-channel", portstr);
+
+       return 0;
+}
+#else
+#define setup_suspend_evtchn() 0
+#endif
+
+static int setup_shutdown_watcher(void)
+{
+       int err;
+
+       err = register_xenbus_watch(&sysrq_watch);
+       if (err) {
+               pr_err("Failed to set sysrq watcher\n");
+               return err;
+       }
+
+       if (is_initial_xendomain())
+               return 0;
+
+       xenbus_scanf(XBT_NIL, "control",
+                    "platform-feature-multiprocessor-suspend",
+                    "%d", &fast_suspend);
+
+       err = register_xenbus_watch(&shutdown_watch);
+       if (err) {
+               pr_err("Failed to set shutdown watcher\n");
+               return err;
+       }
+
+       /* suspend event channel */
+       err = setup_suspend_evtchn();
+       if (err) {
+               pr_err("Failed to register suspend event channel\n");
+               return err;
+       }
+
+       return 0;
+}
+
+#ifdef CONFIG_XEN
+
+static int shutdown_event(struct notifier_block *notifier,
+                         unsigned long event,
+                         void *data)
+{
+       setup_shutdown_watcher();
+       return NOTIFY_DONE;
+}
+
+static int __init setup_shutdown_event(void)
+{
+       static struct notifier_block xenstore_notifier = {
+               .notifier_call = shutdown_event
+       };
+       register_xenstore_notifier(&xenstore_notifier);
+
+       return 0;
+}
+
+subsys_initcall(setup_shutdown_event);
+
+#else /* !defined(CONFIG_XEN) */
+
+int xen_reboot_init(void)
+{
+       return setup_shutdown_watcher();
+}
+
+#endif /* !defined(CONFIG_XEN) */
diff --git a/drivers/xen/core/smpboot.c b/drivers/xen/core/smpboot.c

new file mode 100644 (file)

index 0000000..bcacc4a
--- /dev/null
+++ b/drivers/xen/core/smpboot.c
@@ -0,0 +1,416 @@
+/*
+ *     Xen SMP booting functions
+ *
+ *     See arch/i386/kernel/smpboot.c for copyright and credits for derived
+ *     portions of this file.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/irq.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
+#include <asm/desc.h>
+#include <asm/pgalloc.h>
+#include <xen/clock.h>
+#include <xen/evtchn.h>
+#include <xen/interface/vcpu.h>
+#include <xen/cpu_hotplug.h>
+#include <xen/xenbus.h>
+
+extern int local_setup_timer(unsigned int cpu);
+extern void local_teardown_timer(unsigned int cpu);
+
+extern void hypervisor_callback(void);
+extern void failsafe_callback(void);
+extern void system_call(void);
+extern void smp_trap_init(trap_info_t *);
+
+cpumask_var_t vcpu_initialized_mask;
+
+DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+EXPORT_PER_CPU_SYMBOL(cpu_info);
+
+static int __read_mostly ipi_irq = -1;
+
+void __init prefill_possible_map(void)
+{
+       int i, rc;
+
+       for_each_possible_cpu(i)
+           if (i != smp_processor_id())
+               return;
+
+       for (i = 0; i < NR_CPUS; i++) {
+#ifndef CONFIG_HOTPLUG_CPU
+               if (i >= setup_max_cpus)
+                       break;
+#endif
+               rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+               if (rc >= 0) {
+                       set_cpu_possible(i, true);
+                       nr_cpu_ids = i + 1;
+               }
+       }
+       total_cpus = num_possible_cpus();
+       for (; HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL) >= 0; ++i)
+               if (i != smp_processor_id())
+                       ++total_cpus;
+}
+
+static irqreturn_t ipi_interrupt(int irq, void *dev_id)
+{
+       static void (*const handlers[])(struct pt_regs *) = {
+               [RESCHEDULE_VECTOR] = smp_reschedule_interrupt,
+               [CALL_FUNCTION_VECTOR] = smp_call_function_interrupt,
+               [CALL_FUNC_SINGLE_VECTOR] = smp_call_function_single_interrupt,
+               [REBOOT_VECTOR] = smp_reboot_interrupt,
+#ifdef CONFIG_IRQ_WORK
+               [IRQ_WORK_VECTOR] = smp_irq_work_interrupt,
+#endif
+       };
+       unsigned long *pending = __get_cpu_var(ipi_pending);
+       struct pt_regs *regs = get_irq_regs();
+       irqreturn_t ret = IRQ_NONE;
+
+       for (;;) {
+               unsigned int ipi = find_first_bit(pending, NR_IPIS);
+
+               if (ipi >= NR_IPIS) {
+                       clear_ipi_evtchn();
+                       ipi = find_first_bit(pending, NR_IPIS);
+               }
+               if (ipi >= NR_IPIS)
+                       return ret;
+               ret = IRQ_HANDLED;
+               do {
+                       clear_bit(ipi, pending);
+                       handlers[ipi](regs);
+                       ipi = find_next_bit(pending, NR_IPIS, ipi);
+               } while (ipi < NR_IPIS);
+       }
+}
+
+static int __cpuinit xen_smp_intr_init(unsigned int cpu)
+{
+       static struct irqaction ipi_action = {
+               .handler = ipi_interrupt,
+               .flags   = IRQF_DISABLED,
+               .name    = "ipi"
+       };
+       int rc;
+
+       rc = bind_ipi_to_irqaction(cpu, &ipi_action);
+       if (rc < 0)
+               return rc;
+       if (ipi_irq < 0)
+               ipi_irq = rc;
+       else
+               BUG_ON(ipi_irq != rc);
+
+       rc = xen_spinlock_init(cpu);
+       if (rc < 0)
+               goto unbind_ipi;
+
+       if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0))
+               goto fail;
+
+       return 0;
+
+ fail:
+       xen_spinlock_cleanup(cpu);
+ unbind_ipi:
+       unbind_from_per_cpu_irq(ipi_irq, cpu, NULL);
+       return rc;
+}
+
+static void __cpuinit xen_smp_intr_exit(unsigned int cpu)
+{
+       if (cpu != 0)
+               local_teardown_timer(cpu);
+
+       unbind_from_per_cpu_irq(ipi_irq, cpu, NULL);
+       xen_spinlock_cleanup(cpu);
+}
+
+static void __cpuinit cpu_bringup(void)
+{
+       unsigned int cpu;
+
+       cpu_init();
+       identify_secondary_cpu(__this_cpu_ptr(&cpu_info));
+       touch_softlockup_watchdog();
+       preempt_disable();
+       xen_setup_cpu_clockevents();
+       cpu = smp_processor_id();
+       notify_cpu_starting(cpu);
+       ipi_call_lock_irq();
+       set_cpu_online(cpu, true);
+       ipi_call_unlock_irq();
+}
+
+static void __cpuinit cpu_bringup_and_idle(void)
+{
+       cpu_bringup();
+       cpu_idle();
+}
+
+static void __cpuinit cpu_initialize_context(unsigned int cpu)
+{
+       /* vcpu_guest_context_t is too large to allocate on the stack.
+        * Hence we allocate statically and protect it with a lock */
+       static vcpu_guest_context_t ctxt;
+       static DEFINE_SPINLOCK(ctxt_lock);
+
+       struct task_struct *idle = idle_task(cpu);
+
+       if (cpumask_test_and_set_cpu(cpu, vcpu_initialized_mask))
+               return;
+
+       spin_lock(&ctxt_lock);
+
+       memset(&ctxt, 0, sizeof(ctxt));
+
+       ctxt.flags = VGCF_IN_KERNEL;
+       ctxt.user_regs.ds = __USER_DS;
+       ctxt.user_regs.es = __USER_DS;
+       ctxt.user_regs.ss = __KERNEL_DS;
+       ctxt.user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+       ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */
+
+       smp_trap_init(ctxt.trap_ctxt);
+
+       ctxt.gdt_frames[0] = arbitrary_virt_to_mfn(get_cpu_gdt_table(cpu));
+       ctxt.gdt_ents = GDT_SIZE / 8;
+
+       ctxt.user_regs.cs = __KERNEL_CS;
+       ctxt.user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
+
+       ctxt.kernel_ss = __KERNEL_DS;
+       ctxt.kernel_sp = idle->thread.sp0;
+
+       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
+       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
+#ifdef __i386__
+       ctxt.event_callback_cs     = __KERNEL_CS;
+       ctxt.failsafe_callback_cs  = __KERNEL_CS;
+
+       ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+
+       ctxt.user_regs.fs = __KERNEL_PERCPU;
+       ctxt.user_regs.gs = __KERNEL_STACK_CANARY;
+#else /* __x86_64__ */
+       ctxt.syscall_callback_eip  = (unsigned long)system_call;
+
+       ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt));
+
+       ctxt.gs_base_kernel = per_cpu_offset(cpu);
+#endif
+
+       if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt))
+               BUG();
+
+       spin_unlock(&ctxt_lock);
+}
+
+void __init smp_prepare_cpus(unsigned int max_cpus)
+{
+       unsigned int cpu;
+       struct task_struct *idle;
+       int apicid;
+       struct vcpu_get_physid cpu_id;
+       void *gdt_addr;
+
+       apicid = 0;
+       if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, 0, &cpu_id) == 0)
+               apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
+       cpu_data(0) = boot_cpu_data;
+       current_thread_info()->cpu = 0;
+
+       if (xen_smp_intr_init(0))
+               BUG();
+
+       if (!alloc_cpumask_var(&vcpu_initialized_mask, GFP_KERNEL))
+               BUG();
+       cpumask_copy(vcpu_initialized_mask, cpumask_of(0));
+
+       /* Restrict the possible_map according to max_cpus. */
+       while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+               for (cpu = nr_cpu_ids-1; !cpu_possible(cpu); cpu--)
+                       continue;
+               set_cpu_possible(cpu, false);
+       }
+
+       for_each_possible_cpu (cpu) {
+               if (cpu == 0)
+                       continue;
+
+               idle = fork_idle(cpu);
+               if (IS_ERR(idle))
+                       panic("failed fork for CPU %d", cpu);
+
+               gdt_addr = get_cpu_gdt_table(cpu);
+               make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
+
+               apicid = cpu;
+               if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0)
+                       apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id);
+               cpu_data(cpu) = boot_cpu_data;
+               cpu_data(cpu).cpu_index = cpu;
+
+#ifdef __x86_64__
+               clear_tsk_thread_flag(idle, TIF_FORK);
+               per_cpu(kernel_stack, cpu) =
+                       (unsigned long)task_stack_page(idle) -
+                       KERNEL_STACK_OFFSET + THREAD_SIZE;
+#endif
+               per_cpu(current_task, cpu) = idle;
+
+               irq_ctx_init(cpu);
+
+#ifdef CONFIG_HOTPLUG_CPU
+               if (is_initial_xendomain())
+#endif
+                       set_cpu_present(cpu, true);
+       }
+
+       init_xenbus_allowed_cpumask();
+
+#ifdef CONFIG_X86_IO_APIC
+       /*
+        * Here we can be sure that there is an IO-APIC in the system. Let's
+        * go and set it up:
+        */
+       if (cpu_has_apic && !skip_ioapic_setup && nr_ioapics)
+               setup_IO_APIC();
+#endif
+}
+
+void __init smp_prepare_boot_cpu(void)
+{
+       unsigned int cpu;
+
+       switch_to_new_gdt(smp_processor_id());
+       prefill_possible_map();
+       for_each_possible_cpu(cpu)
+               if (cpu != smp_processor_id())
+                       setup_vcpu_info(cpu);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Initialize cpu_present_map late to skip SMP boot code in init/main.c.
+ * But do it early enough to catch critical for_each_present_cpu() loops
+ * in i386-specific code.
+ */
+static int __init initialize_cpu_present_map(void)
+{
+       unsigned int cpu;
+
+       for_each_possible_cpu(cpu)
+               set_cpu_present(cpu, true);
+
+       return 0;
+}
+core_initcall(initialize_cpu_present_map);
+
+int __cpuinit __cpu_disable(void)
+{
+       unsigned int cpu = smp_processor_id();
+
+       if (cpu == 0)
+               return -EBUSY;
+
+       set_cpu_online(cpu, false);
+       fixup_irqs();
+
+       return 0;
+}
+
+void __cpuinit __cpu_die(unsigned int cpu)
+{
+       while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
+               current->state = TASK_UNINTERRUPTIBLE;
+               schedule_timeout(HZ/10);
+       }
+
+       xen_smp_intr_exit(cpu);
+
+       if (num_online_cpus() == 1)
+               alternatives_smp_switch(0);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __cpuinit __cpu_up(unsigned int cpu)
+{
+       int rc;
+
+       rc = cpu_up_check(cpu);
+       if (rc)
+               return rc;
+
+       rc = xen_smp_intr_init(cpu);
+       if (rc)
+               return rc;
+
+       cpu_initialize_context(cpu);
+
+       if (num_online_cpus() == 1)
+               alternatives_smp_switch(1);
+
+       /* This must be done before setting cpu_online_map */
+       wmb();
+
+       rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
+       if (!rc) {
+               /* Wait 5s total for a response. */
+               unsigned long timeout = jiffies + 5 * HZ;
+
+               while (!cpu_online(cpu) && time_before_eq(jiffies, timeout))
+                       HYPERVISOR_yield();
+               if (!cpu_online(cpu)) {
+                       VOID(HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL));
+                       rc = -ETIMEDOUT;
+               }
+       }
+
+       if (rc) {
+               xen_smp_intr_exit(cpu);
+               if (num_online_cpus() == 1)
+                       alternatives_smp_switch(0);
+       }
+
+       return rc;
+}
+
+void __ref play_dead(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+       idle_task_exit();
+       local_irq_disable();
+       cpumask_clear_cpu(smp_processor_id(), cpu_initialized_mask);
+       preempt_enable_no_resched();
+       VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
+       cpu_bringup();
+#else
+       BUG();
+#endif
+}
+
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+       nmi_selftest();
+}
+
+#ifndef CONFIG_X86_LOCAL_APIC
+int setup_profiling_timer(unsigned int multiplier)
+{
+       return -EINVAL;
+}
+#endif
diff --git a/drivers/xen/core/spinlock.c b/drivers/xen/core/spinlock.c

new file mode 100644 (file)

index 0000000..35759ac
--- /dev/null
+++ b/drivers/xen/core/spinlock.c
@@ -0,0 +1,409 @@
+/*
+ *     Xen spinlock functions
+ *
+ *     See arch/x86/xen/smp.c for copyright and credits for derived
+ *     portions of this file.
+ */
+#define XEN_SPINLOCK_SOURCE
+#include <linux/spinlock_types.h>
+
+#ifdef TICKET_SHIFT
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <asm/hardirq.h>
+#include <xen/clock.h>
+#include <xen/evtchn.h>
+
+struct spinning {
+       arch_spinlock_t *lock;
+       unsigned int ticket;
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+       unsigned int irq_count;
+#endif
+       struct spinning *prev;
+};
+static DEFINE_PER_CPU(struct spinning *, _spinning);
+static DEFINE_PER_CPU_READ_MOSTLY(evtchn_port_t, poll_evtchn);
+/*
+ * Protect removal of objects: Addition can be done lockless, and even
+ * removal itself doesn't need protection - what needs to be prevented is
+ * removed objects going out of scope (as they're allocated on the stack).
+ */
+struct rm_seq {
+       unsigned int idx;
+#define SEQ_REMOVE_BIAS (1 << !!CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING)
+       atomic_t ctr[2];
+};
+static DEFINE_PER_CPU(struct rm_seq, rm_seq);
+
+int __cpuinit xen_spinlock_init(unsigned int cpu)
+{
+       struct evtchn_bind_ipi bind_ipi;
+       int rc;
+
+       setup_runstate_area(cpu);
+
+       WARN_ON(per_cpu(poll_evtchn, cpu));
+       bind_ipi.vcpu = cpu;
+       rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi);
+       if (!rc)
+               per_cpu(poll_evtchn, cpu) = bind_ipi.port;
+       else
+               pr_warning("No spinlock poll event channel for CPU#%u (%d)\n",
+                          cpu, rc);
+
+       return rc;
+}
+
+void __cpuinit xen_spinlock_cleanup(unsigned int cpu)
+{
+       struct evtchn_close close;
+
+       close.port = per_cpu(poll_evtchn, cpu);
+       per_cpu(poll_evtchn, cpu) = 0;
+       WARN_ON(HYPERVISOR_event_channel_op(EVTCHNOP_close, &close));
+}
+
+#ifdef CONFIG_PM_SLEEP
+#include <linux/syscore_ops.h>
+
+static void __cpuinit spinlock_resume(void)
+{
+       unsigned int cpu;
+
+       for_each_online_cpu(cpu) {
+               per_cpu(poll_evtchn, cpu) = 0;
+               xen_spinlock_init(cpu);
+       }
+}
+
+static struct syscore_ops __cpuinitdata spinlock_syscore_ops = {
+       .resume = spinlock_resume
+};
+
+static int __init spinlock_register(void)
+{
+       if (!is_initial_xendomain())
+               register_syscore_ops(&spinlock_syscore_ops);
+       return 0;
+}
+core_initcall(spinlock_register);
+#endif
+
+static inline void sequence(unsigned int bias)
+{
+       unsigned int rm_idx = percpu_read(rm_seq.idx);
+
+       smp_wmb();
+       percpu_write(rm_seq.idx, (rm_idx + bias) ^ (SEQ_REMOVE_BIAS / 2));
+       smp_mb();
+       rm_idx &= 1;
+       while (percpu_read(rm_seq.ctr[rm_idx].counter))
+               cpu_relax();
+}
+
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+static DEFINE_PER_CPU(unsigned int, _irq_count);
+
+static __ticket_t spin_adjust(struct spinning *spinning,
+                             const arch_spinlock_t *lock,
+                             __ticket_t ticket)
+{
+       for (; spinning; spinning = spinning->prev) {
+               unsigned int old = spinning->ticket;
+
+               if (spinning->lock != lock)
+                       continue;
+               while (likely(old + 1)) {
+                       unsigned int cur;
+
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING > 1
+                       ticket = spin_adjust(spinning->prev, lock, ticket);
+#endif
+                       cur = cmpxchg(&spinning->ticket, old, ticket);
+                       if (cur == old)
+                               return cur;
+                       old = cur;
+               }
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING == 1
+               break;
+#endif
+       }
+       return ticket;
+}
+
+struct __raw_tickets xen_spin_adjust(const arch_spinlock_t *lock,
+                                    struct __raw_tickets token)
+{
+       token.tail = spin_adjust(percpu_read(_spinning), lock, token.tail);
+       token.head = ACCESS_ONCE(lock->tickets.head);
+       return token;
+}
+
+static unsigned int ticket_drop(struct spinning *spinning,
+                               unsigned int ticket, unsigned int cpu)
+{
+       arch_spinlock_t *lock = spinning->lock;
+
+       if (cmpxchg(&spinning->ticket, ticket, -1) != ticket)
+               return -1;
+       lock->owner = cpu;
+       __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
+       ticket = (__ticket_t)(ticket + 1);
+       return ticket != lock->tickets.tail ? ticket : -1;
+}
+
+static unsigned int ticket_get(arch_spinlock_t *lock, struct spinning *prev)
+{
+       struct __raw_tickets token = xadd(&lock->tickets,
+                                         (struct __raw_tickets){ .tail = 1 });
+
+       return token.head == token.tail ? token.tail
+                                       : spin_adjust(prev, lock, token.tail);
+}
+
+void xen_spin_irq_enter(void)
+{
+       struct spinning *spinning = percpu_read(_spinning);
+       unsigned int cpu = raw_smp_processor_id();
+
+       percpu_inc(_irq_count);
+       smp_mb();
+       for (; spinning; spinning = spinning->prev) {
+               arch_spinlock_t *lock = spinning->lock;
+
+               /*
+                * Return the ticket if we now own the lock. While just being
+                * desirable generally (to reduce latency on spinning CPUs),
+                * this is essential in the case where interrupts get
+                * re-enabled in xen_spin_wait().
+                * Try to get a new ticket right away (to reduce latency after
+                * the current lock was released), but don't acquire the lock.
+                */
+               while (lock->tickets.head == spinning->ticket) {
+                       unsigned int ticket = ticket_drop(spinning,
+                                                         spinning->ticket,
+                                                         cpu);
+
+                       if (!(ticket + 1))
+                               break;
+                       xen_spin_kick(lock, ticket);
+                       spinning->ticket = ticket_get(lock, spinning->prev);
+                       smp_mb();
+               }
+       }
+}
+
+void xen_spin_irq_exit(void)
+{
+       struct spinning *spinning = percpu_read(_spinning);
+       unsigned int cpu = raw_smp_processor_id();
+       /*
+        * Despite its counterpart being first in xen_spin_irq_enter() (to make
+        * xen_spin_kick() properly handle locks that get owned after their
+        * tickets were obtained there), it can validly be done first here:
+        * We're guaranteed to see another invocation of xen_spin_irq_enter()
+        * if any of the tickets need to be dropped again.
+        */
+       unsigned int irq_count = this_cpu_dec_return(_irq_count);
+
+       /*
+        * Make sure all xen_spin_kick() instances which may still have seen
+        * our old IRQ count exit their critical region (so that we won't fail
+        * to re-obtain a ticket if ticket_drop() completes only after our
+        * ticket check below).
+        */
+       sequence(0);
+
+       /*
+        * Obtain new tickets for (or acquire) all those locks at the IRQ
+        * nesting level we are about to return to where above we avoided
+        * acquiring them.
+        */
+       for (; spinning; spinning = spinning->prev) {
+               arch_spinlock_t *lock = spinning->lock;
+
+               if (spinning->irq_count < irq_count)
+                       break;
+               if (spinning->ticket + 1)
+                       continue;
+               spinning->ticket = ticket_get(lock, spinning->prev);
+               if (ACCESS_ONCE(lock->tickets.head) == spinning->ticket)
+                       lock->owner = cpu;
+       }
+}
+#endif
+
+unsigned int xen_spin_wait(arch_spinlock_t *lock, struct __raw_tickets *ptok,
+                          unsigned int flags)
+{
+       unsigned int cpu = raw_smp_processor_id();
+       typeof(vcpu_info(0)->evtchn_upcall_mask) upcall_mask
+               = arch_local_save_flags();
+       struct spinning spinning;
+
+       /* If kicker interrupt not initialized yet, just spin. */
+       if (unlikely(!cpu_online(cpu))
+           || unlikely(!this_cpu_read(poll_evtchn)))
+               return UINT_MAX;
+
+       /* announce we're spinning */
+       spinning.ticket = ptok->tail;
+       spinning.lock = lock;
+       spinning.prev = percpu_read(_spinning);
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+       spinning.irq_count = UINT_MAX;
+       if (upcall_mask > flags) {
+               const struct spinning *other;
+               int nesting = CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING;
+
+               for (other = spinning.prev; other; other = other->prev)
+                       if (other->lock == lock && !--nesting) {
+                               flags = upcall_mask;
+                               break;
+                       }
+       }
+       arch_local_irq_disable();
+#endif
+       smp_wmb();
+       percpu_write(_spinning, &spinning);
+
+       for (;;) {
+               clear_evtchn(percpu_read(poll_evtchn));
+
+               /*
+                * Check again to make sure it didn't become free while
+                * we weren't looking.
+                */
+               if (lock->tickets.head == spinning.ticket) {
+                       /*
+                        * If we interrupted another spinlock while it was
+                        * blocking, make sure it doesn't block (again)
+                        * without rechecking the lock.
+                        */
+                       if (spinning.prev)
+                               set_evtchn(percpu_read(poll_evtchn));
+                       break;
+               }
+
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+               if (upcall_mask > flags) {
+                       spinning.irq_count = percpu_read(_irq_count);
+                       smp_wmb();
+                       arch_local_irq_restore(flags);
+               }
+#endif
+
+               if (!test_evtchn(percpu_read(poll_evtchn)) &&
+                   HYPERVISOR_poll_no_timeout(&__get_cpu_var(poll_evtchn), 1))
+                       BUG();
+
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+               arch_local_irq_disable();
+               smp_wmb();
+               spinning.irq_count = UINT_MAX;
+#endif
+
+               if (test_evtchn(percpu_read(poll_evtchn))) {
+                       inc_irq_stat(irq_lock_count);
+                       break;
+               }
+       }
+
+       /*
+        * Leave the event pending so that any interrupted blocker will
+        * re-check.
+        */
+
+       /* announce we're done */
+       percpu_write(_spinning, spinning.prev);
+       if (!CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING)
+               arch_local_irq_disable();
+       sequence(SEQ_REMOVE_BIAS);
+       arch_local_irq_restore(upcall_mask);
+       smp_rmb();
+       if (lock->tickets.head == spinning.ticket) {
+               lock->owner = cpu;
+               return 0;
+       }
+       BUG_ON(CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING && !(spinning.ticket + 1));
+       ptok->head = lock->tickets.head;
+       ptok->tail = spinning.ticket;
+
+       return 1 << 10;
+}
+
+void xen_spin_kick(const arch_spinlock_t *lock, unsigned int ticket)
+{
+       unsigned int cpu = raw_smp_processor_id(), anchor = cpu;
+
+       if (unlikely(!cpu_online(cpu)))
+               cpu = -1, anchor = nr_cpu_ids;
+
+       while ((cpu = cpumask_next(cpu, cpu_online_mask)) != anchor) {
+               unsigned int flags;
+               atomic_t *rm_ctr;
+               struct spinning *spinning;
+
+               if (cpu >= nr_cpu_ids) {
+                       if (anchor == nr_cpu_ids)
+                               return;
+                       cpu = cpumask_first(cpu_online_mask);
+                       if (cpu == anchor)
+                               return;
+               }
+
+               flags = arch_local_irq_save();
+               for (;;) {
+                       unsigned int rm_idx = per_cpu(rm_seq.idx, cpu);
+
+                       rm_ctr = per_cpu(rm_seq.ctr, cpu) + (rm_idx & 1);
+                       atomic_inc(rm_ctr);
+#ifdef CONFIG_X86 /* atomic ops are full CPU barriers */
+                       barrier();
+#else
+                       smp_mb();
+#endif
+                       spinning = per_cpu(_spinning, cpu);
+                       smp_rmb();
+                       if ((rm_idx ^ per_cpu(rm_seq.idx, cpu))
+                           < SEQ_REMOVE_BIAS)
+                               break;
+                       atomic_dec(rm_ctr);
+                       if (!vcpu_running(cpu))
+                               HYPERVISOR_yield();
+               }
+
+               for (; spinning; spinning = spinning->prev)
+                       if (spinning->lock == lock &&
+                           spinning->ticket == ticket) {
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+                               ticket = spinning->irq_count
+                                        < per_cpu(_irq_count, cpu)
+                                        ? ticket_drop(spinning, ticket, cpu) : -2;
+#endif
+                               break;
+                       }
+
+               atomic_dec(rm_ctr);
+               arch_local_irq_restore(flags);
+
+               if (unlikely(spinning)) {
+#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+                       if (!(ticket + 1))
+                               return;
+                       if (ticket + 2) {
+                               cpu = anchor < nr_cpu_ids ? anchor : -1;
+                               continue;
+                       }
+#endif
+                       notify_remote_via_evtchn(per_cpu(poll_evtchn, cpu));
+                       return;
+               }
+       }
+}
+EXPORT_SYMBOL(xen_spin_kick);
+
+#endif /* TICKET_SHIFT */
diff --git a/drivers/xen/core/xen_proc.c b/drivers/xen/core/xen_proc.c

new file mode 100644 (file)

index 0000000..b4f1136
--- /dev/null
+++ b/drivers/xen/core/xen_proc.c
@@ -0,0 +1,30 @@
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <xen/xen_proc.h>
+
+static struct proc_dir_entry *xen_base;
+
+struct proc_dir_entry *
+#ifndef MODULE
+__init
+#endif
+create_xen_proc_entry(const char *name, mode_t mode)
+{
+       if ( xen_base == NULL )
+               if ( (xen_base = proc_mkdir("xen", NULL)) == NULL )
+                       panic("Couldn't create /proc/xen");
+       return create_proc_entry(name, mode, xen_base);
+}
+
+#ifdef MODULE
+#include <linux/export.h>
+
+EXPORT_SYMBOL_GPL(create_xen_proc_entry); 
+#elif defined(CONFIG_XEN_PRIVILEGED_GUEST)
+
+void remove_xen_proc_entry(const char *name)
+{
+       remove_proc_entry(name, xen_base);
+}
+
+#endif
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c

index b1f60a0..58f917f 100644 (file)
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -49,9 +49,15 @@
  #include <linux/cpu.h>
  
  #include <xen/xen.h>
+#ifdef CONFIG_PARAVIRT_XEN
  #include <xen/events.h>
  #include <xen/evtchn.h>
  #include <asm/xen/hypervisor.h>
+#else
+#include <xen/evtchn.h>
+#include <xen/public/evtchn.h>
+#define bind_evtchn_to_irqhandler bind_caller_port_to_irqhandler
+#endif
  
  struct per_user_data {
         struct mutex bind_mutex; /* serialize bind/unbind operations */
@@ -278,6 +284,9 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port)
         int irq = irq_from_evtchn(port);
  
         unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+#ifdef CONFIG_XEN
+       WARN_ON(close_evtchn(port));
+#endif
  
         set_port_user(port, NULL);
  }
@@ -450,7 +459,8 @@ static int evtchn_open(struct inode *inode, struct file *filp)
         if (u == NULL)
                 return -ENOMEM;
  
-       u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm);
+       u->name = kasprintf(GFP_KERNEL, "evtchn:%s[%d]",
+                           current->comm, current->pid);
         if (u->name == NULL) {
                 kfree(u);
                 return -ENOMEM;
@@ -518,7 +528,12 @@ static const struct file_operations evtchn_fops = {
  
  static struct miscdevice evtchn_miscdev = {
         .minor        = MISC_DYNAMIC_MINOR,
+#ifdef CONFIG_PARAVIRT_XEN
         .name         = "xen/evtchn",
+#else
+       .name         = "evtchn",
+#endif
+       .nodename     = "xen/evtchn",
         .fops         = &evtchn_fops,
  };
  static int __init evtchn_init(void)
@@ -534,10 +549,10 @@ static int __init evtchn_init(void)
  
         spin_lock_init(&port_user_lock);
  
-       /* Create '/dev/misc/evtchn'. */
+       /* Create '/dev/xen/evtchn'. */
         err = misc_register(&evtchn_miscdev);
         if (err != 0) {
-               printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
+               pr_alert("Could not register /dev/xen/evtchn\n");
                 return err;
         }
  
@@ -558,3 +573,4 @@ module_init(evtchn_init);
  module_exit(evtchn_cleanup);
  
  MODULE_LICENSE("GPL");
+MODULE_ALIAS("devname:xen/evtchn");
diff --git a/drivers/xen/fbfront/Makefile b/drivers/xen/fbfront/Makefile

new file mode 100644 (file)

index 0000000..e2b8909
--- /dev/null
+++ b/drivers/xen/fbfront/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_XEN_FRAMEBUFFER)  := xenfb.o
+obj-$(CONFIG_XEN_KEYBOARD)     += xenkbd.o
diff --git a/drivers/xen/fbfront/xenfb.c b/drivers/xen/fbfront/xenfb.c

new file mode 100644 (file)

index 0000000..b199b6b
--- /dev/null
+++ b/drivers/xen/fbfront/xenfb.c
@@ -0,0 +1,910 @@
+/*
+ * linux/drivers/video/xenfb.c -- Xen para-virtual frame buffer device
+ *
+ * Copyright (C) 2005-2006 Anthony Liguori <aliguori@us.ibm.com>
+ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
+ *
+ *  Based on linux/drivers/video/q40fb.c
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License. See the file COPYING in the main directory of this archive for
+ *  more details.
+ */
+
+/*
+ * TODO:
+ *
+ * Switch to grant tables when they become capable of dealing with the
+ * frame buffer.
+ */
+
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fb.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/freezer.h>
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <xen/interface/io/fbif.h>
+#include <xen/interface/io/protocols.h>
+#include <xen/xenbus.h>
+#include <linux/kthread.h>
+
+struct xenfb_mapping
+{
+       struct list_head        link;
+       struct vm_area_struct   *vma;
+       atomic_t                map_refs;
+       int                     faults;
+       struct xenfb_info       *info;
+};
+
+struct xenfb_info
+{
+       struct task_struct      *kthread;
+       wait_queue_head_t       wq;
+
+       unsigned char           *fb;
+       struct fb_info          *fb_info;
+       struct timer_list       refresh;
+       int                     dirty;
+       int                     x1, y1, x2, y2; /* dirty rectangle,
+                                                  protected by dirty_lock */
+       spinlock_t              dirty_lock;
+       struct mutex            mm_lock;
+       int                     nr_pages;
+       struct page             **pages;
+       struct list_head        mappings; /* protected by mm_lock */
+
+       int                     irq;
+       struct xenfb_page       *page;
+       unsigned long           *mfns;
+       int                     feature_resize; /* Backend has resize feature */
+       struct xenfb_resize     resize;
+       int                     resize_dpy;
+       spinlock_t              resize_lock;
+
+       struct xenbus_device    *xbdev;
+};
+
+/*
+ * There are three locks:
+ *    spinlock resize_lock protecting resize_dpy and resize
+ *    spinlock dirty_lock protecting the dirty rectangle
+ *    mutex mm_lock protecting mappings.
+ *
+ * How the dirty and mapping locks work together
+ *
+ * The problem is that dirty rectangle and mappings aren't
+ * independent: the dirty rectangle must cover all faulted pages in
+ * mappings.  We need to prove that our locking maintains this
+ * invariant.
+ *
+ * There are several kinds of critical regions:
+ *
+ * 1. Holding only dirty_lock: xenfb_refresh().  May run in
+ *    interrupts.  Extends the dirty rectangle.  Trivially preserves
+ *    invariant.
+ *
+ * 2. Holding only mm_lock: xenfb_mmap() and xenfb_vm_close().  Touch
+ *    only mappings.  The former creates unfaulted pages.  Preserves
+ *    invariant.  The latter removes pages.  Preserves invariant.
+ *
+ * 3. Holding both locks: xenfb_vm_fault().  Extends the dirty
+ *    rectangle and updates mappings consistently.  Preserves
+ *    invariant.
+ *
+ * 4. The ugliest one: xenfb_update_screen().  Clear the dirty
+ *    rectangle and update mappings consistently.
+ *
+ *    We can't simply hold both locks, because zap_page_range() cannot
+ *    be called with a spinlock held.
+ *
+ *    Therefore, we first clear the dirty rectangle with both locks
+ *    held.  Then we unlock dirty_lock and update the mappings.
+ *    Critical regions that hold only dirty_lock may interfere with
+ *    that.  This can only be region 1: xenfb_refresh().  But that
+ *    just extends the dirty rectangle, which can't harm the
+ *    invariant.
+ *
+ * But FIXME: the invariant is too weak.  It misses that the fault
+ * record in mappings must be consistent with the mapping of pages in
+ * the associated address space!  __do_fault() updates the PTE after
+ * xenfb_vm_fault() returns, i.e. outside the critical region.  This
+ * allows the following race:
+ *
+ * X writes to some address in the Xen frame buffer
+ * Fault - call __do_fault()
+ *     call xenfb_vm_fault()
+ *         grab mm_lock
+ *         map->faults++;
+ *         release mm_lock
+ *     return back to do_no_page()
+ * (preempted, or SMP)
+ * Xen worker thread runs.
+ *      grab mm_lock
+ *      look at mappings
+ *          find this mapping, zaps its pages (but page not in pte yet)
+ *          clear map->faults
+ *      releases mm_lock
+ * (back to X process)
+ *     put page in X's pte
+ *
+ * Oh well, we wont be updating the writes to this page anytime soon.
+ */
+#define MB_ (1024*1024)
+#define XENFB_DEFAULT_FB_LEN (XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8)
+
+enum {KPARAM_MEM, KPARAM_WIDTH, KPARAM_HEIGHT, KPARAM_CNT};
+static int video[KPARAM_CNT] = {2, XENFB_WIDTH, XENFB_HEIGHT};
+module_param_array(video, int, NULL, 0);
+MODULE_PARM_DESC(video,
+               "Size of video memory in MB and width,height in pixels, default = (2,800,600)");
+
+static int xenfb_fps = 20;
+
+static int xenfb_remove(struct xenbus_device *);
+static void xenfb_init_shared_page(struct xenfb_info *, struct fb_info *);
+static int xenfb_connect_backend(struct xenbus_device *, struct xenfb_info *);
+static void xenfb_disconnect_backend(struct xenfb_info *);
+
+static void xenfb_send_event(struct xenfb_info *info,
+               union xenfb_out_event *event)
+{
+       __u32 prod;
+
+       prod = info->page->out_prod;
+       /* caller ensures !xenfb_queue_full() */
+       mb();                   /* ensure ring space available */
+       XENFB_OUT_RING_REF(info->page, prod) = *event;
+       wmb();                  /* ensure ring contents visible */
+       info->page->out_prod = prod + 1;
+
+       notify_remote_via_irq(info->irq);
+}
+
+static void xenfb_do_update(struct xenfb_info *info,
+                           int x, int y, int w, int h)
+{
+       union xenfb_out_event event;
+
+       memset(&event, 0, sizeof(event));
+       event.type = XENFB_TYPE_UPDATE;
+       event.update.x = x;
+       event.update.y = y;
+       event.update.width = w;
+       event.update.height = h;
+
+       /* caller ensures !xenfb_queue_full() */
+       xenfb_send_event(info, &event);
+}
+
+static void xenfb_do_resize(struct xenfb_info *info)
+{
+       union xenfb_out_event event;
+
+       memset(&event, 0, sizeof(event));
+       event.resize = info->resize;
+
+       /* caller ensures !xenfb_queue_full() */
+       xenfb_send_event(info, &event);
+}
+
+static int xenfb_queue_full(struct xenfb_info *info)
+{
+       __u32 cons, prod;
+
+       prod = info->page->out_prod;
+       cons = info->page->out_cons;
+       return prod - cons == XENFB_OUT_RING_LEN;
+}
+
+static void xenfb_update_screen(struct xenfb_info *info)
+{
+       unsigned long flags;
+       int y1, y2, x1, x2;
+       struct xenfb_mapping *map;
+
+       if (xenfb_queue_full(info))
+               return;
+
+       mutex_lock(&info->mm_lock);
+
+       spin_lock_irqsave(&info->dirty_lock, flags);
+       if (info->dirty){
+               info->dirty = 0;
+               y1 = info->y1;
+               y2 = info->y2;
+               x1 = info->x1;
+               x2 = info->x2;
+               info->x1 = info->y1 = INT_MAX;
+               info->x2 = info->y2 = 0;
+       } else {
+               spin_unlock_irqrestore(&info->dirty_lock, flags);
+               mutex_unlock(&info->mm_lock);
+               return;
+       }
+       spin_unlock_irqrestore(&info->dirty_lock, flags);
+
+       list_for_each_entry(map, &info->mappings, link) {
+               if (!map->faults)
+                       continue;
+               zap_page_range(map->vma, map->vma->vm_start,
+                              map->vma->vm_end - map->vma->vm_start, NULL);
+               map->faults = 0;
+       }
+
+       mutex_unlock(&info->mm_lock);
+
+       if (x2 < x1 || y2 < y1) {
+               pr_warning("xenfb_update_screen bogus rect %d %d %d %d\n",
+                          x1, x2, y1, y2);
+               WARN_ON(1);
+       }
+       xenfb_do_update(info, x1, y1, x2 - x1, y2 - y1);
+}
+
+static void xenfb_handle_resize_dpy(struct xenfb_info *info)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&info->resize_lock, flags);
+       if (info->resize_dpy) {
+               if (!xenfb_queue_full(info)) {
+                       info->resize_dpy = 0;
+                       xenfb_do_resize(info);
+               }
+       }
+       spin_unlock_irqrestore(&info->resize_lock, flags);
+}
+
+static int xenfb_thread(void *data)
+{
+       struct xenfb_info *info = data;
+
+       while (!kthread_should_stop()) {
+               xenfb_handle_resize_dpy(info);
+               xenfb_update_screen(info);
+               wait_event_interruptible(info->wq,
+                       kthread_should_stop() || info->dirty);
+               try_to_freeze();
+       }
+       return 0;
+}
+
+static int xenfb_setcolreg(unsigned regno, unsigned red, unsigned green,
+                          unsigned blue, unsigned transp,
+                          struct fb_info *info)
+{
+       u32 v;
+
+       if (regno > info->cmap.len)
+               return 1;
+
+       red   >>= (16 - info->var.red.length);
+       green >>= (16 - info->var.green.length);
+       blue  >>= (16 - info->var.blue.length);
+
+       v = (red << info->var.red.offset) |
+           (green << info->var.green.offset) |
+           (blue << info->var.blue.offset);
+
+       /* FIXME is this sane?  check against xxxfb_setcolreg()!  */
+       switch (info->var.bits_per_pixel) {
+       case 16:
+       case 24:
+       case 32:
+               ((u32 *)info->pseudo_palette)[regno] = v;
+               break;
+       }
+       
+       return 0;
+}
+
+static void xenfb_timer(unsigned long data)
+{
+       struct xenfb_info *info = (struct xenfb_info *)data;
+       wake_up(&info->wq);
+}
+
+static void __xenfb_refresh(struct xenfb_info *info,
+                           int x1, int y1, int w, int h)
+{
+       int y2, x2;
+
+       y2 = y1 + h;
+       x2 = x1 + w;
+
+       if (info->y1 > y1)
+               info->y1 = y1;
+       if (info->y2 < y2)
+               info->y2 = y2;
+       if (info->x1 > x1)
+               info->x1 = x1;
+       if (info->x2 < x2)
+               info->x2 = x2;
+       info->dirty = 1;
+
+       if (timer_pending(&info->refresh))
+               return;
+
+       mod_timer(&info->refresh, jiffies + HZ/xenfb_fps);
+}
+
+static void xenfb_refresh(struct xenfb_info *info,
+                         int x1, int y1, int w, int h)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&info->dirty_lock, flags);
+       __xenfb_refresh(info, x1, y1, w, h);
+       spin_unlock_irqrestore(&info->dirty_lock, flags);
+}
+
+static void xenfb_fillrect(struct fb_info *p, const struct fb_fillrect *rect)
+{
+       struct xenfb_info *info = p->par;
+
+       cfb_fillrect(p, rect);
+       xenfb_refresh(info, rect->dx, rect->dy, rect->width, rect->height);
+}
+
+static void xenfb_imageblit(struct fb_info *p, const struct fb_image *image)
+{
+       struct xenfb_info *info = p->par;
+
+       cfb_imageblit(p, image);
+       xenfb_refresh(info, image->dx, image->dy, image->width, image->height);
+}
+
+static void xenfb_copyarea(struct fb_info *p, const struct fb_copyarea *area)
+{
+       struct xenfb_info *info = p->par;
+
+       cfb_copyarea(p, area);
+       xenfb_refresh(info, area->dx, area->dy, area->width, area->height);
+}
+
+static void xenfb_vm_open(struct vm_area_struct *vma)
+{
+       struct xenfb_mapping *map = vma->vm_private_data;
+       atomic_inc(&map->map_refs);
+}
+
+static void xenfb_vm_close(struct vm_area_struct *vma)
+{
+       struct xenfb_mapping *map = vma->vm_private_data;
+       struct xenfb_info *info = map->info;
+
+       mutex_lock(&info->mm_lock);
+       if (atomic_dec_and_test(&map->map_refs)) {
+               list_del(&map->link);
+               kfree(map);
+       }
+       mutex_unlock(&info->mm_lock);
+}
+
+static int xenfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       struct xenfb_mapping *map = vma->vm_private_data;
+       struct xenfb_info *info = map->info;
+       int pgnr = ((long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT;
+       unsigned long flags;
+       struct page *page;
+       int y1, y2;
+
+       if (pgnr >= info->nr_pages)
+               return VM_FAULT_SIGBUS;
+
+       mutex_lock(&info->mm_lock);
+       spin_lock_irqsave(&info->dirty_lock, flags);
+       page = info->pages[pgnr];
+       get_page(page);
+       map->faults++;
+
+       y1 = pgnr * PAGE_SIZE / info->fb_info->fix.line_length;
+       y2 = (pgnr * PAGE_SIZE + PAGE_SIZE - 1) / info->fb_info->fix.line_length;
+       if (y2 > info->fb_info->var.yres)
+               y2 = info->fb_info->var.yres;
+       __xenfb_refresh(info, 0, y1, info->fb_info->var.xres, y2 - y1);
+       spin_unlock_irqrestore(&info->dirty_lock, flags);
+       mutex_unlock(&info->mm_lock);
+
+       vmf->page = page;
+
+       return VM_FAULT_MINOR;
+}
+
+static struct vm_operations_struct xenfb_vm_ops = {
+       .open   = xenfb_vm_open,
+       .close  = xenfb_vm_close,
+       .fault  = xenfb_vm_fault,
+};
+
+static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma)
+{
+       struct xenfb_info *info = fb_info->par;
+       struct xenfb_mapping *map;
+       int map_pages;
+
+       if (!(vma->vm_flags & VM_WRITE))
+               return -EINVAL;
+       if (!(vma->vm_flags & VM_SHARED))
+               return -EINVAL;
+       if (vma->vm_pgoff != 0)
+               return -EINVAL;
+
+       map_pages = (vma->vm_end - vma->vm_start + PAGE_SIZE-1) >> PAGE_SHIFT;
+       if (map_pages > info->nr_pages)
+               return -EINVAL;
+
+       map = kzalloc(sizeof(*map), GFP_KERNEL);
+       if (map == NULL)
+               return -ENOMEM;
+
+       map->vma = vma;
+       map->faults = 0;
+       map->info = info;
+       atomic_set(&map->map_refs, 1);
+
+       mutex_lock(&info->mm_lock);
+       list_add(&map->link, &info->mappings);
+       mutex_unlock(&info->mm_lock);
+
+       vma->vm_ops = &xenfb_vm_ops;
+       vma->vm_flags |= (VM_DONTEXPAND | VM_RESERVED);
+       vma->vm_private_data = map;
+
+       return 0;
+}
+
+static int
+xenfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
+{
+       struct xenfb_info *xenfb_info;
+       int required_mem_len;
+
+       xenfb_info = info->par;
+
+       if (!xenfb_info->feature_resize) {
+               if (var->xres == video[KPARAM_WIDTH] &&
+                       var->yres == video[KPARAM_HEIGHT] &&
+                       var->bits_per_pixel == xenfb_info->page->depth) {
+                       return 0;
+               }
+               return -EINVAL;
+       }
+
+       /* Can't resize past initial width and height */
+       if (var->xres > video[KPARAM_WIDTH] || var->yres > video[KPARAM_HEIGHT])
+               return -EINVAL;
+
+       required_mem_len = var->xres * var->yres * (xenfb_info->page->depth / 8);
+       if (var->bits_per_pixel == xenfb_info->page->depth &&
+               var->xres <= info->fix.line_length / (XENFB_DEPTH / 8) &&
+               required_mem_len <= info->fix.smem_len) {
+               var->xres_virtual = var->xres;
+               var->yres_virtual = var->yres;
+               return 0;
+       }
+       return -EINVAL;
+}
+
+static int xenfb_set_par(struct fb_info *info)
+{
+       struct xenfb_info *xenfb_info;
+       unsigned long flags;
+
+       xenfb_info = info->par;
+
+       spin_lock_irqsave(&xenfb_info->resize_lock, flags);
+       xenfb_info->resize.type = XENFB_TYPE_RESIZE;
+       xenfb_info->resize.width = info->var.xres;
+       xenfb_info->resize.height = info->var.yres;
+       xenfb_info->resize.stride = info->fix.line_length;
+       xenfb_info->resize.depth = info->var.bits_per_pixel;
+       xenfb_info->resize.offset = 0;
+       xenfb_info->resize_dpy = 1;
+       spin_unlock_irqrestore(&xenfb_info->resize_lock, flags);
+       return 0;
+}
+
+static struct fb_ops xenfb_fb_ops = {
+       .owner          = THIS_MODULE,
+       .fb_setcolreg   = xenfb_setcolreg,
+       .fb_fillrect    = xenfb_fillrect,
+       .fb_copyarea    = xenfb_copyarea,
+       .fb_imageblit   = xenfb_imageblit,
+       .fb_mmap        = xenfb_mmap,
+       .fb_check_var   = xenfb_check_var,
+       .fb_set_par     = xenfb_set_par,
+};
+
+static irqreturn_t xenfb_event_handler(int rq, void *dev_id)
+{
+       /*
+        * No in events recognized, simply ignore them all.
+        * If you need to recognize some, see xenbkd's input_handler()
+        * for how to do that.
+        */
+       struct xenfb_info *info = dev_id;
+       struct xenfb_page *page = info->page;
+
+       if (page->in_cons != page->in_prod) {
+               info->page->in_cons = info->page->in_prod;
+               notify_remote_via_irq(info->irq);
+       }
+       return IRQ_HANDLED;
+}
+
+static unsigned long vmalloc_to_mfn(void *address)
+{
+       return pfn_to_mfn(vmalloc_to_pfn(address));
+}
+
+static __devinit void
+xenfb_make_preferred_console(void)
+{
+       struct console *c;
+
+       if (console_set_on_cmdline)
+               return;
+
+       console_lock();
+       for_each_console(c) {
+               if (!strcmp(c->name, "tty") && c->index == 0)
+                       break;
+       }
+       console_unlock();
+       if (c) {
+               unregister_console(c);
+               c->flags |= CON_CONSDEV;
+               c->flags &= ~CON_PRINTBUFFER; /* don't print again */
+               register_console(c);
+       }
+}
+
+static int __devinit xenfb_probe(struct xenbus_device *dev,
+                                const struct xenbus_device_id *id)
+{
+       struct xenfb_info *info;
+       struct fb_info *fb_info;
+       int fb_size;
+       int val;
+       int ret;
+
+       info = kzalloc(sizeof(*info), GFP_KERNEL);
+       if (info == NULL) {
+               xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+               return -ENOMEM;
+       }
+
+       /* Limit kernel param videoram amount to what is in xenstore */
+       if (xenbus_scanf(XBT_NIL, dev->otherend, "videoram", "%d", &val) == 1) {
+               if (val < video[KPARAM_MEM])
+                       video[KPARAM_MEM] = val;
+       }
+
+       /* If requested res does not fit in available memory, use default */
+       fb_size = video[KPARAM_MEM] * MB_;
+       if (video[KPARAM_WIDTH] * video[KPARAM_HEIGHT] * XENFB_DEPTH/8 > fb_size) {
+               video[KPARAM_WIDTH] = XENFB_WIDTH;
+               video[KPARAM_HEIGHT] = XENFB_HEIGHT;
+               fb_size = XENFB_DEFAULT_FB_LEN;
+       }
+
+       dev_set_drvdata(&dev->dev, info);
+       info->xbdev = dev;
+       info->irq = -1;
+       info->x1 = info->y1 = INT_MAX;
+       spin_lock_init(&info->dirty_lock);
+       spin_lock_init(&info->resize_lock);
+       mutex_init(&info->mm_lock);
+       init_waitqueue_head(&info->wq);
+       init_timer(&info->refresh);
+       info->refresh.function = xenfb_timer;
+       info->refresh.data = (unsigned long)info;
+       INIT_LIST_HEAD(&info->mappings);
+
+       info->fb = vzalloc(fb_size);
+       if (info->fb == NULL)
+               goto error_nomem;
+
+       info->nr_pages = (fb_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+       info->pages = kmalloc(sizeof(struct page *) * info->nr_pages,
+                             GFP_KERNEL);
+       if (info->pages == NULL)
+               goto error_nomem;
+
+       info->mfns = vmalloc(sizeof(unsigned long) * info->nr_pages);
+       if (!info->mfns)
+               goto error_nomem;
+
+       /* set up shared page */
+       info->page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+       if (!info->page)
+               goto error_nomem;
+
+       fb_info = framebuffer_alloc(sizeof(u32) * 256, NULL);
+                               /* see fishy hackery below */
+       if (fb_info == NULL)
+               goto error_nomem;
+
+       /* FIXME fishy hackery */
+       fb_info->pseudo_palette = fb_info->par;
+       fb_info->par = info;
+       /* /FIXME */
+       fb_info->screen_base = info->fb;
+
+       fb_info->fbops = &xenfb_fb_ops;
+       fb_info->var.xres_virtual = fb_info->var.xres = video[KPARAM_WIDTH];
+       fb_info->var.yres_virtual = fb_info->var.yres = video[KPARAM_HEIGHT];
+       fb_info->var.bits_per_pixel = XENFB_DEPTH;
+
+       fb_info->var.red = (struct fb_bitfield){16, 8, 0};
+       fb_info->var.green = (struct fb_bitfield){8, 8, 0};
+       fb_info->var.blue = (struct fb_bitfield){0, 8, 0};
+
+       fb_info->var.activate = FB_ACTIVATE_NOW;
+       fb_info->var.height = -1;
+       fb_info->var.width = -1;
+       fb_info->var.vmode = FB_VMODE_NONINTERLACED;
+
+       fb_info->fix.visual = FB_VISUAL_TRUECOLOR;
+       fb_info->fix.line_length = fb_info->var.xres * (XENFB_DEPTH / 8);
+       fb_info->fix.smem_start = 0;
+       fb_info->fix.smem_len = fb_size;
+       strcpy(fb_info->fix.id, "xen");
+       fb_info->fix.type = FB_TYPE_PACKED_PIXELS;
+       fb_info->fix.accel = FB_ACCEL_NONE;
+
+       fb_info->flags = FBINFO_FLAG_DEFAULT;
+
+       ret = fb_alloc_cmap(&fb_info->cmap, 256, 0);
+       if (ret < 0) {
+               framebuffer_release(fb_info);
+               xenbus_dev_fatal(dev, ret, "fb_alloc_cmap");
+               goto error;
+       }
+
+       xenfb_init_shared_page(info, fb_info);
+
+       ret = register_framebuffer(fb_info);
+       if (ret) {
+               fb_dealloc_cmap(&info->fb_info->cmap);
+               framebuffer_release(fb_info);
+               xenbus_dev_fatal(dev, ret, "register_framebuffer");
+               goto error;
+       }
+       info->fb_info = fb_info;
+
+       ret = xenfb_connect_backend(dev, info);
+       if (ret < 0)
+               goto error;
+
+       xenfb_make_preferred_console();
+       return 0;
+
+ error_nomem:
+       ret = -ENOMEM;
+       xenbus_dev_fatal(dev, ret, "allocating device memory");
+ error:
+       xenfb_remove(dev);
+       return ret;
+}
+
+static int xenfb_resume(struct xenbus_device *dev)
+{
+       struct xenfb_info *info = dev_get_drvdata(&dev->dev);
+
+       xenfb_disconnect_backend(info);
+       xenfb_init_shared_page(info, info->fb_info);
+       return xenfb_connect_backend(dev, info);
+}
+
+static int xenfb_remove(struct xenbus_device *dev)
+{
+       struct xenfb_info *info = dev_get_drvdata(&dev->dev);
+
+       del_timer(&info->refresh);
+       if (info->kthread)
+               kthread_stop(info->kthread);
+       xenfb_disconnect_backend(info);
+       if (info->fb_info) {
+               unregister_framebuffer(info->fb_info);
+               fb_dealloc_cmap(&info->fb_info->cmap);
+               framebuffer_release(info->fb_info);
+       }
+       free_page((unsigned long)info->page);
+       vfree(info->mfns);
+       kfree(info->pages);
+       vfree(info->fb);
+       kfree(info);
+
+       return 0;
+}
+
+static void xenfb_init_shared_page(struct xenfb_info *info,
+                                   struct fb_info * fb_info)
+{
+       int i;
+       int epd = PAGE_SIZE / sizeof(info->mfns[0]);
+
+       for (i = 0; i < info->nr_pages; i++)
+               info->pages[i] = vmalloc_to_page(info->fb + i * PAGE_SIZE);
+
+       for (i = 0; i < info->nr_pages; i++)
+               info->mfns[i] = vmalloc_to_mfn(info->fb + i * PAGE_SIZE);
+
+       for (i = 0; i * epd < info->nr_pages; i++)
+               info->page->pd[i] = vmalloc_to_mfn(&info->mfns[i * epd]);
+
+       info->page->width = fb_info->var.xres;
+       info->page->height = fb_info->var.yres;
+       info->page->depth = fb_info->var.bits_per_pixel;
+       info->page->line_length = fb_info->fix.line_length;
+       info->page->mem_length = fb_info->fix.smem_len;
+       info->page->in_cons = info->page->in_prod = 0;
+       info->page->out_cons = info->page->out_prod = 0;
+}
+
+static int xenfb_connect_backend(struct xenbus_device *dev,
+                                struct xenfb_info *info)
+{
+       int ret, irq;
+       struct xenbus_transaction xbt;
+
+       irq = bind_listening_port_to_irqhandler(
+               dev->otherend_id, xenfb_event_handler, 0, "xenfb", info);
+       if (irq < 0) {
+               xenbus_dev_fatal(dev, irq,
+                                "bind_listening_port_to_irqhandler");
+               return irq;
+       }
+
+ again:
+       ret = xenbus_transaction_start(&xbt);
+       if (ret) {
+               xenbus_dev_fatal(dev, ret, "starting transaction");
+               goto unbind_irq;
+       }
+       ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
+                           virt_to_mfn(info->page));
+       if (ret)
+               goto error_xenbus;
+       ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+                           irq_to_evtchn_port(irq));
+       if (ret)
+               goto error_xenbus;
+       ret = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
+                           XEN_IO_PROTO_ABI_NATIVE);
+       if (ret)
+               goto error_xenbus;
+       ret = xenbus_printf(xbt, dev->nodename, "feature-update", "1");
+       if (ret)
+               goto error_xenbus;
+       ret = xenbus_transaction_end(xbt, 0);
+       if (ret) {
+               if (ret == -EAGAIN)
+                       goto again;
+               xenbus_dev_fatal(dev, ret, "completing transaction");
+               goto unbind_irq;
+       }
+
+       info->irq = irq;
+       xenbus_switch_state(dev, XenbusStateInitialised);
+       return 0;
+
+ error_xenbus:
+       xenbus_transaction_end(xbt, 1);
+       xenbus_dev_fatal(dev, ret, "writing xenstore");
+ unbind_irq:
+       unbind_from_irqhandler(irq, info);
+       return ret;
+}
+
+static void xenfb_disconnect_backend(struct xenfb_info *info)
+{
+       if (info->irq >= 0)
+               unbind_from_irqhandler(info->irq, info);
+       info->irq = -1;
+}
+
+static void xenfb_backend_changed(struct xenbus_device *dev,
+                                 enum xenbus_state backend_state)
+{
+       struct xenfb_info *info = dev_get_drvdata(&dev->dev);
+       int val;
+
+       switch (backend_state) {
+       case XenbusStateInitialising:
+       case XenbusStateInitialised:
+       case XenbusStateReconfiguring:
+       case XenbusStateReconfigured:
+       case XenbusStateUnknown:
+       case XenbusStateClosed:
+               break;
+
+       case XenbusStateInitWait:
+       InitWait:
+               xenbus_switch_state(dev, XenbusStateConnected);
+               break;
+
+       case XenbusStateConnected:
+               /*
+                * Work around xenbus race condition: If backend goes
+                * through InitWait to Connected fast enough, we can
+                * get Connected twice here.
+                */
+               if (dev->state != XenbusStateConnected)
+                       goto InitWait; /* no InitWait seen yet, fudge it */
+
+
+               if (xenbus_scanf(XBT_NIL, dev->otherend,
+                                       "feature-resize", "%d", &val) < 0)
+                       val = 0;
+               info->feature_resize = val;
+
+               if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                                "request-update", "%d", &val) < 0)
+                       val = 0;
+
+               if (val && !info->kthread) {
+                       info->kthread = kthread_run(xenfb_thread, info,
+                                                   "xenfb thread");
+                       if (IS_ERR(info->kthread)) {
+                               info->kthread = NULL;
+                               xenbus_dev_fatal(dev, PTR_ERR(info->kthread),
+                                               "xenfb_thread");
+                       }
+               }
+               break;
+
+       case XenbusStateClosing:
+               // FIXME is this safe in any dev->state?
+               xenbus_frontend_closed(dev);
+               break;
+       }
+}
+
+static const struct xenbus_device_id xenfb_ids[] = {
+       { "vfb" },
+       { "" }
+};
+MODULE_ALIAS("xen:vfb");
+
+static DEFINE_XENBUS_DRIVER(xenfb, ,
+       .probe = xenfb_probe,
+       .remove = xenfb_remove,
+       .resume = xenfb_resume,
+       .otherend_changed = xenfb_backend_changed,
+);
+
+static int __init xenfb_init(void)
+{
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       /* Nothing to do if running in dom0. */
+       if (is_initial_xendomain())
+               return -ENODEV;
+
+       return xenbus_register_frontend(&xenfb_driver);
+}
+
+static void __exit xenfb_cleanup(void)
+{
+       return xenbus_unregister_driver(&xenfb_driver);
+}
+
+module_init(xenfb_init);
+module_exit(xenfb_cleanup);
+
+MODULE_DESCRIPTION("Xen virtual framebuffer device frontend");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/fbfront/xenkbd.c b/drivers/xen/fbfront/xenkbd.c

new file mode 100644 (file)

index 0000000..a9ebe8f
--- /dev/null
+++ b/drivers/xen/fbfront/xenkbd.c
@@ -0,0 +1,366 @@
+/*
+ * linux/drivers/input/keyboard/xenkbd.c -- Xen para-virtual input device
+ *
+ * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com>
+ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com>
+ *
+ *  Based on linux/drivers/input/mouse/sermouse.c
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License. See the file COPYING in the main directory of this archive for
+ *  more details.
+ */
+
+/*
+ * TODO:
+ *
+ * Switch to grant tables together with xenfb.c.
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/input.h>
+#include <asm/hypervisor.h>
+#include <xen/evtchn.h>
+#include <xen/interface/io/fbif.h>
+#include <xen/interface/io/kbdif.h>
+#include <xen/xenbus.h>
+
+struct xenkbd_info
+{
+       struct input_dev *kbd;
+       struct input_dev *ptr;
+       struct xenkbd_page *page;
+       int irq;
+       struct xenbus_device *xbdev;
+       char phys[32];
+};
+
+static int xenkbd_remove(struct xenbus_device *);
+static int xenkbd_connect_backend(struct xenbus_device *, struct xenkbd_info *);
+static void xenkbd_disconnect_backend(struct xenkbd_info *);
+
+/*
+ * Note: if you need to send out events, see xenfb_do_update() for how
+ * to do that.
+ */
+
+static irqreturn_t input_handler(int rq, void *dev_id)
+{
+       struct xenkbd_info *info = dev_id;
+       struct xenkbd_page *page = info->page;
+       __u32 cons, prod;
+
+       prod = page->in_prod;
+       if (prod == page->in_cons)
+               return IRQ_HANDLED;
+       rmb();                  /* ensure we see ring contents up to prod */
+       for (cons = page->in_cons; cons != prod; cons++) {
+               union xenkbd_in_event *event;
+               struct input_dev *dev;
+               event = &XENKBD_IN_RING_REF(page, cons);
+
+               dev = info->ptr;
+               switch (event->type) {
+               case XENKBD_TYPE_MOTION:
+                       if (event->motion.rel_z)
+                               input_report_rel(dev, REL_WHEEL,
+                                                -event->motion.rel_z);
+                       input_report_rel(dev, REL_X, event->motion.rel_x);
+                       input_report_rel(dev, REL_Y, event->motion.rel_y);
+                       break;
+               case XENKBD_TYPE_KEY:
+                       dev = NULL;
+                       if (test_bit(event->key.keycode, info->kbd->keybit))
+                               dev = info->kbd;
+                       if (test_bit(event->key.keycode, info->ptr->keybit))
+                               dev = info->ptr;
+                       if (dev)
+                               input_report_key(dev, event->key.keycode,
+                                                event->key.pressed);
+                       else
+                               pr_warning("xenkbd: unhandled keycode 0x%x\n",
+                                          event->key.keycode);
+                       break;
+               case XENKBD_TYPE_POS:
+                       if (event->pos.rel_z)
+                               input_report_rel(dev, REL_WHEEL,
+                                                -event->pos.rel_z);
+                       input_report_abs(dev, ABS_X, event->pos.abs_x);
+                       input_report_abs(dev, ABS_Y, event->pos.abs_y);
+                       break;
+               }
+               if (dev)
+                       input_sync(dev);
+       }
+       mb();                   /* ensure we got ring contents */
+       page->in_cons = cons;
+       notify_remote_via_irq(info->irq);
+
+       return IRQ_HANDLED;
+}
+
+int __devinit xenkbd_probe(struct xenbus_device *dev,
+                          const struct xenbus_device_id *id)
+{
+       int ret, i, abs;
+       struct xenkbd_info *info;
+       struct input_dev *kbd, *ptr;
+
+       info = kzalloc(sizeof(*info), GFP_KERNEL);
+       if (!info) {
+               xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+               return -ENOMEM;
+       }
+       dev_set_drvdata(&dev->dev, info);
+       info->xbdev = dev;
+       snprintf(info->phys, sizeof(info->phys), "xenbus/%s", dev->nodename);
+
+       info->page = (void *)__get_free_page(GFP_KERNEL);
+       if (!info->page)
+               goto error_nomem;
+       info->page->in_cons = info->page->in_prod = 0;
+       info->page->out_cons = info->page->out_prod = 0;
+
+       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-abs-pointer", "%d", &abs) < 0)
+               abs = 0;
+       if (abs)
+               xenbus_printf(XBT_NIL, dev->nodename, "request-abs-pointer", "1");
+
+       /* keyboard */
+       kbd = input_allocate_device();
+       if (!kbd)
+               goto error_nomem;
+       kbd->name = "Xen Virtual Keyboard";
+       kbd->phys = info->phys;
+       kbd->id.bustype = BUS_PCI;
+       kbd->id.vendor = 0x5853;
+       kbd->id.product = 0xffff;
+       __set_bit(EV_KEY, kbd->evbit);
+       for (i = KEY_ESC; i < KEY_UNKNOWN; i++)
+               __set_bit(i, kbd->keybit);
+       for (i = KEY_OK; i < KEY_MAX; i++)
+               __set_bit(i, kbd->keybit);
+
+       ret = input_register_device(kbd);
+       if (ret) {
+               input_free_device(kbd);
+               xenbus_dev_fatal(dev, ret, "input_register_device(kbd)");
+               goto error;
+       }
+       info->kbd = kbd;
+
+       /* pointing device */
+       ptr = input_allocate_device();
+       if (!ptr)
+               goto error_nomem;
+       ptr->name = "Xen Virtual Pointer";
+       ptr->phys = info->phys;
+       ptr->id.bustype = BUS_PCI;
+       ptr->id.vendor = 0x5853;
+       ptr->id.product = 0xfffe;
+
+       if (abs) {
+               __set_bit(EV_ABS, ptr->evbit);
+               input_set_abs_params(ptr, ABS_X, 0, XENFB_WIDTH, 0, 0);
+               input_set_abs_params(ptr, ABS_Y, 0, XENFB_HEIGHT, 0, 0);
+       } else {
+               input_set_capability(ptr, EV_REL, REL_X);
+               input_set_capability(ptr, EV_REL, REL_Y);
+       }
+       input_set_capability(ptr, EV_REL, REL_WHEEL);
+
+       __set_bit(EV_KEY, ptr->evbit);
+       for (i = BTN_LEFT; i <= BTN_TASK; i++)
+               __set_bit(i, ptr->keybit);
+
+       ret = input_register_device(ptr);
+       if (ret) {
+               input_free_device(ptr);
+               xenbus_dev_fatal(dev, ret, "input_register_device(ptr)");
+               goto error;
+       }
+       info->ptr = ptr;
+
+       ret = xenkbd_connect_backend(dev, info);
+       if (ret < 0)
+               goto error;
+
+       return 0;
+
+ error_nomem:
+       ret = -ENOMEM;
+       xenbus_dev_fatal(dev, ret, "allocating device memory");
+ error:
+       xenkbd_remove(dev);
+       return ret;
+}
+
+static int xenkbd_resume(struct xenbus_device *dev)
+{
+       struct xenkbd_info *info = dev_get_drvdata(&dev->dev);
+
+       xenkbd_disconnect_backend(info);
+       info->page->in_cons = info->page->in_prod = 0;
+       info->page->out_cons = info->page->out_prod = 0;
+       return xenkbd_connect_backend(dev, info);
+}
+
+static int xenkbd_remove(struct xenbus_device *dev)
+{
+       struct xenkbd_info *info = dev_get_drvdata(&dev->dev);
+
+       xenkbd_disconnect_backend(info);
+       input_unregister_device(info->kbd);
+       input_unregister_device(info->ptr);
+       free_page((unsigned long)info->page);
+       kfree(info);
+       return 0;
+}
+
+static int xenkbd_connect_backend(struct xenbus_device *dev,
+                                 struct xenkbd_info *info)
+{
+       int ret;
+       struct xenbus_transaction xbt;
+
+       ret = bind_listening_port_to_irqhandler(
+               dev->otherend_id, input_handler, 0, "xenkbd", info);
+       if (ret < 0) {
+               xenbus_dev_fatal(dev, ret,
+                                "bind_listening_port_to_irqhandler");
+               return ret;
+       }
+       info->irq = ret;
+
+ again:
+       ret = xenbus_transaction_start(&xbt);
+       if (ret) {
+               xenbus_dev_fatal(dev, ret, "starting transaction");
+               return ret;
+       }
+       ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu",
+                           virt_to_mfn(info->page));
+       if (ret)
+               goto error_xenbus;
+       ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+                           irq_to_evtchn_port(info->irq));
+       if (ret)
+               goto error_xenbus;
+       ret = xenbus_transaction_end(xbt, 0);
+       if (ret) {
+               if (ret == -EAGAIN)
+                       goto again;
+               xenbus_dev_fatal(dev, ret, "completing transaction");
+               return ret;
+       }
+
+       xenbus_switch_state(dev, XenbusStateInitialised);
+       return 0;
+
+ error_xenbus:
+       xenbus_transaction_end(xbt, 1);
+       xenbus_dev_fatal(dev, ret, "writing xenstore");
+       return ret;
+}
+
+static void xenkbd_disconnect_backend(struct xenkbd_info *info)
+{
+       if (info->irq >= 0)
+               unbind_from_irqhandler(info->irq, info);
+       info->irq = -1;
+}
+
+static void xenkbd_backend_changed(struct xenbus_device *dev,
+                                  enum xenbus_state backend_state)
+{
+       struct xenkbd_info *info = dev_get_drvdata(&dev->dev);
+       int ret, val;
+
+       switch (backend_state) {
+       case XenbusStateInitialising:
+       case XenbusStateInitialised:
+       case XenbusStateReconfiguring:
+       case XenbusStateReconfigured:
+       case XenbusStateUnknown:
+       case XenbusStateClosed:
+               break;
+
+       case XenbusStateInitWait:
+       InitWait:
+               ret = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                                  "feature-abs-pointer", "%d", &val);
+               if (ret < 0)
+                       val = 0;
+               if (val) {
+                       ret = xenbus_printf(XBT_NIL, info->xbdev->nodename,
+                                           "request-abs-pointer", "1");
+                       if (ret)
+                               ; /* FIXME */
+               }
+               xenbus_switch_state(dev, XenbusStateConnected);
+               break;
+
+       case XenbusStateConnected:
+               /*
+                * Work around xenbus race condition: If backend goes
+                * through InitWait to Connected fast enough, we can
+                * get Connected twice here.
+                */
+               if (dev->state != XenbusStateConnected)
+                       goto InitWait; /* no InitWait seen yet, fudge it */
+
+               /* Set input abs params to match backend screen res */
+               if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                                  "width", "%d", &val) > 0 )
+                       input_set_abs_params(info->ptr, ABS_X, 0, val, 0, 0);
+
+               if (xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                                  "height", "%d", &val) > 0 )
+                       input_set_abs_params(info->ptr, ABS_Y, 0, val, 0, 0);
+
+               break;
+
+       case XenbusStateClosing:
+               xenbus_frontend_closed(dev);
+               break;
+       }
+}
+
+static const struct xenbus_device_id xenkbd_ids[] = {
+       { "vkbd" },
+       { "" }
+};
+MODULE_ALIAS("xen:vkbd");
+
+static DEFINE_XENBUS_DRIVER(xenkbd, ,
+       .probe = xenkbd_probe,
+       .remove = xenkbd_remove,
+       .resume = xenkbd_resume,
+       .otherend_changed = xenkbd_backend_changed,
+);
+
+static int __init xenkbd_init(void)
+{
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       /* Nothing to do if running in dom0. */
+       if (is_initial_xendomain())
+               return -ENODEV;
+
+       return xenbus_register_frontend(&xenkbd_driver);
+}
+
+static void __exit xenkbd_cleanup(void)
+{
+       return xenbus_unregister_driver(&xenkbd_driver);
+}
+
+module_init(xenkbd_init);
+module_exit(xenkbd_cleanup);
+
+MODULE_DESCRIPTION("Xen virtual keyboard/pointer device frontend");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/features.c b/drivers/xen/features.c

index 99eda16..fece0f2 100644 (file)
--- a/drivers/xen/features.c
+++ b/drivers/xen/features.c
@@ -9,14 +9,21 @@
  #include <linux/cache.h>
  #include <linux/module.h>
  
+#ifdef CONFIG_PARAVIRT_XEN
  #include <asm/xen/hypercall.h>
+#else
+#include <asm/hypervisor.h>
+#endif
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
  
  #include <xen/interface/xen.h>
  #include <xen/interface/version.h>
  #include <xen/features.h>
  
  u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
-EXPORT_SYMBOL_GPL(xen_features);
+EXPORT_SYMBOL(xen_features);
  
  void xen_setup_features(void)
  {
diff --git a/drivers/xen/gntdev/Makefile b/drivers/xen/gntdev/Makefile

new file mode 100644 (file)

index 0000000..8bd8c62
--- /dev/null
+++ b/drivers/xen/gntdev/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_XEN_GRANT_DEV) := gntdev.o
diff --git a/drivers/xen/gntdev/gntdev.c b/drivers/xen/gntdev/gntdev.c

new file mode 100644 (file)

index 0000000..3ec5dbc
--- /dev/null
+++ b/drivers/xen/gntdev/gntdev.c
@@ -0,0 +1,1012 @@
+/******************************************************************************
+ * gntdev.c
+ * 
+ * Device for accessing (in user-space) pages that have been granted by other
+ * domains.
+ *
+ * Copyright (c) 2006-2007, D G Murray.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <xen/gnttab.h>
+#include <asm/hypervisor.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include <xen/public/gntdev.h>
+
+
+#define DRIVER_AUTHOR "Derek G. Murray <Derek.Murray@cl.cam.ac.uk>"
+#define DRIVER_DESC   "User-space granted page access driver"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+
+#define GNTDEV_NAME "gntdev"
+MODULE_ALIAS("devname:xen/" GNTDEV_NAME);
+
+#define MAX_GRANTS_LIMIT   1024
+#define DEFAULT_MAX_GRANTS 128
+
+/* A slot can be in one of three states:
+ *
+ * 0. GNTDEV_SLOT_INVALID:
+ *    This slot is not associated with a grant reference, and is therefore free
+ *    to be overwritten by a new grant reference.
+ *
+ * 1. GNTDEV_SLOT_NOT_YET_MAPPED:
+ *    This slot is associated with a grant reference (via the 
+ *    IOCTL_GNTDEV_MAP_GRANT_REF ioctl), but it has not yet been mmap()-ed.
+ *
+ * 2. GNTDEV_SLOT_MAPPED:
+ *    This slot is associated with a grant reference, and has been mmap()-ed.
+ */
+typedef enum gntdev_slot_state {
+       GNTDEV_SLOT_INVALID = 0,
+       GNTDEV_SLOT_NOT_YET_MAPPED,
+       GNTDEV_SLOT_MAPPED
+} gntdev_slot_state_t;
+
+#define GNTDEV_INVALID_HANDLE    -1
+#define GNTDEV_FREE_LIST_INVALID -1
+/* Each opened instance of gntdev is associated with a list of grants,
+ * represented by an array of elements of the following type,
+ * gntdev_grant_info_t.
+ */
+typedef struct gntdev_grant_info {
+       gntdev_slot_state_t state;
+       union {
+               uint32_t free_list_index;
+               struct {
+                       domid_t domid;
+                       grant_ref_t ref;
+                       grant_handle_t kernel_handle;
+                       grant_handle_t user_handle;
+                       uint64_t dev_bus_addr;
+               } valid;
+       } u;
+} gntdev_grant_info_t;
+
+/* Private data structure, which is stored in the file pointer for files
+ * associated with this device.
+ */
+typedef struct gntdev_file_private_data {
+  
+       /* Array of grant information. */
+       gntdev_grant_info_t *grants;
+       uint32_t grants_size;
+
+       /* Read/write semaphore used to protect the grants array. */
+       struct rw_semaphore grants_sem;
+
+       /* An array of indices of free slots in the grants array.
+        * N.B. An entry in this list may temporarily have the value
+        * GNTDEV_FREE_LIST_INVALID if the corresponding slot has been removed
+        * from the list by the contiguous allocator, but the list has not yet
+        * been compressed. However, this is not visible across invocations of
+        * the device.
+        */
+       int32_t *free_list;
+       
+       /* The number of free slots in the grants array. */
+       uint32_t free_list_size;
+
+       /* Read/write semaphore used to protect the free list. */
+       struct rw_semaphore free_list_sem;
+       
+       /* Index of the next slot after the most recent contiguous allocation, 
+        * for use in a next-fit allocator.
+        */
+       uint32_t next_fit_index;
+
+       /* Used to map grants into the kernel, before mapping them into user
+        * space.
+        */
+       struct page **foreign_pages;
+
+} gntdev_file_private_data_t;
+
+/* Module lifecycle operations. */
+static int __init gntdev_init(void);
+static void __exit gntdev_exit(void);
+
+module_init(gntdev_init);
+module_exit(gntdev_exit);
+
+/* File operations. */
+static int gntdev_open(struct inode *inode, struct file *flip);
+static int gntdev_release(struct inode *inode, struct file *flip);
+static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma);
+static long gntdev_ioctl(struct file *flip,
+                        unsigned int cmd, unsigned long arg);
+
+static const struct file_operations gntdev_fops = {
+       .owner = THIS_MODULE,
+       .open = gntdev_open,
+       .llseek = no_llseek,
+       .release = gntdev_release,
+       .mmap = gntdev_mmap,
+       .unlocked_ioctl = gntdev_ioctl
+};
+
+/* VM operations. */
+static void gntdev_vma_close(struct vm_area_struct *vma);
+static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr,
+                             pte_t *ptep, int is_fullmm);
+
+static struct vm_operations_struct gntdev_vmops = {
+       .close = gntdev_vma_close,
+       .zap_pte = gntdev_clear_pte
+};
+
+/* Memory mapping functions
+ * ------------------------
+ *
+ * Every granted page is mapped into both kernel and user space, and the two
+ * following functions return the respective virtual addresses of these pages.
+ *
+ * When shadow paging is disabled, the granted page is mapped directly into
+ * user space; when it is enabled, it is mapped into the kernel and remapped
+ * into user space using vm_insert_page() (see gntdev_mmap(), below).
+ */
+
+/* Returns the virtual address (in user space) of the @page_index'th page
+ * in the given VM area.
+ */
+static inline unsigned long get_user_vaddr (struct vm_area_struct *vma,
+                                           int page_index)
+{
+       return (unsigned long) vma->vm_start + (page_index << PAGE_SHIFT);
+}
+
+/* Returns the virtual address (in kernel space) of the @slot_index'th page
+ * mapped by the gntdev instance that owns the given private data struct.
+ */
+static inline unsigned long get_kernel_vaddr (gntdev_file_private_data_t *priv,
+                                             int slot_index)
+{
+       unsigned long pfn;
+       void *kaddr;
+       pfn = page_to_pfn(priv->foreign_pages[slot_index]);
+       kaddr = pfn_to_kaddr(pfn);
+       return (unsigned long) kaddr;
+}
+
+/* Helper functions. */
+
+/* Adds information about a grant reference to the list of grants in the file's
+ * private data structure. Returns non-zero on failure. On success, sets the
+ * value of *offset to the offset that should be mmap()-ed in order to map the
+ * grant reference.
+ */
+static int add_grant_reference(gntdev_file_private_data_t *private_data,
+                              struct ioctl_gntdev_grant_ref *op,
+                              uint64_t *offset)
+{
+       uint32_t slot_index;
+
+       slot_index = private_data->free_list[--private_data->free_list_size];
+       private_data->free_list[private_data->free_list_size]
+               = GNTDEV_FREE_LIST_INVALID;
+
+       /* Copy the grant information into file's private data. */
+       private_data->grants[slot_index].state = GNTDEV_SLOT_NOT_YET_MAPPED;
+       private_data->grants[slot_index].u.valid.domid = op->domid;
+       private_data->grants[slot_index].u.valid.ref = op->ref;
+
+       /* The offset is calculated as the index of the chosen entry in the
+        * file's private data's array of grant information. This is then
+        * shifted to give an offset into the virtual "file address space".
+        */
+       *offset = slot_index << PAGE_SHIFT;
+
+       return 0;
+}
+
+/* Adds the @count grant references to the contiguous range in the slot array
+ * beginning at @first_slot. It is assumed that @first_slot was returned by a
+ * previous invocation of find_contiguous_free_range(), during the same
+ * invocation of the driver.
+ */
+static int add_grant_references(gntdev_file_private_data_t *private_data,
+                               uint32_t count,
+                               struct ioctl_gntdev_grant_ref *ops,
+                               uint32_t first_slot)
+{
+       uint32_t i;
+       
+       for (i = 0; i < count; ++i) {
+
+               /* First, mark the slot's entry in the free list as invalid. */
+               uint32_t free_list_index =
+                       private_data->grants[first_slot+i].u.free_list_index;
+               private_data->free_list[free_list_index] = 
+                       GNTDEV_FREE_LIST_INVALID;
+
+               /* Now, update the slot. */
+               private_data->grants[first_slot+i].state = 
+                       GNTDEV_SLOT_NOT_YET_MAPPED;
+               private_data->grants[first_slot+i].u.valid.domid =
+                       ops[i].domid;
+               private_data->grants[first_slot+i].u.valid.ref = ops[i].ref;
+       }
+
+       return 0;       
+}
+
+/* Scans through the free list for @flip, removing entries that are marked as
+ * GNTDEV_SLOT_INVALID. This will reduce the recorded size of the free list to
+ * the number of valid entries.
+ */
+static void compress_free_list(gntdev_file_private_data_t *private_data)
+{
+       uint32_t i, j = 0, old_size;
+       
+       old_size = private_data->free_list_size;
+       for (i = 0; i < old_size; ++i) {
+               if (private_data->free_list[i] != GNTDEV_FREE_LIST_INVALID) {
+                       if (i > j) {
+                               int32_t slot_index;
+
+                               slot_index = private_data->free_list[i];
+                               private_data->free_list[j] = slot_index;
+                               private_data->grants[slot_index].u
+                                       .free_list_index = j;
+                               private_data->free_list[i] 
+                                       = GNTDEV_FREE_LIST_INVALID;
+                       }
+                       ++j;
+               } else {
+                       --private_data->free_list_size;
+               }
+       }
+}
+
+/* Searches the grant array in the private data of @flip for a range of
+ * @num_slots contiguous slots in the GNTDEV_SLOT_INVALID state.
+ *
+ * Returns the index of the first slot if a range is found, otherwise -ENOMEM.
+ */
+static int find_contiguous_free_range(gntdev_file_private_data_t *private_data,
+                                     uint32_t num_slots) 
+{
+       uint32_t i, start_index = private_data->next_fit_index;
+       uint32_t range_start = 0, range_length;
+
+       /* First search from the start_index to the end of the array. */
+       range_length = 0;
+       for (i = start_index; i < private_data->grants_size; ++i) {
+               if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) {
+                       if (range_length == 0) {
+                               range_start = i;
+                       }
+                       ++range_length;
+                       if (range_length == num_slots) {
+                               return range_start;
+                       }
+               }
+       }
+       
+       /* Now search from the start of the array to the start_index. */
+       range_length = 0;
+       for (i = 0; i < start_index; ++i) {
+               if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) {
+                       if (range_length == 0) {
+                               range_start = i;
+                       }
+                       ++range_length;
+                       if (range_length == num_slots) {
+                               return range_start;
+                       }
+               }
+       }
+       
+       return -ENOMEM;
+}
+
+static int init_private_data(gntdev_file_private_data_t *priv,
+                            uint32_t max_grants)
+{
+       int i;
+
+       /* Allocate space for the kernel-mapping of granted pages. */
+       priv->foreign_pages = 
+               alloc_empty_pages_and_pagevec(max_grants);
+       if (!priv->foreign_pages)
+               goto nomem_out;
+
+       /* Allocate the grant list and free-list. */
+       priv->grants = kmalloc(max_grants * sizeof(gntdev_grant_info_t),
+                              GFP_KERNEL);
+       if (!priv->grants)
+               goto nomem_out2;
+       priv->free_list = kmalloc(max_grants * sizeof(int32_t), GFP_KERNEL);
+       if (!priv->free_list)
+               goto nomem_out3;
+
+       /* Initialise the free-list, which contains all slots at first. */
+       for (i = 0; i < max_grants; ++i) {
+               priv->free_list[max_grants - i - 1] = i;
+               priv->grants[i].state = GNTDEV_SLOT_INVALID;
+               priv->grants[i].u.free_list_index = max_grants - i - 1;
+       }
+       priv->grants_size = max_grants;
+       priv->free_list_size = max_grants;
+       priv->next_fit_index = 0;
+
+       return 0;
+
+nomem_out3:
+       kfree(priv->grants);
+nomem_out2:
+       free_empty_pages_and_pagevec(priv->foreign_pages, max_grants);
+nomem_out:
+       return -ENOMEM;
+
+}
+
+/* Interface functions. */
+
+static struct miscdevice gntdev_miscdev = {
+       .minor        = MISC_DYNAMIC_MINOR,
+       .name         = GNTDEV_NAME,
+       .nodename     = "xen/" GNTDEV_NAME,
+       .fops         = &gntdev_fops,
+};
+
+/* Initialises the driver. Called when the module is loaded. */
+static int __init gntdev_init(void)
+{
+       int err;
+
+       if (!is_running_on_xen()) {
+               pr_err("You must be running Xen to use gntdev\n");
+               return -ENODEV;
+       }
+
+       err = misc_register(&gntdev_miscdev);
+       if (err)
+       {
+               pr_err("Could not register gntdev device\n");
+               return err;
+       }
+
+       return 0;
+}
+
+/* Cleans up and unregisters the driver. Called when the driver is unloaded.
+ */
+static void __exit gntdev_exit(void)
+{
+       misc_deregister(&gntdev_miscdev);
+}
+
+/* Called when the device is opened. */
+static int gntdev_open(struct inode *inode, struct file *flip)
+{
+       gntdev_file_private_data_t *private_data;
+
+       nonseekable_open(inode, flip);
+
+       /* Allocate space for the per-instance private data. */
+       private_data = kmalloc(sizeof(*private_data), GFP_KERNEL);
+       if (!private_data)
+               goto nomem_out;
+
+       /* These will be lazily initialised by init_private_data. */
+       private_data->grants = NULL;
+       private_data->free_list = NULL;
+       private_data->foreign_pages = NULL;
+
+       init_rwsem(&private_data->grants_sem);
+       init_rwsem(&private_data->free_list_sem);
+
+       flip->private_data = private_data;
+
+       return 0;
+
+nomem_out:
+       return -ENOMEM;
+}
+
+/* Called when the device is closed.
+ */
+static int gntdev_release(struct inode *inode, struct file *flip)
+{
+       if (flip->private_data) {
+               gntdev_file_private_data_t *private_data = 
+                       (gntdev_file_private_data_t *) flip->private_data;
+               if (private_data->foreign_pages)
+                       free_empty_pages_and_pagevec
+                               (private_data->foreign_pages,
+                                private_data->grants_size);
+               if (private_data->grants) 
+                       kfree(private_data->grants);
+               if (private_data->free_list)
+                       kfree(private_data->free_list);
+               kfree(private_data);
+       }
+       return 0;
+}
+
+/* Called when an attempt is made to mmap() the device. The private data from
+ * @flip contains the list of grant references that can be mapped. The vm_pgoff
+ * field of @vma contains the index into that list that refers to the grant
+ * reference that will be mapped. Only mappings that are a multiple of
+ * PAGE_SIZE are handled.
+ */
+static int gntdev_mmap (struct file *flip, struct vm_area_struct *vma) 
+{
+       struct gnttab_map_grant_ref op;
+       unsigned long slot_index = vma->vm_pgoff;
+       unsigned long kernel_vaddr, user_vaddr;
+       uint32_t size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+       uint64_t ptep;
+       int ret, exit_ret;
+       int flags;
+       int i;
+       struct page *page;
+       gntdev_file_private_data_t *private_data = flip->private_data;
+
+       if (unlikely(!private_data)) {
+               pr_err("file's private data is NULL\n");
+               return -EINVAL;
+       }
+
+       /* Test to make sure that the grants array has been initialised. */
+       down_read(&private_data->grants_sem);
+       if (unlikely(!private_data->grants)) {
+               up_read(&private_data->grants_sem);
+               pr_err("attempted to mmap before ioctl\n");
+               return -EINVAL;
+       }
+       up_read(&private_data->grants_sem);
+
+       if (unlikely((size <= 0) || 
+                    (size + slot_index) > private_data->grants_size)) {
+               pr_err("Invalid number of pages or offset"
+                      "(num_pages = %d, first_slot = %ld)\n",
+                      size, slot_index);
+               return -ENXIO;
+       }
+
+       if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+               pr_err("writable mappings must be shared\n");
+               return -EINVAL;
+       }
+
+       /* Slots must be in the NOT_YET_MAPPED state. */
+       down_write(&private_data->grants_sem);
+       for (i = 0; i < size; ++i) {
+               if (private_data->grants[slot_index + i].state != 
+                   GNTDEV_SLOT_NOT_YET_MAPPED) {
+                       pr_err("Slot (index = %ld) is in the wrong "
+                              "state (%d)\n", slot_index + i,
+                              private_data->grants[slot_index + i].state);
+                       up_write(&private_data->grants_sem);
+                       return -EINVAL;
+               }
+       }
+
+       /* Install the hook for unmapping. */
+       vma->vm_ops = &gntdev_vmops;
+    
+       /* The VM area contains pages from another VM. */
+       vma->vm_flags |= VM_FOREIGN;
+       vma->vm_private_data = kzalloc(size * sizeof(struct page *),
+                                      GFP_KERNEL);
+       if (vma->vm_private_data == NULL) {
+               pr_err("couldn't allocate mapping structure for VM area\n");
+               return -ENOMEM;
+       }
+
+       /* This flag prevents Bad PTE errors when the memory is unmapped. */
+       vma->vm_flags |= VM_RESERVED;
+
+       /* This flag prevents this VM area being copied on a fork(). A better
+        * behaviour might be to explicitly carry out the appropriate mappings
+        * on fork(), but I don't know if there's a hook for this.
+        */
+       vma->vm_flags |= VM_DONTCOPY;
+
+#ifdef CONFIG_X86
+       /* This flag ensures that the page tables are not unpinned before the
+        * VM area is unmapped. Therefore Xen still recognises the PTE as
+        * belonging to an L1 pagetable, and the grant unmap operation will
+        * succeed, even if the process does not exit cleanly.
+        */
+       vma->vm_mm->context.has_foreign_mappings = 1;
+#endif
+
+       exit_ret = -ENOMEM;
+       for (i = 0; i < size; ++i) {
+
+               flags = GNTMAP_host_map;
+               if (!(vma->vm_flags & VM_WRITE))
+                       flags |= GNTMAP_readonly;
+
+               kernel_vaddr = get_kernel_vaddr(private_data, slot_index + i);
+               user_vaddr = get_user_vaddr(vma, i);
+               page = private_data->foreign_pages[slot_index + i];
+
+               gnttab_set_map_op(&op, kernel_vaddr, flags,   
+                                 private_data->grants[slot_index+i]
+                                 .u.valid.ref, 
+                                 private_data->grants[slot_index+i]
+                                 .u.valid.domid);
+
+               /* Carry out the mapping of the grant reference. */
+               ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, 
+                                               &op, 1);
+               BUG_ON(ret);
+               if (op.status != GNTST_okay) {
+                       if (op.status != GNTST_eagain)
+                               pr_err("Error mapping the grant reference "
+                                      "into the kernel (%d). domid = %d; ref = %d\n",
+                                      op.status,
+                                      private_data->grants[slot_index+i]
+                                      .u.valid.domid,
+                                      private_data->grants[slot_index+i]
+                                      .u.valid.ref);
+                       else
+                               /* Propagate eagain instead of trying to fix it up */
+                               exit_ret = -EAGAIN;
+                       goto undo_map_out;
+               }
+
+               /* Store a reference to the page that will be mapped into user
+                * space.
+                */
+               ((struct page **) vma->vm_private_data)[i] = page;
+
+               /* Mark mapped page as reserved. */
+               SetPageReserved(page);
+
+               /* Record the grant handle, for use in the unmap operation. */
+               private_data->grants[slot_index+i].u.valid.kernel_handle = 
+                       op.handle;
+               private_data->grants[slot_index+i].u.valid.dev_bus_addr = 
+                       op.dev_bus_addr;
+               
+               private_data->grants[slot_index+i].state = GNTDEV_SLOT_MAPPED;
+               private_data->grants[slot_index+i].u.valid.user_handle =
+                       GNTDEV_INVALID_HANDLE;
+
+               /* Now perform the mapping to user space. */
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+
+                       /* NOT USING SHADOW PAGE TABLES. */
+                       /* In this case, we map the grant(s) straight into user
+                        * space.
+                        */
+
+                       /* Get the machine address of the PTE for the user 
+                        *  page.
+                        */
+                       if ((ret = create_lookup_pte_addr(vma->vm_mm, 
+                                                         vma->vm_start 
+                                                         + (i << PAGE_SHIFT), 
+                                                         &ptep)))
+                       {
+                               pr_err("Error obtaining PTE pointer (%d)\n",
+                                      ret);
+                               goto undo_map_out;
+                       }
+                       
+                       /* Configure the map operation. */
+               
+                       /* The reference is to be used by host CPUs. */
+                       flags = GNTMAP_host_map;
+                       
+                       /* Specifies a user space mapping. */
+                       flags |= GNTMAP_application_map;
+                       
+                       /* The map request contains the machine address of the
+                        * PTE to update.
+                        */
+                       flags |= GNTMAP_contains_pte;
+                       
+                       if (!(vma->vm_flags & VM_WRITE))
+                               flags |= GNTMAP_readonly;
+
+                       gnttab_set_map_op(&op, ptep, flags, 
+                                         private_data->grants[slot_index+i]
+                                         .u.valid.ref, 
+                                         private_data->grants[slot_index+i]
+                                         .u.valid.domid);
+
+                       /* Carry out the mapping of the grant reference. */
+                       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+                                                       &op, 1);
+                       BUG_ON(ret);
+                       if (op.status != GNTST_okay) {
+                               pr_err("Error mapping the grant "
+                                      "reference into user space (%d). domid "
+                                      "= %d; ref = %d\n", op.status,
+                                      private_data->grants[slot_index+i].u
+                                      .valid.domid,
+                                      private_data->grants[slot_index+i].u
+                                      .valid.ref);
+                               /* This should never happen after we've mapped into
+                               * the kernel space. */
+                               BUG_ON(op.status == GNTST_eagain);
+                               goto undo_map_out;
+                       }
+                       
+                       /* Record the grant handle, for use in the unmap 
+                        * operation. 
+                        */
+                       private_data->grants[slot_index+i].u.
+                               valid.user_handle = op.handle;
+
+                       /* Update p2m structure with the new mapping. */
+                       set_phys_to_machine(__pa(kernel_vaddr) >> PAGE_SHIFT,
+                                           FOREIGN_FRAME(private_data->
+                                                         grants[slot_index+i]
+                                                         .u.valid.dev_bus_addr
+                                                         >> PAGE_SHIFT));
+               } else {
+                       /* USING SHADOW PAGE TABLES. */
+                       /* In this case, we simply insert the page into the VM
+                        * area. */
+                       ret = vm_insert_page(vma, user_vaddr, page);
+               }
+
+       }
+       exit_ret = 0;
+
+       up_write(&private_data->grants_sem);
+       return exit_ret;
+
+undo_map_out:
+       /* If we have a mapping failure, the unmapping will be taken care of
+        * by do_mmap_pgoff(), which will eventually call gntdev_clear_pte().
+        * All we need to do here is free the vma_private_data.
+        */
+       kfree(vma->vm_private_data);
+
+       /* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file
+        * to NULL on failure. However, we need this in gntdev_clear_pte() to
+        * unmap the grants. Therefore, we smuggle a reference to the file's
+        * private data in the VM area's private data pointer.
+        */
+       vma->vm_private_data = private_data;
+       
+       up_write(&private_data->grants_sem);
+
+       return exit_ret;
+}
+
+static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr,
+                             pte_t *ptep, int is_fullmm)
+{
+       int slot_index, ret;
+       pte_t copy;
+       struct gnttab_unmap_grant_ref op;
+       gntdev_file_private_data_t *private_data;
+
+       /* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file
+        * to NULL on failure. However, we need this in gntdev_clear_pte() to
+        * unmap the grants. Therefore, we smuggle a reference to the file's
+        * private data in the VM area's private data pointer.
+        */
+       if (vma->vm_file) {
+               private_data = (gntdev_file_private_data_t *)
+                       vma->vm_file->private_data;
+       } else if (vma->vm_private_data) {
+               private_data = (gntdev_file_private_data_t *)
+                       vma->vm_private_data;
+       } else {
+               private_data = NULL; /* gcc warning */
+               BUG();
+       }
+
+       /* Calculate the grant relating to this PTE. */
+       slot_index = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+
+       /* Only unmap grants if the slot has been mapped. This could be being
+        * called from a failing mmap().
+        */
+       if (private_data->grants[slot_index].state == GNTDEV_SLOT_MAPPED) {
+
+               /* First, we clear the user space mapping, if it has been made.
+                */
+               if (private_data->grants[slot_index].u.valid.user_handle !=
+                   GNTDEV_INVALID_HANDLE && 
+                   !xen_feature(XENFEAT_auto_translated_physmap)) {
+                       /* NOT USING SHADOW PAGE TABLES. */
+
+                       /* Copy the existing value of the PTE for returning. */
+                       copy = *ptep;
+
+                       gnttab_set_unmap_op(&op, ptep_to_machine(ptep), 
+                                           GNTMAP_contains_pte,
+                                           private_data->grants[slot_index]
+                                           .u.valid.user_handle);
+                       ret = HYPERVISOR_grant_table_op(
+                               GNTTABOP_unmap_grant_ref, &op, 1);
+                       BUG_ON(ret);
+                       if (op.status != GNTST_okay)
+                               pr_warning("User unmap grant status = %d\n",
+                                          op.status);
+               } else {
+                       /* USING SHADOW PAGE TABLES. */
+                       copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
+               }
+
+               /* Finally, we unmap the grant from kernel space. */
+               gnttab_set_unmap_op(&op, 
+                                   get_kernel_vaddr(private_data, slot_index),
+                                   GNTMAP_host_map, 
+                                   private_data->grants[slot_index].u.valid
+                                   .kernel_handle);
+               ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, 
+                                               &op, 1);
+               BUG_ON(ret);
+               if (op.status != GNTST_okay)
+                       pr_warning("Kernel unmap grant status = %d\n",
+                                  op.status);
+
+
+               /* Return slot to the not-yet-mapped state, so that it may be
+                * mapped again, or removed by a subsequent ioctl.
+                */
+               private_data->grants[slot_index].state = 
+                       GNTDEV_SLOT_NOT_YET_MAPPED;
+
+               /* Invalidate the physical to machine mapping for this page. */
+               set_phys_to_machine(
+                       page_to_pfn(private_data->foreign_pages[slot_index]),
+                       INVALID_P2M_ENTRY);
+
+       } else {
+               copy = xen_ptep_get_and_clear_full(vma, addr, ptep, is_fullmm);
+       }
+
+       return copy;
+}
+
+/* "Destructor" for a VM area.
+ */
+static void gntdev_vma_close(struct vm_area_struct *vma) {
+       if (vma->vm_private_data) {
+               kfree(vma->vm_private_data);
+       }
+}
+
+/* Called when an ioctl is made on the device.
+ */
+static long gntdev_ioctl(struct file *flip,
+                        unsigned int cmd, unsigned long arg)
+{
+       int rc = 0;
+       gntdev_file_private_data_t *private_data = 
+               (gntdev_file_private_data_t *) flip->private_data;
+
+       /* On the first invocation, we will lazily initialise the grant array
+        * and free-list.
+        */
+       if (unlikely(!private_data->grants) 
+           && likely(cmd != IOCTL_GNTDEV_SET_MAX_GRANTS)) {
+               down_write(&private_data->grants_sem);
+               
+               if (unlikely(private_data->grants)) {
+                       up_write(&private_data->grants_sem);
+                       goto private_data_initialised;
+               }
+               
+               /* Just use the default. Setting to a non-default is handled
+                * in the ioctl switch.
+                */
+               rc = init_private_data(private_data, DEFAULT_MAX_GRANTS);
+               
+               up_write(&private_data->grants_sem);
+
+               if (rc) {
+                       pr_err("Initialising gntdev private data failed\n");
+                       return rc;
+               }
+       }
+           
+private_data_initialised:
+       switch (cmd) {
+       case IOCTL_GNTDEV_MAP_GRANT_REF:
+       {
+               struct ioctl_gntdev_map_grant_ref op;
+               struct ioctl_gntdev_grant_ref *refs = NULL;
+
+               if (copy_from_user(&op, (void __user *)arg, sizeof(op)))
+                       return -EFAULT;
+               if (unlikely(op.count <= 0))
+                       return -EINVAL;
+
+               if (op.count > 1 && op.count <= private_data->grants_size) {
+                       struct ioctl_gntdev_grant_ref *u;
+
+                       refs = kmalloc(op.count * sizeof(*refs), GFP_KERNEL);
+                       if (!refs)
+                               return -ENOMEM;
+                       u = ((struct ioctl_gntdev_map_grant_ref *)arg)->refs;
+                       if (copy_from_user(refs, (void __user *)u,
+                                          sizeof(*refs) * op.count)) {
+                               kfree(refs);
+                               return -EFAULT;
+                       }
+               }
+
+               down_write(&private_data->grants_sem);
+               down_write(&private_data->free_list_sem);
+
+               if (unlikely(op.count > private_data->free_list_size)) {
+                       rc = -ENOMEM;
+                       goto map_out;
+               }
+
+               if (op.count == 1) {
+                       if ((rc = add_grant_reference(private_data, op.refs,
+                                                     &op.index)) < 0) {
+                               pr_err("Adding grant reference failed (%d)\n",
+                                      rc);
+                               goto map_out;
+                       }
+               } else {
+                       if ((rc = find_contiguous_free_range(private_data,
+                                                            op.count)) < 0) {
+                               pr_err("Finding contiguous range failed"
+                                      " (%d)\n", rc);
+                               goto map_out;
+                       }
+                       op.index = rc << PAGE_SHIFT;
+                       if ((rc = add_grant_references(private_data, op.count,
+                                                      refs, rc))) {
+                               pr_err("Adding grant references failed (%d)\n",
+                                      rc);
+                               goto map_out;
+                       }
+                       compress_free_list(private_data);
+               }
+
+       map_out:
+               up_write(&private_data->free_list_sem);
+               up_write(&private_data->grants_sem);
+
+               kfree(refs);
+
+               if (!rc && copy_to_user((void __user *)arg, &op, sizeof(op)))
+                       rc = -EFAULT;
+               return rc;
+       }
+       case IOCTL_GNTDEV_UNMAP_GRANT_REF:
+       {
+               struct ioctl_gntdev_unmap_grant_ref op;
+               uint32_t i, start_index;
+
+               if (copy_from_user(&op, (void __user *)arg, sizeof(op)))
+                       return -EFAULT;
+
+               start_index = op.index >> PAGE_SHIFT;
+               if (start_index + op.count > private_data->grants_size)
+                       return -EINVAL;
+
+               down_write(&private_data->grants_sem);
+
+               /* First, check that all pages are in the NOT_YET_MAPPED
+                * state.
+                */
+               for (i = 0; i < op.count; ++i) {
+                       if (unlikely
+                           (private_data->grants[start_index + i].state
+                            != GNTDEV_SLOT_NOT_YET_MAPPED)) {
+                               if (private_data->grants[start_index + i].state
+                                   == GNTDEV_SLOT_INVALID) {
+                                       pr_err("Tried to remove an invalid "
+                                              "grant at offset 0x%x.",
+                                              (start_index + i) 
+                                              << PAGE_SHIFT);
+                                       rc = -EINVAL;
+                               } else {
+                                       pr_err("Tried to remove a grant which "
+                                              "is currently mmap()-ed at "
+                                              "offset 0x%x.",
+                                              (start_index + i) 
+                                              << PAGE_SHIFT);
+                                       rc = -EBUSY;
+                               }
+                               goto unmap_out;
+                       }
+               }
+
+               down_write(&private_data->free_list_sem);
+
+               /* Unmap pages and add them to the free list.
+                */
+               for (i = 0; i < op.count; ++i) {
+                       private_data->grants[start_index+i].state = 
+                               GNTDEV_SLOT_INVALID;
+                       private_data->grants[start_index+i].u.free_list_index =
+                               private_data->free_list_size;
+                       private_data->free_list[private_data->free_list_size] =
+                               start_index + i;
+                       ++private_data->free_list_size;
+               }
+
+               up_write(&private_data->free_list_sem);
+       unmap_out:
+               up_write(&private_data->grants_sem);
+               return rc;
+       }
+       case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR:
+       {
+               struct ioctl_gntdev_get_offset_for_vaddr op;
+               struct vm_area_struct *vma;
+               unsigned long vaddr;
+
+               if (copy_from_user(&op, (void __user *)arg, sizeof(op)))
+                       return -EFAULT;
+
+               vaddr = (unsigned long)op.vaddr;
+
+               down_read(&current->mm->mmap_sem);              
+               vma = find_vma(current->mm, vaddr);
+               if (!vma || vma->vm_ops != &gntdev_vmops) {
+                       rc = -EFAULT;
+                       goto get_offset_out;
+               }
+               if (vma->vm_start != vaddr) {
+                       pr_err("The vaddr specified in an "
+                              "IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR must be at "
+                              "the start of the VM area. vma->vm_start = "
+                              "%#lx; vaddr = %#lx\n",
+                              vma->vm_start, vaddr);
+                       rc = -EFAULT;
+                       goto get_offset_out;
+               }
+               op.offset = vma->vm_pgoff << PAGE_SHIFT;
+               op.count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+       get_offset_out:
+               up_read(&current->mm->mmap_sem);
+               if (!rc && copy_to_user((void __user *)arg, &op, sizeof(op)))
+                       rc = -EFAULT;
+               return rc;
+       }
+       case IOCTL_GNTDEV_SET_MAX_GRANTS:
+       {
+               struct ioctl_gntdev_set_max_grants op;
+
+               if (copy_from_user(&op, (void __user *)arg, sizeof(op)))
+                       return -EFAULT;
+               if (op.count > MAX_GRANTS_LIMIT)
+                       return -EINVAL;
+
+               down_write(&private_data->grants_sem);
+               if (unlikely(private_data->grants))
+                       rc = -EBUSY;
+               else
+                       rc = init_private_data(private_data, op.count);
+               up_write(&private_data->grants_sem);
+               return rc;
+       }
+       default:
+               return -ENOIOCTLCMD;
+       }
+
+       return 0;
+}
diff --git a/drivers/xen/netback/Makefile b/drivers/xen/netback/Makefile

new file mode 100644 (file)

index 0000000..2bb2677
--- /dev/null
+++ b/drivers/xen/netback/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
+obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
+
+netbk-y   := netback.o xenbus.o interface.o accel.o
+netloop-y := loopback.o
diff --git a/drivers/xen/netback/accel.c b/drivers/xen/netback/accel.c

new file mode 100644 (file)

index 0000000..c902e73
--- /dev/null
+++ b/drivers/xen/netback/accel.c
@@ -0,0 +1,269 @@
+/******************************************************************************
+ * drivers/xen/netback/accel.c
+ *
+ * Interface between backend virtual network device and accelerated plugin. 
+ * 
+ * Copyright (C) 2007 Solarflare Communications, Inc
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/list.h>
+#include <linux/atomic.h>
+#include <xen/xenbus.h>
+#include <linux/mutex.h>
+
+#include "common.h"
+
+#if 0
+#undef DPRINTK
+#define DPRINTK(fmt, args...)                                          \
+       printk("netback/accel (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
+#endif
+
+/* 
+ * A list of available netback accelerator plugin modules (each list
+ * entry is of type struct netback_accelerator) 
+ */ 
+static struct list_head accelerators_list;
+/* Lock used to protect access to accelerators_list */
+DEFINE_MUTEX(accelerators_mutex);
+
+/* 
+ * Compare a backend to an accelerator, and decide if they are
+ * compatible (i.e. if the accelerator should be used by the
+ * backend) 
+ */
+static int match_accelerator(struct xenbus_device *xendev,
+                            struct backend_info *be, 
+                            struct netback_accelerator *accelerator)
+{
+       int rc = 0;
+       char *eth_name = xenbus_read(XBT_NIL, xendev->nodename, "accel", NULL);
+       
+       if (IS_ERR(eth_name)) {
+               /* Probably means not present */
+               DPRINTK("%s: no match due to xenbus_read accel error %ld\n",
+                       __FUNCTION__, PTR_ERR(eth_name));
+               return 0;
+       } else {
+               if (!strcmp(eth_name, accelerator->eth_name))
+                       rc = 1;
+               kfree(eth_name);
+               return rc;
+       }
+}
+
+
+static void do_probe(struct backend_info *be, 
+                    struct netback_accelerator *accelerator,
+                    struct xenbus_device *xendev) 
+{
+       be->accelerator = accelerator;
+       atomic_inc(&be->accelerator->use_count);
+       if (be->accelerator->hooks->probe(xendev) != 0) {
+               atomic_dec(&be->accelerator->use_count);
+               module_put(be->accelerator->hooks->owner);
+               be->accelerator = NULL;
+       }
+}
+
+
+/*
+ * Notify suitable backends that a new accelerator is available and
+ * connected.  This will also notify the accelerator plugin module
+ * that it is being used for a device through the probe hook.
+ */
+static int netback_accelerator_probe_backend(struct device *dev, void *arg)
+{
+       struct netback_accelerator *accelerator = 
+               (struct netback_accelerator *)arg;
+       struct xenbus_device *xendev = to_xenbus_device(dev);
+
+       if (!strcmp("vif", xendev->devicetype)) {
+               struct backend_info *be = dev_get_drvdata(&xendev->dev);
+
+               if (match_accelerator(xendev, be, accelerator) &&
+                   try_module_get(accelerator->hooks->owner)) {
+                       do_probe(be, accelerator, xendev);
+               }
+       }
+       return 0;
+}
+
+
+/*
+ * Notify suitable backends that an accelerator is unavailable.
+ */
+static int netback_accelerator_remove_backend(struct device *dev, void *arg)
+{
+       struct xenbus_device *xendev = to_xenbus_device(dev);
+       struct netback_accelerator *accelerator = 
+               (struct netback_accelerator *)arg;
+       
+       if (!strcmp("vif", xendev->devicetype)) {
+               struct backend_info *be = dev_get_drvdata(&xendev->dev);
+
+               if (be->accelerator == accelerator) {
+                       be->accelerator->hooks->remove(xendev);
+                       atomic_dec(&be->accelerator->use_count);
+                       module_put(be->accelerator->hooks->owner);
+                       be->accelerator = NULL;
+               }
+       }
+       return 0;
+}
+
+
+
+/*
+ * Entry point for an netback accelerator plugin module.  Called to
+ * advertise its presence, and connect to any suitable backends.
+ */
+int netback_connect_accelerator(unsigned version, int id, const char *eth_name, 
+                               struct netback_accel_hooks *hooks)
+{
+       struct netback_accelerator *new_accelerator;
+       unsigned eth_name_len;
+
+       if (version != NETBACK_ACCEL_VERSION) {
+               if (version > NETBACK_ACCEL_VERSION) {
+                       /* Caller has higher version number, leave it
+                          up to them to decide whether to continue.
+                          They can recall with a lower number if
+                          they're happy to be compatible with us */
+                       return NETBACK_ACCEL_VERSION;
+               } else {
+                       /* We have a more recent version than caller.
+                          Currently reject, but may in future be able
+                          to be backwardly compatible */
+                       return -EPROTO;
+               }
+       }
+
+       new_accelerator = 
+               kmalloc(sizeof(struct netback_accelerator), GFP_KERNEL);
+       if (!new_accelerator) {
+               DPRINTK("%s: failed to allocate memory for accelerator\n",
+                       __FUNCTION__);
+               return -ENOMEM;
+       }
+
+       new_accelerator->id = id;
+       
+       eth_name_len = strlen(eth_name)+1;
+       new_accelerator->eth_name = kmalloc(eth_name_len, GFP_KERNEL);
+       if (!new_accelerator->eth_name) {
+               DPRINTK("%s: failed to allocate memory for eth_name string\n",
+                       __FUNCTION__);
+               kfree(new_accelerator);
+               return -ENOMEM;
+       }
+       strlcpy(new_accelerator->eth_name, eth_name, eth_name_len);
+       
+       new_accelerator->hooks = hooks;
+
+       atomic_set(&new_accelerator->use_count, 0);
+       
+       mutex_lock(&accelerators_mutex);
+       list_add(&new_accelerator->link, &accelerators_list);
+       
+       /* tell existing backends about new plugin */
+       xenbus_for_each_backend(new_accelerator, 
+                               netback_accelerator_probe_backend);
+
+       mutex_unlock(&accelerators_mutex);
+
+       return 0;
+
+}
+EXPORT_SYMBOL_GPL(netback_connect_accelerator);
+
+
+/* 
+ * Disconnect an accelerator plugin module that has previously been
+ * connected.
+ */
+void netback_disconnect_accelerator(int id, const char *eth_name)
+{
+       struct netback_accelerator *accelerator;
+
+       mutex_lock(&accelerators_mutex);
+       list_for_each_entry(accelerator, &accelerators_list, link) {
+               if (!strcmp(eth_name, accelerator->eth_name)) {
+                       xenbus_for_each_backend
+                               (accelerator, netback_accelerator_remove_backend);
+                       BUG_ON(atomic_read(&accelerator->use_count) != 0);
+                       list_del(&accelerator->link);                           
+                       kfree(accelerator->eth_name);
+                       kfree(accelerator);
+                       break;
+               }
+       }
+       mutex_unlock(&accelerators_mutex);
+}
+EXPORT_SYMBOL_GPL(netback_disconnect_accelerator);
+
+
+void netback_probe_accelerators(struct backend_info *be,
+                               struct xenbus_device *dev)
+{
+       struct netback_accelerator *accelerator;
+
+       /* 
+        * Check list of accelerators to see if any is suitable, and
+        * use it if it is.
+        */
+       mutex_lock(&accelerators_mutex);
+       list_for_each_entry(accelerator, &accelerators_list, link) { 
+               if (match_accelerator(dev, be, accelerator) &&
+                   try_module_get(accelerator->hooks->owner)) {
+                       do_probe(be, accelerator, dev);
+                       break;
+               }
+       }
+       mutex_unlock(&accelerators_mutex);
+}
+
+
+void netback_remove_accelerators(struct backend_info *be,
+                                struct xenbus_device *dev)
+{
+       mutex_lock(&accelerators_mutex);
+       /* Notify the accelerator (if any) of this device's removal */
+       if (be->accelerator != NULL) {
+               be->accelerator->hooks->remove(dev);
+               atomic_dec(&be->accelerator->use_count);
+               module_put(be->accelerator->hooks->owner);
+               be->accelerator = NULL;
+       }
+       mutex_unlock(&accelerators_mutex);
+}
+
+
+void netif_accel_init(void)
+{
+       INIT_LIST_HEAD(&accelerators_list);
+}
diff --git a/drivers/xen/netback/common.h b/drivers/xen/netback/common.h

new file mode 100644 (file)

index 0000000..f681999
--- /dev/null
+++ b/drivers/xen/netback/common.h
@@ -0,0 +1,297 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/common.h
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __NETIF__BACKEND__COMMON_H__
+#define __NETIF__BACKEND__COMMON_H__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/wait.h>
+#include <xen/interface/io/netif.h>
+#include <xen/xenbus.h>
+#include <xen/interface/event_channel.h>
+
+#define DPRINTK(_f, _a...)                     \
+       pr_debug("(file=%s, line=%d) " _f,      \
+                __FILE__ , __LINE__ , ## _a )
+#define IPRINTK(fmt, args...) pr_info("xen_net: " fmt, ##args)
+#define WPRINTK(fmt, args...) pr_warning("xen_net: " fmt, ##args)
+
+typedef struct netif_st {
+       /* Unique identifier for this interface. */
+       domid_t          domid;
+       unsigned int     group;
+       unsigned int     handle;
+
+       u8               fe_dev_addr[6];
+
+       unsigned int     irq;
+
+       /* The shared rings and indexes. */
+       netif_tx_back_ring_t tx;
+       netif_rx_back_ring_t rx;
+       struct vm_struct *tx_comms_area;
+       struct vm_struct *rx_comms_area;
+
+       /* Flags that must not be set in dev->features */
+       int features_disabled;
+
+       /* Frontend feature information. */
+       u8 can_sg:1;
+       u8 gso:1;
+       u8 csum:1;
+
+       /* Internal feature information. */
+       u8 can_queue:1; /* can queue packets for receiver? */
+       u8 copying_receiver:1;  /* copy packets to receiver?       */
+
+       /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
+       RING_IDX rx_req_cons_peek;
+
+       /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
+       unsigned long   credit_bytes;
+       unsigned long   credit_usec;
+       unsigned long   remaining_credit;
+       struct timer_list credit_timeout;
+
+       /* Enforce draining of the transmit queue. */
+       struct timer_list tx_queue_timeout;
+
+       /* Statistics */
+       unsigned long nr_copied_skbs;
+       unsigned long rx_gso_csum_fixups;
+
+       /* Miscellaneous private stuff. */
+       struct list_head list;  /* scheduling list */
+       atomic_t         refcnt;
+       struct net_device *dev;
+
+       unsigned int carrier;
+
+       wait_queue_head_t waiting_to_free;
+} netif_t;
+
+/*
+ * Implement our own carrier flag: the network stack's version causes delays
+ * when the carrier is re-enabled (in particular, dev_activate() may not
+ * immediately be called, which can cause packet loss; also the etherbridge
+ * can be rather lazy in activating its port).
+ */
+#define netback_carrier_on(netif)      ((netif)->carrier = 1)
+#define netback_carrier_off(netif)     ((netif)->carrier = 0)
+#define netback_carrier_ok(netif)      ((netif)->carrier)
+
+enum {
+       NETBK_DONT_COPY_SKB,
+       NETBK_DELAYED_COPY_SKB,
+       NETBK_ALWAYS_COPY_SKB,
+};
+
+extern int netbk_copy_skb_mode;
+
+/* Function pointers into netback accelerator plugin modules */
+struct netback_accel_hooks {
+       struct module *owner;
+       int  (*probe)(struct xenbus_device *dev);
+       int (*remove)(struct xenbus_device *dev);
+};
+
+/* Structure to track the state of a netback accelerator plugin */
+struct netback_accelerator {
+       struct list_head link;
+       int id;
+       char *eth_name;
+       atomic_t use_count;
+       struct netback_accel_hooks *hooks;
+};
+
+struct backend_info {
+       struct xenbus_device *dev;
+       netif_t *netif;
+       enum xenbus_state frontend_state;
+       struct xenbus_watch hotplug_status_watch;
+       int have_hotplug_status_watch:1;
+
+       /* State relating to the netback accelerator */
+       void *netback_accel_priv;
+       /* The accelerator that this backend is currently using */
+       struct netback_accelerator *accelerator;
+};
+
+#define NETBACK_ACCEL_VERSION 0x00010001
+
+/* 
+ * Connect an accelerator plugin module to netback.  Returns zero on
+ * success, < 0 on error, > 0 (with highest version number supported)
+ * if version mismatch.
+ */
+extern int netback_connect_accelerator(unsigned version,
+                                      int id, const char *eth_name, 
+                                      struct netback_accel_hooks *hooks);
+/* Disconnect a previously connected accelerator plugin module */
+extern void netback_disconnect_accelerator(int id, const char *eth_name);
+
+
+extern
+void netback_probe_accelerators(struct backend_info *be,
+                               struct xenbus_device *dev);
+extern
+void netback_remove_accelerators(struct backend_info *be,
+                                struct xenbus_device *dev);
+extern
+void netif_accel_init(void);
+
+
+#define NET_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, PAGE_SIZE)
+#define NET_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, PAGE_SIZE)
+
+void netif_disconnect(struct backend_info *be);
+
+netif_t *netif_alloc(struct device *parent, domid_t domid, unsigned int handle);
+int netif_map(struct backend_info *be, grant_ref_t tx_ring_ref,
+             grant_ref_t rx_ring_ref, evtchn_port_t evtchn);
+
+#define netif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define netif_put(_b)                                          \
+       do {                                                    \
+               if ( atomic_dec_and_test(&(_b)->refcnt) )       \
+                       wake_up(&(_b)->waiting_to_free);        \
+       } while (0)
+
+void netif_xenbus_init(void);
+
+#define netif_schedulable(netif)                               \
+       (netif_running((netif)->dev) && netback_carrier_ok(netif))
+
+void netif_schedule_work(netif_t *netif);
+void netif_deschedule_work(netif_t *netif);
+
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
+irqreturn_t netif_be_int(int irq, void *dev_id);
+
+static inline int netbk_can_queue(struct net_device *dev)
+{
+       netif_t *netif = netdev_priv(dev);
+       return netif->can_queue;
+}
+
+static inline int netbk_can_sg(struct net_device *dev)
+{
+       netif_t *netif = netdev_priv(dev);
+       return netif->can_sg;
+}
+
+struct pending_tx_info {
+       netif_tx_request_t req;
+       netif_t *netif;
+};
+typedef unsigned int pending_ring_idx_t;
+
+struct netbk_rx_meta {
+       skb_frag_t frag;
+       int id;
+       u8 copy:1;
+};
+
+struct netbk_tx_pending_inuse {
+       struct list_head list;
+       unsigned long alloc_time;
+};
+
+#define MAX_PENDING_REQS (1U << CONFIG_XEN_NETDEV_TX_SHIFT)
+#define MAX_MFN_ALLOC 64
+
+struct xen_netbk {
+       union {
+               struct {
+                       struct tasklet_struct net_tx_tasklet;
+                       struct tasklet_struct net_rx_tasklet;
+               };
+               struct {
+                       wait_queue_head_t netbk_action_wq;
+                       struct task_struct *task;
+               };
+       };
+
+       struct sk_buff_head rx_queue;
+       struct sk_buff_head tx_queue;
+
+       struct timer_list net_timer;
+       struct timer_list tx_pending_timer;
+
+       pending_ring_idx_t pending_prod;
+       pending_ring_idx_t pending_cons;
+       pending_ring_idx_t dealloc_prod;
+       pending_ring_idx_t dealloc_cons;
+
+       struct list_head pending_inuse_head;
+       struct list_head schedule_list;
+
+       spinlock_t schedule_list_lock;
+       spinlock_t release_lock;
+
+       struct page **mmap_pages;
+
+       atomic_t nr_groups;
+       unsigned int alloc_index;
+
+       struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+       struct netbk_tx_pending_inuse pending_inuse[MAX_PENDING_REQS];
+       struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
+       struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
+
+       grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
+       u16 pending_ring[MAX_PENDING_REQS];
+       u16 dealloc_ring[MAX_PENDING_REQS];
+
+       struct multicall_entry rx_mcl[NET_RX_RING_SIZE+3];
+       struct mmu_update rx_mmu[NET_RX_RING_SIZE];
+       struct gnttab_transfer grant_trans_op[NET_RX_RING_SIZE];
+       struct gnttab_copy grant_copy_op[NET_RX_RING_SIZE];
+       DECLARE_BITMAP(rx_notify, NR_DYNIRQS);
+#if !defined(NR_DYNIRQS)
+# error
+#elif NR_DYNIRQS <= 0x10000
+       u16 notify_list[NET_RX_RING_SIZE];
+#else
+       int notify_list[NET_RX_RING_SIZE];
+#endif
+       struct netbk_rx_meta meta[NET_RX_RING_SIZE];
+
+       unsigned long mfn_list[MAX_MFN_ALLOC];
+};
+
+extern struct xen_netbk *xen_netbk;
+extern unsigned int netbk_nr_groups;
+
+#endif /* __NETIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/netback/interface.c b/drivers/xen/netback/interface.c

new file mode 100644 (file)

index 0000000..9ba736b
--- /dev/null
+++ b/drivers/xen/netback/interface.c
@@ -0,0 +1,363 @@
+/******************************************************************************
+ * arch/xen/drivers/netif/backend/interface.c
+ * 
+ * Network-device interface management.
+ * 
+ * Copyright (c) 2004-2005, Keir Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <xen/evtchn.h>
+
+/*
+ * Module parameter 'queue_length':
+ * 
+ * Enables queuing in the network stack when a client has run out of receive
+ * descriptors. Although this feature can improve receive bandwidth by avoiding
+ * packet loss, it can also result in packets sitting in the 'tx_queue' for
+ * unbounded time. This is bad if those packets hold onto foreign resources.
+ * For example, consider a packet that holds onto resources belonging to the
+ * guest for which it is queued (e.g., packet received on vif1.0, destined for
+ * vif1.1 which is not activated in the guest): in this situation the guest
+ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we
+ * run a timer (tx_queue_timeout) to drain the queue when the interface is
+ * blocked.
+ */
+static unsigned long netbk_queue_length = 32;
+module_param_named(queue_length, netbk_queue_length, ulong, 0644);
+
+static void __netif_up(netif_t *netif)
+{
+       unsigned int group = 0;
+       unsigned int min_groups = atomic_read(&xen_netbk[0].nr_groups);
+       unsigned int i;
+
+       /* Find the list which contains least number of domains. */
+       for (i = 1; i < netbk_nr_groups; i++) {
+               unsigned int nr_groups = atomic_read(&xen_netbk[i].nr_groups);
+
+               if (nr_groups < min_groups) {
+                       group = i;
+                       min_groups = nr_groups;
+               }
+       }
+
+       atomic_inc(&xen_netbk[group].nr_groups);
+       netif->group = group;
+
+       enable_irq(netif->irq);
+       netif_schedule_work(netif);
+}
+
+static void __netif_down(netif_t *netif)
+{
+       struct xen_netbk *netbk = xen_netbk + netif->group;
+
+       disable_irq(netif->irq);
+       netif_deschedule_work(netif);
+
+       netif->group = UINT_MAX;
+       atomic_dec(&netbk->nr_groups);
+}
+
+static int net_open(struct net_device *dev)
+{
+       netif_t *netif = netdev_priv(dev);
+       if (netback_carrier_ok(netif)) {
+               __netif_up(netif);
+               netif_start_queue(dev);
+       }
+       return 0;
+}
+
+static int net_close(struct net_device *dev)
+{
+       netif_t *netif = netdev_priv(dev);
+       if (netback_carrier_ok(netif))
+               __netif_down(netif);
+       netif_stop_queue(dev);
+       return 0;
+}
+
+static int netbk_change_mtu(struct net_device *dev, int mtu)
+{
+       int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+       if (mtu > max)
+               return -EINVAL;
+       dev->mtu = mtu;
+       return 0;
+}
+
+static netdev_features_t netbk_fix_features(struct net_device *dev,
+                                           netdev_features_t features)
+{
+       netif_t *netif = netdev_priv(dev);
+
+       if (!netif->can_sg)
+               features &= ~NETIF_F_SG;
+       if (!netif->gso)
+               features &= ~NETIF_F_TSO;
+       if (!netif->csum)
+               features &= ~NETIF_F_IP_CSUM;
+
+       return features;
+}
+
+static void netbk_get_drvinfo(struct net_device *dev,
+                             struct ethtool_drvinfo *info)
+{
+       strcpy(info->driver, "netbk");
+       strlcpy(info->bus_info, dev_name(dev->dev.parent),
+               ARRAY_SIZE(info->bus_info));
+}
+
+static const struct netif_stat {
+       char name[ETH_GSTRING_LEN];
+       u16 offset;
+} netbk_stats[] = {
+       { "copied_skbs", offsetof(netif_t, nr_copied_skbs) / sizeof(long) },
+       { "rx_gso_csum_fixups", offsetof(netif_t, rx_gso_csum_fixups) / sizeof(long) },
+};
+
+static int netbk_get_sset_count(struct net_device *dev, int sset)
+{
+       switch (sset) {
+       case ETH_SS_STATS:
+               return ARRAY_SIZE(netbk_stats);
+       }
+       return -EOPNOTSUPP;
+}
+
+static void netbk_get_ethtool_stats(struct net_device *dev,
+                                  struct ethtool_stats *stats, u64 * data)
+{
+       unsigned long *np = netdev_priv(dev);
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
+               data[i] = np[netbk_stats[i].offset];
+}
+
+static void netbk_get_strings(struct net_device *dev, u32 stringset, u8 * data)
+{
+       int i;
+
+       switch (stringset) {
+       case ETH_SS_STATS:
+               for (i = 0; i < ARRAY_SIZE(netbk_stats); i++)
+                       memcpy(data + i * ETH_GSTRING_LEN,
+                              netbk_stats[i].name, ETH_GSTRING_LEN);
+               break;
+       }
+}
+
+static const struct ethtool_ops network_ethtool_ops =
+{
+       .get_drvinfo = netbk_get_drvinfo,
+       .get_link = ethtool_op_get_link,
+
+       .get_sset_count = netbk_get_sset_count,
+       .get_ethtool_stats = netbk_get_ethtool_stats,
+       .get_strings = netbk_get_strings,
+};
+
+static const struct net_device_ops netif_be_netdev_ops = {
+       .ndo_open               = net_open,
+       .ndo_stop               = net_close,
+       .ndo_start_xmit         = netif_be_start_xmit,
+       .ndo_change_mtu         = netbk_change_mtu,
+       .ndo_fix_features       = netbk_fix_features,
+       .ndo_set_mac_address    = eth_mac_addr,
+       .ndo_validate_addr      = eth_validate_addr,
+};
+
+netif_t *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
+{
+       int err = 0;
+       struct net_device *dev;
+       netif_t *netif;
+       char name[IFNAMSIZ] = {};
+
+       snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
+       dev = alloc_netdev(sizeof(netif_t), name, ether_setup);
+       if (dev == NULL) {
+               DPRINTK("Could not create netif: out of memory\n");
+               return ERR_PTR(-ENOMEM);
+       }
+
+       SET_NETDEV_DEV(dev, parent);
+
+       netif = netdev_priv(dev);
+       netif->domid  = domid;
+       netif->group = UINT_MAX;
+       netif->handle = handle;
+       netif->can_sg = 1;
+       netif->csum = 1;
+       atomic_set(&netif->refcnt, 1);
+       init_waitqueue_head(&netif->waiting_to_free);
+       netif->dev = dev;
+
+       netback_carrier_off(netif);
+
+       netif->credit_bytes = netif->remaining_credit = ~0UL;
+       netif->credit_usec  = 0UL;
+       init_timer(&netif->credit_timeout);
+       /* Initialize 'expires' now: it's used to track the credit window. */
+       netif->credit_timeout.expires = jiffies;
+
+       init_timer(&netif->tx_queue_timeout);
+
+       dev->netdev_ops = &netif_be_netdev_ops;
+
+       dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
+       dev->features = dev->hw_features;
+
+       SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
+
+       dev->tx_queue_len = netbk_queue_length;
+
+       /*
+        * Initialise a dummy MAC address. We choose the numerically
+        * largest non-broadcast address to prevent the address getting
+        * stolen by an Ethernet bridge for STP purposes.
+        * (FE:FF:FF:FF:FF:FF)
+        */ 
+       memset(dev->dev_addr, 0xFF, ETH_ALEN);
+       dev->dev_addr[0] &= ~0x01;
+
+       rtnl_lock();
+       err = register_netdevice(dev);
+       rtnl_unlock();
+       if (err) {
+               DPRINTK("Could not register new net device %s: err=%d\n",
+                       dev->name, err);
+               free_netdev(dev);
+               return ERR_PTR(err);
+       }
+
+       DPRINTK("Successfully created netif\n");
+       return netif;
+}
+
+int netif_map(struct backend_info *be, grant_ref_t tx_ring_ref,
+             grant_ref_t rx_ring_ref, evtchn_port_t evtchn)
+{
+       netif_t *netif = be->netif;
+       struct vm_struct *area;
+       int err = -ENOMEM;
+       netif_tx_sring_t *txs;
+       netif_rx_sring_t *rxs;
+
+       /* Already connected through? */
+       if (netif->irq)
+               return 0;
+
+       area = xenbus_map_ring_valloc(be->dev, tx_ring_ref);
+       if (IS_ERR(area))
+               return PTR_ERR(area);
+       netif->tx_comms_area = area;
+       area = xenbus_map_ring_valloc(be->dev, rx_ring_ref);
+       if (IS_ERR(area)) {
+               err = PTR_ERR(area);
+               goto err_rx;
+       }
+       netif->rx_comms_area = area;
+
+       err = bind_interdomain_evtchn_to_irqhandler(
+               netif->domid, evtchn, netif_be_int, 0,
+               netif->dev->name, netif);
+       if (err < 0)
+               goto err_hypervisor;
+       BUG_ON(err < DYNIRQ_BASE || err >= DYNIRQ_BASE + NR_DYNIRQS);
+       netif->irq = err;
+       disable_irq(netif->irq);
+
+       txs = (netif_tx_sring_t *)netif->tx_comms_area->addr;
+       BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
+
+       rxs = (netif_rx_sring_t *)
+               ((char *)netif->rx_comms_area->addr);
+       BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
+
+       netif->rx_req_cons_peek = 0;
+
+       netif_get(netif);
+
+       rtnl_lock();
+       if (!netif->can_sg && netif->dev->mtu > ETH_DATA_LEN)
+               dev_set_mtu(netif->dev, ETH_DATA_LEN);
+       netdev_update_features(netif->dev);
+       netback_carrier_on(netif);
+       if (netif_running(netif->dev))
+               __netif_up(netif);
+       rtnl_unlock();
+
+       return 0;
+err_hypervisor:
+       xenbus_unmap_ring_vfree(be->dev, netif->rx_comms_area);
+err_rx:
+       xenbus_unmap_ring_vfree(be->dev, netif->tx_comms_area);
+       return err;
+}
+
+void netif_disconnect(struct backend_info *be)
+{
+       netif_t *netif = be->netif;
+
+       if (netback_carrier_ok(netif)) {
+               rtnl_lock();
+               netback_carrier_off(netif);
+               netif_carrier_off(netif->dev); /* discard queued packets */
+               if (netif_running(netif->dev))
+                       __netif_down(netif);
+               rtnl_unlock();
+               netif_put(netif);
+       }
+
+       atomic_dec(&netif->refcnt);
+       wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0);
+
+       del_timer_sync(&netif->credit_timeout);
+       del_timer_sync(&netif->tx_queue_timeout);
+
+       if (netif->irq)
+               unbind_from_irqhandler(netif->irq, netif);
+       
+       unregister_netdev(netif->dev);
+
+       if (netif->tx.sring) {
+               xenbus_unmap_ring_vfree(be->dev, netif->tx_comms_area);
+               xenbus_unmap_ring_vfree(be->dev, netif->rx_comms_area);
+       }
+
+       free_netdev(netif->dev);
+}
diff --git a/drivers/xen/netback/loopback.c b/drivers/xen/netback/loopback.c

new file mode 100644 (file)

index 0000000..1020d2f
--- /dev/null
+++ b/drivers/xen/netback/loopback.c
@@ -0,0 +1,278 @@
+/******************************************************************************
+ * netback/loopback.c
+ * 
+ * A two-interface loopback device to emulate a local netfront-netback
+ * connection. This ensures that local packet delivery looks identical
+ * to inter-domain delivery. Most importantly, packets delivered locally
+ * originating from other domains will get *copied* when they traverse this
+ * driver. This prevents unbounded delays in socket-buffer queues from
+ * causing the netback driver to "seize up".
+ * 
+ * This driver creates a symmetric pair of loopback interfaces with names
+ * vif0.0 and veth0. The intention is that 'vif0.0' is bound to an Ethernet
+ * bridge, just like a proper netback interface, while a local IP interface
+ * is configured on 'veth0'.
+ * 
+ * As with a real netback interface, vif0.0 is configured with a suitable
+ * dummy MAC address. No default is provided for veth0: a reasonable strategy
+ * is to transfer eth0's MAC address to veth0, and give eth0 a dummy address
+ * (to avoid confusing the Etherbridge).
+ * 
+ * Copyright (c) 2005 K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/ethtool.h>
+#include <net/dst.h>
+#include <net/xfrm.h>          /* secpath_reset() */
+#include <asm/hypervisor.h>    /* is_initial_xendomain() */
+#include <../net/core/kmap_skb.h> /* k{,un}map_skb_frag() */
+
+static int nloopbacks = -1;
+module_param(nloopbacks, int, 0);
+MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create");
+
+struct net_private {
+       struct net_device *loopback_dev;
+       int loop_idx;
+};
+
+static inline struct net_private *loopback_priv(struct net_device *dev)
+{
+       return netdev_priv(dev);
+}
+
+static int loopback_open(struct net_device *dev)
+{
+       memset(&dev->stats, 0, sizeof(dev->stats));
+       netif_start_queue(dev);
+       return 0;
+}
+
+static int loopback_close(struct net_device *dev)
+{
+       netif_stop_queue(dev);
+       return 0;
+}
+
+#ifdef CONFIG_X86
+static int is_foreign(unsigned long pfn)
+{
+       /* NB. Play it safe for auto-translation mode. */
+       return (xen_feature(XENFEAT_auto_translated_physmap) ||
+               (phys_to_machine_mapping[pfn] & FOREIGN_FRAME_BIT));
+}
+#else
+/* How to detect a foreign mapping? Play it safe. */
+#define is_foreign(pfn)        (1)
+#endif
+
+static int skb_remove_foreign_references(struct sk_buff *skb)
+{
+       struct page *page;
+       unsigned long pfn;
+       int i, off;
+       char *vaddr;
+
+       BUG_ON(skb_shinfo(skb)->frag_list);
+
+       if (skb_cloned(skb) &&
+           unlikely(pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+               return 0;
+
+       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+               pfn = page_to_pfn(skb_frag_page(&skb_shinfo(skb)->frags[i]));
+               if (!is_foreign(pfn))
+                       continue;
+               
+               page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+               if (unlikely(!page))
+                       return 0;
+
+               vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+               off = skb_shinfo(skb)->frags[i].page_offset;
+               memcpy(page_address(page) + off,
+                      vaddr + off,
+                      skb_frag_size(&skb_shinfo(skb)->frags[i]));
+               kunmap_skb_frag(vaddr);
+
+               skb_frag_unref(skb, i);
+               skb_frag_set_page(skb, i, page);
+       }
+
+       return 1;
+}
+
+static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+       if (!skb_remove_foreign_references(skb)) {
+               dev->stats.tx_dropped++;
+               dev_kfree_skb(skb);
+               return NETDEV_TX_OK;
+       }
+
+       dst_release(skb_dst(skb));
+       skb_dst_set(skb, NULL);
+
+       skb_orphan(skb);
+
+       dev->stats.tx_bytes += skb->len;
+       dev->stats.tx_packets++;
+
+       /* Switch to loopback context. */
+       dev = loopback_priv(dev)->loopback_dev;
+
+       dev->stats.rx_bytes += skb->len;
+       dev->stats.rx_packets++;
+
+       skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */
+       skb->protocol = eth_type_trans(skb, dev);
+
+       /* Flush netfilter context: rx'ed skbuffs not expected to have any. */
+       nf_reset(skb);
+       secpath_reset(skb);
+
+       netif_rx(skb);
+
+       return NETDEV_TX_OK;
+}
+
+static void get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+       strcpy(info->driver, "netloop");
+       snprintf(info->bus_info, ETHTOOL_BUSINFO_LEN, "vif-0-%d",
+                loopback_priv(dev)->loop_idx);
+}
+
+static const struct ethtool_ops network_ethtool_ops =
+{
+       .get_drvinfo = get_drvinfo,
+       .get_link = ethtool_op_get_link,
+};
+
+static const struct net_device_ops loopback_netdev_ops = {
+       .ndo_open               = loopback_open,
+       .ndo_stop               = loopback_close,
+       .ndo_start_xmit         = loopback_start_xmit,
+       .ndo_change_mtu         = NULL, /* allow arbitrary mtu */
+};
+
+static void loopback_construct(struct net_device *dev, struct net_device *lo,
+                              int loop_idx)
+{
+       struct net_private *np = loopback_priv(dev);
+
+       np->loopback_dev     = lo;
+       np->loop_idx         = loop_idx;
+
+       dev->netdev_ops      = &loopback_netdev_ops;
+       dev->tx_queue_len    = 0;
+
+       dev->features        = (NETIF_F_HIGHDMA |
+                               NETIF_F_LLTX |
+                               NETIF_F_TSO |
+                               NETIF_F_SG |
+                               NETIF_F_IP_CSUM);
+
+       SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
+
+       /*
+        * We do not set a jumbo MTU on the interface. Otherwise the network
+        * stack will try to send large packets that will get dropped by the
+        * Ethernet bridge (unless the physical Ethernet interface is
+        * configured to transfer jumbo packets). If a larger MTU is desired
+        * then the system administrator can specify it using the 'ifconfig'
+        * command.
+        */
+       /*dev->mtu             = 16*1024;*/
+}
+
+static int __init make_loopback(int i)
+{
+       struct net_device *dev1, *dev2;
+       char dev_name[IFNAMSIZ];
+       int err = -ENOMEM;
+
+       sprintf(dev_name, "vif0.%d", i);
+       dev1 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
+       if (!dev1)
+               return err;
+
+       sprintf(dev_name, "veth%d", i);
+       dev2 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup);
+       if (!dev2)
+               goto fail_netdev2;
+
+       loopback_construct(dev1, dev2, i);
+       loopback_construct(dev2, dev1, i);
+
+       /*
+        * Initialise a dummy MAC address for the 'dummy backend' interface. We
+        * choose the numerically largest non-broadcast address to prevent the
+        * address getting stolen by an Ethernet bridge for STP purposes.
+        */
+       memset(dev1->dev_addr, 0xFF, ETH_ALEN);
+       dev1->dev_addr[0] &= ~0x01;
+
+       if ((err = register_netdev(dev1)) != 0)
+               goto fail;
+
+       if ((err = register_netdev(dev2)) != 0) {
+               unregister_netdev(dev1);
+               goto fail;
+       }
+
+       return 0;
+
+ fail:
+       free_netdev(dev2);
+ fail_netdev2:
+       free_netdev(dev1);
+       return err;
+}
+
+static int __init loopback_init(void)
+{
+       int i, err = 0;
+
+       if (nloopbacks == -1)
+               nloopbacks = is_initial_xendomain() ? 4 : 0;
+
+       for (i = 0; i < nloopbacks; i++)
+               if ((err = make_loopback(i)) != 0)
+                       break;
+
+       return err;
+}
+
+module_init(loopback_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/netback/netback.c b/drivers/xen/netback/netback.c

new file mode 100644 (file)

index 0000000..d45e42c
--- /dev/null
+++ b/drivers/xen/netback/netback.c
@@ -0,0 +1,1883 @@
+/******************************************************************************
+ * drivers/xen/netback/netback.c
+ * 
+ * Back-end of the driver for virtual network devices. This portion of the
+ * driver exports a 'unified' network-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A 
+ * reference front-end implementation can be found in:
+ *  drivers/xen/netfront/netfront.c
+ * 
+ * Copyright (c) 2002-2005, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "common.h"
+#include <linux/if_vlan.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+#include <net/tcp.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <xen/interface/memory.h>
+#include <xen/net-util.h>
+
+/*define NETBE_DEBUG_INTERRUPT*/
+
+struct xen_netbk *__read_mostly xen_netbk;
+unsigned int __read_mostly netbk_nr_groups;
+static bool __read_mostly use_kthreads = true;
+static bool __initdata bind_threads;
+
+#define GET_GROUP_INDEX(netif) ((netif)->group)
+
+static void netif_idx_release(struct xen_netbk *, u16 pending_idx);
+static void make_tx_response(netif_t *netif, 
+                            netif_tx_request_t *txp,
+                            s8       st);
+static netif_rx_response_t *make_rx_response(netif_t *netif, 
+                                            u16      id, 
+                                            s8       st,
+                                            u16      offset,
+                                            u16      size,
+                                            u16      flags);
+
+static void net_tx_action(unsigned long group);
+static void net_rx_action(unsigned long group);
+
+/* Discriminate from any valid pending_idx value. */
+#define INVALID_PENDING_IDX 0xffff
+
+static inline unsigned long idx_to_pfn(struct xen_netbk *netbk, u16 idx)
+{
+       return page_to_pfn(netbk->mmap_pages[idx]);
+}
+
+static inline unsigned long idx_to_kaddr(struct xen_netbk *netbk, u16 idx)
+{
+       return (unsigned long)pfn_to_kaddr(idx_to_pfn(netbk, idx));
+}
+
+/* extra field used in struct page */
+union page_ext {
+       struct {
+#if BITS_PER_LONG < 64
+#define GROUP_WIDTH (BITS_PER_LONG - CONFIG_XEN_NETDEV_TX_SHIFT)
+#define MAX_GROUPS ((1U << GROUP_WIDTH) - 1)
+               unsigned int grp:GROUP_WIDTH;
+               unsigned int idx:CONFIG_XEN_NETDEV_TX_SHIFT;
+#else
+#define MAX_GROUPS UINT_MAX
+               unsigned int grp, idx;
+#endif
+       } e;
+       void *mapping;
+};
+
+static inline void netif_set_page_ext(struct page *pg, unsigned int group,
+                                     unsigned int idx)
+{
+       union page_ext ext = { .e = { .grp = group + 1, .idx = idx } };
+
+       BUILD_BUG_ON(sizeof(ext) > sizeof(ext.mapping));
+       pg->mapping = ext.mapping;
+}
+
+static inline unsigned int netif_page_group(const struct page *pg)
+{
+       union page_ext ext = { .mapping = pg->mapping };
+
+       return ext.e.grp - 1;
+}
+
+static inline unsigned int netif_page_index(const struct page *pg)
+{
+       union page_ext ext = { .mapping = pg->mapping };
+
+       return ext.e.idx;
+}
+
+static u16 frag_get_pending_idx(const skb_frag_t *frag)
+{
+       return (u16)frag->page_offset;
+}
+
+static void frag_set_pending_idx(skb_frag_t *frag, u16 pending_idx)
+{
+       frag->page_offset = pending_idx;
+}
+
+/*
+ * This is the amount of packet we copy rather than map, so that the
+ * guest can't fiddle with the contents of the headers while we do
+ * packet processing on them (netfilter, routing, etc).
+ */
+#define PKT_PROT_LEN    (ETH_HLEN + VLAN_HLEN + \
+                        sizeof(struct iphdr) + MAX_IPOPTLEN + \
+                        sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE)
+
+#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
+
+static inline pending_ring_idx_t nr_pending_reqs(const struct xen_netbk *netbk)
+{
+       return MAX_PENDING_REQS -
+               netbk->pending_prod + netbk->pending_cons;
+}
+
+/* Setting this allows the safe use of this driver without netloop. */
+static bool MODPARM_copy_skb = true;
+module_param_named(copy_skb, MODPARM_copy_skb, bool, 0);
+MODULE_PARM_DESC(copy_skb, "Copy data received from netfront without netloop");
+static bool MODPARM_permute_returns;
+module_param_named(permute_returns, MODPARM_permute_returns, bool, S_IRUSR|S_IWUSR);
+MODULE_PARM_DESC(permute_returns, "Randomly permute the order in which TX responses are sent to the frontend");
+module_param_named(groups, netbk_nr_groups, uint, 0);
+MODULE_PARM_DESC(groups, "Specify the number of tasklet pairs/threads to use");
+module_param_named(tasklets, use_kthreads, invbool, 0);
+MODULE_PARM_DESC(tasklets, "Use tasklets instead of kernel threads");
+module_param_named(bind, bind_threads, bool, 0);
+MODULE_PARM_DESC(bind, "Bind kernel threads to (v)CPUs");
+
+int netbk_copy_skb_mode;
+
+static inline unsigned long alloc_mfn(struct xen_netbk *netbk)
+{
+       BUG_ON(netbk->alloc_index == 0);
+       return netbk->mfn_list[--netbk->alloc_index];
+}
+
+static int check_mfn(struct xen_netbk *netbk, unsigned int nr)
+{
+       struct xen_memory_reservation reservation = {
+               .extent_order = 0,
+               .domid        = DOMID_SELF
+       };
+       int rc;
+
+       if (likely(netbk->alloc_index >= nr))
+               return 0;
+
+       set_xen_guest_handle(reservation.extent_start,
+                            netbk->mfn_list + netbk->alloc_index);
+       reservation.nr_extents = MAX_MFN_ALLOC - netbk->alloc_index;
+       rc = HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation);
+       if (likely(rc > 0))
+               netbk->alloc_index += rc;
+
+       return netbk->alloc_index >= nr ? 0 : -ENOMEM;
+}
+
+static void netbk_schedule(struct xen_netbk *netbk)
+{
+       if (use_kthreads)
+               wake_up(&netbk->netbk_action_wq);
+       else
+               tasklet_schedule(&netbk->net_tx_tasklet);
+}
+
+static void netbk_schedule_group(unsigned long group)
+{
+       netbk_schedule(&xen_netbk[group]);
+}
+
+static inline void maybe_schedule_tx_action(unsigned int group)
+{
+       struct xen_netbk *netbk = &xen_netbk[group];
+
+       smp_mb();
+       if ((nr_pending_reqs(netbk) < (MAX_PENDING_REQS/2)) &&
+           !list_empty(&netbk->schedule_list))
+               netbk_schedule(netbk);
+}
+
+static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
+{
+       struct skb_shared_info *ninfo;
+       struct sk_buff *nskb;
+       unsigned long offset;
+       int ret;
+       int len;
+       int headlen;
+
+       BUG_ON(skb_shinfo(skb)->frag_list != NULL);
+
+       nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN);
+       if (unlikely(!nskb))
+               goto err;
+
+       skb_reserve(nskb, 16 + NET_IP_ALIGN);
+       headlen = skb_end_pointer(nskb) - nskb->data;
+       if (headlen > skb_headlen(skb))
+               headlen = skb_headlen(skb);
+       ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
+       BUG_ON(ret);
+
+       ninfo = skb_shinfo(nskb);
+       ninfo->gso_size = skb_shinfo(skb)->gso_size;
+       ninfo->gso_type = skb_shinfo(skb)->gso_type;
+
+       offset = headlen;
+       len = skb->len - headlen;
+
+       nskb->len = skb->len;
+       nskb->data_len = len;
+       nskb->truesize += len;
+
+       while (len) {
+               struct page *page;
+               int copy;
+               int zero;
+
+               if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) {
+                       dump_stack();
+                       goto err_free;
+               }
+
+               copy = len >= PAGE_SIZE ? PAGE_SIZE : len;
+               zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO;
+
+               page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero);
+               if (unlikely(!page))
+                       goto err_free;
+
+               ret = skb_copy_bits(skb, offset, page_address(page), copy);
+               BUG_ON(ret);
+
+               __skb_fill_page_desc(nskb, ninfo->nr_frags, page, 0, copy);
+               ninfo->nr_frags++;
+
+               offset += copy;
+               len -= copy;
+       }
+
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+       offset = 0;
+#else
+       offset = nskb->data - skb->data;
+#endif
+
+       nskb->transport_header = skb->transport_header + offset;
+       nskb->network_header   = skb->network_header   + offset;
+       nskb->mac_header       = skb->mac_header       + offset;
+
+       return nskb;
+
+ err_free:
+       kfree_skb(nskb);
+ err:
+       return NULL;
+}
+
+static inline int netbk_max_required_rx_slots(netif_t *netif)
+{
+       if (netif->can_sg || netif->gso)
+               return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */
+       return 1; /* all in one */
+}
+
+static inline int netbk_queue_full(netif_t *netif)
+{
+       RING_IDX peek   = netif->rx_req_cons_peek;
+       RING_IDX needed = netbk_max_required_rx_slots(netif);
+
+       return ((netif->rx.sring->req_prod - peek) < needed) ||
+              ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed);
+}
+
+static void tx_queue_callback(unsigned long data)
+{
+       netif_t *netif = (netif_t *)data;
+       if (netif_schedulable(netif))
+               netif_wake_queue(netif->dev);
+}
+
+int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+       netif_t *netif = netdev_priv(dev);
+       unsigned int group = GET_GROUP_INDEX(netif);
+       struct xen_netbk *netbk;
+
+       BUG_ON(skb->dev != dev);
+
+       if (unlikely(group >= netbk_nr_groups)) {
+               BUG_ON(group != UINT_MAX);
+               goto drop;
+       }
+
+       /* Drop the packet if the target domain has no receive buffers. */
+       if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif)))
+               goto drop;
+
+       /*
+        * Copy the packet here if it's destined for a flipping interface
+        * but isn't flippable (e.g. extra references to data).
+        * XXX For now we also copy skbuffs whose head crosses a page
+        * boundary, because netbk_gop_skb can't handle them.
+        */
+       if (!netif->copying_receiver ||
+           ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE)) {
+               struct sk_buff *nskb = netbk_copy_skb(skb);
+               if ( unlikely(nskb == NULL) )
+                       goto drop;
+               /* Copy only the header fields we use in this driver. */
+               nskb->dev = skb->dev;
+               nskb->ip_summed = skb->ip_summed;
+               dev_kfree_skb(skb);
+               skb = nskb;
+       }
+
+       netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 +
+                                  !!skb_shinfo(skb)->gso_size;
+       netif_get(netif);
+
+       if (netbk_can_queue(dev) && netbk_queue_full(netif)) {
+               netif->rx.sring->req_event = netif->rx_req_cons_peek +
+                       netbk_max_required_rx_slots(netif);
+               mb(); /* request notification /then/ check & stop the queue */
+               if (netbk_queue_full(netif)) {
+                       netif_stop_queue(dev);
+                       /*
+                        * Schedule 500ms timeout to restart the queue, thus
+                        * ensuring that an inactive queue will be drained.
+                        * Packets will be immediately be dropped until more
+                        * receive buffers become available (see
+                        * netbk_queue_full() check above).
+                        */
+                       netif->tx_queue_timeout.data = (unsigned long)netif;
+                       netif->tx_queue_timeout.function = tx_queue_callback;
+                       mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2);
+               }
+       }
+
+       netbk = &xen_netbk[group];
+       skb_queue_tail(&netbk->rx_queue, skb);
+       netbk_schedule(netbk);
+
+       return NETDEV_TX_OK;
+
+ drop:
+       dev->stats.tx_dropped++;
+       dev_kfree_skb(skb);
+       return NETDEV_TX_OK;
+}
+
+#if 0
+static void xen_network_done_notify(void)
+{
+       static struct net_device *eth0_dev = NULL;
+       if (unlikely(eth0_dev == NULL))
+               eth0_dev = __dev_get_by_name(&init_net, "eth0");
+       napi_schedule(???);
+}
+/* 
+ * Add following to poll() function in NAPI driver (Tigon3 is example):
+ *  if ( xen_network_done() )
+ *      tg3_enable_ints(tp);
+ */
+int xen_network_done(void)
+{
+       return skb_queue_empty(&rx_queue);
+}
+#endif
+
+struct netrx_pending_operations {
+       unsigned trans_prod, trans_cons;
+       unsigned mmu_prod, mmu_mcl;
+       unsigned mcl_prod, mcl_cons;
+       unsigned copy_prod, copy_cons;
+       unsigned meta_prod, meta_cons;
+       mmu_update_t *mmu;
+       gnttab_transfer_t *trans;
+       gnttab_copy_t *copy;
+       multicall_entry_t *mcl;
+       struct netbk_rx_meta *meta;
+};
+
+/* Set up the grant operations for this fragment.  If it's a flipping
+   interface, we also set up the unmap request from here. */
+static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta,
+                         int i, struct netrx_pending_operations *npo,
+                         struct page *page, unsigned long size,
+                         unsigned long offset)
+{
+       mmu_update_t *mmu;
+       gnttab_transfer_t *gop;
+       gnttab_copy_t *copy_gop;
+       multicall_entry_t *mcl;
+       netif_rx_request_t *req;
+       unsigned long old_mfn, new_mfn;
+       struct xen_netbk *netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
+
+       old_mfn = virt_to_mfn(page_address(page));
+
+       req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i);
+       if (netif->copying_receiver) {
+               unsigned int group, idx;
+
+               /* The fragment needs to be copied rather than
+                  flipped. */
+               meta->copy = 1;
+               copy_gop = npo->copy + npo->copy_prod++;
+               copy_gop->flags = GNTCOPY_dest_gref;
+               if (PageForeign(page) &&
+                   page->mapping != NULL &&
+                   (idx = netif_page_index(page)) < MAX_PENDING_REQS &&
+                   (group = netif_page_group(page)) < netbk_nr_groups) {
+                       struct pending_tx_info *src_pend;
+                       unsigned int grp;
+
+                       netbk = &xen_netbk[group];
+                       BUG_ON(netbk->mmap_pages[idx] != page);
+                       src_pend = &netbk->pending_tx_info[idx];
+                       grp = GET_GROUP_INDEX(src_pend->netif);
+                       BUG_ON(group != grp && grp != UINT_MAX);
+                       copy_gop->source.domid = src_pend->netif->domid;
+                       copy_gop->source.u.ref = src_pend->req.gref;
+                       copy_gop->flags |= GNTCOPY_source_gref;
+               } else {
+                       copy_gop->source.domid = DOMID_SELF;
+                       copy_gop->source.u.gmfn = old_mfn;
+               }
+               copy_gop->source.offset = offset;
+               copy_gop->dest.domid = netif->domid;
+               copy_gop->dest.offset = 0;
+               copy_gop->dest.u.ref = req->gref;
+               copy_gop->len = size;
+       } else {
+               meta->copy = 0;
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                       new_mfn = alloc_mfn(netbk);
+
+                       /*
+                        * Set the new P2M table entry before
+                        * reassigning the old data page. Heed the
+                        * comment in pgtable-2level.h:pte_page(). :-)
+                        */
+                       set_phys_to_machine(page_to_pfn(page), new_mfn);
+
+                       mcl = npo->mcl + npo->mcl_prod++;
+                       MULTI_update_va_mapping(mcl,
+                                            (unsigned long)page_address(page),
+                                            pfn_pte_ma(new_mfn, PAGE_KERNEL),
+                                            0);
+
+                       mmu = npo->mmu + npo->mmu_prod++;
+                       mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) |
+                               MMU_MACHPHYS_UPDATE;
+                       mmu->val = page_to_pfn(page);
+               }
+
+               gop = npo->trans + npo->trans_prod++;
+               gop->mfn = old_mfn;
+               gop->domid = netif->domid;
+               gop->ref = req->gref;
+       }
+       return req->id;
+}
+
+static void netbk_gop_skb(struct sk_buff *skb,
+                         struct netrx_pending_operations *npo)
+{
+       netif_t *netif = netdev_priv(skb->dev);
+       int nr_frags = skb_shinfo(skb)->nr_frags;
+       int i;
+       int extra;
+       struct netbk_rx_meta *head_meta, *meta;
+
+       head_meta = npo->meta + npo->meta_prod++;
+       head_meta->frag.page_offset = skb_shinfo(skb)->gso_type;
+       head_meta->frag.size = skb_shinfo(skb)->gso_size;
+       extra = !!head_meta->frag.size + 1;
+
+       for (i = 0; i < nr_frags; i++) {
+               meta = npo->meta + npo->meta_prod++;
+               meta->frag = skb_shinfo(skb)->frags[i];
+               meta->id = netbk_gop_frag(netif, meta, i + extra, npo,
+                                         skb_frag_page(&meta->frag),
+                                         skb_frag_size(&meta->frag),
+                                         meta->frag.page_offset);
+       }
+
+       /*
+        * This must occur at the end to ensure that we don't trash skb_shinfo
+        * until we're done. We know that the head doesn't cross a page
+        * boundary because such packets get copied in netif_be_start_xmit.
+        */
+       head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo,
+                                      virt_to_page(skb->data),
+                                      skb_headlen(skb),
+                                      offset_in_page(skb->data));
+
+       netif->rx.req_cons += nr_frags + extra;
+}
+
+static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta)
+{
+       int i;
+
+       for (i = 0; i < nr_frags; i++)
+               put_page(skb_frag_page(&meta[i].frag));
+}
+
+/* This is a twin to netbk_gop_skb.  Assume that netbk_gop_skb was
+   used to set up the operations on the top of
+   netrx_pending_operations, which have since been done.  Check that
+   they didn't give any errors and advance over them. */
+static int netbk_check_gop(int nr_frags, domid_t domid, struct netrx_pending_operations *npo)
+{
+       multicall_entry_t *mcl;
+       gnttab_transfer_t *gop;
+       gnttab_copy_t     *copy_op;
+       int status = XEN_NETIF_RSP_OKAY;
+       int i;
+
+       for (i = 0; i <= nr_frags; i++) {
+               if (npo->meta[npo->meta_cons + i].copy) {
+                       copy_op = npo->copy + npo->copy_cons++;
+                       if (unlikely(copy_op->status == GNTST_eagain))
+                               gnttab_check_GNTST_eagain_while(GNTTABOP_copy, copy_op);
+                       if (unlikely(copy_op->status != GNTST_okay)) {
+                               DPRINTK("Bad status %d from copy to DOM%d.\n",
+                                       copy_op->status, domid);
+                               status = XEN_NETIF_RSP_ERROR;
+                       }
+               } else {
+                       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                               mcl = npo->mcl + npo->mcl_cons++;
+                               /* The update_va_mapping() must not fail. */
+                               BUG_ON(mcl->result != 0);
+                       }
+
+                       gop = npo->trans + npo->trans_cons++;
+                       /* Check the reassignment error code. */
+                       if (unlikely(gop->status != GNTST_okay)) {
+                               DPRINTK("Bad status %d from grant transfer to DOM%u\n",
+                                       gop->status, domid);
+                               /*
+                                * Page no longer belongs to us unless
+                                * GNTST_bad_page, but that should be
+                                * a fatal error anyway.
+                                */
+                               BUG_ON(gop->status == GNTST_bad_page);
+                               status = XEN_NETIF_RSP_ERROR;
+                       }
+               }
+       }
+
+       return status;
+}
+
+static void netbk_add_frag_responses(netif_t *netif, int status,
+                                    struct netbk_rx_meta *meta, int nr_frags)
+{
+       int i;
+       unsigned long offset;
+
+       for (i = 0; i < nr_frags; i++) {
+               int id = meta[i].id;
+               int flags = (i == nr_frags - 1) ? 0 : XEN_NETRXF_more_data;
+
+               if (meta[i].copy)
+                       offset = 0;
+               else
+                       offset = meta[i].frag.page_offset;
+               make_rx_response(netif, id, status, offset,
+                                meta[i].frag.size, flags);
+       }
+}
+
+static void net_rx_action(unsigned long group)
+{
+       netif_t *netif = NULL;
+       s8 status;
+       u16 id, irq, flags;
+       netif_rx_response_t *resp;
+       multicall_entry_t *mcl;
+       struct sk_buff_head rxq;
+       struct sk_buff *skb;
+       int notify_nr = 0;
+       int ret;
+       int nr_frags;
+       int count;
+       unsigned long offset;
+       struct xen_netbk *netbk = &xen_netbk[group];
+
+       struct netrx_pending_operations npo = {
+               .mmu   = netbk->rx_mmu,
+               .trans = netbk->grant_trans_op,
+               .copy  = netbk->grant_copy_op,
+               .mcl   = netbk->rx_mcl,
+               .meta  = netbk->meta,
+       };
+
+       skb_queue_head_init(&rxq);
+
+       count = 0;
+
+       while ((skb = skb_dequeue(&netbk->rx_queue)) != NULL) {
+               nr_frags = skb_shinfo(skb)->nr_frags;
+               *(int *)skb->cb = nr_frags;
+
+               if (!xen_feature(XENFEAT_auto_translated_physmap) &&
+                   !((netif_t *)netdev_priv(skb->dev))->copying_receiver &&
+                   check_mfn(netbk, nr_frags + 1)) {
+                       /* Memory squeeze? Back off for an arbitrary while. */
+                       if ( net_ratelimit() )
+                               WPRINTK("Memory squeeze in netback "
+                                       "driver.\n");
+                       mod_timer(&netbk->net_timer, jiffies + HZ);
+                       skb_queue_head(&netbk->rx_queue, skb);
+                       break;
+               }
+
+               netbk_gop_skb(skb, &npo);
+
+               count += nr_frags + 1;
+
+               __skb_queue_tail(&rxq, skb);
+
+               /* Filled the batch queue? */
+               if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE)
+                       break;
+       }
+
+       BUG_ON(npo.meta_prod > ARRAY_SIZE(netbk->meta));
+
+       npo.mmu_mcl = npo.mcl_prod;
+       if (npo.mcl_prod) {
+               BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
+               BUG_ON(npo.mmu_prod > ARRAY_SIZE(netbk->rx_mmu));
+               mcl = npo.mcl + npo.mcl_prod++;
+
+               BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping);
+               mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
+
+               mcl->op = __HYPERVISOR_mmu_update;
+               mcl->args[0] = (unsigned long)netbk->rx_mmu;
+               mcl->args[1] = npo.mmu_prod;
+               mcl->args[2] = 0;
+               mcl->args[3] = DOMID_SELF;
+       }
+
+       if (npo.trans_prod) {
+               BUG_ON(npo.trans_prod > ARRAY_SIZE(netbk->grant_trans_op));
+               mcl = npo.mcl + npo.mcl_prod++;
+               mcl->op = __HYPERVISOR_grant_table_op;
+               mcl->args[0] = GNTTABOP_transfer;
+               mcl->args[1] = (unsigned long)netbk->grant_trans_op;
+               mcl->args[2] = npo.trans_prod;
+       }
+
+       if (npo.copy_prod) {
+               BUG_ON(npo.copy_prod > ARRAY_SIZE(netbk->grant_copy_op));
+               mcl = npo.mcl + npo.mcl_prod++;
+               mcl->op = __HYPERVISOR_grant_table_op;
+               mcl->args[0] = GNTTABOP_copy;
+               mcl->args[1] = (unsigned long)netbk->grant_copy_op;
+               mcl->args[2] = npo.copy_prod;
+       }
+
+       /* Nothing to do? */
+       if (!npo.mcl_prod)
+               return;
+
+       BUG_ON(npo.mcl_prod > ARRAY_SIZE(netbk->rx_mcl));
+
+       ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod);
+       BUG_ON(ret != 0);
+       /* The mmu_machphys_update() must not fail. */
+       BUG_ON(npo.mmu_mcl && npo.mcl[npo.mmu_mcl].result != 0);
+
+       while ((skb = __skb_dequeue(&rxq)) != NULL) {
+               nr_frags = *(int *)skb->cb;
+
+               netif = netdev_priv(skb->dev);
+
+               status = netbk_check_gop(nr_frags, netif->domid, &npo);
+
+               /* We can't rely on skb_release_data to release the
+                  pages used by fragments for us, since it tries to
+                  touch the pages in the fraglist.  If we're in
+                  flipping mode, that doesn't work.  In copying mode,
+                  we still have access to all of the pages, and so
+                  it's safe to let release_data deal with it. */
+               /* (Freeing the fragments is safe since we copy
+                  non-linear skbs destined for flipping interfaces) */
+               if (!netif->copying_receiver) {
+                       atomic_set(&(skb_shinfo(skb)->dataref), 1);
+                       skb_shinfo(skb)->frag_list = NULL;
+                       skb_shinfo(skb)->nr_frags = 0;
+                       netbk_free_pages(nr_frags, netbk->meta + npo.meta_cons + 1);
+               }
+
+               skb->dev->stats.tx_bytes += skb->len;
+               skb->dev->stats.tx_packets++;
+
+               id = netbk->meta[npo.meta_cons].id;
+               flags = nr_frags ? XEN_NETRXF_more_data : 0;
+
+               switch (skb->ip_summed) {
+               case CHECKSUM_PARTIAL: /* local packet? */
+                       flags |= XEN_NETRXF_csum_blank |
+                                XEN_NETRXF_data_validated;
+                       break;
+               case CHECKSUM_UNNECESSARY: /* remote but checksummed? */
+                       flags |= XEN_NETRXF_data_validated;
+                       break;
+               }
+
+               if (netbk->meta[npo.meta_cons].copy)
+                       offset = 0;
+               else
+                       offset = offset_in_page(skb->data);
+               resp = make_rx_response(netif, id, status, offset,
+                                       skb_headlen(skb), flags);
+
+               if (netbk->meta[npo.meta_cons].frag.size) {
+                       struct netif_extra_info *gso =
+                               (struct netif_extra_info *)
+                               RING_GET_RESPONSE(&netif->rx,
+                                                 netif->rx.rsp_prod_pvt++);
+
+                       resp->flags |= XEN_NETRXF_extra_info;
+
+                       gso->u.gso.size = netbk->meta[npo.meta_cons].frag.size;
+                       gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+                       gso->u.gso.pad = 0;
+                       gso->u.gso.features = 0;
+
+                       gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+                       gso->flags = 0;
+               }
+
+               netbk_add_frag_responses(netif, status,
+                                        netbk->meta + npo.meta_cons + 1,
+                                        nr_frags);
+
+               RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret);
+               irq = netif->irq - DYNIRQ_BASE;
+               if (ret && !__test_and_set_bit(irq, netbk->rx_notify))
+                       netbk->notify_list[notify_nr++] = irq;
+
+               if (netif_queue_stopped(netif->dev) &&
+                   netif_schedulable(netif) &&
+                   !netbk_queue_full(netif))
+                       netif_wake_queue(netif->dev);
+
+               netif_put(netif);
+               dev_kfree_skb(skb);
+
+               npo.meta_cons += nr_frags + 1;
+       }
+
+       if (notify_nr == 1) {
+               irq = *netbk->notify_list;
+               __clear_bit(irq, netbk->rx_notify);
+               notify_remote_via_irq(irq + DYNIRQ_BASE);
+       } else {
+               for (count = ret = 0; ret < notify_nr; ++ret) {
+                       irq = netbk->notify_list[ret];
+                       __clear_bit(irq, netbk->rx_notify);
+                       if (!multi_notify_remote_via_irq(netbk->rx_mcl + count,
+                                                        irq + DYNIRQ_BASE))
+                               ++count;
+               }
+               if (HYPERVISOR_multicall(netbk->rx_mcl, count))
+                       BUG();
+       }
+
+       /* More work to do? */
+       if (!skb_queue_empty(&netbk->rx_queue) &&
+           !timer_pending(&netbk->net_timer))
+               netbk_schedule(netbk);
+#if 0
+       else
+               xen_network_done_notify();
+#endif
+}
+
+static int __on_net_schedule_list(netif_t *netif)
+{
+       return netif->list.next != NULL;
+}
+
+/* Must be called with netbk->schedule_list_lock held. */
+static void remove_from_net_schedule_list(netif_t *netif)
+{
+       if (likely(__on_net_schedule_list(netif))) {
+               list_del(&netif->list);
+               netif->list.next = NULL;
+               netif_put(netif);
+       }
+}
+
+static netif_t *poll_net_schedule_list(struct xen_netbk *netbk)
+{
+       netif_t *netif = NULL;
+
+       spin_lock_irq(&netbk->schedule_list_lock);
+       if (!list_empty(&netbk->schedule_list)) {
+               netif = list_first_entry(&netbk->schedule_list, netif_t, list);
+               netif_get(netif);
+               remove_from_net_schedule_list(netif);
+       }
+       spin_unlock_irq(&netbk->schedule_list_lock);
+       return netif;
+}
+
+static void add_to_net_schedule_list_tail(netif_t *netif)
+{
+       struct xen_netbk *netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
+       unsigned long flags;
+
+       if (__on_net_schedule_list(netif))
+               return;
+
+       spin_lock_irqsave(&netbk->schedule_list_lock, flags);
+       if (!__on_net_schedule_list(netif) &&
+           likely(netif_schedulable(netif))) {
+               list_add_tail(&netif->list, &netbk->schedule_list);
+               netif_get(netif);
+       }
+       spin_unlock_irqrestore(&netbk->schedule_list_lock, flags);
+}
+
+/*
+ * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
+ * If this driver is pipelining transmit requests then we can be very
+ * aggressive in avoiding new-packet notifications -- frontend only needs to
+ * send a notification if there are no outstanding unreceived responses.
+ * If we may be buffer transmit buffers for any reason then we must be rather
+ * more conservative and treat this as the final check for pending work.
+ */
+void netif_schedule_work(netif_t *netif)
+{
+       int more_to_do;
+
+#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
+       more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
+#else
+       RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
+#endif
+
+       if (more_to_do) {
+               add_to_net_schedule_list_tail(netif);
+               maybe_schedule_tx_action(GET_GROUP_INDEX(netif));
+       }
+}
+
+void netif_deschedule_work(netif_t *netif)
+{
+       struct xen_netbk *netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
+
+       spin_lock_irq(&netbk->schedule_list_lock);
+       remove_from_net_schedule_list(netif);
+       spin_unlock_irq(&netbk->schedule_list_lock);
+}
+
+
+static void tx_add_credit(netif_t *netif)
+{
+       unsigned long max_burst, max_credit;
+
+       /*
+        * Allow a burst big enough to transmit a jumbo packet of up to 128kB.
+        * Otherwise the interface can seize up due to insufficient credit.
+        */
+       max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size;
+       max_burst = min(max_burst, 131072UL);
+       max_burst = max(max_burst, netif->credit_bytes);
+
+       /* Take care that adding a new chunk of credit doesn't wrap to zero. */
+       max_credit = netif->remaining_credit + netif->credit_bytes;
+       if (max_credit < netif->remaining_credit)
+               max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */
+
+       netif->remaining_credit = min(max_credit, max_burst);
+}
+
+static void tx_credit_callback(unsigned long data)
+{
+       netif_t *netif = (netif_t *)data;
+       tx_add_credit(netif);
+       netif_schedule_work(netif);
+}
+
+static inline int copy_pending_req(struct xen_netbk *netbk,
+                                  pending_ring_idx_t pending_idx)
+{
+       return gnttab_copy_grant_page(netbk->grant_tx_handle[pending_idx],
+                                     &netbk->mmap_pages[pending_idx]);
+}
+
+static void permute_dealloc_ring(u16 *dealloc_ring, pending_ring_idx_t dc,
+                                pending_ring_idx_t dp)
+{
+       static unsigned random_src = 0x12345678;
+       unsigned dst_offset;
+       pending_ring_idx_t dest;
+       u16 tmp;
+
+       while (dc != dp) {
+               dst_offset = (random_src / 256) % (dp - dc);
+               dest = dc + dst_offset;
+               tmp = dealloc_ring[MASK_PEND_IDX(dest)];
+               dealloc_ring[MASK_PEND_IDX(dest)] =
+                       dealloc_ring[MASK_PEND_IDX(dc)];
+               dealloc_ring[MASK_PEND_IDX(dc)] = tmp;
+               dc++;
+               random_src *= 68389;
+       }
+}
+
+inline static void net_tx_action_dealloc(struct xen_netbk *netbk)
+{
+       struct netbk_tx_pending_inuse *inuse, *n;
+       gnttab_unmap_grant_ref_t *gop;
+       u16 pending_idx;
+       pending_ring_idx_t dc, dp;
+       netif_t *netif;
+       LIST_HEAD(list);
+
+       dc = netbk->dealloc_cons;
+       gop = netbk->tx_unmap_ops;
+
+       /*
+        * Free up any grants we have finished using
+        */
+       do {
+               dp = netbk->dealloc_prod;
+
+               /* Ensure we see all indices enqueued by netif_idx_release(). */
+               smp_rmb();
+
+               if (MODPARM_permute_returns && netbk_nr_groups == 1)
+                       permute_dealloc_ring(netbk->dealloc_ring, dc, dp);
+
+               while (dc != dp) {
+                       unsigned long pfn;
+                       struct netbk_tx_pending_inuse *pending_inuse =
+                                       netbk->pending_inuse;
+
+                       pending_idx = netbk->dealloc_ring[MASK_PEND_IDX(dc++)];
+                       list_move_tail(&pending_inuse[pending_idx].list, &list);
+
+                       pfn = idx_to_pfn(netbk, pending_idx);
+                       /* Already unmapped? */
+                       if (!phys_to_machine_mapping_valid(pfn))
+                               continue;
+
+                       gnttab_set_unmap_op(gop, idx_to_kaddr(netbk, pending_idx),
+                                           GNTMAP_host_map,
+                                           netbk->grant_tx_handle[pending_idx]);
+                       gop++;
+               }
+
+       } while (dp != netbk->dealloc_prod);
+
+       netbk->dealloc_cons = dc;
+
+       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+                                     netbk->tx_unmap_ops,
+                                     gop - netbk->tx_unmap_ops))
+               BUG();
+
+       /* Copy any entries that have been pending for too long. */
+       if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
+           !list_empty(&netbk->pending_inuse_head)) {
+               list_for_each_entry_safe(inuse, n, &netbk->pending_inuse_head, list) {
+                       struct pending_tx_info *pending_tx_info
+                               = netbk->pending_tx_info;
+
+                       if (time_after(inuse->alloc_time + HZ / 2, jiffies))
+                               break;
+
+                       pending_idx = inuse - netbk->pending_inuse;
+
+                       pending_tx_info[pending_idx].netif->nr_copied_skbs++;
+
+                       switch (copy_pending_req(netbk, pending_idx)) {
+                       case 0:
+                               list_move_tail(&inuse->list, &list);
+                               continue;
+                       case -EBUSY:
+                               list_del_init(&inuse->list);
+                               continue;
+                       case -ENOENT:
+                               continue;
+                       }
+
+                       break;
+               }
+       }
+
+       list_for_each_entry_safe(inuse, n, &list, list) {
+               struct pending_tx_info *pending_tx_info =
+                       netbk->pending_tx_info;
+
+               pending_idx = inuse - netbk->pending_inuse;
+               netif = pending_tx_info[pending_idx].netif;
+
+               make_tx_response(netif, &pending_tx_info[pending_idx].req, 
+                                XEN_NETIF_RSP_OKAY);
+
+               /* Ready for next use. */
+               gnttab_reset_grant_page(netbk->mmap_pages[pending_idx]);
+
+               netbk->pending_ring[MASK_PEND_IDX(netbk->pending_prod++)] =
+                       pending_idx;
+
+               netif_put(netif);
+
+               list_del_init(&inuse->list);
+       }
+}
+
+static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end)
+{
+       RING_IDX cons = netif->tx.req_cons;
+
+       do {
+               make_tx_response(netif, txp, XEN_NETIF_RSP_ERROR);
+               if (cons >= end)
+                       break;
+               txp = RING_GET_REQUEST(&netif->tx, cons++);
+       } while (1);
+       netif->tx.req_cons = cons;
+       netif_schedule_work(netif);
+       netif_put(netif);
+}
+
+static int netbk_count_requests(netif_t *netif, netif_tx_request_t *first,
+                               netif_tx_request_t *txp, int work_to_do)
+{
+       RING_IDX cons = netif->tx.req_cons;
+       int frags = 0;
+
+       if (!(first->flags & XEN_NETTXF_more_data))
+               return 0;
+
+       do {
+               if (frags >= work_to_do) {
+                       DPRINTK("Need more frags\n");
+                       return -frags;
+               }
+
+               if (unlikely(frags >= MAX_SKB_FRAGS)) {
+                       DPRINTK("Too many frags\n");
+                       return -frags;
+               }
+
+               memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags),
+                      sizeof(*txp));
+               if (txp->size > first->size) {
+                       DPRINTK("Frags galore\n");
+                       return -frags;
+               }
+
+               first->size -= txp->size;
+               frags++;
+
+               if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
+                       DPRINTK("txp->offset: %x, size: %u\n",
+                               txp->offset, txp->size);
+                       return -frags;
+               }
+       } while ((txp++)->flags & XEN_NETTXF_more_data);
+
+       return frags;
+}
+
+static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
+                                                 struct sk_buff *skb,
+                                                 netif_tx_request_t *txp,
+                                                 gnttab_map_grant_ref_t *mop)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       skb_frag_t *frags = shinfo->frags;
+       u16 pending_idx = *(u16 *)skb->data;
+       int i, start;
+
+       /* Skip first skb fragment if it is on same page as header fragment. */
+       start = (frag_get_pending_idx(frags) == pending_idx);
+
+       for (i = start; i < shinfo->nr_frags; i++, txp++) {
+               struct xen_netbk *netbk = &xen_netbk[GET_GROUP_INDEX(netif)];
+               pending_ring_idx_t index = MASK_PEND_IDX(netbk->pending_cons++);
+               struct pending_tx_info *pending_tx_info =
+                       netbk->pending_tx_info;
+
+               pending_idx = netbk->pending_ring[index];
+
+               gnttab_set_map_op(mop++, idx_to_kaddr(netbk, pending_idx),
+                                 GNTMAP_host_map | GNTMAP_readonly,
+                                 txp->gref, netif->domid);
+
+               memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
+               netif_get(netif);
+               pending_tx_info[pending_idx].netif = netif;
+               frag_set_pending_idx(&frags[i], pending_idx);
+       }
+
+       return mop;
+}
+
+static int netbk_tx_check_mop(struct xen_netbk *netbk, struct sk_buff *skb,
+                             gnttab_map_grant_ref_t **mopp)
+{
+       gnttab_map_grant_ref_t *mop = *mopp;
+       u16 pending_idx = *(u16 *)skb->data;
+       struct pending_tx_info *pending_tx_info = netbk->pending_tx_info;
+       netif_t *netif = pending_tx_info[pending_idx].netif;
+       netif_tx_request_t *txp;
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int nr_frags = shinfo->nr_frags;
+       int i, err, start;
+
+       /* Check status of header. */
+       err = mop->status;
+       if (unlikely(err != GNTST_okay)) {
+               pending_ring_idx_t index = MASK_PEND_IDX(netbk->pending_prod++);
+
+               txp = &pending_tx_info[pending_idx].req;
+               make_tx_response(netif, txp, XEN_NETIF_RSP_ERROR);
+               netbk->pending_ring[index] = pending_idx;
+               netif_put(netif);
+       } else {
+               set_phys_to_machine(idx_to_pfn(netbk, pending_idx),
+                       FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
+               netbk->grant_tx_handle[pending_idx] = mop->handle;
+       }
+
+       /* Skip first skb fragment if it is on same page as header fragment. */
+       start = (frag_get_pending_idx(shinfo->frags) == pending_idx);
+
+       for (i = start; i < nr_frags; i++) {
+               int j, newerr;
+               pending_ring_idx_t index;
+
+               pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
+
+               /* Check error status: if okay then remember grant handle. */
+               newerr = (++mop)->status;
+               if (likely(newerr == GNTST_okay)) {
+                       set_phys_to_machine(idx_to_pfn(netbk, pending_idx),
+                               FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
+                       netbk->grant_tx_handle[pending_idx] = mop->handle;
+                       /* Had a previous error? Invalidate this fragment. */
+                       if (unlikely(err != GNTST_okay))
+                               netif_idx_release(netbk, pending_idx);
+                       continue;
+               }
+
+               /* Error on this fragment: respond to client with an error. */
+               txp = &pending_tx_info[pending_idx].req;
+               make_tx_response(netif, txp, XEN_NETIF_RSP_ERROR);
+               index = MASK_PEND_IDX(netbk->pending_prod++);
+               netbk->pending_ring[index] = pending_idx;
+               netif_put(netif);
+
+               /* Not the first error? Preceding frags already invalidated. */
+               if (err != GNTST_okay)
+                       continue;
+
+               /* First error: invalidate header and preceding fragments. */
+               pending_idx = *((u16 *)skb->data);
+               netif_idx_release(netbk, pending_idx);
+               for (j = start; j < i; j++) {
+                       pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
+                       netif_idx_release(netbk, pending_idx);
+               }
+
+               /* Remember the error: invalidate all subsequent fragments. */
+               err = newerr;
+       }
+
+       *mopp = mop + 1;
+       return err;
+}
+
+static void netbk_fill_frags(struct xen_netbk *netbk, struct sk_buff *skb)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int nr_frags = shinfo->nr_frags;
+       int i;
+
+       for (i = 0; i < nr_frags; i++) {
+               netif_tx_request_t *txp;
+               u16 pending_idx = frag_get_pending_idx(shinfo->frags + i);
+
+               netbk->pending_inuse[pending_idx].alloc_time = jiffies;
+               list_add_tail(&netbk->pending_inuse[pending_idx].list,
+                             &netbk->pending_inuse_head);
+
+               txp = &netbk->pending_tx_info[pending_idx].req;
+               __skb_fill_page_desc(skb, i, netbk->mmap_pages[pending_idx],
+                                    txp->offset, txp->size);
+
+               skb->len += txp->size;
+               skb->data_len += txp->size;
+               skb->truesize += txp->size;
+       }
+}
+
+int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras,
+                    int work_to_do)
+{
+       struct netif_extra_info extra;
+       RING_IDX cons = netif->tx.req_cons;
+
+       do {
+               if (unlikely(work_to_do-- <= 0)) {
+                       DPRINTK("Missing extra info\n");
+                       return -EBADR;
+               }
+
+               memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons),
+                      sizeof(extra));
+               if (unlikely(!extra.type ||
+                            extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+                       netif->tx.req_cons = ++cons;
+                       DPRINTK("Invalid extra type: %d\n", extra.type);
+                       return -EINVAL;
+               }
+
+               memcpy(&extras[extra.type - 1], &extra, sizeof(extra));
+               netif->tx.req_cons = ++cons;
+       } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE);
+
+       return work_to_do;
+}
+
+static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso)
+{
+       if (!gso->u.gso.size) {
+               DPRINTK("GSO size must not be zero.\n");
+               return -EINVAL;
+       }
+
+       /* Currently only TCPv4 S.O. is supported. */
+       if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
+               DPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
+               return -EINVAL;
+       }
+
+       skb_shinfo(skb)->gso_size = gso->u.gso.size;
+       skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+       /* Header must be checked, and gso_segs computed. */
+       skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+       skb_shinfo(skb)->gso_segs = 0;
+
+       return 0;
+}
+
+/* Called after netfront has transmitted */
+static void net_tx_action(unsigned long group)
+{
+       struct xen_netbk *netbk = &xen_netbk[group];
+       struct sk_buff *skb;
+       netif_t *netif;
+       netif_tx_request_t txreq;
+       netif_tx_request_t txfrags[MAX_SKB_FRAGS];
+       struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
+       u16 pending_idx;
+       RING_IDX i;
+       gnttab_map_grant_ref_t *mop;
+       unsigned int data_len;
+       int ret, work_to_do;
+
+       net_tx_action_dealloc(netbk);
+
+       mop = netbk->tx_map_ops;
+       BUILD_BUG_ON(MAX_SKB_FRAGS >= MAX_PENDING_REQS);
+       while (((nr_pending_reqs(netbk) + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
+              !list_empty(&netbk->schedule_list)) {
+               /* Get a netif from the list with work to do. */
+               netif = poll_net_schedule_list(netbk);
+               if (!netif)
+                       continue;
+
+               RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
+               if (!work_to_do) {
+                       netif_put(netif);
+                       continue;
+               }
+
+               i = netif->tx.req_cons;
+               rmb(); /* Ensure that we see the request before we copy it. */
+               memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
+
+               /* Credit-based scheduling. */
+               if (txreq.size > netif->remaining_credit) {
+                       unsigned long now = jiffies;
+                       unsigned long next_credit = 
+                               netif->credit_timeout.expires +
+                               msecs_to_jiffies(netif->credit_usec / 1000);
+
+                       /* Timer could already be pending in rare cases. */
+                       if (timer_pending(&netif->credit_timeout)) {
+                               netif_put(netif);
+                               continue;
+                       }
+
+                       /* Passed the point where we can replenish credit? */
+                       if (time_after_eq(now, next_credit)) {
+                               netif->credit_timeout.expires = now;
+                               tx_add_credit(netif);
+                       }
+
+                       /* Still too big to send right now? Set a callback. */
+                       if (txreq.size > netif->remaining_credit) {
+                               netif->credit_timeout.data     =
+                                       (unsigned long)netif;
+                               netif->credit_timeout.function =
+                                       tx_credit_callback;
+                               mod_timer(&netif->credit_timeout, next_credit);
+                               netif_put(netif);
+                               continue;
+                       }
+               }
+               netif->remaining_credit -= txreq.size;
+
+               work_to_do--;
+               netif->tx.req_cons = ++i;
+
+               memset(extras, 0, sizeof(extras));
+               if (txreq.flags & XEN_NETTXF_extra_info) {
+                       work_to_do = netbk_get_extras(netif, extras,
+                                                     work_to_do);
+                       i = netif->tx.req_cons;
+                       if (unlikely(work_to_do < 0)) {
+                               netbk_tx_err(netif, &txreq, i);
+                               continue;
+                       }
+               }
+
+               ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do);
+               if (unlikely(ret < 0)) {
+                       netbk_tx_err(netif, &txreq, i - ret);
+                       continue;
+               }
+               i += ret;
+
+               if (unlikely(txreq.size < ETH_HLEN)) {
+                       DPRINTK("Bad packet size: %d\n", txreq.size);
+                       netbk_tx_err(netif, &txreq, i);
+                       continue;
+               }
+
+               /* No crossing a page as the payload mustn't fragment. */
+               if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
+                       DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", 
+                               txreq.offset, txreq.size, 
+                               (txreq.offset &~PAGE_MASK) + txreq.size);
+                       netbk_tx_err(netif, &txreq, i);
+                       continue;
+               }
+
+               pending_idx = netbk->pending_ring[MASK_PEND_IDX(netbk->pending_cons)];
+
+               data_len = (txreq.size > PKT_PROT_LEN &&
+                           ret < MAX_SKB_FRAGS) ?
+                       PKT_PROT_LEN : txreq.size;
+
+               skb = alloc_skb(data_len + 16 + NET_IP_ALIGN,
+                               GFP_ATOMIC | __GFP_NOWARN);
+               if (unlikely(skb == NULL)) {
+                       DPRINTK("Can't allocate a skb in start_xmit.\n");
+                       netbk_tx_err(netif, &txreq, i);
+                       break;
+               }
+
+               /* Packets passed to netif_rx() must have some headroom. */
+               skb_reserve(skb, 16 + NET_IP_ALIGN);
+
+               if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
+                       struct netif_extra_info *gso;
+                       gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
+                       if (netbk_set_skb_gso(skb, gso)) {
+                               kfree_skb(skb);
+                               netbk_tx_err(netif, &txreq, i);
+                               continue;
+                       }
+               }
+
+               gnttab_set_map_op(mop, idx_to_kaddr(netbk, pending_idx),
+                                 GNTMAP_host_map | GNTMAP_readonly,
+                                 txreq.gref, netif->domid);
+               mop++;
+
+               memcpy(&netbk->pending_tx_info[pending_idx].req,
+                      &txreq, sizeof(txreq));
+               netbk->pending_tx_info[pending_idx].netif = netif;
+               *((u16 *)skb->data) = pending_idx;
+
+               __skb_put(skb, data_len);
+
+               skb_shinfo(skb)->nr_frags = ret;
+               if (data_len < txreq.size)
+                       skb_shinfo(skb)->nr_frags++;
+               else
+                       pending_idx = INVALID_PENDING_IDX;
+               frag_set_pending_idx(skb_shinfo(skb)->frags, pending_idx);
+
+               __skb_queue_tail(&netbk->tx_queue, skb);
+
+               netbk->pending_cons++;
+
+               mop = netbk_get_requests(netif, skb, txfrags, mop);
+
+               netif->tx.req_cons = i;
+               netif_schedule_work(netif);
+
+               if ((mop - netbk->tx_map_ops) >= ARRAY_SIZE(netbk->tx_map_ops))
+                       break;
+       }
+
+       if (mop == netbk->tx_map_ops)
+               goto out;
+
+    /* NOTE: some maps may fail with GNTST_eagain, which could be successfully
+     * retried in the backend after a delay. However, we can also fail the tx
+     * req and let the frontend resend the relevant packet again. This is fine
+     * because it is unlikely that a network buffer will be paged out or shared,
+     * and therefore it is unlikely to fail with GNTST_eagain. */
+       ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+                                       netbk->tx_map_ops,
+                                       mop - netbk->tx_map_ops);
+       BUG_ON(ret);
+
+       mop = netbk->tx_map_ops;
+       while ((skb = __skb_dequeue(&netbk->tx_queue)) != NULL) {
+               struct net_device *dev;
+               netif_tx_request_t *txp;
+
+               pending_idx = *((u16 *)skb->data);
+               netif       = netbk->pending_tx_info[pending_idx].netif;
+               dev         = netif->dev;
+               txp         = &netbk->pending_tx_info[pending_idx].req;
+
+               /* Check the remap error code. */
+               if (unlikely(netbk_tx_check_mop(netbk, skb, &mop))) {
+                       DPRINTK("netback grant failed.\n");
+                       skb_shinfo(skb)->nr_frags = 0;
+                       kfree_skb(skb);
+                       dev->stats.rx_dropped++;
+                       continue;
+               }
+
+               data_len = skb->len;
+               memcpy(skb->data,
+                      (void *)(idx_to_kaddr(netbk, pending_idx)|txp->offset),
+                      data_len);
+               if (data_len < txp->size) {
+                       /* Append the packet payload as a fragment. */
+                       txp->offset += data_len;
+                       txp->size -= data_len;
+               } else {
+                       /* Schedule a response immediately. */
+                       netif_idx_release(netbk, pending_idx);
+               }
+
+               if (txp->flags & XEN_NETTXF_csum_blank)
+                       skb->ip_summed = CHECKSUM_PARTIAL;
+               else if (txp->flags & XEN_NETTXF_data_validated)
+                       skb->ip_summed = CHECKSUM_UNNECESSARY;
+               else
+                       skb->ip_summed = CHECKSUM_NONE;
+
+               netbk_fill_frags(netbk, skb);
+
+               /*
+                * If the initial fragment was < PKT_PROT_LEN then
+                * pull through some bytes from the other fragments to
+                * increase the linear region to PKT_PROT_LEN bytes.
+                */
+               if (skb_headlen(skb) < PKT_PROT_LEN && skb_is_nonlinear(skb)) {
+                       int target = min_t(int, skb->len, PKT_PROT_LEN);
+                       __pskb_pull_tail(skb, target - skb_headlen(skb));
+               }
+
+               skb->protocol = eth_type_trans(skb, dev);
+
+               if (skb_checksum_setup(skb, &netif->rx_gso_csum_fixups)) {
+                       DPRINTK("Can't setup checksum in net_tx_action\n");
+                       kfree_skb(skb);
+                       continue;
+               }
+
+               if (unlikely(netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB) &&
+                   unlikely(skb_linearize(skb))) {
+                       DPRINTK("Can't linearize skb in net_tx_action.\n");
+                       kfree_skb(skb);
+                       dev->stats.rx_errors++;
+                       continue;
+               }
+
+               dev->stats.rx_bytes += skb->len;
+               dev->stats.rx_packets++;
+
+               if (use_kthreads)
+                       netif_rx_ni(skb);
+               else
+                       netif_rx(skb);
+       }
+
+ out:
+       if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
+           !list_empty(&netbk->pending_inuse_head)) {
+               struct netbk_tx_pending_inuse *oldest;
+
+               oldest = list_entry(netbk->pending_inuse_head.next,
+                                   struct netbk_tx_pending_inuse, list);
+               mod_timer(&netbk->tx_pending_timer, oldest->alloc_time + HZ);
+       }
+}
+
+static void netif_idx_release(struct xen_netbk *netbk, u16 pending_idx)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&netbk->release_lock, flags);
+       netbk->dealloc_ring[MASK_PEND_IDX(netbk->dealloc_prod)] = pending_idx;
+       /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
+       smp_wmb();
+       netbk->dealloc_prod++;
+       spin_unlock_irqrestore(&netbk->release_lock, flags);
+
+       netbk_schedule(netbk);
+}
+
+static void netif_page_release(struct page *page, unsigned int order)
+{
+       unsigned int idx = netif_page_index(page);
+       unsigned int group = netif_page_group(page);
+       struct xen_netbk *netbk = &xen_netbk[group];
+
+       BUG_ON(order);
+       BUG_ON(group >= netbk_nr_groups || idx >= MAX_PENDING_REQS);
+       BUG_ON(netbk->mmap_pages[idx] != page);
+       netif_idx_release(netbk, idx);
+}
+
+irqreturn_t netif_be_int(int irq, void *dev_id)
+{
+       netif_t *netif = dev_id;
+       unsigned int group = GET_GROUP_INDEX(netif);
+
+       if (unlikely(group >= netbk_nr_groups)) {
+               /*
+                * Short of having a way to bind the IRQ in disabled mode
+                * (IRQ_NOAUTOEN), we have to ignore the first invocation(s)
+                * (before we got assigned to a group).
+                */
+               BUG_ON(group != UINT_MAX);
+               return IRQ_HANDLED;
+       }
+
+       add_to_net_schedule_list_tail(netif);
+       maybe_schedule_tx_action(group);
+
+       if (netif_schedulable(netif) && !netbk_queue_full(netif))
+               netif_wake_queue(netif->dev);
+
+       return IRQ_HANDLED;
+}
+
+static void make_tx_response(netif_t *netif, 
+                            netif_tx_request_t *txp,
+                            s8       st)
+{
+       RING_IDX i = netif->tx.rsp_prod_pvt;
+       netif_tx_response_t *resp;
+       int notify;
+
+       resp = RING_GET_RESPONSE(&netif->tx, i);
+       resp->id     = txp->id;
+       resp->status = st;
+
+       if (txp->flags & XEN_NETTXF_extra_info)
+               RING_GET_RESPONSE(&netif->tx, ++i)->status = XEN_NETIF_RSP_NULL;
+
+       netif->tx.rsp_prod_pvt = ++i;
+       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
+       if (notify)
+               notify_remote_via_irq(netif->irq);
+
+#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
+       if (i == netif->tx.req_cons) {
+               int more_to_do;
+               RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
+               if (more_to_do)
+                       add_to_net_schedule_list_tail(netif);
+       }
+#endif
+}
+
+static netif_rx_response_t *make_rx_response(netif_t *netif, 
+                                            u16      id, 
+                                            s8       st,
+                                            u16      offset,
+                                            u16      size,
+                                            u16      flags)
+{
+       RING_IDX i = netif->rx.rsp_prod_pvt;
+       netif_rx_response_t *resp;
+
+       resp = RING_GET_RESPONSE(&netif->rx, i);
+       resp->offset     = offset;
+       resp->flags      = flags;
+       resp->id         = id;
+       resp->status     = (s16)size;
+       if (st < 0)
+               resp->status = (s16)st;
+
+       netif->rx.rsp_prod_pvt = ++i;
+
+       return resp;
+}
+
+#ifdef NETBE_DEBUG_INTERRUPT
+static irqreturn_t netif_be_dbg(int irq, void *dev_id)
+{
+       netif_t *netif;
+       unsigned int i = 0, group;
+
+       pr_alert("netif_schedule_list:\n");
+
+       for (group = 0; group < netbk_nr_groups; ++group) {
+               struct xen_netbk *netbk = &xen_netbk[group];
+
+               spin_lock_irq(&netbk->schedule_list_lock);
+
+               list_for_each_entry(netif, &netbk->schedule_list, list) {
+                       pr_alert(" %d: private(rx_req_cons=%08x "
+                                "rx_resp_prod=%08x\n", i,
+                                netif->rx.req_cons, netif->rx.rsp_prod_pvt);
+                       pr_alert("   tx_req_cons=%08x tx_resp_prod=%08x)\n",
+                                netif->tx.req_cons, netif->tx.rsp_prod_pvt);
+                       pr_alert("   shared(rx_req_prod=%08x "
+                                "rx_resp_prod=%08x\n",
+                                netif->rx.sring->req_prod,
+                                netif->rx.sring->rsp_prod);
+                       pr_alert("   rx_event=%08x tx_req_prod=%08x\n",
+                                netif->rx.sring->rsp_event,
+                                netif->tx.sring->req_prod);
+                       pr_alert("   tx_resp_prod=%08x, tx_event=%08x)\n",
+                                netif->tx.sring->rsp_prod,
+                                netif->tx.sring->rsp_event);
+                       i++;
+               }
+
+               spin_unlock_irq(&netbk->netbk->schedule_list_lock);
+       }
+
+       pr_alert(" ** End of netif_schedule_list **\n");
+
+       return IRQ_HANDLED;
+}
+
+static struct irqaction netif_be_dbg_action = {
+       .handler = netif_be_dbg,
+       .flags   = IRQF_SHARED,
+       .name    = "net-be-dbg"
+};
+#endif
+
+static inline int rx_work_todo(struct xen_netbk *netbk)
+{
+       return !skb_queue_empty(&netbk->rx_queue);
+}
+
+static inline int tx_work_todo(struct xen_netbk *netbk)
+{
+       if (netbk->dealloc_cons != netbk->dealloc_prod)
+               return 1;
+
+       if (netbk_copy_skb_mode == NETBK_DELAYED_COPY_SKB &&
+           !list_empty(&netbk->pending_inuse_head))
+               return 1;
+
+       if (nr_pending_reqs(netbk) + MAX_SKB_FRAGS < MAX_PENDING_REQS &&
+           !list_empty(&netbk->schedule_list))
+               return 1;
+
+       return 0;
+}
+
+static int netbk_action_thread(void *index)
+{
+       unsigned long group = (unsigned long)index;
+       struct xen_netbk *netbk = &xen_netbk[group];
+
+       while (!kthread_should_stop()) {
+               wait_event_interruptible(netbk->netbk_action_wq,
+                                        rx_work_todo(netbk) ||
+                                        tx_work_todo(netbk) ||
+                                        kthread_should_stop());
+               cond_resched();
+
+               if (rx_work_todo(netbk))
+                       net_rx_action(group);
+
+               if (tx_work_todo(netbk))
+                       net_tx_action(group);
+       }
+
+       return 0;
+}
+
+
+static int __init netback_init(void)
+{
+       unsigned int i, group;
+       int rc;
+       struct page *page;
+
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       group = netbk_nr_groups;
+       if (!netbk_nr_groups)
+               netbk_nr_groups = (num_online_cpus() + 1) / 2;
+       if (netbk_nr_groups > MAX_GROUPS)
+               netbk_nr_groups = MAX_GROUPS;
+
+       do {
+               xen_netbk = vzalloc(netbk_nr_groups * sizeof(*xen_netbk));
+       } while (!xen_netbk && (netbk_nr_groups >>= 1));
+       if (!xen_netbk)
+               return -ENOMEM;
+       if (group && netbk_nr_groups != group)
+               pr_warn("netback: only using %u (instead of %u) groups\n",
+                       netbk_nr_groups, group);
+
+       /* We can increase reservation by this much in net_rx_action(). */
+       balloon_update_driver_allowance(netbk_nr_groups * NET_RX_RING_SIZE);
+
+       for (group = 0; group < netbk_nr_groups; group++) {
+               struct xen_netbk *netbk = &xen_netbk[group];
+
+               skb_queue_head_init(&netbk->rx_queue);
+               skb_queue_head_init(&netbk->tx_queue);
+
+               init_timer(&netbk->net_timer);
+               netbk->net_timer.data = group;
+               netbk->net_timer.function = netbk_schedule_group;
+
+               init_timer(&netbk->tx_pending_timer);
+               netbk->tx_pending_timer.data = group;
+               netbk->tx_pending_timer.function = netbk_schedule_group;
+
+               netbk->pending_prod = MAX_PENDING_REQS;
+
+               INIT_LIST_HEAD(&netbk->pending_inuse_head);
+               INIT_LIST_HEAD(&netbk->schedule_list);
+
+               spin_lock_init(&netbk->schedule_list_lock);
+               spin_lock_init(&netbk->release_lock);
+
+               netbk->mmap_pages =
+                       alloc_empty_pages_and_pagevec(MAX_PENDING_REQS);
+               if (netbk->mmap_pages == NULL) {
+                       pr_err("%s: out of memory\n", __func__);
+                       rc = -ENOMEM;
+                       goto failed_init;
+               }
+
+               for (i = 0; i < MAX_PENDING_REQS; i++) {
+                       page = netbk->mmap_pages[i];
+                       SetPageForeign(page, netif_page_release);
+                       netif_set_page_ext(page, group, i);
+                       netbk->pending_ring[i] = i;
+                       INIT_LIST_HEAD(&netbk->pending_inuse[i].list);
+               }
+
+               if (use_kthreads) {
+                       init_waitqueue_head(&netbk->netbk_action_wq);
+                       netbk->task = kthread_create(netbk_action_thread,
+                                                    (void *)(long)group,
+                                                    "netback/%u", group);
+
+                       if (IS_ERR(netbk->task)) {
+                               pr_err("netback: kthread_create() failed\n");
+                               rc = PTR_ERR(netbk->task);
+                               goto failed_init;
+                       }
+                       if (bind_threads)
+                               kthread_bind(netbk->task,
+                                            group % num_online_cpus());
+                       wake_up_process(netbk->task);
+               } else {
+                       tasklet_init(&netbk->net_tx_tasklet, net_tx_action, group);
+                       tasklet_init(&netbk->net_rx_tasklet, net_rx_action, group);
+               }
+       }
+
+       netbk_copy_skb_mode = NETBK_DONT_COPY_SKB;
+       if (MODPARM_copy_skb) {
+               if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_and_replace,
+                                             NULL, 0))
+                       netbk_copy_skb_mode = NETBK_ALWAYS_COPY_SKB;
+               else
+                       netbk_copy_skb_mode = NETBK_DELAYED_COPY_SKB;
+       }
+
+       netif_accel_init();
+
+       netif_xenbus_init();
+
+#ifdef NETBE_DEBUG_INTERRUPT
+       (void)bind_virq_to_irqaction(VIRQ_DEBUG,
+                                    0,
+                                    &netif_be_dbg_action);
+#endif
+
+       return 0;
+
+failed_init:
+       do {
+               struct xen_netbk *netbk = &xen_netbk[group];
+
+               if (use_kthreads && netbk->task && !IS_ERR(netbk->task))
+                       kthread_stop(netbk->task);
+               if (netbk->mmap_pages)
+                       free_empty_pages_and_pagevec(netbk->mmap_pages,
+                                                    MAX_PENDING_REQS);
+       } while (group--);
+       vfree(xen_netbk);
+       balloon_update_driver_allowance(-(long)netbk_nr_groups
+                                       * NET_RX_RING_SIZE);
+
+       return rc;
+}
+
+module_init(netback_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vif");
diff --git a/drivers/xen/netback/xenbus.c b/drivers/xen/netback/xenbus.c

new file mode 100644 (file)

index 0000000..b73b422
--- /dev/null
+++ b/drivers/xen/netback/xenbus.c
@@ -0,0 +1,494 @@
+/*  Xenbus code for netif backend
+    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+    Copyright (C) 2005 XenSource Ltd
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#include <stdarg.h>
+#include <linux/rwsem.h>
+#include <xen/xenbus.h>
+#include "common.h"
+
+#if 0
+#undef DPRINTK
+#define DPRINTK(fmt, args...) \
+    printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args)
+#endif
+
+static DECLARE_RWSEM(teardown_sem);
+
+static int connect_rings(struct backend_info *);
+static void connect(struct backend_info *);
+static void backend_create_netif(struct backend_info *be);
+static void unregister_hotplug_status_watch(struct backend_info *be);
+static void netback_disconnect(struct device *, bool);
+
+static int netback_remove(struct xenbus_device *dev)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+       netback_remove_accelerators(be, dev);
+
+       netback_disconnect(&dev->dev, true);
+       kfree(be);
+       return 0;
+}
+
+static void netback_disconnect(struct device *xbdev_dev, bool clear)
+{
+       struct backend_info *be = dev_get_drvdata(xbdev_dev);
+
+       unregister_hotplug_status_watch(be);
+       if (be->netif)
+               kobject_uevent(&xbdev_dev->kobj, KOBJ_OFFLINE);
+
+       xenbus_rm(XBT_NIL, be->dev->nodename, "hotplug-status");
+
+       down_write(&teardown_sem);
+       if (be->netif) {
+               netif_disconnect(be);
+               be->netif = NULL;
+       }
+       if (clear)
+               dev_set_drvdata(xbdev_dev, NULL);
+       up_write(&teardown_sem);
+}
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and switch to InitWait.
+ */
+static int netback_probe(struct xenbus_device *dev,
+                        const struct xenbus_device_id *id)
+{
+       const char *message;
+       struct xenbus_transaction xbt;
+       int err;
+       int sg;
+       struct backend_info *be = kzalloc(sizeof(struct backend_info),
+                                         GFP_KERNEL);
+       if (!be) {
+               xenbus_dev_fatal(dev, -ENOMEM,
+                                "allocating backend structure");
+               return -ENOMEM;
+       }
+
+       be->dev = dev;
+       dev_set_drvdata(&dev->dev, be);
+
+       sg = 1;
+       if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
+               sg = 0;
+
+       do {
+               err = xenbus_transaction_start(&xbt);
+               if (err) {
+                       xenbus_dev_fatal(dev, err, "starting transaction");
+                       goto fail;
+               }
+
+               err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
+               if (err) {
+                       message = "writing feature-sg";
+                       goto abort_transaction;
+               }
+
+               err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
+                                   "%d", sg);
+               if (err) {
+                       message = "writing feature-gso-tcpv4";
+                       goto abort_transaction;
+               }
+
+               /* We support rx-copy path. */
+               err = xenbus_printf(xbt, dev->nodename,
+                                   "feature-rx-copy", "%d", 1);
+               if (err) {
+                       message = "writing feature-rx-copy";
+                       goto abort_transaction;
+               }
+
+               /*
+                * We don't support rx-flip path (except old guests who don't
+                * grok this feature flag).
+                */
+               err = xenbus_printf(xbt, dev->nodename,
+                                   "feature-rx-flip", "%d", 0);
+               if (err) {
+                       message = "writing feature-rx-flip";
+                       goto abort_transaction;
+               }
+
+               err = xenbus_transaction_end(xbt, 0);
+       } while (err == -EAGAIN);
+
+       if (err) {
+               xenbus_dev_fatal(dev, err, "completing transaction");
+               goto fail;
+       }
+
+       netback_probe_accelerators(be, dev);
+
+       err = xenbus_switch_state(dev, XenbusStateInitWait);
+       if (err)
+               goto fail;
+
+       /* This kicks hotplug scripts, so do it immediately. */
+       backend_create_netif(be);
+
+       return 0;
+
+abort_transaction:
+       xenbus_transaction_end(xbt, 1);
+       xenbus_dev_fatal(dev, err, "%s", message);
+fail:
+       DPRINTK("failed");
+       netback_remove(dev);
+       return err;
+}
+
+
+/**
+ * Handle the creation of the hotplug script environment.  We add the script
+ * and vif variables to the environment, for the benefit of the vif-* hotplug
+ * scripts.
+ */
+static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env)
+{
+       struct backend_info *be;
+       char *val;
+
+       DPRINTK("netback_uevent");
+
+       val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL);
+       if (IS_ERR(val)) {
+               int err = PTR_ERR(val);
+               xenbus_dev_fatal(xdev, err, "reading script");
+               return err;
+       }
+
+       add_uevent_var(env, "script=%s", val);
+       kfree(val);
+
+       down_read(&teardown_sem);
+       be = dev_get_drvdata(&xdev->dev);
+       if (be && be->netif)
+               add_uevent_var(env, "vif=%s", be->netif->dev->name);
+       up_read(&teardown_sem);
+
+       return 0;
+}
+
+
+static void backend_create_netif(struct backend_info *be)
+{
+       int err;
+       long handle;
+       struct xenbus_device *dev = be->dev;
+       netif_t *netif;
+
+       if (be->netif != NULL)
+               return;
+
+       err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle);
+       if (err != 1) {
+               xenbus_dev_fatal(dev, err, "reading handle");
+               return;
+       }
+
+       netif = netif_alloc(&dev->dev, dev->otherend_id, handle);
+       if (IS_ERR(netif)) {
+               err = PTR_ERR(netif);
+               xenbus_dev_fatal(dev, err, "creating interface");
+               return;
+       }
+       be->netif = netif;
+
+       kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE);
+}
+
+
+/**
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+                            enum xenbus_state frontend_state)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+       DPRINTK("%s", xenbus_strstate(frontend_state));
+
+       be->frontend_state = frontend_state;
+
+       switch (frontend_state) {
+       case XenbusStateInitialising:
+               if (dev->state == XenbusStateClosed) {
+                       pr_info("%s: %s: prepare for reconnect\n",
+                               __FUNCTION__, dev->nodename);
+                       xenbus_switch_state(dev, XenbusStateInitWait);
+               }
+               break;
+
+       case XenbusStateInitialised:
+               break;
+
+       case XenbusStateConnected:
+               if (dev->state == XenbusStateConnected)
+                       break;
+
+               /* backend_create_netif() is idempotent */
+               backend_create_netif(be);
+               if (be->netif)
+                       connect(be);
+               break;
+
+       case XenbusStateClosing:
+               netback_disconnect(&dev->dev, false);
+               xenbus_switch_state(dev, XenbusStateClosing);
+               break;
+
+       case XenbusStateClosed:
+               xenbus_switch_state(dev, XenbusStateClosed);
+               if (xenbus_dev_is_online(dev))
+                       break;
+               /* fall through if not online */
+       case XenbusStateUnknown:
+               /* implies netback_disconnect() via netback_remove() */
+               device_unregister(&dev->dev);
+               break;
+
+       default:
+               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+                                frontend_state);
+               break;
+       }
+}
+
+
+static void xen_net_read_rate(struct xenbus_device *dev,
+                             unsigned long *bytes, unsigned long *usec)
+{
+       char *s, *e;
+       unsigned long b, u;
+       char *ratestr;
+
+       /* Default to unlimited bandwidth. */
+       *bytes = ~0UL;
+       *usec = 0;
+
+       ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL);
+       if (IS_ERR(ratestr))
+               return;
+
+       s = ratestr;
+       b = simple_strtoul(s, &e, 10);
+       if ((s == e) || (*e != ','))
+               goto fail;
+
+       s = e + 1;
+       u = simple_strtoul(s, &e, 10);
+       if ((s == e) || (*e != '\0'))
+               goto fail;
+
+       *bytes = b;
+       *usec = u;
+
+       kfree(ratestr);
+       return;
+
+ fail:
+       WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n");
+       kfree(ratestr);
+}
+
+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+       char *s, *e, *macstr;
+       int i;
+
+       macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+       if (IS_ERR(macstr))
+               return PTR_ERR(macstr);
+
+       for (i = 0; i < ETH_ALEN; i++) {
+               mac[i] = simple_strtoul(s, &e, 16);
+               if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+                       kfree(macstr);
+                       return -ENOENT;
+               }
+               s = e+1;
+       }
+
+       kfree(macstr);
+       return 0;
+}
+
+static void unregister_hotplug_status_watch(struct backend_info *be)
+{
+       if (be->have_hotplug_status_watch) {
+               unregister_xenbus_watch(&be->hotplug_status_watch);
+               kfree(be->hotplug_status_watch.node);
+       }
+       be->have_hotplug_status_watch = 0;
+}
+
+static void hotplug_status_changed(struct xenbus_watch *watch,
+                                  const char **vec,
+                                  unsigned int vec_size)
+{
+       struct backend_info *be = container_of(watch,
+                                              struct backend_info,
+                                              hotplug_status_watch);
+       char *str;
+       unsigned int len;
+
+       str = xenbus_read(XBT_NIL, be->dev->nodename, "hotplug-status", &len);
+       if (IS_ERR(str))
+               return;
+       if (len == sizeof("connected")-1 && !memcmp(str, "connected", len)) {
+               xenbus_switch_state(be->dev, XenbusStateConnected);
+               /* Not interested in this watch anymore. */
+               unregister_hotplug_status_watch(be);
+       }
+       kfree(str);
+}
+
+static void connect(struct backend_info *be)
+{
+       int err;
+       struct xenbus_device *dev = be->dev;
+
+       err = connect_rings(be);
+       if (err)
+               return;
+
+       err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
+               return;
+       }
+
+       xen_net_read_rate(dev, &be->netif->credit_bytes,
+                         &be->netif->credit_usec);
+       be->netif->remaining_credit = be->netif->credit_bytes;
+
+       unregister_hotplug_status_watch(be);
+       err = xenbus_watch_path2(dev, dev->nodename, "hotplug-status",
+                                &be->hotplug_status_watch,
+                                hotplug_status_changed);
+       if (err) {
+               /* Switch now, since we can't do a watch. */
+               xenbus_switch_state(dev, XenbusStateConnected);
+       } else {
+               be->have_hotplug_status_watch = 1;
+       }
+
+       netif_wake_queue(be->netif->dev);
+}
+
+
+static int connect_rings(struct backend_info *be)
+{
+       netif_t *netif = be->netif;
+       struct xenbus_device *dev = be->dev;
+       unsigned int tx_ring_ref, rx_ring_ref;
+       unsigned int evtchn, rx_copy;
+       int err;
+       int val;
+
+       DPRINTK("");
+
+       err = xenbus_gather(XBT_NIL, dev->otherend,
+                           "tx-ring-ref", "%u", &tx_ring_ref,
+                           "rx-ring-ref", "%u", &rx_ring_ref,
+                           "event-channel", "%u", &evtchn, NULL);
+       if (err) {
+               xenbus_dev_fatal(dev, err,
+                                "reading %s/ring-ref and event-channel",
+                                dev->otherend);
+               return err;
+       }
+
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
+                          &rx_copy);
+       if (err == -ENOENT) {
+               err = 0;
+               rx_copy = 0;
+       }
+       if (err < 0) {
+               xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
+                                dev->otherend);
+               return err;
+       }
+       netif->copying_receiver = !!rx_copy;
+
+       if (netif->dev->tx_queue_len != 0) {
+               if (xenbus_scanf(XBT_NIL, dev->otherend,
+                                "feature-rx-notify", "%d", &val) < 0)
+                       val = 0;
+               if (val)
+                       netif->can_queue = 1;
+               else
+                       /* Must be non-zero for pfifo_fast to work. */
+                       netif->dev->tx_queue_len = 1;
+       }
+
+       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
+               val = 0;
+       netif->can_sg = !!val;
+
+       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
+                        &val) < 0)
+               val = 0;
+       netif->gso = !!val;
+
+       if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
+                        "%d", &val) < 0)
+               val = 0;
+       netif->csum = !val;
+
+       /* Map the shared frame, irq etc. */
+       err = netif_map(be, tx_ring_ref, rx_ring_ref, evtchn);
+       if (err) {
+               xenbus_dev_fatal(dev, err,
+                                "mapping shared-frames %u/%u port %u",
+                                tx_ring_ref, rx_ring_ref, evtchn);
+               return err;
+       }
+       return 0;
+}
+
+
+/* ** Driver Registration ** */
+
+
+static const struct xenbus_device_id netback_ids[] = {
+       { "vif" },
+       { "" }
+};
+
+static DEFINE_XENBUS_DRIVER(netback, ,
+       .probe = netback_probe,
+       .remove = netback_remove,
+       .uevent = netback_uevent,
+       .otherend_changed = frontend_changed,
+);
+
+
+void netif_xenbus_init(void)
+{
+       WARN_ON(xenbus_register_backend(&netback_driver));
+}
diff --git a/drivers/xen/netfront/Makefile b/drivers/xen/netfront/Makefile

new file mode 100644 (file)

index 0000000..9c0c6ad
--- /dev/null
+++ b/drivers/xen/netfront/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_XEN_NETDEV_FRONTEND)      := xennet.o
+
+xennet-objs := netfront.o accel.o
diff --git a/drivers/xen/netfront/accel.c b/drivers/xen/netfront/accel.c

new file mode 100644 (file)

index 0000000..f20e0ca
--- /dev/null
+++ b/drivers/xen/netfront/accel.c
@@ -0,0 +1,830 @@
+/******************************************************************************
+ * Virtual network driver for conversing with remote driver backends.
+ *
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/version.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+
+#include "netfront.h"
+
+#define DPRINTK(fmt, args...)                          \
+       pr_debug("netfront/accel (%s:%d) " fmt,         \
+              __FUNCTION__, __LINE__, ##args)
+#define IPRINTK(fmt, args...) pr_info("netfront/accel: " fmt, ##args)
+#define WPRINTK(fmt, args...) pr_warning("netfront/accel: " fmt, ##args)
+
+static int netfront_remove_accelerator(struct netfront_info *np,
+                                      struct xenbus_device *dev);
+static int netfront_load_accelerator(struct netfront_info *np, 
+                                    struct xenbus_device *dev, 
+                                    const char *frontend);
+
+static void netfront_accelerator_remove_watch(struct netfront_info *np);
+
+/*
+ * List of all netfront accelerator plugin modules available.  Each
+ * list entry is of type struct netfront_accelerator.
+ */ 
+static struct list_head accelerators_list;
+
+/* Workqueue to process acceleration configuration changes */
+struct workqueue_struct *accel_watch_workqueue;
+
+/* Mutex to prevent concurrent loads and suspends, etc. */
+DEFINE_MUTEX(accelerator_mutex);
+
+void netif_init_accel(void)
+{
+       INIT_LIST_HEAD(&accelerators_list);
+
+       accel_watch_workqueue = create_workqueue("net_accel");
+}
+
+void netif_exit_accel(void)
+{
+       struct netfront_accelerator *accelerator, *tmp;
+
+       flush_workqueue(accel_watch_workqueue);
+       destroy_workqueue(accel_watch_workqueue);
+
+       /* No lock required as everything else should be quiet by now */
+       list_for_each_entry_safe(accelerator, tmp, &accelerators_list, link) {
+               BUG_ON(!list_empty(&accelerator->vif_states));
+
+               list_del(&accelerator->link);
+               kfree(accelerator->frontend);
+               kfree(accelerator);
+       }
+}
+
+
+/* 
+ * Watch the configured accelerator and change plugin if it's modified 
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+static void accel_watch_work(struct work_struct *context)
+#else
+static void accel_watch_work(void *context)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+       struct netfront_accel_vif_state *vif_state = 
+               container_of(context, struct netfront_accel_vif_state, 
+                            accel_work);
+#else
+        struct netfront_accel_vif_state *vif_state = 
+               (struct netfront_accel_vif_state *)context;
+#endif
+       struct netfront_info *np = vif_state->np;
+       char *accel_frontend;
+       int accel_len, rc = -1;
+
+       mutex_lock(&accelerator_mutex);
+
+       accel_frontend = xenbus_read(XBT_NIL, np->xbdev->otherend, 
+                                    "accel-frontend", &accel_len);
+       if (IS_ERR(accel_frontend)) {
+               accel_frontend = NULL;
+               netfront_remove_accelerator(np, np->xbdev);
+       } else {
+               /* If this is the first time, request the accelerator,
+                  otherwise only request one if it has changed */
+               if (vif_state->accel_frontend == NULL) {
+                       rc = netfront_load_accelerator(np, np->xbdev, 
+                                                      accel_frontend);
+               } else {
+                       if (strncmp(vif_state->accel_frontend, accel_frontend,
+                                   accel_len)) {
+                               netfront_remove_accelerator(np, np->xbdev);
+                               rc = netfront_load_accelerator(np, np->xbdev, 
+                                                              accel_frontend);
+                       }
+               }
+       }
+
+       /* Get rid of previous state and replace with the new name */
+       if (vif_state->accel_frontend != NULL)
+               kfree(vif_state->accel_frontend);
+       vif_state->accel_frontend = accel_frontend;
+
+       mutex_unlock(&accelerator_mutex);
+
+       if (rc == 0) {
+               DPRINTK("requesting module %s\n", accel_frontend);
+               request_module("%s", accel_frontend);
+               /*
+                * Module should now call netfront_accelerator_loaded() once
+                * it's up and running, and we can continue from there 
+                */
+       }
+}
+
+
+static void accel_watch_changed(struct xenbus_watch *watch,
+                               const char **vec, unsigned int len)
+{
+       struct netfront_accel_vif_state *vif_state = 
+               container_of(watch, struct netfront_accel_vif_state,
+                            accel_watch);
+       queue_work(accel_watch_workqueue, &vif_state->accel_work);
+}
+
+
+void netfront_accelerator_add_watch(struct netfront_info *np)
+{
+       int err;
+       
+       /* 
+        * If old watch exists, e.g. from before suspend/resume,
+        * remove it now 
+        */
+       netfront_accelerator_remove_watch(np);
+
+       /* Get a watch on the accelerator plugin */
+       err = xenbus_watch_path2(np->xbdev, np->xbdev->otherend, 
+                                "accel-frontend", 
+                                &np->accel_vif_state.accel_watch,
+                                accel_watch_changed);
+       if (err) {
+               DPRINTK("%s: Failed to register accel watch: %d\n",
+                        __FUNCTION__, err);
+               np->accel_vif_state.accel_watch.node = NULL;
+        }
+}
+
+
+static void 
+netfront_accelerator_purge_watch(struct netfront_accel_vif_state *vif_state)
+{
+       flush_workqueue(accel_watch_workqueue);
+
+       /* Clean up any state left from watch */
+       if (vif_state->accel_frontend != NULL) {
+               kfree(vif_state->accel_frontend);
+               vif_state->accel_frontend = NULL;
+       }
+}
+
+
+static
+void netfront_accelerator_remove_watch(struct netfront_info *np)
+{
+       struct netfront_accel_vif_state *vif_state = &np->accel_vif_state;
+
+       /* Get rid of watch on accelerator plugin */
+       if (vif_state->accel_watch.node != NULL) {
+               unregister_xenbus_watch(&vif_state->accel_watch);
+               kfree(vif_state->accel_watch.node);
+               vif_state->accel_watch.node = NULL;
+
+               netfront_accelerator_purge_watch(vif_state);
+       }       
+}
+
+
+/* 
+ * Initialise the accel_vif_state field in the netfront state
+ */ 
+void init_accelerator_vif(struct netfront_info *np,
+                         struct xenbus_device *dev)
+{
+       np->accelerator = NULL;
+
+       /* It's assumed that these things don't change */
+       np->accel_vif_state.np = np;
+       np->accel_vif_state.dev = dev;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+       INIT_WORK(&np->accel_vif_state.accel_work, accel_watch_work);
+#else
+       INIT_WORK(&np->accel_vif_state.accel_work, accel_watch_work, 
+                 &np->accel_vif_state);
+#endif
+}
+
+
+/*
+ * Compare a frontend description string against an accelerator to see
+ * if they match.  Would ultimately be nice to replace the string with
+ * a unique numeric identifier for each accelerator.
+ */
+static int match_accelerator(const char *frontend, 
+                            struct netfront_accelerator *accelerator)
+{
+       return strcmp(frontend, accelerator->frontend) == 0;
+}
+
+
+/* 
+ * Add a frontend vif to the list of vifs that is using a netfront
+ * accelerator plugin module.  Must be called with the accelerator
+ * mutex held.
+ */
+static void add_accelerator_vif(struct netfront_accelerator *accelerator,
+                               struct netfront_info *np)
+{
+       if (np->accelerator == NULL) {
+               np->accelerator = accelerator;
+               
+               list_add(&np->accel_vif_state.link, &accelerator->vif_states);
+       } else {
+               /* 
+                * May get here legitimately if suspend_cancel is
+                * called, but in that case configuration should not
+                * have changed
+                */
+               BUG_ON(np->accelerator != accelerator);
+       }
+}
+
+
+/*
+ * Initialise the state to track an accelerator plugin module.  
+ * 
+ * Must be called with the accelerator mutex held.
+ */ 
+static int init_accelerator(const char *frontend, 
+                           struct netfront_accelerator **result,
+                           struct netfront_accel_hooks *hooks)
+{
+       struct netfront_accelerator *accelerator = 
+               kmalloc(sizeof(struct netfront_accelerator), GFP_KERNEL);
+       int frontend_len;
+
+       if (!accelerator) {
+               DPRINTK("no memory for accelerator\n");
+               return -ENOMEM;
+       }
+
+       frontend_len = strlen(frontend) + 1;
+       accelerator->frontend = kmalloc(frontend_len, GFP_KERNEL);
+       if (!accelerator->frontend) {
+               DPRINTK("no memory for accelerator\n");
+               kfree(accelerator);
+               return -ENOMEM;
+       }
+       strlcpy(accelerator->frontend, frontend, frontend_len);
+       
+       INIT_LIST_HEAD(&accelerator->vif_states);
+       spin_lock_init(&accelerator->vif_states_lock);
+
+       accelerator->hooks = hooks;
+
+       list_add(&accelerator->link, &accelerators_list);
+
+       *result = accelerator;
+
+       return 0;
+}                                      
+
+
+/* 
+ * Modify the hooks stored in the per-vif state to match that in the
+ * netfront accelerator's state.
+ * 
+ * Takes the vif_states_lock spinlock and may sleep.
+ */
+static void 
+accelerator_set_vif_state_hooks(struct netfront_accel_vif_state *vif_state)
+{
+       struct netfront_accelerator *accelerator;
+       unsigned long flags;
+
+       DPRINTK("%p\n",vif_state);
+
+       /* Make sure there are no data path operations going on */
+       napi_disable(&vif_state->np->napi);
+       netif_tx_lock_bh(vif_state->np->netdev);
+
+       accelerator = vif_state->np->accelerator;
+       spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+       vif_state->hooks = accelerator->hooks;
+       spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+
+       netif_tx_unlock_bh(vif_state->np->netdev);
+       napi_enable(&vif_state->np->napi);
+}
+
+
+/* 
+ * Must be called with the accelerator mutex held.  Takes the
+ * vif_states_lock spinlock.
+ */
+static void accelerator_probe_new_vif(struct netfront_info *np,
+                                     struct xenbus_device *dev, 
+                                     struct netfront_accelerator *accelerator)
+{
+       struct netfront_accel_hooks *hooks;
+
+       DPRINTK("\n");
+
+       /* Include this frontend device on the accelerator's list */
+       add_accelerator_vif(accelerator, np);
+       
+       hooks = accelerator->hooks;
+       
+       if (hooks && hooks->new_device(np->netdev, dev) == 0)
+               accelerator_set_vif_state_hooks(&np->accel_vif_state);
+
+       return;
+}
+
+
+/*  
+ * Request that a particular netfront accelerator plugin is loaded.
+ * Usually called as a result of the vif configuration specifying
+ * which one to use.
+ *
+ * Must be called with accelerator_mutex held.  Takes the
+ * vif_states_lock spinlock.
+ */
+static int netfront_load_accelerator(struct netfront_info *np, 
+                                    struct xenbus_device *dev, 
+                                    const char *frontend)
+{
+       struct netfront_accelerator *accelerator;
+       int rc = 0;
+
+       DPRINTK(" %s\n", frontend);
+
+       /* 
+        * Look at list of loaded accelerators to see if the requested
+        * one is already there 
+        */
+       list_for_each_entry(accelerator, &accelerators_list, link) {
+               if (match_accelerator(frontend, accelerator)) {
+                       accelerator_probe_new_vif(np, dev, accelerator);
+                       return 0;
+               }
+       }
+
+       /* Couldn't find it, so create a new one and load the module */
+       if ((rc = init_accelerator(frontend, &accelerator, NULL)) < 0) {
+               return rc;
+       }
+
+       /* Include this frontend device on the accelerator's list */
+       add_accelerator_vif(accelerator, np);
+
+       return rc;
+}
+
+
+/*
+ * Go through all the netfront vifs and see if they have requested
+ * this accelerator.  Notify the accelerator plugin of the relevant
+ * device if so.  Called when an accelerator plugin module is first
+ * loaded and connects to netfront.
+ *
+ * Must be called with accelerator_mutex held.  Takes the
+ * vif_states_lock spinlock.
+ */
+static void 
+accelerator_probe_vifs(struct netfront_accelerator *accelerator,
+                      struct netfront_accel_hooks *hooks)
+{
+       struct netfront_accel_vif_state *vif_state, *tmp;
+
+       DPRINTK("%p\n", accelerator);
+
+       /* 
+        * Store the hooks for future calls to probe a new device, and
+        * to wire into the vif_state once the accelerator plugin is
+        * ready to accelerate each vif
+        */
+       BUG_ON(hooks == NULL);
+       accelerator->hooks = hooks;
+
+       /* Holds accelerator_mutex to iterate list */
+       list_for_each_entry_safe(vif_state, tmp, &accelerator->vif_states,
+                                link) {
+               struct netfront_info *np = vif_state->np;
+               
+               if (hooks->new_device(np->netdev, vif_state->dev) == 0)
+                       accelerator_set_vif_state_hooks(vif_state);
+       }
+}
+
+
+/* 
+ * Called by the netfront accelerator plugin module when it has
+ * loaded.
+ *
+ * Takes the accelerator_mutex and vif_states_lock spinlock.
+ */
+int netfront_accelerator_loaded(int version, const char *frontend, 
+                               struct netfront_accel_hooks *hooks)
+{
+       struct netfront_accelerator *accelerator;
+
+       if (is_initial_xendomain())
+               return -EINVAL;
+
+       if (version != NETFRONT_ACCEL_VERSION) {
+               if (version > NETFRONT_ACCEL_VERSION) {
+                       /* Caller has higher version number, leave it
+                          up to them to decide whether to continue.
+                          They can re-call with a lower number if
+                          they're happy to be compatible with us */
+                       return NETFRONT_ACCEL_VERSION;
+               } else {
+                       /* We have a more recent version than caller.
+                          Currently reject, but may in future be able
+                          to be backwardly compatible */
+                       return -EPROTO;
+               }
+       }
+
+       mutex_lock(&accelerator_mutex);
+
+       /* 
+        * Look through list of accelerators to see if it has already
+        * been requested
+        */
+       list_for_each_entry(accelerator, &accelerators_list, link) {
+               if (match_accelerator(frontend, accelerator)) {
+                       accelerator_probe_vifs(accelerator, hooks);
+                       goto out;
+               }
+       }
+
+       /*
+        * If it wasn't in the list, add it now so that when it is
+        * requested the caller will find it
+        */
+       DPRINTK("Couldn't find matching accelerator (%s)\n",
+               frontend);
+
+       init_accelerator(frontend, &accelerator, hooks);
+
+ out:
+       mutex_unlock(&accelerator_mutex);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(netfront_accelerator_loaded);
+
+
+/* 
+ * Remove the hooks from a single vif state.
+ * 
+ * Takes the vif_states_lock spinlock and may sleep.
+ */
+static void 
+accelerator_remove_single_hook(struct netfront_accelerator *accelerator,
+                              struct netfront_accel_vif_state *vif_state)
+{
+       unsigned long flags;
+
+       /* Make sure there are no data path operations going on */
+       napi_disable(&vif_state->np->napi);
+       netif_tx_lock_bh(vif_state->np->netdev);
+
+       spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+
+       /* 
+        * Remove the hooks, but leave the vif_state on the
+        * accelerator's list as that signifies this vif is
+        * interested in using that accelerator if it becomes
+        * available again
+        */
+       vif_state->hooks = NULL;
+       
+       spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+
+       netif_tx_unlock_bh(vif_state->np->netdev);
+       napi_enable(&vif_state->np->napi);
+}
+
+
+/* 
+ * Safely remove the accelerator function hooks from a netfront state.
+ * 
+ * Must be called with the accelerator mutex held.  Takes the
+ * vif_states_lock spinlock.
+ */
+static void accelerator_remove_hooks(struct netfront_accelerator *accelerator)
+{
+       struct netfront_accel_vif_state *vif_state, *tmp;
+       unsigned long flags;
+
+       /* Mutex is held to iterate list */
+       list_for_each_entry_safe(vif_state, tmp,
+                                &accelerator->vif_states,
+                                link) {
+               if(vif_state->hooks) {
+                       spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+
+                       /* Last chance to get statistics from the accelerator */
+                       vif_state->hooks->get_stats(vif_state->np->netdev,
+                                                   &vif_state->np->netdev->stats,
+                                                   this_cpu_ptr(vif_state->np->stats));
+
+                       spin_unlock_irqrestore(&accelerator->vif_states_lock,
+                                              flags);
+
+                       accelerator_remove_single_hook(accelerator, vif_state);
+
+                       accelerator->hooks->remove(vif_state->dev);
+               }
+       }
+       
+       accelerator->hooks = NULL;
+}
+
+
+/* 
+ * Called by a netfront accelerator when it is unloaded.  This safely
+ * removes the hooks into the plugin and blocks until all devices have
+ * finished using it, so on return it is safe to unload.
+ *
+ * Takes the accelerator mutex, and vif_states_lock spinlock.
+ */
+void netfront_accelerator_stop(const char *frontend)
+{
+       struct netfront_accelerator *accelerator;
+
+       mutex_lock(&accelerator_mutex);
+
+       list_for_each_entry(accelerator, &accelerators_list, link) {
+               if (match_accelerator(frontend, accelerator)) {
+                       accelerator_remove_hooks(accelerator);
+                       goto out;
+               }
+       }
+ out:
+       mutex_unlock(&accelerator_mutex);
+}
+EXPORT_SYMBOL_GPL(netfront_accelerator_stop);
+
+
+/* 
+ * Helper for call_remove and do_suspend
+ * 
+ * Must be called with the accelerator mutex held.  Takes the
+ * vif_states_lock spinlock.
+ */
+static int do_remove(struct netfront_info *np, struct xenbus_device *dev)
+{
+       struct netfront_accelerator *accelerator = np->accelerator;
+       unsigned long flags;
+       int rc = 0;
+ 
+       if (np->accel_vif_state.hooks) {
+               spin_lock_irqsave(&accelerator->vif_states_lock, flags);
+
+               /* Last chance to get statistics from the accelerator */
+               np->accel_vif_state.hooks->get_stats(np->netdev,
+                                                    &np->netdev->stats,
+                                                    this_cpu_ptr(np->stats));
+
+               spin_unlock_irqrestore(&accelerator->vif_states_lock, 
+                                      flags);
+
+               /* 
+                * Try and do the opposite of accelerator_probe_new_vif
+                * to ensure there's no state pointing back at the 
+                * netdev 
+                */
+               accelerator_remove_single_hook(accelerator, 
+                                              &np->accel_vif_state);
+
+               rc = accelerator->hooks->remove(dev);
+       }
+ 
+       return rc;
+}
+
+
+/*
+ * Must be called with the accelerator mutex held.  Takes the
+ * vif_states_lock spinlock
+ */
+static int netfront_remove_accelerator(struct netfront_info *np,
+                                      struct xenbus_device *dev)
+{
+       struct netfront_accelerator *accelerator;
+       struct netfront_accel_vif_state *tmp_vif_state;
+       int rc = 0; 
+
+       /* Check that we've got a device that was accelerated */
+       if (np->accelerator == NULL)
+               return rc;
+
+       accelerator = np->accelerator;
+
+       list_for_each_entry(tmp_vif_state, &accelerator->vif_states,
+                           link) {
+               if (tmp_vif_state == &np->accel_vif_state) {
+                       list_del(&np->accel_vif_state.link);
+                       break;
+               }
+       }
+
+       rc = do_remove(np, dev);
+
+       np->accelerator = NULL;
+
+       return rc;
+}
+
+
+/*
+ * No lock pre-requisites.  Takes the accelerator mutex and the
+ * vif_states_lock spinlock.
+ */
+int netfront_accelerator_call_remove(struct netfront_info *np,
+                                    struct xenbus_device *dev)
+{
+       int rc;
+       netfront_accelerator_remove_watch(np);
+       mutex_lock(&accelerator_mutex);
+       rc = netfront_remove_accelerator(np, dev);
+       mutex_unlock(&accelerator_mutex);
+       return rc;
+}
+
+
+/*
+ * No lock pre-requisites.  Takes the accelerator mutex and the
+ * vif_states_lock spinlock.
+ */
+int netfront_accelerator_suspend(struct netfront_info *np,
+                                struct xenbus_device *dev)
+{
+       int rc = 0;
+       
+       mutex_lock(&accelerator_mutex);
+
+       /* Check that we've got a device that was accelerated */
+       if (np->accelerator == NULL)
+               goto out;
+
+       /* 
+        * Call the remove accelerator hook, but leave the vif_state
+        * on the accelerator's list in case there is a suspend_cancel.
+        */
+       rc = do_remove(np, dev);
+ out:
+       mutex_unlock(&accelerator_mutex);
+       return rc;
+}
+  
+  
+int netfront_accelerator_suspend_cancel(struct netfront_info *np,
+                                       struct xenbus_device *dev)
+{
+       netfront_accelerator_purge_watch(&np->accel_vif_state);
+
+       /* 
+        * Gratuitously fire the watch handler to reinstate the
+        * configured accelerator
+        */
+       if (dev->state == XenbusStateConnected)
+               queue_work(accel_watch_workqueue, 
+                          &np->accel_vif_state.accel_work);
+
+       return 0;
+}
+
+
+/*
+ * No lock pre-requisites.  Takes the accelerator mutex
+ */
+void netfront_accelerator_resume(struct netfront_info *np,
+                                struct xenbus_device *dev)
+{
+       struct netfront_accel_vif_state *accel_vif_state = NULL;
+
+       mutex_lock(&accelerator_mutex);
+
+       /* Check that we've got a device that was accelerated */
+       if(np->accelerator == NULL)
+               goto out;
+
+       /* Find the vif_state from the accelerator's list */
+       list_for_each_entry(accel_vif_state, &np->accelerator->vif_states, 
+                           link) {
+               if (accel_vif_state->dev == dev) {
+                       BUG_ON(accel_vif_state != &np->accel_vif_state);
+ 
+                       /* 
+                        * Remove it from the accelerator's list so
+                        * state is consistent for probing new vifs
+                        * when they get connected
+                        */
+                       list_del(&accel_vif_state->link);
+                       np->accelerator = NULL;
+
+                       break;
+               }
+       }
+
+ out:
+       mutex_unlock(&accelerator_mutex);
+       return;
+}
+
+
+/*
+ * No lock pre-requisites.  Takes the vif_states_lock spinlock
+ */
+int netfront_check_accelerator_queue_ready(struct net_device *dev,
+                                          struct netfront_info *np)
+{
+       struct netfront_accelerator *accelerator;
+       int rc = 1;
+       unsigned long flags;
+
+       accelerator = np->accelerator;
+
+       /* Call the check_ready accelerator hook. */ 
+       if (np->accel_vif_state.hooks && accelerator) {
+               spin_lock_irqsave(&accelerator->vif_states_lock, flags); 
+               if (np->accel_vif_state.hooks &&
+                   np->accelerator == accelerator)
+                       rc = np->accel_vif_state.hooks->check_ready(dev);
+               spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+       }
+
+       return rc;
+}
+
+
+/*
+ * No lock pre-requisites.  Takes the vif_states_lock spinlock
+ */
+void netfront_accelerator_call_stop_napi_irq(struct netfront_info *np,
+                                            struct net_device *dev)
+{
+       struct netfront_accelerator *accelerator;
+       unsigned long flags;
+
+       accelerator = np->accelerator;
+
+       /* Call the stop_napi_interrupts accelerator hook. */
+       if (np->accel_vif_state.hooks && accelerator != NULL) {
+               spin_lock_irqsave(&accelerator->vif_states_lock, flags); 
+               if (np->accel_vif_state.hooks &&
+                   np->accelerator == accelerator)
+                       np->accel_vif_state.hooks->stop_napi_irq(dev);
+               spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+       }
+}
+
+
+/*
+ * No lock pre-requisites.  Takes the vif_states_lock spinlock
+ */
+int netfront_accelerator_call_get_stats(struct netfront_info *np,
+                                       struct net_device *dev)
+{
+       struct netfront_accelerator *accelerator;
+       unsigned long flags;
+       int rc = 0;
+
+       accelerator = np->accelerator;
+
+       /* Call the get_stats accelerator hook. */
+       if (np->accel_vif_state.hooks && accelerator != NULL) {
+               spin_lock_irqsave(&accelerator->vif_states_lock, flags); 
+               if (np->accel_vif_state.hooks && 
+                   np->accelerator == accelerator)
+                       rc = np->accel_vif_state.hooks->get_stats(dev, &dev->stats,
+                                                                 this_cpu_ptr(np->stats));
+               spin_unlock_irqrestore(&accelerator->vif_states_lock, flags);
+       }
+       return rc;
+}
+
diff --git a/drivers/xen/netfront/netfront.c b/drivers/xen/netfront/netfront.c

new file mode 100644 (file)

index 0000000..03da3e9
--- /dev/null
+++ b/drivers/xen/netfront/netfront.c
@@ -0,0 +1,2280 @@
+/******************************************************************************
+ * Virtual network driver for conversing with remote driver backends.
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ * Copyright (c) 2005, XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/ethtool.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/io.h>
+#include <linux/moduleparam.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+#include <net/route.h>
+#include <asm/uaccess.h>
+#include <xen/evtchn.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/netif.h>
+#include <xen/interface/memory.h>
+#include <xen/balloon.h>
+#include <asm/page.h>
+#include <asm/maddr.h>
+#include <asm/uaccess.h>
+#include <xen/interface/grant_table.h>
+#include <xen/gnttab.h>
+#include <xen/net-util.h>
+
+struct netfront_cb {
+       struct page *page;
+       unsigned offset;
+};
+
+#define NETFRONT_SKB_CB(skb)   ((struct netfront_cb *)((skb)->cb))
+
+#include "netfront.h"
+
+/*
+ * Mutually-exclusive module options to select receive data path:
+ *  rx_copy : Packets are copied by network backend into local memory
+ *  rx_flip : Page containing packet data is transferred to our ownership
+ * For fully-virtualised guests there is no option - copying must be used.
+ * For paravirtualised guests, flipping is the default.
+ */
+#ifdef CONFIG_XEN
+static bool MODPARM_rx_copy;
+module_param_named(rx_copy, MODPARM_rx_copy, bool, 0);
+MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)");
+static bool MODPARM_rx_flip;
+module_param_named(rx_flip, MODPARM_rx_flip, bool, 0);
+MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)");
+#else
+# define MODPARM_rx_copy true
+# define MODPARM_rx_flip false
+#endif
+
+#define RX_COPY_THRESHOLD 256
+
+/* If we don't have GSO, fake things up so that we never try to use it. */
+#if defined(NETIF_F_GSO)
+#define HAVE_GSO                       1
+#define HAVE_TSO                       1 /* TSO is a subset of GSO */
+#define HAVE_CSUM_OFFLOAD              1
+static inline void dev_disable_gso_features(struct net_device *dev)
+{
+       /* Turn off all GSO bits except ROBUST. */
+       dev->features &= ~NETIF_F_GSO_MASK;
+       dev->features |= NETIF_F_GSO_ROBUST;
+}
+#elif defined(NETIF_F_TSO)
+#define HAVE_GSO                      0
+#define HAVE_TSO                       1
+
+/* Some older kernels cannot cope with incorrect checksums,
+ * particularly in netfilter. I'm not sure there is 100% correlation
+ * with the presence of NETIF_F_TSO but it appears to be a good first
+ * approximiation.
+ */
+#define HAVE_CSUM_OFFLOAD              0
+
+#define gso_size tso_size
+#define gso_segs tso_segs
+static inline void dev_disable_gso_features(struct net_device *dev)
+{
+       /* Turn off all TSO bits. */
+       dev->features &= ~NETIF_F_TSO;
+}
+static inline int skb_is_gso(const struct sk_buff *skb)
+{
+        return skb_shinfo(skb)->tso_size;
+}
+static inline int skb_gso_ok(struct sk_buff *skb, int features)
+{
+        return (features & NETIF_F_TSO);
+}
+
+#define netif_skb_features(skb) ((skb)->dev->features)
+static inline int netif_needs_gso(struct sk_buff *skb, int features)
+{
+        return skb_is_gso(skb) &&
+               (!skb_gso_ok(skb, features) ||
+                unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
+}
+#else
+#define HAVE_GSO                       0
+#define HAVE_TSO                       0
+#define HAVE_CSUM_OFFLOAD              0
+#define netif_needs_gso(skb, feat)     0
+#define dev_disable_gso_features(dev)  ((void)0)
+#define ethtool_op_set_tso(dev, data)  (-ENOSYS)
+#endif
+
+struct netfront_rx_info {
+       struct netif_rx_response rx;
+       struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
+};
+
+/*
+ * Implement our own carrier flag: the network stack's version causes delays
+ * when the carrier is re-enabled (in particular, dev_activate() may not
+ * immediately be called, which can cause packet loss).
+ */
+#define netfront_carrier_on(netif)     ((netif)->carrier = 1)
+#define netfront_carrier_off(netif)    ((netif)->carrier = 0)
+#define netfront_carrier_ok(netif)     ((netif)->carrier)
+
+/*
+ * Access macros for acquiring freeing slots in tx_skbs[].
+ */
+
+static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id)
+{
+       list[id] = list[0];
+       list[0]  = (void *)(unsigned long)id;
+}
+
+static inline unsigned short get_id_from_freelist(struct sk_buff **list)
+{
+       unsigned int id = (unsigned int)(unsigned long)list[0];
+       list[0] = list[id];
+       return id;
+}
+
+static inline int xennet_rxidx(RING_IDX idx)
+{
+       return idx & (NET_RX_RING_SIZE - 1);
+}
+
+static inline struct sk_buff *xennet_get_rx_skb(struct netfront_info *np,
+                                               RING_IDX ri)
+{
+       int i = xennet_rxidx(ri);
+       struct sk_buff *skb = np->rx_skbs[i];
+       np->rx_skbs[i] = NULL;
+       return skb;
+}
+
+static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np,
+                                           RING_IDX ri)
+{
+       int i = xennet_rxidx(ri);
+       grant_ref_t ref = np->grant_rx_ref[i];
+       np->grant_rx_ref[i] = GRANT_INVALID_REF;
+       return ref;
+}
+
+#define DPRINTK(fmt, args...)                          \
+       pr_debug("netfront (%s:%d) " fmt,               \
+                __FUNCTION__, __LINE__, ##args)
+#define IPRINTK(fmt, args...) pr_info("netfront: " fmt, ##args)
+#define WPRINTK(fmt, args...) pr_warning("netfront: " fmt, ##args)
+
+static int setup_device(struct xenbus_device *, struct netfront_info *);
+static struct net_device *create_netdev(struct xenbus_device *);
+
+static void end_access(int, void *);
+static void netif_release_rings(struct netfront_info *);
+static void netif_disconnect_backend(struct netfront_info *);
+
+static int network_connect(struct net_device *);
+static void network_tx_buf_gc(struct net_device *);
+static void network_alloc_rx_buffers(struct net_device *);
+
+static irqreturn_t netif_int(int irq, void *dev_id);
+
+#ifdef CONFIG_SYSFS
+static int xennet_sysfs_addif(struct net_device *netdev);
+static void xennet_sysfs_delif(struct net_device *netdev);
+#else /* !CONFIG_SYSFS */
+#define xennet_sysfs_addif(dev) (0)
+#define xennet_sysfs_delif(dev) do { } while(0)
+#endif
+
+static inline bool xennet_can_sg(struct net_device *dev)
+{
+       return dev->features & NETIF_F_SG;
+}
+
+/*
+ * Work around net.ipv4.conf.*.arp_notify not being enabled by default.
+ */
+static void __devinit netfront_enable_arp_notify(struct netfront_info *info)
+{
+#ifdef CONFIG_INET
+       struct in_device *in_dev;
+
+       rtnl_lock();
+       in_dev = __in_dev_get_rtnl(info->netdev);
+       if (in_dev && !IN_DEV_CONF_GET(in_dev, ARP_NOTIFY))
+               IN_DEV_CONF_SET(in_dev, ARP_NOTIFY, 1);
+       rtnl_unlock();
+       if (!in_dev)
+               pr_warn("Cannot enable ARP notification on %s\n",
+                       info->xbdev->nodename);
+#endif
+}
+
+/**
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and the ring buffers for communication with the backend, and
+ * inform the backend of the appropriate details for those.
+ */
+static int __devinit netfront_probe(struct xenbus_device *dev,
+                                   const struct xenbus_device_id *id)
+{
+       int err;
+       struct net_device *netdev;
+       struct netfront_info *info;
+
+       netdev = create_netdev(dev);
+       if (IS_ERR(netdev)) {
+               err = PTR_ERR(netdev);
+               xenbus_dev_fatal(dev, err, "creating netdev");
+               return err;
+       }
+
+       info = netdev_priv(netdev);
+       dev_set_drvdata(&dev->dev, info);
+
+       err = register_netdev(info->netdev);
+       if (err) {
+               pr_warning("%s: register_netdev err=%d\n",
+                          __FUNCTION__, err);
+               goto fail;
+       }
+
+       netfront_enable_arp_notify(info);
+
+       err = xennet_sysfs_addif(info->netdev);
+       if (err) {
+               unregister_netdev(info->netdev);
+               pr_warning("%s: add sysfs failed err=%d\n",
+                          __FUNCTION__, err);
+               goto fail;
+       }
+
+       return 0;
+
+ fail:
+       free_netdev(netdev);
+       dev_set_drvdata(&dev->dev, NULL);
+       return err;
+}
+
+static int __devexit netfront_remove(struct xenbus_device *dev)
+{
+       struct netfront_info *info = dev_get_drvdata(&dev->dev);
+
+       DPRINTK("%s\n", dev->nodename);
+
+       netfront_accelerator_call_remove(info, dev);
+
+       netif_disconnect_backend(info);
+
+       del_timer_sync(&info->rx_refill_timer);
+
+       xennet_sysfs_delif(info->netdev);
+
+       unregister_netdev(info->netdev);
+
+       free_percpu(info->stats);
+
+       free_netdev(info->netdev);
+
+       return 0;
+}
+
+
+static int netfront_suspend(struct xenbus_device *dev)
+{
+       struct netfront_info *info = dev_get_drvdata(&dev->dev);
+       return netfront_accelerator_suspend(info, dev);
+}
+
+
+static int netfront_suspend_cancel(struct xenbus_device *dev)
+{
+       struct netfront_info *info = dev_get_drvdata(&dev->dev);
+       return netfront_accelerator_suspend_cancel(info, dev);
+}
+
+
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart.  We tear down our netif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int netfront_resume(struct xenbus_device *dev)
+{
+       struct netfront_info *info = dev_get_drvdata(&dev->dev);
+
+       DPRINTK("%s\n", dev->nodename);
+
+       netfront_accelerator_resume(info, dev);
+
+       netif_disconnect_backend(info);
+       return 0;
+}
+
+static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+       char *s, *e, *macstr;
+       int i;
+
+       macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+       if (IS_ERR(macstr))
+               return PTR_ERR(macstr);
+
+       for (i = 0; i < ETH_ALEN; i++) {
+               mac[i] = simple_strtoul(s, &e, 16);
+               if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+                       kfree(macstr);
+                       return -ENOENT;
+               }
+               s = e+1;
+       }
+
+       kfree(macstr);
+       return 0;
+}
+
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_backend(struct xenbus_device *dev,
+                          struct netfront_info *info)
+{
+       const char *message;
+       struct xenbus_transaction xbt;
+       int err;
+
+       /* Read mac only in the first setup. */
+       if (!is_valid_ether_addr(info->mac)) {
+               err = xen_net_read_mac(dev, info->mac);
+               if (err) {
+                       xenbus_dev_fatal(dev, err, "parsing %s/mac",
+                                        dev->nodename);
+                       goto out;
+               }
+       }
+
+       /* Create shared ring, alloc event channel. */
+       err = setup_device(dev, info);
+       if (err)
+               goto out;
+
+       /* This will load an accelerator if one is configured when the
+        * watch fires */
+       netfront_accelerator_add_watch(info);
+
+again:
+       err = xenbus_transaction_start(&xbt);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "starting transaction");
+               goto destroy_ring;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u",
+                           info->tx_ring_ref);
+       if (err) {
+               message = "writing tx ring-ref";
+               goto abort_transaction;
+       }
+       err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u",
+                           info->rx_ring_ref);
+       if (err) {
+               message = "writing rx ring-ref";
+               goto abort_transaction;
+       }
+       err = xenbus_printf(xbt, dev->nodename,
+                           "event-channel", "%u",
+                           irq_to_evtchn_port(info->irq));
+       if (err) {
+               message = "writing event-channel";
+               goto abort_transaction;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u",
+                           info->copying_receiver);
+       if (err) {
+               message = "writing request-rx-copy";
+               goto abort_transaction;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1);
+       if (err) {
+               message = "writing feature-rx-notify";
+               goto abort_transaction;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "feature-no-csum-offload",
+                           "%d", !HAVE_CSUM_OFFLOAD);
+       if (err) {
+               message = "writing feature-no-csum-offload";
+               goto abort_transaction;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
+       if (err) {
+               message = "writing feature-sg";
+               goto abort_transaction;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d",
+                           HAVE_TSO);
+       if (err) {
+               message = "writing feature-gso-tcpv4";
+               goto abort_transaction;
+       }
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err) {
+               if (err == -EAGAIN)
+                       goto again;
+               xenbus_dev_fatal(dev, err, "completing transaction");
+               goto destroy_ring;
+       }
+
+       return 0;
+
+ abort_transaction:
+       xenbus_transaction_end(xbt, 1);
+       xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_ring:
+       netfront_accelerator_call_remove(info, dev);
+       netif_disconnect_backend(info);
+ out:
+       return err;
+}
+
+static int setup_device(struct xenbus_device *dev, struct netfront_info *info)
+{
+       struct netif_tx_sring *txs;
+       struct netif_rx_sring *rxs;
+       int err;
+       struct net_device *netdev = info->netdev;
+
+       info->tx_ring_ref = GRANT_INVALID_REF;
+       info->rx_ring_ref = GRANT_INVALID_REF;
+       info->rx.sring = NULL;
+       info->tx.sring = NULL;
+       info->irq = 0;
+
+       txs = (struct netif_tx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH);
+       if (!txs) {
+               err = -ENOMEM;
+               xenbus_dev_fatal(dev, err, "allocating tx ring page");
+               goto fail;
+       }
+       SHARED_RING_INIT(txs);
+       FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
+
+       err = xenbus_grant_ring(dev, virt_to_mfn(txs));
+       if (err < 0) {
+               free_page((unsigned long)txs);
+               goto fail;
+       }
+       info->tx_ring_ref = err;
+
+       rxs = (struct netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH);
+       if (!rxs) {
+               err = -ENOMEM;
+               xenbus_dev_fatal(dev, err, "allocating rx ring page");
+               goto fail;
+       }
+       SHARED_RING_INIT(rxs);
+       FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
+
+       err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
+       if (err < 0) {
+               free_page((unsigned long)rxs);
+               goto fail;
+       }
+       info->rx_ring_ref = err;
+
+       memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
+
+       err = bind_listening_port_to_irqhandler(
+               dev->otherend_id, netif_int, 0, netdev->name, netdev);
+       if (err < 0)
+               goto fail;
+       info->irq = err;
+
+       return 0;
+
+ fail:
+       netif_release_rings(info);
+       return err;
+}
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void backend_changed(struct xenbus_device *dev,
+                           enum xenbus_state backend_state)
+{
+       struct netfront_info *np = dev_get_drvdata(&dev->dev);
+       struct net_device *netdev = np->netdev;
+
+       DPRINTK("%s\n", xenbus_strstate(backend_state));
+
+       switch (backend_state) {
+       case XenbusStateInitialising:
+       case XenbusStateInitialised:
+       case XenbusStateReconfiguring:
+       case XenbusStateReconfigured:
+       case XenbusStateUnknown:
+       case XenbusStateClosed:
+               break;
+
+       case XenbusStateInitWait:
+               if (dev->state != XenbusStateInitialising)
+                       break;
+               if (network_connect(netdev) != 0)
+                       break;
+               xenbus_switch_state(dev, XenbusStateConnected);
+               break;
+
+       case XenbusStateConnected:
+               netif_notify_peers(netdev);
+               break;
+
+       case XenbusStateClosing:
+               xenbus_frontend_closed(dev);
+               break;
+       }
+}
+
+static inline int netfront_tx_slot_available(struct netfront_info *np)
+{
+       return ((np->tx.req_prod_pvt - np->tx.rsp_cons) <
+               (TX_MAX_TARGET - MAX_SKB_FRAGS - 2));
+}
+
+
+static inline void network_maybe_wake_tx(struct net_device *dev)
+{
+       struct netfront_info *np = netdev_priv(dev);
+
+       if (unlikely(netif_queue_stopped(dev)) &&
+           netfront_tx_slot_available(np) &&
+           likely(netif_running(dev)) &&
+           netfront_check_accelerator_queue_ready(dev, np))
+               netif_wake_queue(dev);
+}
+
+
+int netfront_check_queue_ready(struct net_device *dev)
+{
+       struct netfront_info *np = netdev_priv(dev);
+
+       return unlikely(netif_queue_stopped(dev)) &&
+               netfront_tx_slot_available(np) &&
+               likely(netif_running(dev));
+}
+EXPORT_SYMBOL(netfront_check_queue_ready);
+
+static int network_open(struct net_device *dev)
+{
+       struct netfront_info *np = netdev_priv(dev);
+
+       napi_enable(&np->napi);
+
+       spin_lock_bh(&np->rx_lock);
+       if (netfront_carrier_ok(np)) {
+               network_alloc_rx_buffers(dev);
+               np->rx.sring->rsp_event = np->rx.rsp_cons + 1;
+               if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)){
+                       netfront_accelerator_call_stop_napi_irq(np, dev);
+
+                       napi_schedule(&np->napi);
+               }
+       }
+       spin_unlock_bh(&np->rx_lock);
+
+       netif_start_queue(dev);
+
+       return 0;
+}
+
+static void network_tx_buf_gc(struct net_device *dev)
+{
+       RING_IDX cons, prod;
+       unsigned short id;
+       struct netfront_info *np = netdev_priv(dev);
+       struct sk_buff *skb;
+
+       BUG_ON(!netfront_carrier_ok(np));
+
+       do {
+               prod = np->tx.sring->rsp_prod;
+               rmb(); /* Ensure we see responses up to 'rp'. */
+
+               for (cons = np->tx.rsp_cons; cons != prod; cons++) {
+                       struct netif_tx_response *txrsp;
+
+                       txrsp = RING_GET_RESPONSE(&np->tx, cons);
+                       if (txrsp->status == XEN_NETIF_RSP_NULL)
+                               continue;
+
+                       id  = txrsp->id;
+                       skb = np->tx_skbs[id];
+                       if (unlikely(gnttab_query_foreign_access(
+                               np->grant_tx_ref[id]) != 0)) {
+                               pr_alert("network_tx_buf_gc: grant still"
+                                        " in use by backend domain\n");
+                               BUG();
+                       }
+                       gnttab_end_foreign_access_ref(np->grant_tx_ref[id]);
+                       gnttab_release_grant_reference(
+                               &np->gref_tx_head, np->grant_tx_ref[id]);
+                       np->grant_tx_ref[id] = GRANT_INVALID_REF;
+                       add_id_to_freelist(np->tx_skbs, id);
+                       dev_kfree_skb_irq(skb);
+               }
+
+               np->tx.rsp_cons = prod;
+
+               /*
+                * Set a new event, then check for race with update of tx_cons.
+                * Note that it is essential to schedule a callback, no matter
+                * how few buffers are pending. Even if there is space in the
+                * transmit ring, higher layers may be blocked because too much
+                * data is outstanding: in such cases notification from Xen is
+                * likely to be the only kick that we'll get.
+                */
+               np->tx.sring->rsp_event =
+                       prod + ((np->tx.sring->req_prod - prod) >> 1) + 1;
+               mb();
+       } while ((cons == prod) && (prod != np->tx.sring->rsp_prod));
+
+       network_maybe_wake_tx(dev);
+}
+
+static void rx_refill_timeout(unsigned long data)
+{
+       struct net_device *dev = (struct net_device *)data;
+       struct netfront_info *np = netdev_priv(dev);
+
+       netfront_accelerator_call_stop_napi_irq(np, dev);
+
+       napi_schedule(&np->napi);
+}
+
+static void network_alloc_rx_buffers(struct net_device *dev)
+{
+       unsigned short id;
+       struct netfront_info *np = netdev_priv(dev);
+       struct sk_buff *skb;
+       struct page *page;
+       int i, batch_target, notify;
+       RING_IDX req_prod = np->rx.req_prod_pvt;
+       struct xen_memory_reservation reservation;
+       grant_ref_t ref;
+       unsigned long pfn;
+       void *vaddr;
+       int nr_flips;
+       netif_rx_request_t *req;
+
+       if (unlikely(!netfront_carrier_ok(np)))
+               return;
+
+       /*
+        * Allocate skbuffs greedily, even though we batch updates to the
+        * receive ring. This creates a less bursty demand on the memory
+        * allocator, so should reduce the chance of failed allocation requests
+        * both for ourself and for other kernel subsystems.
+        */
+       batch_target = np->rx_target - (req_prod - np->rx.rsp_cons);
+       for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) {
+               /*
+                * Allocate an skb and a page. Do not use __dev_alloc_skb as
+                * that will allocate page-sized buffers which is not
+                * necessary here.
+                * 16 bytes added as necessary headroom for netif_receive_skb.
+                */
+               skb = alloc_skb(RX_COPY_THRESHOLD + 16 + NET_IP_ALIGN,
+                               GFP_ATOMIC | __GFP_NOWARN);
+               if (unlikely(!skb))
+                       goto no_skb;
+
+               page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
+               if (!page) {
+                       kfree_skb(skb);
+no_skb:
+                       /* Any skbuffs queued for refill? Force them out. */
+                       if (i != 0)
+                               goto refill;
+                       /* Could not allocate any skbuffs. Try again later. */
+                       mod_timer(&np->rx_refill_timer,
+                                 jiffies + (HZ/10));
+                       break;
+               }
+
+               skb_reserve(skb, 16 + NET_IP_ALIGN); /* mimic dev_alloc_skb() */
+               __skb_fill_page_desc(skb, 0, page, 0, 0);
+               skb_shinfo(skb)->nr_frags = 1;
+               __skb_queue_tail(&np->rx_batch, skb);
+       }
+
+       /* Is the batch large enough to be worthwhile? */
+       if (i < (np->rx_target/2)) {
+               if (req_prod > np->rx.sring->req_prod)
+                       goto push;
+               return;
+       }
+
+       /* Adjust our fill target if we risked running out of buffers. */
+       if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) &&
+           ((np->rx_target *= 2) > np->rx_max_target))
+               np->rx_target = np->rx_max_target;
+
+ refill:
+       for (nr_flips = i = 0; ; i++) {
+               if ((skb = __skb_dequeue(&np->rx_batch)) == NULL)
+                       break;
+
+               skb->dev = dev;
+
+               id = xennet_rxidx(req_prod + i);
+
+               BUG_ON(np->rx_skbs[id]);
+               np->rx_skbs[id] = skb;
+
+               ref = gnttab_claim_grant_reference(&np->gref_rx_head);
+               BUG_ON((signed short)ref < 0);
+               np->grant_rx_ref[id] = ref;
+
+               page = skb_frag_page(skb_shinfo(skb)->frags);
+               pfn = page_to_pfn(page);
+               vaddr = page_address(page);
+
+               req = RING_GET_REQUEST(&np->rx, req_prod + i);
+               if (!np->copying_receiver) {
+                       gnttab_grant_foreign_transfer_ref(ref,
+                                                         np->xbdev->otherend_id,
+                                                         pfn);
+                       np->rx_pfn_array[nr_flips] = pfn_to_mfn(pfn);
+                       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                               /* Remove this page before passing
+                                * back to Xen. */
+                               set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+                               MULTI_update_va_mapping(np->rx_mcl+i,
+                                                       (unsigned long)vaddr,
+                                                       __pte(0), 0);
+                       }
+                       nr_flips++;
+               } else {
+                       gnttab_grant_foreign_access_ref(ref,
+                                                       np->xbdev->otherend_id,
+                                                       pfn_to_mfn(pfn),
+                                                       0);
+               }
+
+               req->id = id;
+               req->gref = ref;
+       }
+
+       if ( nr_flips != 0 ) {
+               /* Tell the ballon driver what is going on. */
+               balloon_update_driver_allowance(i);
+
+               set_xen_guest_handle(reservation.extent_start,
+                                    np->rx_pfn_array);
+               reservation.nr_extents   = nr_flips;
+               reservation.extent_order = 0;
+               reservation.address_bits = 0;
+               reservation.domid        = DOMID_SELF;
+
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                       /* After all PTEs have been zapped, flush the TLB. */
+                       np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] =
+                               UVMF_TLB_FLUSH|UVMF_ALL;
+
+                       /* Give away a batch of pages. */
+                       np->rx_mcl[i].op = __HYPERVISOR_memory_op;
+                       np->rx_mcl[i].args[0] = XENMEM_decrease_reservation;
+                       np->rx_mcl[i].args[1] = (unsigned long)&reservation;
+
+                       /* Zap PTEs and give away pages in one big
+                        * multicall. */
+                       if (unlikely(HYPERVISOR_multicall(np->rx_mcl, i+1)))
+                               BUG();
+
+                       /* Check return status of HYPERVISOR_memory_op(). */
+                       if (unlikely(np->rx_mcl[i].result != i))
+                               panic("Unable to reduce memory reservation\n");
+                       while (nr_flips--)
+                               BUG_ON(np->rx_mcl[nr_flips].result);
+               } else {
+                       if (HYPERVISOR_memory_op(XENMEM_decrease_reservation,
+                                                &reservation) != i)
+                               panic("Unable to reduce memory reservation\n");
+               }
+       } else {
+               wmb();
+       }
+
+       /* Above is a suitable barrier to ensure backend will see requests. */
+       np->rx.req_prod_pvt = req_prod + i;
+ push:
+       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify);
+       if (notify)
+               notify_remote_via_irq(np->irq);
+}
+
+static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
+                             struct netif_tx_request *tx)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       char *data = skb->data;
+       unsigned long mfn;
+       RING_IDX prod = np->tx.req_prod_pvt;
+       int frags = skb_shinfo(skb)->nr_frags;
+       unsigned int offset = offset_in_page(data);
+       unsigned int len = skb_headlen(skb);
+       unsigned int id;
+       grant_ref_t ref;
+       int i;
+
+       while (len > PAGE_SIZE - offset) {
+               tx->size = PAGE_SIZE - offset;
+               tx->flags |= XEN_NETTXF_more_data;
+               len -= tx->size;
+               data += tx->size;
+               offset = 0;
+
+               id = get_id_from_freelist(np->tx_skbs);
+               np->tx_skbs[id] = skb_get(skb);
+               tx = RING_GET_REQUEST(&np->tx, prod++);
+               tx->id = id;
+               ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+               BUG_ON((signed short)ref < 0);
+
+               mfn = virt_to_mfn(data);
+               gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+                                               mfn, GTF_readonly);
+
+               tx->gref = np->grant_tx_ref[id] = ref;
+               tx->offset = offset;
+               tx->size = len;
+               tx->flags = 0;
+       }
+
+       for (i = 0; i < frags; i++) {
+               skb_frag_t *frag = skb_shinfo(skb)->frags + i;
+
+               tx->flags |= XEN_NETTXF_more_data;
+
+               id = get_id_from_freelist(np->tx_skbs);
+               np->tx_skbs[id] = skb_get(skb);
+               tx = RING_GET_REQUEST(&np->tx, prod++);
+               tx->id = id;
+               ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+               BUG_ON((signed short)ref < 0);
+
+               mfn = pfn_to_mfn(page_to_pfn(skb_frag_page(frag)));
+               gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+                                               mfn, GTF_readonly);
+
+               tx->gref = np->grant_tx_ref[id] = ref;
+               tx->offset = frag->page_offset;
+               tx->size = skb_frag_size(frag);
+               tx->flags = 0;
+       }
+
+       np->tx.req_prod_pvt = prod;
+}
+
+static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+       unsigned short id;
+       struct netfront_info *np = netdev_priv(dev);
+       struct netfront_stats *stats = this_cpu_ptr(np->stats);
+       struct netif_tx_request *tx;
+       struct netif_extra_info *extra;
+       char *data = skb->data;
+       RING_IDX i;
+       grant_ref_t ref;
+       unsigned long mfn, flags;
+       int notify;
+       int frags = skb_shinfo(skb)->nr_frags;
+       unsigned int offset = offset_in_page(data);
+       unsigned int len = skb_headlen(skb);
+
+       /* Check the fast path, if hooks are available */
+       if (np->accel_vif_state.hooks && 
+           np->accel_vif_state.hooks->start_xmit(skb, dev)) { 
+               /* Fast path has sent this packet */ 
+               return NETDEV_TX_OK;
+       } 
+
+       frags += DIV_ROUND_UP(offset + len, PAGE_SIZE);
+       if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
+               pr_alert("xennet: skb rides the rocket: %d frags\n", frags);
+               dump_stack();
+               goto drop;
+       }
+
+       spin_lock_irqsave(&np->tx_lock, flags);
+
+       if (unlikely(!netfront_carrier_ok(np) ||
+                    (frags > 1 && !xennet_can_sg(dev)) ||
+                    netif_needs_gso(skb, netif_skb_features(skb)))) {
+               spin_unlock_irqrestore(&np->tx_lock, flags);
+               goto drop;
+       }
+
+       i = np->tx.req_prod_pvt;
+
+       id = get_id_from_freelist(np->tx_skbs);
+       np->tx_skbs[id] = skb;
+
+       tx = RING_GET_REQUEST(&np->tx, i);
+
+       tx->id   = id;
+       ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+       BUG_ON((signed short)ref < 0);
+       mfn = virt_to_mfn(data);
+       gnttab_grant_foreign_access_ref(
+               ref, np->xbdev->otherend_id, mfn, GTF_readonly);
+       tx->gref = np->grant_tx_ref[id] = ref;
+       tx->offset = offset;
+       tx->size = len;
+
+       tx->flags = 0;
+       extra = NULL;
+
+       if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
+               tx->flags |= XEN_NETTXF_csum_blank | XEN_NETTXF_data_validated;
+       else if (skb->ip_summed == CHECKSUM_UNNECESSARY)
+               tx->flags |= XEN_NETTXF_data_validated;
+
+#if HAVE_TSO
+       if (skb_shinfo(skb)->gso_size) {
+               struct netif_extra_info *gso = (struct netif_extra_info *)
+                       RING_GET_REQUEST(&np->tx, ++i);
+
+               if (extra)
+                       extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
+               else
+                       tx->flags |= XEN_NETTXF_extra_info;
+
+               gso->u.gso.size = skb_shinfo(skb)->gso_size;
+               gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+               gso->u.gso.pad = 0;
+               gso->u.gso.features = 0;
+
+               gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+               gso->flags = 0;
+               extra = gso;
+       }
+#endif
+
+       np->tx.req_prod_pvt = i + 1;
+
+       xennet_make_frags(skb, dev, tx);
+       tx->size = skb->len;
+
+       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
+       if (notify)
+               notify_remote_via_irq(np->irq);
+
+       u64_stats_update_begin(&stats->syncp);
+       stats->tx_bytes += skb->len;
+       stats->tx_packets++;
+       u64_stats_update_end(&stats->syncp);
+       dev->trans_start = jiffies;
+
+       /* Note: It is not safe to access skb after network_tx_buf_gc()! */
+       network_tx_buf_gc(dev);
+
+       if (!netfront_tx_slot_available(np))
+               netif_stop_queue(dev);
+
+       spin_unlock_irqrestore(&np->tx_lock, flags);
+
+       return NETDEV_TX_OK;
+
+ drop:
+       dev->stats.tx_dropped++;
+       dev_kfree_skb(skb);
+       return NETDEV_TX_OK;
+}
+
+static irqreturn_t netif_int(int irq, void *dev_id)
+{
+       struct net_device *dev = dev_id;
+       struct netfront_info *np = netdev_priv(dev);
+       unsigned long flags;
+
+       spin_lock_irqsave(&np->tx_lock, flags);
+
+       if (likely(netfront_carrier_ok(np))) {
+               network_tx_buf_gc(dev);
+               /* Under tx_lock: protects access to rx shared-ring indexes. */
+               if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) {
+                       netfront_accelerator_call_stop_napi_irq(np, dev);
+
+                       napi_schedule(&np->napi);
+               }
+       }
+
+       spin_unlock_irqrestore(&np->tx_lock, flags);
+
+       return IRQ_HANDLED;
+}
+
+static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb,
+                               grant_ref_t ref)
+{
+       int new = xennet_rxidx(np->rx.req_prod_pvt);
+
+       BUG_ON(np->rx_skbs[new]);
+       np->rx_skbs[new] = skb;
+       np->grant_rx_ref[new] = ref;
+       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new;
+       RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref;
+       np->rx.req_prod_pvt++;
+}
+
+int xennet_get_extras(struct netfront_info *np,
+                     struct netif_extra_info *extras, RING_IDX rp)
+
+{
+       struct netif_extra_info *extra;
+       RING_IDX cons = np->rx.rsp_cons;
+       int err = 0;
+
+       do {
+               struct sk_buff *skb;
+               grant_ref_t ref;
+
+               if (unlikely(cons + 1 == rp)) {
+                       if (net_ratelimit())
+                               WPRINTK("Missing extra info\n");
+                       err = -EBADR;
+                       break;
+               }
+
+               extra = (struct netif_extra_info *)
+                       RING_GET_RESPONSE(&np->rx, ++cons);
+
+               if (unlikely(!extra->type ||
+                            extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
+                       if (net_ratelimit())
+                               WPRINTK("Invalid extra type: %d\n",
+                                       extra->type);
+                       err = -EINVAL;
+               } else {
+                       memcpy(&extras[extra->type - 1], extra,
+                              sizeof(*extra));
+               }
+
+               skb = xennet_get_rx_skb(np, cons);
+               ref = xennet_get_rx_ref(np, cons);
+               xennet_move_rx_slot(np, skb, ref);
+       } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
+
+       np->rx.rsp_cons = cons;
+       return err;
+}
+
+static int xennet_get_responses(struct netfront_info *np,
+                               struct netfront_rx_info *rinfo, RING_IDX rp,
+                               struct sk_buff_head *list,
+                               int *pages_flipped_p)
+{
+       int pages_flipped = *pages_flipped_p;
+       struct mmu_update *mmu;
+       struct multicall_entry *mcl;
+       struct netif_rx_response *rx = &rinfo->rx;
+       struct netif_extra_info *extras = rinfo->extras;
+       RING_IDX cons = np->rx.rsp_cons;
+       struct sk_buff *skb = xennet_get_rx_skb(np, cons);
+       grant_ref_t ref = xennet_get_rx_ref(np, cons);
+       int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD);
+       int frags = 1;
+       int err = 0;
+       unsigned long ret;
+
+       if (rx->flags & XEN_NETRXF_extra_info) {
+               err = xennet_get_extras(np, extras, rp);
+               cons = np->rx.rsp_cons;
+       }
+
+       for (;;) {
+               unsigned long mfn;
+
+               if (unlikely(rx->status < 0 ||
+                            rx->offset + rx->status > PAGE_SIZE)) {
+                       if (net_ratelimit())
+                               WPRINTK("rx->offset: %x, size: %u\n",
+                                       rx->offset, rx->status);
+                       xennet_move_rx_slot(np, skb, ref);
+                       err = -EINVAL;
+                       goto next;
+               }
+
+               /*
+                * This definitely indicates a bug, either in this driver or in
+                * the backend driver. In future this should flag the bad
+                * situation to the system controller to reboot the backed.
+                */
+               if (ref == GRANT_INVALID_REF) {
+                       if (net_ratelimit())
+                               WPRINTK("Bad rx response id %d.\n", rx->id);
+                       err = -EINVAL;
+                       goto next;
+               }
+
+               if (!np->copying_receiver) {
+                       /* Memory pressure, insufficient buffer
+                        * headroom, ... */
+                       if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) {
+                               if (net_ratelimit())
+                                       WPRINTK("Unfulfilled rx req "
+                                               "(id=%d, st=%d).\n",
+                                               rx->id, rx->status);
+                               xennet_move_rx_slot(np, skb, ref);
+                               err = -ENOMEM;
+                               goto next;
+                       }
+
+                       if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                               /* Remap the page. */
+                               const struct page *page =
+                                       skb_frag_page(skb_shinfo(skb)->frags);
+                               unsigned long pfn = page_to_pfn(page);
+                               void *vaddr = page_address(page);
+
+                               mcl = np->rx_mcl + pages_flipped;
+                               mmu = np->rx_mmu + pages_flipped;
+
+                               MULTI_update_va_mapping(mcl,
+                                                       (unsigned long)vaddr,
+                                                       pfn_pte_ma(mfn,
+                                                                  PAGE_KERNEL),
+                                                       0);
+                               mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
+                                       | MMU_MACHPHYS_UPDATE;
+                               mmu->val = pfn;
+
+                               set_phys_to_machine(pfn, mfn);
+                       }
+                       pages_flipped++;
+               } else {
+                       ret = gnttab_end_foreign_access_ref(ref);
+                       BUG_ON(!ret);
+               }
+
+               gnttab_release_grant_reference(&np->gref_rx_head, ref);
+
+               __skb_queue_tail(list, skb);
+
+next:
+               if (!(rx->flags & XEN_NETRXF_more_data))
+                       break;
+
+               if (cons + frags == rp) {
+                       if (net_ratelimit())
+                               WPRINTK("Need more frags\n");
+                       err = -ENOENT;
+                       break;
+               }
+
+               rx = RING_GET_RESPONSE(&np->rx, cons + frags);
+               skb = xennet_get_rx_skb(np, cons + frags);
+               ref = xennet_get_rx_ref(np, cons + frags);
+               frags++;
+       }
+
+       if (unlikely(frags > max)) {
+               if (net_ratelimit())
+                       WPRINTK("Too many frags\n");
+               err = -E2BIG;
+       }
+
+       if (unlikely(err))
+               np->rx.rsp_cons = cons + frags;
+
+       *pages_flipped_p = pages_flipped;
+
+       return err;
+}
+
+static RING_IDX xennet_fill_frags(struct netfront_info *np,
+                                 struct sk_buff *skb,
+                                 struct sk_buff_head *list)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int nr_frags = shinfo->nr_frags;
+       RING_IDX cons = np->rx.rsp_cons;
+       struct sk_buff *nskb;
+
+       while ((nskb = __skb_dequeue(list))) {
+               struct netif_rx_response *rx =
+                       RING_GET_RESPONSE(&np->rx, ++cons);
+
+               __skb_fill_page_desc(skb, nr_frags,
+                                    skb_frag_page(skb_shinfo(nskb)->frags),
+                                    rx->offset, rx->status);
+
+               skb->data_len += rx->status;
+
+               skb_shinfo(nskb)->nr_frags = 0;
+               kfree_skb(nskb);
+
+               nr_frags++;
+       }
+
+       shinfo->nr_frags = nr_frags;
+       return cons;
+}
+
+static int xennet_set_skb_gso(struct sk_buff *skb,
+                             struct netif_extra_info *gso)
+{
+       if (!gso->u.gso.size) {
+               if (net_ratelimit())
+                       WPRINTK("GSO size must not be zero.\n");
+               return -EINVAL;
+       }
+
+       /* Currently only TCPv4 S.O. is supported. */
+       if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) {
+               if (net_ratelimit())
+                       WPRINTK("Bad GSO type %d.\n", gso->u.gso.type);
+               return -EINVAL;
+       }
+
+#if HAVE_TSO
+       skb_shinfo(skb)->gso_size = gso->u.gso.size;
+#if HAVE_GSO
+       skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
+
+       /* Header must be checked, and gso_segs computed. */
+       skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+#endif
+       skb_shinfo(skb)->gso_segs = 0;
+
+       return 0;
+#else
+       if (net_ratelimit())
+               WPRINTK("GSO unsupported by this kernel.\n");
+       return -EINVAL;
+#endif
+}
+
+static int netif_poll(struct napi_struct *napi, int budget)
+{
+       struct netfront_info *np = container_of(napi, struct netfront_info, napi);
+       struct netfront_stats *stats = this_cpu_ptr(np->stats);
+       struct net_device *dev = np->netdev;
+       struct sk_buff *skb;
+       struct netfront_rx_info rinfo;
+       struct netif_rx_response *rx = &rinfo.rx;
+       struct netif_extra_info *extras = rinfo.extras;
+       RING_IDX i, rp;
+       struct multicall_entry *mcl;
+       int work_done, more_to_do = 1, accel_more_to_do = 1;
+       struct sk_buff_head rxq;
+       struct sk_buff_head errq;
+       struct sk_buff_head tmpq;
+       unsigned long flags;
+       unsigned int len;
+       int pages_flipped = 0;
+       int err;
+
+       spin_lock(&np->rx_lock); /* no need for spin_lock_bh() in ->poll() */
+
+       if (unlikely(!netfront_carrier_ok(np))) {
+               spin_unlock(&np->rx_lock);
+               return 0;
+       }
+
+       skb_queue_head_init(&rxq);
+       skb_queue_head_init(&errq);
+       skb_queue_head_init(&tmpq);
+
+       rp = np->rx.sring->rsp_prod;
+       rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+       i = np->rx.rsp_cons;
+       work_done = 0;
+       while ((i != rp) && (work_done < budget)) {
+               memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx));
+               memset(extras, 0, sizeof(rinfo.extras));
+
+               err = xennet_get_responses(np, &rinfo, rp, &tmpq,
+                                          &pages_flipped);
+
+               if (unlikely(err)) {
+err:   
+                       while ((skb = __skb_dequeue(&tmpq)))
+                               __skb_queue_tail(&errq, skb);
+                       dev->stats.rx_errors++;
+                       i = np->rx.rsp_cons;
+                       continue;
+               }
+
+               skb = __skb_dequeue(&tmpq);
+
+               if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
+                       struct netif_extra_info *gso;
+                       gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
+
+                       if (unlikely(xennet_set_skb_gso(skb, gso))) {
+                               __skb_queue_head(&tmpq, skb);
+                               np->rx.rsp_cons += skb_queue_len(&tmpq);
+                               goto err;
+                       }
+               }
+
+               NETFRONT_SKB_CB(skb)->page =
+                       skb_frag_page(skb_shinfo(skb)->frags);
+               NETFRONT_SKB_CB(skb)->offset = rx->offset;
+
+               len = rx->status;
+               if (len > RX_COPY_THRESHOLD)
+                       len = RX_COPY_THRESHOLD;
+               skb_put(skb, len);
+
+               if (rx->status > len) {
+                       skb_shinfo(skb)->frags[0].page_offset =
+                               rx->offset + len;
+                       skb_frag_size_set(skb_shinfo(skb)->frags,
+                                         rx->status - len);
+                       skb->data_len = rx->status - len;
+               } else {
+                       __skb_fill_page_desc(skb, 0, NULL, 0, 0);
+                       skb_shinfo(skb)->nr_frags = 0;
+               }
+
+               i = xennet_fill_frags(np, skb, &tmpq);
+
+               /*
+                * Truesize must approximates the size of true data plus
+                * any supervisor overheads. Adding hypervisor overheads
+                * has been shown to significantly reduce achievable
+                * bandwidth with the default receive buffer size. It is
+                * therefore not wise to account for it here.
+                *
+                * After alloc_skb(RX_COPY_THRESHOLD), truesize is set to
+                * RX_COPY_THRESHOLD + the supervisor overheads. Here, we
+                * add the size of the data pulled in xennet_fill_frags().
+                *
+                * We also adjust for any unused space in the main data
+                * area by subtracting (RX_COPY_THRESHOLD - len). This is
+                * especially important with drivers which split incoming
+                * packets into header and data, using only 66 bytes of
+                * the main data area (see the e1000 driver for example.)
+                * On such systems, without this last adjustement, our
+                * achievable receive throughout using the standard receive
+                * buffer size was cut by 25%(!!!).
+                */
+               skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len);
+               skb->len += skb->data_len;
+
+               if (rx->flags & XEN_NETRXF_csum_blank)
+                       skb->ip_summed = CHECKSUM_PARTIAL;
+               else if (rx->flags & XEN_NETRXF_data_validated)
+                       skb->ip_summed = CHECKSUM_UNNECESSARY;
+               else
+                       skb->ip_summed = CHECKSUM_NONE;
+
+               u64_stats_update_begin(&stats->syncp);
+               stats->rx_packets++;
+               stats->rx_bytes += skb->len;
+               u64_stats_update_end(&stats->syncp);
+
+               __skb_queue_tail(&rxq, skb);
+
+               np->rx.rsp_cons = ++i;
+               work_done++;
+       }
+
+       if (pages_flipped) {
+               /* Some pages are no longer absent... */
+               balloon_update_driver_allowance(-pages_flipped);
+
+               /* Do all the remapping work and M2P updates. */
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                       mcl = np->rx_mcl + pages_flipped;
+                       mcl->op = __HYPERVISOR_mmu_update;
+                       mcl->args[0] = (unsigned long)np->rx_mmu;
+                       mcl->args[1] = pages_flipped;
+                       mcl->args[2] = 0;
+                       mcl->args[3] = DOMID_SELF;
+                       err = HYPERVISOR_multicall_check(np->rx_mcl,
+                                                        pages_flipped + 1,
+                                                        NULL);
+                       BUG_ON(err);
+               }
+       }
+
+       __skb_queue_purge(&errq);
+
+       while ((skb = __skb_dequeue(&rxq)) != NULL) {
+               struct page *page = NETFRONT_SKB_CB(skb)->page;
+               void *vaddr = page_address(page);
+               unsigned offset = NETFRONT_SKB_CB(skb)->offset;
+
+               memcpy(skb->data, vaddr + offset, skb_headlen(skb));
+
+               if (page != skb_frag_page(skb_shinfo(skb)->frags))
+                       __free_page(page);
+
+               /* Ethernet work: Delayed to here as it peeks the header. */
+               skb->protocol = eth_type_trans(skb, dev);
+
+               if (skb_checksum_setup(skb, &np->rx_gso_csum_fixups)) {
+                       kfree_skb(skb);
+                       continue;
+               }
+
+               /* Pass it up. */
+               netif_receive_skb(skb);
+       }
+
+       /* If we get a callback with very few responses, reduce fill target. */
+       /* NB. Note exponential increase, linear decrease. */
+       if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) >
+            ((3*np->rx_target) / 4)) &&
+           (--np->rx_target < np->rx_min_target))
+               np->rx_target = np->rx_min_target;
+
+       network_alloc_rx_buffers(dev);
+
+       if (work_done < budget) {
+               /* there's some spare capacity, try the accelerated path */
+               int accel_budget = budget - work_done;
+               int accel_budget_start = accel_budget;
+
+               if (np->accel_vif_state.hooks) { 
+                       accel_more_to_do =  
+                               np->accel_vif_state.hooks->netdev_poll 
+                               (dev, &accel_budget); 
+                       work_done += (accel_budget_start - accel_budget); 
+               } else
+                       accel_more_to_do = 0;
+       }
+
+       if (work_done < budget) {
+               local_irq_save(flags);
+
+               RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do);
+
+               if (!more_to_do && !accel_more_to_do && 
+                   np->accel_vif_state.hooks) {
+                       /* 
+                        *  Slow path has nothing more to do, see if
+                        *  fast path is likewise
+                        */
+                       accel_more_to_do = 
+                               np->accel_vif_state.hooks->start_napi_irq(dev);
+               }
+
+               if (!more_to_do && !accel_more_to_do)
+                       __napi_complete(napi);
+
+               local_irq_restore(flags);
+       }
+
+       spin_unlock(&np->rx_lock);
+       
+       return work_done;
+}
+
+static void netif_release_tx_bufs(struct netfront_info *np)
+{
+       struct sk_buff *skb;
+       int i;
+
+       for (i = 1; i <= NET_TX_RING_SIZE; i++) {
+               if ((unsigned long)np->tx_skbs[i] < PAGE_OFFSET)
+                       continue;
+
+               skb = np->tx_skbs[i];
+               gnttab_end_foreign_access_ref(np->grant_tx_ref[i]);
+               gnttab_release_grant_reference(
+                       &np->gref_tx_head, np->grant_tx_ref[i]);
+               np->grant_tx_ref[i] = GRANT_INVALID_REF;
+               add_id_to_freelist(np->tx_skbs, i);
+               dev_kfree_skb_irq(skb);
+       }
+}
+
+static void netif_release_rx_bufs_flip(struct netfront_info *np)
+{
+       struct mmu_update      *mmu = np->rx_mmu;
+       struct multicall_entry *mcl = np->rx_mcl;
+       struct sk_buff_head free_list;
+       struct sk_buff *skb;
+       unsigned long mfn;
+       int xfer = 0, noxfer = 0, unused = 0;
+       int id, ref, rc;
+
+       skb_queue_head_init(&free_list);
+
+       spin_lock_bh(&np->rx_lock);
+
+       for (id = 0; id < NET_RX_RING_SIZE; id++) {
+               struct page *page;
+
+               if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) {
+                       unused++;
+                       continue;
+               }
+
+               skb = np->rx_skbs[id];
+               mfn = gnttab_end_foreign_transfer_ref(ref);
+               gnttab_release_grant_reference(&np->gref_rx_head, ref);
+               np->grant_rx_ref[id] = GRANT_INVALID_REF;
+               add_id_to_freelist(np->rx_skbs, id);
+
+               page = skb_frag_page(skb_shinfo(skb)->frags);
+
+               if (0 == mfn) {
+                       balloon_release_driver_page(page);
+                       skb_shinfo(skb)->nr_frags = 0;
+                       dev_kfree_skb(skb);
+                       noxfer++;
+                       continue;
+               }
+
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                       /* Remap the page. */
+                       unsigned long pfn = page_to_pfn(page);
+                       void *vaddr = page_address(page);
+
+                       MULTI_update_va_mapping(mcl, (unsigned long)vaddr,
+                                               pfn_pte_ma(mfn, PAGE_KERNEL),
+                                               0);
+                       mcl++;
+                       mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT)
+                               | MMU_MACHPHYS_UPDATE;
+                       mmu->val = pfn;
+                       mmu++;
+
+                       set_phys_to_machine(pfn, mfn);
+               }
+               __skb_queue_tail(&free_list, skb);
+               xfer++;
+       }
+
+       DPRINTK("%s: %d xfer, %d noxfer, %d unused\n",
+               __FUNCTION__, xfer, noxfer, unused);
+
+       if (xfer) {
+               /* Some pages are no longer absent... */
+               balloon_update_driver_allowance(-xfer);
+
+               if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+                       /* Do all the remapping work and M2P updates. */
+                       mcl->op = __HYPERVISOR_mmu_update;
+                       mcl->args[0] = (unsigned long)np->rx_mmu;
+                       mcl->args[1] = mmu - np->rx_mmu;
+                       mcl->args[2] = 0;
+                       mcl->args[3] = DOMID_SELF;
+                       mcl++;
+                       rc = HYPERVISOR_multicall_check(
+                               np->rx_mcl, mcl - np->rx_mcl, NULL);
+                       BUG_ON(rc);
+               }
+       }
+
+       __skb_queue_purge(&free_list);
+
+       spin_unlock_bh(&np->rx_lock);
+}
+
+static void netif_release_rx_bufs_copy(struct netfront_info *np)
+{
+       struct sk_buff *skb;
+       int i, ref;
+       int busy = 0, inuse = 0;
+
+       spin_lock_bh(&np->rx_lock);
+
+       for (i = 0; i < NET_RX_RING_SIZE; i++) {
+               ref = np->grant_rx_ref[i];
+
+               if (ref == GRANT_INVALID_REF)
+                       continue;
+
+               inuse++;
+
+               skb = np->rx_skbs[i];
+
+               if (!gnttab_end_foreign_access_ref(ref))
+               {
+                       busy++;
+                       continue;
+               }
+
+               gnttab_release_grant_reference(&np->gref_rx_head, ref);
+               np->grant_rx_ref[i] = GRANT_INVALID_REF;
+               add_id_to_freelist(np->rx_skbs, i);
+
+               dev_kfree_skb(skb);
+       }
+
+       if (busy)
+               DPRINTK("%s: Unable to release %d of %d inuse grant references out of %ld total.\n",
+                       __FUNCTION__, busy, inuse, NET_RX_RING_SIZE);
+
+       spin_unlock_bh(&np->rx_lock);
+}
+
+static int network_close(struct net_device *dev)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       netif_stop_queue(np->netdev);
+       napi_disable(&np->napi);
+       return 0;
+}
+
+
+static int xennet_set_mac_address(struct net_device *dev, void *p)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       struct sockaddr *addr = p;
+
+       if (netif_running(dev))
+               return -EBUSY;
+
+       if (!is_valid_ether_addr(addr->sa_data))
+               return -EADDRNOTAVAIL;
+
+       memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+       memcpy(np->mac, addr->sa_data, ETH_ALEN);
+
+       return 0;
+}
+
+static int xennet_change_mtu(struct net_device *dev, int mtu)
+{
+       int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+       if (mtu > max)
+               return -EINVAL;
+       dev->mtu = mtu;
+       return 0;
+}
+
+static struct rtnl_link_stats64 *xennet_get_stats64(struct net_device *dev,
+                                                   struct rtnl_link_stats64 *tot)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       int cpu;
+
+       netfront_accelerator_call_get_stats(np, dev);
+
+       for_each_possible_cpu(cpu) {
+               struct netfront_stats *stats = per_cpu_ptr(np->stats, cpu);
+               u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+               unsigned int start;
+
+               do {
+                       start = u64_stats_fetch_begin_bh(&stats->syncp);
+
+                       rx_packets = stats->rx_packets;
+                       tx_packets = stats->tx_packets;
+                       rx_bytes = stats->rx_bytes;
+                       tx_bytes = stats->tx_bytes;
+               } while (u64_stats_fetch_retry_bh(&stats->syncp, start));
+
+               tot->rx_packets += rx_packets;
+               tot->tx_packets += tx_packets;
+               tot->rx_bytes   += rx_bytes;
+               tot->tx_bytes   += tx_bytes;
+       }
+
+       tot->rx_errors  = dev->stats.rx_errors;
+       tot->tx_dropped = dev->stats.tx_dropped;
+
+       return tot;
+}
+
+static const struct xennet_stat {
+       char name[ETH_GSTRING_LEN];
+       u16 offset;
+} xennet_stats[] = {
+       {
+               "rx_gso_csum_fixups",
+               offsetof(struct netfront_info, rx_gso_csum_fixups) / sizeof(long)
+       },
+};
+
+static int xennet_get_sset_count(struct net_device *dev, int sset)
+{
+       switch (sset) {
+       case ETH_SS_STATS:
+               return ARRAY_SIZE(xennet_stats);
+       }
+       return -EOPNOTSUPP;
+}
+
+static void xennet_get_ethtool_stats(struct net_device *dev,
+                                    struct ethtool_stats *stats, u64 *data)
+{
+       unsigned long *np = netdev_priv(dev);
+       unsigned int i;
+
+       for (i = 0; i < ARRAY_SIZE(xennet_stats); i++)
+               data[i] = np[xennet_stats[i].offset];
+}
+
+static void xennet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+       unsigned int i;
+
+       switch (stringset) {
+       case ETH_SS_STATS:
+               for (i = 0; i < ARRAY_SIZE(xennet_stats); i++)
+                       memcpy(data + i * ETH_GSTRING_LEN,
+                              xennet_stats[i].name, ETH_GSTRING_LEN);
+               break;
+       }
+}
+
+static void netfront_get_drvinfo(struct net_device *dev,
+                                struct ethtool_drvinfo *info)
+{
+       strcpy(info->driver, "netfront");
+       strlcpy(info->bus_info, dev_name(dev->dev.parent),
+               ARRAY_SIZE(info->bus_info));
+}
+
+static int network_connect(struct net_device *dev)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       int i, requeue_idx, err;
+       struct sk_buff *skb;
+       grant_ref_t ref;
+       netif_rx_request_t *req;
+       unsigned int feature_rx_copy, feature_rx_flip;
+
+       err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+                          "feature-rx-copy", "%u", &feature_rx_copy);
+       if (err != 1)
+               feature_rx_copy = 0;
+       err = xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+                          "feature-rx-flip", "%u", &feature_rx_flip);
+       if (err != 1)
+               feature_rx_flip = 1;
+
+       /*
+        * Copy packets on receive path if:
+        *  (a) This was requested by user, and the backend supports it; or
+        *  (b) Flipping was requested, but this is unsupported by the backend.
+        */
+       np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) ||
+                               (MODPARM_rx_flip && !feature_rx_flip));
+
+       err = talk_to_backend(np->xbdev, np);
+       if (err)
+               return err;
+
+       rtnl_lock();
+       netdev_update_features(dev);
+       rtnl_unlock();
+
+       DPRINTK("device %s has %sing receive path.\n",
+               dev->name, np->copying_receiver ? "copy" : "flipp");
+
+       spin_lock_bh(&np->rx_lock);
+       spin_lock_irq(&np->tx_lock);
+
+       /*
+        * Recovery procedure:
+        *  NB. Freelist index entries are always going to be less than
+        *  PAGE_OFFSET, whereas pointers to skbs will always be equal or
+        *  greater than PAGE_OFFSET: we use this property to distinguish
+        *  them.
+        */
+
+       /* Step 1: Discard all pending TX packet fragments. */
+       netif_release_tx_bufs(np);
+
+       /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */
+       for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) {
+               unsigned long pfn;
+
+               if (!np->rx_skbs[i])
+                       continue;
+
+               skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i);
+               ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i);
+               req = RING_GET_REQUEST(&np->rx, requeue_idx);
+               pfn = page_to_pfn(skb_frag_page(skb_shinfo(skb)->frags));
+
+               if (!np->copying_receiver) {
+                       gnttab_grant_foreign_transfer_ref(
+                               ref, np->xbdev->otherend_id, pfn);
+               } else {
+                       gnttab_grant_foreign_access_ref(
+                               ref, np->xbdev->otherend_id,
+                               pfn_to_mfn(pfn), 0);
+               }
+               req->gref = ref;
+               req->id   = requeue_idx;
+
+               requeue_idx++;
+       }
+
+       np->rx.req_prod_pvt = requeue_idx;
+
+       /*
+        * Step 3: All public and private state should now be sane.  Get
+        * ready to start sending and receiving packets and give the driver
+        * domain a kick because we've probably just requeued some
+        * packets.
+        */
+       netfront_carrier_on(np);
+       notify_remote_via_irq(np->irq);
+       network_tx_buf_gc(dev);
+       network_alloc_rx_buffers(dev);
+
+       spin_unlock_irq(&np->tx_lock);
+       spin_unlock_bh(&np->rx_lock);
+
+       return 0;
+}
+
+static void netif_uninit(struct net_device *dev)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       netif_release_tx_bufs(np);
+       if (np->copying_receiver)
+               netif_release_rx_bufs_copy(np);
+       else
+               netif_release_rx_bufs_flip(np);
+       gnttab_free_grant_references(np->gref_tx_head);
+       gnttab_free_grant_references(np->gref_rx_head);
+}
+
+static const struct ethtool_ops network_ethtool_ops =
+{
+       .get_drvinfo = netfront_get_drvinfo,
+       .get_link = ethtool_op_get_link,
+
+       .get_sset_count = xennet_get_sset_count,
+       .get_ethtool_stats = xennet_get_ethtool_stats,
+       .get_strings = xennet_get_strings,
+};
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_rxbuf_min(struct device *dev,
+                             struct device_attribute *attr, char *buf)
+{
+       struct netfront_info *info = netdev_priv(to_net_dev(dev));
+
+       return sprintf(buf, "%u\n", info->rx_min_target);
+}
+
+static ssize_t store_rxbuf_min(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t len)
+{
+       struct net_device *netdev = to_net_dev(dev);
+       struct netfront_info *np = netdev_priv(netdev);
+       char *endp;
+       unsigned long target;
+
+       if (!capable(CAP_NET_ADMIN))
+               return -EPERM;
+
+       target = simple_strtoul(buf, &endp, 0);
+       if (endp == buf)
+               return -EBADMSG;
+
+       if (target < RX_MIN_TARGET)
+               target = RX_MIN_TARGET;
+       if (target > RX_MAX_TARGET)
+               target = RX_MAX_TARGET;
+
+       spin_lock_bh(&np->rx_lock);
+       if (target > np->rx_max_target)
+               np->rx_max_target = target;
+       np->rx_min_target = target;
+       if (target > np->rx_target)
+               np->rx_target = target;
+
+       network_alloc_rx_buffers(netdev);
+
+       spin_unlock_bh(&np->rx_lock);
+       return len;
+}
+
+static ssize_t show_rxbuf_max(struct device *dev,
+                             struct device_attribute *attr, char *buf)
+{
+       struct netfront_info *info = netdev_priv(to_net_dev(dev));
+
+       return sprintf(buf, "%u\n", info->rx_max_target);
+}
+
+static ssize_t store_rxbuf_max(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t len)
+{
+       struct net_device *netdev = to_net_dev(dev);
+       struct netfront_info *np = netdev_priv(netdev);
+       char *endp;
+       unsigned long target;
+
+       if (!capable(CAP_NET_ADMIN))
+               return -EPERM;
+
+       target = simple_strtoul(buf, &endp, 0);
+       if (endp == buf)
+               return -EBADMSG;
+
+       if (target < RX_MIN_TARGET)
+               target = RX_MIN_TARGET;
+       if (target > RX_MAX_TARGET)
+               target = RX_MAX_TARGET;
+
+       spin_lock_bh(&np->rx_lock);
+       if (target < np->rx_min_target)
+               np->rx_min_target = target;
+       np->rx_max_target = target;
+       if (target < np->rx_target)
+               np->rx_target = target;
+
+       network_alloc_rx_buffers(netdev);
+
+       spin_unlock_bh(&np->rx_lock);
+       return len;
+}
+
+static ssize_t show_rxbuf_cur(struct device *dev,
+                             struct device_attribute *attr, char *buf)
+{
+       struct netfront_info *info = netdev_priv(to_net_dev(dev));
+
+       return sprintf(buf, "%u\n", info->rx_target);
+}
+
+static struct device_attribute xennet_attrs[] = {
+       __ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min),
+       __ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max),
+       __ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL),
+};
+
+static int xennet_sysfs_addif(struct net_device *netdev)
+{
+       int i;
+       int error = 0;
+
+       for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) {
+               error = device_create_file(&netdev->dev,
+                                          &xennet_attrs[i]);
+               if (error)
+                       goto fail;
+       }
+       return 0;
+
+ fail:
+       while (--i >= 0)
+               device_remove_file(&netdev->dev, &xennet_attrs[i]);
+       return error;
+}
+
+static void xennet_sysfs_delif(struct net_device *netdev)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++)
+               device_remove_file(&netdev->dev, &xennet_attrs[i]);
+}
+
+#endif /* CONFIG_SYSFS */
+
+
+/*
+ * Nothing to do here. Virtual interface is point-to-point and the
+ * physical interface is probably promiscuous anyway.
+ */
+static void network_set_multicast_list(struct net_device *dev)
+{
+}
+
+static netdev_features_t xennet_fix_features(struct net_device *dev,
+                                            netdev_features_t features)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       int val;
+
+       if (features & NETIF_F_SG) {
+               if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg",
+                                "%d", &val) < 0)
+                       val = 0;
+
+               if (!val)
+                       features &= ~NETIF_F_SG;
+       }
+
+       if (features & NETIF_F_TSO) {
+               if (xenbus_scanf(XBT_NIL, np->xbdev->otherend,
+                                "feature-gso-tcpv4", "%d", &val) < 0)
+                       val = 0;
+
+               if (!val)
+                       features &= ~NETIF_F_TSO;
+       }
+
+       return features;
+}
+
+static int xennet_set_features(struct net_device *dev,
+                              netdev_features_t features)
+{
+       if (!(features & NETIF_F_SG) && dev->mtu > ETH_DATA_LEN) {
+               netdev_info(dev, "Reducing MTU because no SG offload");
+               dev->mtu = ETH_DATA_LEN;
+       }
+
+       return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void xennet_poll_controller(struct net_device *dev)
+{
+       netif_int(0, dev);
+}
+#endif
+
+static const struct net_device_ops xennet_netdev_ops = {
+       .ndo_uninit             = netif_uninit,
+       .ndo_open               = network_open,
+       .ndo_stop               = network_close,
+       .ndo_start_xmit         = network_start_xmit,
+       .ndo_set_rx_mode        = network_set_multicast_list,
+       .ndo_set_mac_address    = xennet_set_mac_address,
+       .ndo_validate_addr      = eth_validate_addr,
+       .ndo_fix_features       = xennet_fix_features,
+       .ndo_set_features       = xennet_set_features,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+       .ndo_poll_controller = xennet_poll_controller,
+#endif
+       .ndo_change_mtu         = xennet_change_mtu,
+       .ndo_get_stats64        = xennet_get_stats64,
+};
+
+static struct net_device * __devinit create_netdev(struct xenbus_device *dev)
+{
+       int i, err = 0;
+       struct net_device *netdev = NULL;
+       struct netfront_info *np = NULL;
+
+       netdev = alloc_etherdev(sizeof(struct netfront_info));
+       if (!netdev)
+               return ERR_PTR(-ENOMEM);
+
+       np                   = netdev_priv(netdev);
+       np->xbdev            = dev;
+
+       spin_lock_init(&np->tx_lock);
+       spin_lock_init(&np->rx_lock);
+
+       init_accelerator_vif(np, dev);
+
+       skb_queue_head_init(&np->rx_batch);
+       np->rx_target     = RX_DFL_MIN_TARGET;
+       np->rx_min_target = RX_DFL_MIN_TARGET;
+       np->rx_max_target = RX_MAX_TARGET;
+
+       init_timer(&np->rx_refill_timer);
+       np->rx_refill_timer.data = (unsigned long)netdev;
+       np->rx_refill_timer.function = rx_refill_timeout;
+
+       err = -ENOMEM;
+       np->stats = alloc_percpu(struct netfront_stats);
+       if (np->stats == NULL)
+               goto exit;
+
+       /* Initialise {tx,rx}_skbs as a free chain containing every entry. */
+       for (i = 0; i <= NET_TX_RING_SIZE; i++) {
+               np->tx_skbs[i] = (void *)((unsigned long) i+1);
+               np->grant_tx_ref[i] = GRANT_INVALID_REF;
+       }
+
+       for (i = 0; i < NET_RX_RING_SIZE; i++) {
+               np->rx_skbs[i] = NULL;
+               np->grant_rx_ref[i] = GRANT_INVALID_REF;
+       }
+
+       /* A grant for every tx ring slot */
+       if (gnttab_alloc_grant_references(TX_MAX_TARGET,
+                                         &np->gref_tx_head) < 0) {
+               pr_alert("#### netfront can't alloc tx grant refs\n");
+               err = -ENOMEM;
+               goto exit_free_stats;
+       }
+       /* A grant for every rx ring slot */
+       if (gnttab_alloc_grant_references(RX_MAX_TARGET,
+                                         &np->gref_rx_head) < 0) {
+               pr_alert("#### netfront can't alloc rx grant refs\n");
+               err = -ENOMEM;
+               goto exit_free_tx;
+       }
+
+       netdev->netdev_ops      = &xennet_netdev_ops;
+       netif_napi_add(netdev, &np->napi, netif_poll, 64);
+       netdev->features        = NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
+                                 NETIF_F_GSO_ROBUST;
+       netdev->hw_features     = NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO;
+
+       /*
+         * Assume that all hw features are available for now. This set
+         * will be adjusted by the call to netdev_update_features() in
+         * xennet_connect() which is the earliest point where we can
+         * negotiate with the backend regarding supported features.
+         */
+       netdev->features |= netdev->hw_features;
+
+       SET_ETHTOOL_OPS(netdev, &network_ethtool_ops);
+       SET_NETDEV_DEV(netdev, &dev->dev);
+
+       np->netdev = netdev;
+
+       netfront_carrier_off(np);
+
+       return netdev;
+
+ exit_free_tx:
+       gnttab_free_grant_references(np->gref_tx_head);
+ exit_free_stats:
+       free_percpu(np->stats);
+ exit:
+       free_netdev(netdev);
+       return ERR_PTR(err);
+}
+
+static void netif_release_rings(struct netfront_info *info)
+{
+       end_access(info->tx_ring_ref, info->tx.sring);
+       end_access(info->rx_ring_ref, info->rx.sring);
+       info->tx_ring_ref = GRANT_INVALID_REF;
+       info->rx_ring_ref = GRANT_INVALID_REF;
+       info->tx.sring = NULL;
+       info->rx.sring = NULL;
+}
+
+static void netif_disconnect_backend(struct netfront_info *info)
+{
+       /* Stop old i/f to prevent errors whilst we rebuild the state. */
+       spin_lock_bh(&info->rx_lock);
+       spin_lock_irq(&info->tx_lock);
+       netfront_carrier_off(info);
+       spin_unlock_irq(&info->tx_lock);
+       spin_unlock_bh(&info->rx_lock);
+
+       if (info->irq)
+               unbind_from_irqhandler(info->irq, info->netdev);
+       info->irq = 0;
+
+       netif_release_rings(info);
+}
+
+
+static void end_access(int ref, void *page)
+{
+       if (ref != GRANT_INVALID_REF)
+               gnttab_end_foreign_access(ref, (unsigned long)page);
+}
+
+
+/* ** Driver registration ** */
+
+
+static const struct xenbus_device_id netfront_ids[] = {
+       { "vif" },
+       { "" }
+};
+MODULE_ALIAS("xen:vif");
+
+static DEFINE_XENBUS_DRIVER(netfront, ,
+       .probe = netfront_probe,
+       .remove = __devexit_p(netfront_remove),
+       .suspend = netfront_suspend,
+       .suspend_cancel = netfront_suspend_cancel,
+       .resume = netfront_resume,
+       .otherend_changed = backend_changed,
+);
+
+
+static int __init netif_init(void)
+{
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+#ifdef CONFIG_XEN
+       if (MODPARM_rx_flip && MODPARM_rx_copy) {
+               WPRINTK("Cannot specify both rx_copy and rx_flip.\n");
+               return -EINVAL;
+       }
+
+       if (!MODPARM_rx_flip && !MODPARM_rx_copy)
+               MODPARM_rx_copy = true; /* Default is to copy. */
+#endif
+
+       netif_init_accel();
+
+       IPRINTK("Initialising virtual ethernet driver.\n");
+
+       return xenbus_register_frontend(&netfront_driver);
+}
+module_init(netif_init);
+
+
+static void __exit netif_exit(void)
+{
+       xenbus_unregister_driver(&netfront_driver);
+
+       netif_exit_accel();
+}
+module_exit(netif_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/netfront/netfront.h b/drivers/xen/netfront/netfront.h

new file mode 100644 (file)

index 0000000..410a881
--- /dev/null
+++ b/drivers/xen/netfront/netfront.h
@@ -0,0 +1,288 @@
+/******************************************************************************
+ * Virtual network driver for conversing with remote driver backends.
+ *
+ * Copyright (c) 2002-2005, K A Fraser
+ * Copyright (c) 2005, XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef NETFRONT_H
+#define NETFRONT_H
+
+#include <xen/interface/io/netif.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+
+#define NET_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, PAGE_SIZE)
+#define NET_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, PAGE_SIZE)
+
+#include <xen/xenbus.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+struct netfront_stats {
+       u64                     rx_packets;
+       u64                     tx_packets;
+       u64                     rx_bytes;
+       u64                     tx_bytes;
+       struct u64_stats_sync   syncp;
+};
+
+/* 
+ * Function pointer table for hooks into a network acceleration
+ * plugin.  These are called at appropriate points from the netfront
+ * driver 
+ */
+struct netfront_accel_hooks {
+       /* 
+        * new_device: Accelerator hook to ask the plugin to support a
+        * new network interface
+        */
+       int (*new_device)(struct net_device *net_dev, struct xenbus_device *dev);
+       /*
+        * remove: Opposite of new_device
+        */
+       int (*remove)(struct xenbus_device *dev);
+       /*
+        * The net_device is being polled, check the accelerated
+        * hardware for any pending packets
+        */
+       int (*netdev_poll)(struct net_device *dev, int *pbudget);
+       /*
+        * start_xmit: Used to give the accelerated plugin the option
+        * of sending a packet.  Returns non-zero if has done so, or
+        * zero to decline and force the packet onto normal send
+        * path
+        */
+       int (*start_xmit)(struct sk_buff *skb, struct net_device *dev);
+       /* 
+        * start/stop_napi_interrupts Used by netfront to indicate
+        * when napi interrupts should be enabled or disabled 
+        */
+       int (*start_napi_irq)(struct net_device *dev);
+       void (*stop_napi_irq)(struct net_device *dev);
+       /* 
+        * Called before re-enabling the TX queue to check the fast
+        * path has slots too
+        */
+       int (*check_ready)(struct net_device *dev);
+       /*
+        * Get the fastpath network statistics
+        */
+       int (*get_stats)(struct net_device *dev,
+                        struct net_device_stats *dev_stats,
+                        struct netfront_stats *link_stats);
+};
+
+
+/* Version of API/protocol for communication between netfront and
+   acceleration plugin supported */
+#define NETFRONT_ACCEL_VERSION 0x00010003
+
+/* 
+ * Per-netfront device state for the accelerator.  This is used to
+ * allow efficient per-netfront device access to the accelerator
+ * hooks 
+ */
+struct netfront_accel_vif_state {
+       struct list_head link;
+
+       struct xenbus_device *dev;
+       struct netfront_info *np;
+       struct netfront_accel_hooks *hooks;
+
+       /* Watch on the accelerator configuration value */
+       struct xenbus_watch accel_watch;
+       /* Work item to process change in accelerator */
+       struct work_struct accel_work;
+       /* The string from xenbus last time accel_watch fired */
+       char *accel_frontend;
+}; 
+
+/* 
+ * Per-accelerator state stored in netfront.  These form a list that
+ * is used to track which devices are accelerated by which plugins,
+ * and what plugins are available/have been requested 
+ */
+struct netfront_accelerator {
+       /* Used to make a list */
+       struct list_head link;
+       /* ID of the accelerator */
+       int id;
+       /*
+        * String describing the accelerator.  Currently this is the
+        * name of the accelerator module.  This is provided by the
+        * backend accelerator through xenstore 
+        */
+       char *frontend;
+       /* The hooks into the accelerator plugin module */
+       struct netfront_accel_hooks *hooks;
+
+       /* 
+        * List of per-netfront device state (struct
+        * netfront_accel_vif_state) for each netfront device that is
+        * using this accelerator
+        */
+       struct list_head vif_states;
+       spinlock_t vif_states_lock;
+};
+
+struct netfront_info {
+       struct list_head list;
+       struct net_device *netdev;
+
+       struct netif_tx_front_ring tx;
+       struct netif_rx_front_ring rx;
+
+       spinlock_t   tx_lock;
+       spinlock_t   rx_lock;
+
+       struct napi_struct      napi;
+
+       unsigned int irq;
+       unsigned int copying_receiver;
+       unsigned int carrier;
+
+       /* Receive-ring batched refills. */
+#define RX_MIN_TARGET 8
+#define RX_DFL_MIN_TARGET 64
+#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256)
+       unsigned rx_min_target, rx_max_target, rx_target;
+       struct sk_buff_head rx_batch;
+
+       struct timer_list rx_refill_timer;
+
+       /*
+        * {tx,rx}_skbs store outstanding skbuffs. The first entry in tx_skbs
+        * is an index into a chain of free entries.
+        */
+       struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1];
+       struct sk_buff *rx_skbs[NET_RX_RING_SIZE];
+
+#define TX_MAX_TARGET min_t(int, NET_TX_RING_SIZE, 256)
+       grant_ref_t gref_tx_head;
+       grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1];
+       grant_ref_t gref_rx_head;
+       grant_ref_t grant_rx_ref[NET_RX_RING_SIZE];
+
+       struct xenbus_device *xbdev;
+       int tx_ring_ref;
+       int rx_ring_ref;
+       u8 mac[ETH_ALEN];
+
+       unsigned long rx_pfn_array[NET_RX_RING_SIZE];
+       struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1];
+       struct mmu_update rx_mmu[NET_RX_RING_SIZE];
+
+       /* Statistics */
+       struct netfront_stats __percpu *stats;
+       unsigned long rx_gso_csum_fixups;
+
+       /* Private pointer to state internal to accelerator module */
+       void *accel_priv;
+       /* The accelerator used by this netfront device */
+       struct netfront_accelerator *accelerator;
+       /* The accelerator state for this netfront device */
+       struct netfront_accel_vif_state accel_vif_state;
+};
+
+
+/* Exported Functions */
+
+/*
+ * Called by an accelerator plugin module when it has loaded.
+ *
+ * frontend: the string describing the accelerator, currently the module name 
+ * hooks: the hooks for netfront to use to call into the accelerator
+ * version: the version of API between frontend and plugin requested
+ * 
+ * return: 0 on success, <0 on error, >0 (with version supported) on
+ * version mismatch
+ */
+extern int netfront_accelerator_loaded(int version, const char *frontend, 
+                                      struct netfront_accel_hooks *hooks);
+
+/* 
+ * Called by an accelerator plugin module when it is about to unload.
+ *
+ * frontend: the string describing the accelerator.  Must match the
+ * one passed to netfront_accelerator_loaded()
+ */ 
+extern void netfront_accelerator_stop(const char *frontend);
+
+/* 
+ * Called by an accelerator before waking the net device's TX queue to
+ * ensure the slow path has available slots.  Returns true if OK to
+ * wake, false if still busy 
+ */
+extern int netfront_check_queue_ready(struct net_device *net_dev);
+
+
+/* Internal-to-netfront Functions */
+
+/* 
+ * Call into accelerator and check to see if it has tx space before we
+ * wake the net device's TX queue.  Returns true if OK to wake, false
+ * if still busy
+ */ 
+extern 
+int netfront_check_accelerator_queue_ready(struct net_device *dev,
+                                          struct netfront_info *np);
+extern
+int netfront_accelerator_call_remove(struct netfront_info *np,
+                                    struct xenbus_device *dev);
+extern
+int netfront_accelerator_suspend(struct netfront_info *np,
+                                struct xenbus_device *dev);
+extern
+int netfront_accelerator_suspend_cancel(struct netfront_info *np,
+                                       struct xenbus_device *dev);
+extern
+void netfront_accelerator_resume(struct netfront_info *np,
+                                struct xenbus_device *dev);
+extern
+void netfront_accelerator_call_stop_napi_irq(struct netfront_info *np,
+                                            struct net_device *dev);
+extern
+int netfront_accelerator_call_get_stats(struct netfront_info *np,
+                                       struct net_device *dev);
+extern
+void netfront_accelerator_add_watch(struct netfront_info *np);
+
+extern
+void netif_init_accel(void);
+extern
+void netif_exit_accel(void);
+
+extern
+void init_accelerator_vif(struct netfront_info *np,
+                         struct xenbus_device *dev);
+#endif /* NETFRONT_H */
diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c

index b84bf0b..19f694b 100644 (file)
--- a/drivers/xen/pci.c
+++ b/drivers/xen/pci.c
@@ -23,11 +23,20 @@
  #include <xen/interface/physdev.h>
  #include <xen/interface/xen.h>
  
+#ifdef CONFIG_PARAVIRT_XEN
+#define CONFIG_XEN_COMPAT 0x040000
  #include <asm/xen/hypervisor.h>
  #include <asm/xen/hypercall.h>
+#else
+#include <asm/hypervisor.h>
+#endif
  #include "../pci/pci.h"
  
+#if CONFIG_XEN_COMPAT < 0x040200
  static bool __read_mostly pci_seg_supported = true;
+#else
+#define pci_seg_supported true
+#endif
  
  static int xen_add_device(struct device *dev)
  {
@@ -86,7 +95,9 @@ static int xen_add_device(struct device *dev)
                 r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add);
                 if (r != -ENOSYS)
                         return r;
+#if CONFIG_XEN_COMPAT < 0x040200
                 pci_seg_supported = false;
+#endif
         }
  
         if (pci_domain_nr(pci_dev->bus))
diff --git a/drivers/xen/pcifront/Makefile b/drivers/xen/pcifront/Makefile

new file mode 100644 (file)

index 0000000..4ceb18a
--- /dev/null
+++ b/drivers/xen/pcifront/Makefile
@@ -0,0 +1,5 @@
+obj-y += pcifront.o
+
+pcifront-y := pci_op.o xenbus.o pci.o
+
+ccflags-$(CONFIG_XEN_PCIDEV_FE_DEBUG) += -DDEBUG
diff --git a/drivers/xen/pcifront/pci.c b/drivers/xen/pcifront/pci.c

new file mode 100644 (file)

index 0000000..198d65f
--- /dev/null
+++ b/drivers/xen/pcifront/pci.c
@@ -0,0 +1,44 @@
+/*
+ * PCI Frontend Operations - ensure only one PCI frontend runs at a time
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "pcifront.h"
+
+DEFINE_SPINLOCK(pcifront_dev_lock);
+static struct pcifront_device *pcifront_dev = NULL;
+
+int pcifront_connect(struct pcifront_device *pdev)
+{
+       int err = 0;
+
+       spin_lock(&pcifront_dev_lock);
+
+       if (!pcifront_dev) {
+               dev_info(&pdev->xdev->dev, "Installing PCI frontend\n");
+               pcifront_dev = pdev;
+       }
+       else {
+               dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n");
+               err = -EEXIST;
+       }
+
+       spin_unlock(&pcifront_dev_lock);
+
+       return err;
+}
+
+void pcifront_disconnect(struct pcifront_device *pdev)
+{
+       spin_lock(&pcifront_dev_lock);
+
+       if (pdev == pcifront_dev) {
+               dev_info(&pdev->xdev->dev,
+                        "Disconnecting PCI Frontend Buses\n");
+               pcifront_dev = NULL;
+       }
+
+       spin_unlock(&pcifront_dev_lock);
+}
diff --git a/drivers/xen/pcifront/pci_op.c b/drivers/xen/pcifront/pci_op.c

new file mode 100644 (file)

index 0000000..a901a35
--- /dev/null
+++ b/drivers/xen/pcifront/pci_op.c
@@ -0,0 +1,657 @@
+/*
+ * PCI Frontend Operations - Communicates with frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <asm/bitops.h>
+#include <linux/time.h>
+#include <xen/evtchn.h>
+#include "pcifront.h"
+
+static int verbose_request = 0;
+module_param(verbose_request, int, 0644);
+
+static void pcifront_init_sd(struct pcifront_sd *sd,
+                            unsigned int domain, unsigned int bus,
+                            struct pcifront_device *pdev)
+{
+#ifdef __ia64__
+       int err, i, j, k, len, root_num, res_count;
+       struct acpi_resource res;
+       unsigned int d, b, byte;
+       unsigned long magic;
+       char str[64], tmp[3];
+       unsigned char *buf, *bufp;
+       u8 *ptr;
+
+       memset(sd, 0, sizeof(*sd));
+
+       sd->segment = domain;
+       sd->node = -1;  /* Revisit for NUMA */
+       sd->platform_data = pdev;
+
+       /* Look for resources for this controller in xenbus. */
+       err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "root_num",
+                          "%d", &root_num);
+       if (err != 1)
+               return;
+
+       for (i = 0; i < root_num; i++) {
+               len = snprintf(str, sizeof(str), "root-%d", i);
+               if (unlikely(len >= (sizeof(str) - 1)))
+                       return;
+
+               err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+                                  str, "%x:%x", &d, &b);
+               if (err != 2)
+                       return;
+
+               if (d == domain && b == bus)
+                       break;
+       }
+
+       if (i == root_num)
+               return;
+
+       len = snprintf(str, sizeof(str), "root-resource-magic");
+
+       err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+                          str, "%lx", &magic);
+
+       if (err != 1)
+               return; /* No resources, nothing to do */
+
+       if (magic != (sizeof(res) * 2) + 1) {
+               dev_warn(&pdev->xdev->dev,
+                        "pcifront: resource magic mismatch\n");
+               return;
+       }
+
+       len = snprintf(str, sizeof(str), "root-%d-resources", i);
+       if (unlikely(len >= (sizeof(str) - 1)))
+               return;
+
+       err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+                          str, "%d", &res_count);
+
+       if (err != 1)
+               return; /* No resources, nothing to do */
+
+       sd->window = kzalloc(sizeof(*sd->window) * res_count, GFP_KERNEL);
+       if (!sd->window)
+               return;
+
+       /* magic is also the size of the byte stream in xenbus */
+       buf = kmalloc(magic, GFP_KERNEL);
+       if (!buf) {
+               kfree(sd->window);
+               sd->window = NULL;
+               return;
+       }
+
+       /* Read the resources out of xenbus */
+       for (j = 0; j < res_count; j++) {
+               memset(&res, 0, sizeof(res));
+               memset(buf, 0, magic);
+
+               len = snprintf(str, sizeof(str), "root-%d-resource-%d", i, j);
+               if (unlikely(len >= (sizeof(str) - 1)))
+                       return;
+
+               err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+                                  "%s", buf);
+               if (err != 1) {
+                       dev_warn(&pdev->xdev->dev,
+                                "pcifront: error reading resource %d on bus %04x:%02x\n",
+                                j, domain, bus);
+                       continue;
+               }
+
+               bufp = buf;
+               ptr = (u8 *)&res;
+               memset(tmp, 0, sizeof(tmp));
+
+               /* Copy ASCII byte stream into structure */
+               for (k = 0; k < magic - 1; k += 2) {
+                       memcpy(tmp, bufp, 2);
+                       bufp += 2;
+
+                       sscanf(tmp, "%02x", &byte);
+                       *ptr = byte;
+                       ptr++;
+               }
+
+               xen_add_resource(sd, domain, bus, &res);
+               sd->windows++;
+       }
+       kfree(buf);
+#else
+       sd->domain = domain;
+       sd->pdev = pdev;
+#endif
+}
+
+static int errno_to_pcibios_err(int errno)
+{
+       switch (errno) {
+       case XEN_PCI_ERR_success:
+               return PCIBIOS_SUCCESSFUL;
+
+       case XEN_PCI_ERR_dev_not_found:
+               return PCIBIOS_DEVICE_NOT_FOUND;
+
+       case XEN_PCI_ERR_invalid_offset:
+       case XEN_PCI_ERR_op_failed:
+               return PCIBIOS_BAD_REGISTER_NUMBER;
+
+       case XEN_PCI_ERR_not_implemented:
+               return PCIBIOS_FUNC_NOT_SUPPORTED;
+
+       case XEN_PCI_ERR_access_denied:
+               return PCIBIOS_SET_FAILED;
+       }
+       return errno;
+}
+
+static inline void schedule_pcifront_aer_op(struct pcifront_device *pdev)
+{
+       if (test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
+               && !test_and_set_bit(_PDEVB_op_active, &pdev->flags)) {
+               dev_dbg(&pdev->xdev->dev, "schedule aer frontend job\n");
+               schedule_work(&pdev->op_work);
+       }
+}
+
+static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op)
+{
+       int err = 0;
+       struct xen_pci_op *active_op = &pdev->sh_info->op;
+       unsigned long irq_flags;
+       evtchn_port_t port = pdev->evtchn;
+       s64 ns, ns_timeout;
+       struct timeval tv;
+
+       spin_lock_irqsave(&pdev->sh_info_lock, irq_flags);
+
+       memcpy(active_op, op, sizeof(struct xen_pci_op));
+
+       /* Go */
+       wmb();
+       set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
+       notify_remote_via_evtchn(port);
+
+       /*
+        * We set a poll timeout of 3 seconds but give up on return after
+        * 2 seconds. It is better to time out too late rather than too early
+        * (in the latter case we end up continually re-executing poll() with a
+        * timeout in the past). 1s difference gives plenty of slack for error.
+        */
+       do_gettimeofday(&tv);
+       ns_timeout = timeval_to_ns(&tv) + 2 * (s64)NSEC_PER_SEC;
+
+       clear_evtchn(port);
+
+       while (test_bit(_XEN_PCIF_active,
+                       (unsigned long *)&pdev->sh_info->flags)) {
+               if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ))
+                       BUG();
+               clear_evtchn(port);
+               do_gettimeofday(&tv);
+               ns = timeval_to_ns(&tv);
+               if (ns > ns_timeout) {
+                       dev_err(&pdev->xdev->dev,
+                               "pciback not responding!!!\n");
+                       clear_bit(_XEN_PCIF_active,
+                                 (unsigned long *)&pdev->sh_info->flags);
+                       err = XEN_PCI_ERR_dev_not_found;
+                       goto out;
+               }
+       }
+
+       /*
+       * We might lose backend service request since we 
+       * reuse same evtchn with pci_conf backend response. So re-schedule
+       * aer pcifront service.
+       */
+       if (test_bit(_XEN_PCIB_active, 
+                       (unsigned long*)&pdev->sh_info->flags)) {
+               dev_info(&pdev->xdev->dev, "schedule aer pcifront service\n");
+               schedule_pcifront_aer_op(pdev);
+       }
+
+       memcpy(op, active_op, sizeof(struct xen_pci_op));
+
+       err = op->err;
+      out:
+       spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags);
+       return err;
+}
+
+/* Access to this function is spinlocked in drivers/pci/access.c */
+static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn,
+                            int where, int size, u32 *val)
+{
+       int err;
+       struct xen_pci_op op = {
+               .cmd    = XEN_PCI_OP_conf_read,
+               .domain = pci_domain_nr(bus),
+               .bus    = bus->number,
+               .devfn  = devfn,
+               .offset = where,
+               .size   = size,
+       };
+       struct pcifront_sd *sd = bus->sysdata;
+       struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+       if (verbose_request)
+               dev_info(&pdev->xdev->dev, "read %02x.%u offset %x size %d\n",
+                        PCI_SLOT(devfn), PCI_FUNC(devfn), where, size);
+
+       err = do_pci_op(pdev, &op);
+
+       if (likely(!err)) {
+               if (verbose_request)
+                       dev_info(&pdev->xdev->dev, "read %02x.%u = %x\n",
+                                PCI_SLOT(devfn), PCI_FUNC(devfn), op.value);
+
+               *val = op.value;
+       } else if (err == -ENODEV) {
+               /* No device here, pretend that it just returned 0 */
+               err = 0;
+               *val = 0;
+       } else if (verbose_request)
+               dev_info(&pdev->xdev->dev, "read %02x.%u -> %d\n",
+                        PCI_SLOT(devfn), PCI_FUNC(devfn), err);
+
+       return errno_to_pcibios_err(err);
+}
+
+/* Access to this function is spinlocked in drivers/pci/access.c */
+static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn,
+                             int where, int size, u32 val)
+{
+       struct xen_pci_op op = {
+               .cmd    = XEN_PCI_OP_conf_write,
+               .domain = pci_domain_nr(bus),
+               .bus    = bus->number,
+               .devfn  = devfn,
+               .offset = where,
+               .size   = size,
+               .value  = val,
+       };
+       struct pcifront_sd *sd = bus->sysdata;
+       struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+       if (verbose_request)
+               dev_info(&pdev->xdev->dev,
+                        "write %02x.%u offset %x size %d val %x\n",
+                        PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val);
+
+       return errno_to_pcibios_err(do_pci_op(pdev, &op));
+}
+
+static struct pci_ops pcifront_bus_ops = {
+       .read = pcifront_bus_read,
+       .write = pcifront_bus_write,
+};
+
+#ifdef CONFIG_PCI_MSI
+int pci_frontend_enable_msix(struct pci_dev *dev,
+               struct msix_entry *entries,
+               int nvec)
+{
+       int err;
+       int i;
+       struct xen_pci_op op = {
+               .cmd    = XEN_PCI_OP_enable_msix,
+               .domain = pci_domain_nr(dev->bus),
+               .bus = dev->bus->number,
+               .devfn = dev->devfn,
+               .value = nvec,
+       };
+       struct pcifront_sd *sd = dev->bus->sysdata;
+       struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+       if (nvec < 0 || nvec > SH_INFO_MAX_VEC) {
+               dev_err(&dev->dev, "too many (%d) vectors for pci frontend\n",
+                       nvec);
+               return -EINVAL;
+       }
+
+       for (i = 0; i < nvec; i++) {
+               op.msix_entries[i].entry = entries[i].entry;
+               op.msix_entries[i].vector = entries[i].vector;
+       }
+
+       err = do_pci_op(pdev, &op);
+
+       if (!err) {
+               if (!op.value) {
+                       /* we get the result */
+                       for (i = 0; i < nvec; i++)
+                               entries[i].vector = op.msix_entries[i].vector;
+               } else {
+                       dev_err(&dev->dev, "enable MSI-X => %#x\n", op.value);
+                       err = op.value;
+               }
+       } else {
+               dev_err(&dev->dev, "enable MSI-X -> %d\n", err);
+               err = -EINVAL;
+       }
+       return err;
+}
+
+void pci_frontend_disable_msix(struct pci_dev *dev)
+{
+       int err;
+       struct xen_pci_op op = {
+               .cmd    = XEN_PCI_OP_disable_msix,
+               .domain = pci_domain_nr(dev->bus),
+               .bus = dev->bus->number,
+               .devfn = dev->devfn,
+       };
+       struct pcifront_sd *sd = dev->bus->sysdata;
+       struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+       err = do_pci_op(pdev, &op);
+
+       /* What should do for error ? */
+       if (err)
+               dev_err(&dev->dev, "disable MSI-X -> %d\n", err);
+}
+
+int pci_frontend_enable_msi(struct pci_dev *dev)
+{
+       int err;
+       struct xen_pci_op op = {
+               .cmd    = XEN_PCI_OP_enable_msi,
+               .domain = pci_domain_nr(dev->bus),
+               .bus = dev->bus->number,
+               .devfn = dev->devfn,
+       };
+       struct pcifront_sd *sd = dev->bus->sysdata;
+       struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+       err = do_pci_op(pdev, &op);
+       if (likely(!err))
+               dev->irq = op.value;
+       else {
+               dev_err(&dev->dev, "enable MSI -> %d\n", err);
+               err = -EINVAL;
+       }
+       return err;
+}
+
+void pci_frontend_disable_msi(struct pci_dev *dev)
+{
+       int err;
+       struct xen_pci_op op = {
+               .cmd    = XEN_PCI_OP_disable_msi,
+               .domain = pci_domain_nr(dev->bus),
+               .bus = dev->bus->number,
+               .devfn = dev->devfn,
+       };
+       struct pcifront_sd *sd = dev->bus->sysdata;
+       struct pcifront_device *pdev = pcifront_get_pdev(sd);
+
+       err = do_pci_op(pdev, &op);
+       if (likely(!err))
+               dev->irq = op.value;
+       else
+               dev_err(&dev->dev, "disable MSI -> %d\n", err);
+}
+#endif /* CONFIG_PCI_MSI */
+
+/* Claim resources for the PCI frontend as-is, backend won't allow changes */
+static int __devinit pcifront_claim_resource(struct pci_dev *dev, void *data)
+{
+       struct pcifront_device *pdev = data;
+       int i;
+       struct resource *r;
+
+       for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+               r = &dev->resource[i];
+
+               if (!r->parent && r->start && r->flags) {
+                       dev_dbg(&pdev->xdev->dev, "claiming resource %s/%d\n",
+                               pci_name(dev), i);
+                       pci_claim_resource(dev, i);
+               }
+       }
+
+       return 0;
+}
+
+int __devinit pcifront_scan_root(struct pcifront_device *pdev,
+                                unsigned int domain, unsigned int bus)
+{
+       struct pci_bus *b;
+       struct pcifront_sd *sd;
+       struct pci_bus_entry *bus_entry;
+       int err = 0;
+
+#ifndef CONFIG_PCI_DOMAINS
+       if (domain != 0) {
+               dev_err(&pdev->xdev->dev,
+                       "PCI root in non-zero domain %x!\n", domain);
+               dev_err(&pdev->xdev->dev,
+                       "Please compile with CONFIG_PCI_DOMAINS\n");
+               return -EINVAL;
+       }
+#endif
+
+       dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n",
+                domain, bus);
+
+       bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL);
+       sd = kmalloc(sizeof(*sd), GFP_KERNEL);
+       if (!bus_entry || !sd) {
+               err = -ENOMEM;
+               goto err_out;
+       }
+       pcifront_init_sd(sd, domain, bus, pdev);
+
+       b = pci_scan_bus_parented(&pdev->xdev->dev, bus,
+                                 &pcifront_bus_ops, sd);
+       if (!b) {
+               dev_err(&pdev->xdev->dev,
+                       "Error creating PCI Frontend Bus!\n");
+               err = -ENOMEM;
+               goto err_out;
+       }
+
+       pcifront_setup_root_resources(b, sd);
+       bus_entry->bus = b;
+
+       list_add(&bus_entry->list, &pdev->root_buses);
+
+       /* Claim resources before going "live" with our devices */
+       pci_walk_bus(b, pcifront_claim_resource, pdev);
+
+       pci_bus_add_devices(b);
+
+       return 0;
+
+      err_out:
+       kfree(bus_entry);
+       kfree(sd);
+
+       return err;
+}
+
+int __devinit pcifront_rescan_root(struct pcifront_device *pdev,
+                                  unsigned int domain, unsigned int bus)
+{
+       struct pci_bus *b;
+       struct pci_dev *d;
+       unsigned int devfn;
+
+#ifndef CONFIG_PCI_DOMAINS
+       if (domain != 0) {
+               dev_err(&pdev->xdev->dev,
+                       "PCI root in non-zero domain %x\n", domain);
+               dev_err(&pdev->xdev->dev,
+                       "Please compile with CONFIG_PCI_DOMAINS\n");
+               return -EINVAL;
+       }
+#endif
+
+       dev_info(&pdev->xdev->dev, "Rescanning PCI Frontend Bus %04x:%02x\n",
+                domain, bus);
+
+       b = pci_find_bus(domain, bus);
+       if(!b)
+               /* If the bus is unknown, create it. */
+               return pcifront_scan_root(pdev, domain, bus);
+
+       /* Rescan the bus for newly attached functions and add.
+        * We omit handling of PCI bridge attachment because pciback prevents
+        * bridges from being exported.
+        */ 
+       for (devfn = 0; devfn < 0x100; devfn++) {
+               d = pci_get_slot(b, devfn);
+               if(d) {
+                       /* Device is already known. */
+                       pci_dev_put(d);
+                       continue;
+               }
+
+               d = pci_scan_single_device(b, devfn);
+               if (d)
+                       dev_info(&pdev->xdev->dev,
+                                "New device on %04x:%02x:%02x.%u\n",
+                                domain, bus,
+                                PCI_SLOT(devfn), PCI_FUNC(devfn));
+       }
+
+       /* Claim resources before going "live" with our devices */
+       pci_walk_bus(b, pcifront_claim_resource, pdev);
+
+       /* Create SysFS and notify udev of the devices. Aka: "going live" */
+       pci_bus_add_devices(b);
+
+       return 0;
+}
+
+static void free_root_bus_devs(struct pci_bus *bus)
+{
+       struct pci_dev *dev;
+
+       while (!list_empty(&bus->devices)) {
+               dev = container_of(bus->devices.next, struct pci_dev,
+                                  bus_list);
+               dev_dbg(&dev->dev, "removing device\n");
+               pci_stop_and_remove_bus_device(dev);
+       }
+}
+
+void pcifront_free_roots(struct pcifront_device *pdev)
+{
+       struct pci_bus_entry *bus_entry, *t;
+
+       dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n");
+
+       list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) {
+               list_del(&bus_entry->list);
+
+               free_root_bus_devs(bus_entry->bus);
+
+               kfree(bus_entry->bus->sysdata);
+
+               device_unregister(bus_entry->bus->bridge);
+               pci_remove_bus(bus_entry->bus);
+
+               kfree(bus_entry);
+       }
+}
+
+static pci_ers_result_t pcifront_common_process( int cmd, struct pcifront_device *pdev,
+       pci_channel_state_t state)
+{
+       pci_ers_result_t result = PCI_ERS_RESULT_NONE;
+       struct pci_driver *pdrv;
+       int bus = pdev->sh_info->aer_op.bus;
+       int devfn = pdev->sh_info->aer_op.devfn;
+       struct pci_dev *pcidev;
+
+       dev_dbg(&pdev->xdev->dev, 
+               "pcifront AER process: cmd %x (bus %x devfn %x)",
+               cmd, bus, devfn);
+
+       pcidev = pci_get_bus_and_slot(bus, devfn);
+       if (!pcidev || !pcidev->driver) {
+               pci_dev_put(pcidev);
+               dev_err(&pdev->xdev->dev, "AER device or driver is NULL\n");
+               return result;
+       }
+       pdrv = pcidev->driver;
+
+       if (pdrv->err_handler) {
+               dev_dbg(&pcidev->dev, "trying to call AER service\n");
+               switch(cmd) {
+               case XEN_PCI_OP_aer_detected:
+                       if (pdrv->err_handler->error_detected)
+                               result = pdrv->err_handler->error_detected(pcidev, state);
+                       break;
+               case XEN_PCI_OP_aer_mmio:
+                       if (pdrv->err_handler->mmio_enabled)
+                               result = pdrv->err_handler->mmio_enabled(pcidev);
+                       break;
+               case XEN_PCI_OP_aer_slotreset:
+                       if (pdrv->err_handler->slot_reset)
+                               result = pdrv->err_handler->slot_reset(pcidev);
+                       break;
+               case XEN_PCI_OP_aer_resume:
+                       if (pdrv->err_handler->resume)
+                               pdrv->err_handler->resume(pcidev);
+                       break;
+               default:
+                       dev_err(&pdev->xdev->dev,
+                               "bad request %x in aer recovery operation!\n",
+                               cmd);
+                       break;
+               }
+       }
+
+       return result;
+}
+
+
+void pcifront_do_aer(struct work_struct *data)
+{
+       struct pcifront_device *pdev = container_of(data, struct pcifront_device, op_work);
+       int cmd = pdev->sh_info->aer_op.cmd;
+       pci_channel_state_t state = 
+               (pci_channel_state_t)pdev->sh_info->aer_op.err;
+
+       /*If a pci_conf op is in progress, 
+               we have to wait until it is done before service aer op*/
+       dev_dbg(&pdev->xdev->dev, 
+               "pcifront service aer bus %x devfn %x\n", pdev->sh_info->aer_op.bus,
+               pdev->sh_info->aer_op.devfn);
+
+       pdev->sh_info->aer_op.err = pcifront_common_process(cmd, pdev, state);
+
+       wmb();
+       clear_bit(_XEN_PCIB_active, (unsigned long*)&pdev->sh_info->flags);
+       notify_remote_via_evtchn(pdev->evtchn);
+
+       /*in case of we lost an aer request in four lines time_window*/
+       smp_mb__before_clear_bit();
+       clear_bit( _PDEVB_op_active, &pdev->flags);
+       smp_mb__after_clear_bit();
+
+       schedule_pcifront_aer_op(pdev);
+
+}
+
+irqreturn_t pcifront_handler_aer(int irq, void *dev)
+{
+       struct pcifront_device *pdev = dev;
+       schedule_pcifront_aer_op(pdev);
+       return IRQ_HANDLED;
+}
diff --git a/drivers/xen/pcifront/pcifront.h b/drivers/xen/pcifront/pcifront.h

new file mode 100644 (file)

index 0000000..2732ffd
--- /dev/null
+++ b/drivers/xen/pcifront/pcifront.h
@@ -0,0 +1,57 @@
+/*
+ * PCI Frontend - Common data structures & function declarations
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_PCIFRONT_H__
+#define __XEN_PCIFRONT_H__
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/pciif.h>
+#include <linux/interrupt.h>
+#include <xen/pcifront.h>
+#include <linux/atomic.h>
+#include <linux/workqueue.h>
+
+struct pci_bus_entry {
+       struct list_head list;
+       struct pci_bus *bus;
+};
+
+#define _PDEVB_op_active               (0)
+#define PDEVB_op_active                (1 << (_PDEVB_op_active))
+
+struct pcifront_device {
+       struct xenbus_device *xdev;
+       struct list_head root_buses;
+       spinlock_t dev_lock;
+
+       int evtchn;
+       int gnt_ref;
+       int irq;
+
+       /* Lock this when doing any operations in sh_info */
+       spinlock_t sh_info_lock;
+       struct xen_pci_sharedinfo *sh_info;
+       struct work_struct op_work;
+       unsigned long flags;
+
+};
+
+int pcifront_connect(struct pcifront_device *pdev);
+void pcifront_disconnect(struct pcifront_device *pdev);
+
+int pcifront_scan_root(struct pcifront_device *pdev,
+                      unsigned int domain, unsigned int bus);
+int pcifront_rescan_root(struct pcifront_device *pdev,
+                        unsigned int domain, unsigned int bus);
+void pcifront_free_roots(struct pcifront_device *pdev);
+
+void pcifront_do_aer(struct work_struct *data);
+
+irqreturn_t pcifront_handler_aer(int irq, void *dev);
+
+#endif /* __XEN_PCIFRONT_H__ */
diff --git a/drivers/xen/pcifront/xenbus.c b/drivers/xen/pcifront/xenbus.c

new file mode 100644 (file)

index 0000000..8adbf9b
--- /dev/null
+++ b/drivers/xen/pcifront/xenbus.c
@@ -0,0 +1,475 @@
+/*
+ * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn)
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <xen/xenbus.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include "pcifront.h"
+
+#define INVALID_EVTCHN    (-1)
+
+static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev)
+{
+       struct pcifront_device *pdev;
+
+       pdev = kzalloc(sizeof(struct pcifront_device), GFP_KERNEL);
+       if (pdev == NULL)
+               goto out;
+
+       pdev->sh_info =
+           (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL);
+       if (pdev->sh_info == NULL) {
+               kfree(pdev);
+               pdev = NULL;
+               goto out;
+       }
+       pdev->sh_info->flags = 0;
+
+       /*Flag for registering PV AER handler*/
+       set_bit(_XEN_PCIB_AERHANDLER, (void*)&pdev->sh_info->flags);
+
+       dev_set_drvdata(&xdev->dev, pdev);
+       pdev->xdev = xdev;
+
+       INIT_LIST_HEAD(&pdev->root_buses);
+
+       spin_lock_init(&pdev->dev_lock);
+       spin_lock_init(&pdev->sh_info_lock);
+
+       pdev->evtchn = INVALID_EVTCHN;
+       pdev->gnt_ref = GRANT_INVALID_REF;
+       pdev->irq = -1;
+
+       INIT_WORK(&pdev->op_work, pcifront_do_aer);
+
+       dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n",
+               pdev, pdev->sh_info);
+      out:
+       return pdev;
+}
+
+static void free_pdev(struct pcifront_device *pdev)
+{
+       dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev);
+
+       pcifront_free_roots(pdev);
+
+       /*For PCIE_AER error handling job*/
+       flush_work_sync(&pdev->op_work);
+
+       if (pdev->irq > 0)
+               unbind_from_irqhandler(pdev->irq, pdev);
+
+       if (pdev->evtchn != INVALID_EVTCHN)
+               xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
+
+       if (pdev->gnt_ref != GRANT_INVALID_REF)
+               gnttab_end_foreign_access(pdev->gnt_ref,
+                                         (unsigned long)pdev->sh_info);
+       else
+               free_page((unsigned long)pdev->sh_info);
+
+       dev_set_drvdata(&pdev->xdev->dev, NULL);
+
+       kfree(pdev);
+}
+
+static int pcifront_publish_info(struct pcifront_device *pdev)
+{
+       int err = 0;
+       struct xenbus_transaction trans;
+
+       err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
+       if (err < 0)
+               goto out;
+
+       pdev->gnt_ref = err;
+
+       err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
+       if (err)
+               goto out;
+
+       err = bind_caller_port_to_irqhandler(pdev->evtchn,
+                                            pcifront_handler_aer,
+                                            IRQF_SAMPLE_RANDOM,
+                                            "pcifront", pdev);
+       if (err < 0) {
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Failed to bind event channel");
+               goto out;
+       }
+       pdev->irq = err;
+
+      do_publish:
+       err = xenbus_transaction_start(&trans);
+       if (err) {
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error writing configuration for backend "
+                                "(start transaction)");
+               goto out;
+       }
+
+       err = xenbus_printf(trans, pdev->xdev->nodename,
+                           "pci-op-ref", "%u", pdev->gnt_ref);
+       if (!err)
+               err = xenbus_printf(trans, pdev->xdev->nodename,
+                                   "event-channel", "%u", pdev->evtchn);
+       if (!err)
+               err = xenbus_printf(trans, pdev->xdev->nodename,
+                                   "magic", XEN_PCI_MAGIC);
+
+       if (err) {
+               xenbus_transaction_end(trans, 1);
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error writing configuration for backend");
+               goto out;
+       } else {
+               err = xenbus_transaction_end(trans, 0);
+               if (err == -EAGAIN)
+                       goto do_publish;
+               else if (err) {
+                       xenbus_dev_fatal(pdev->xdev, err,
+                                        "Error completing transaction "
+                                        "for backend");
+                       goto out;
+               }
+       }
+
+       xenbus_switch_state(pdev->xdev, XenbusStateInitialised);
+
+       dev_dbg(&pdev->xdev->dev, "publishing successful!\n");
+
+      out:
+       return err;
+}
+
+static int __devinit pcifront_try_connect(struct pcifront_device *pdev)
+{
+       int err = -EFAULT;
+       int i, num_roots, len;
+       char str[64];
+       unsigned int domain, bus;
+
+       spin_lock(&pdev->dev_lock);
+
+       /* Only connect once */
+       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+           XenbusStateInitialised)
+               goto out;
+
+       err = pcifront_connect(pdev);
+       if (err) {
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error connecting PCI Frontend");
+               goto out;
+       }
+
+       err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+                          "root_num", "%d", &num_roots);
+       if (err == -ENOENT) {
+               xenbus_dev_error(pdev->xdev, err,
+                                "No PCI Roots found, trying 0000:00");
+               err = pcifront_scan_root(pdev, 0, 0);
+               num_roots = 0;
+       } else if (err != 1) {
+               if (err == 0)
+                       err = -EINVAL;
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error reading number of PCI roots");
+               goto out;
+       }
+
+       for (i = 0; i < num_roots; i++) {
+               len = snprintf(str, sizeof(str), "root-%d", i);
+               if (unlikely(len >= (sizeof(str) - 1))) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+                                  "%x:%x", &domain, &bus);
+               if (err != 2) {
+                       if (err >= 0)
+                               err = -EINVAL;
+                       xenbus_dev_fatal(pdev->xdev, err,
+                                        "Error reading PCI root %d", i);
+                       goto out;
+               }
+
+               err = pcifront_scan_root(pdev, domain, bus);
+               if (err) {
+                       xenbus_dev_fatal(pdev->xdev, err,
+                                        "Error scanning PCI root %04x:%02x",
+                                        domain, bus);
+                       goto out;
+               }
+       }
+
+       err = xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+       if (err)
+               goto out;
+
+      out:
+       spin_unlock(&pdev->dev_lock);
+       return err;
+}
+
+static int pcifront_try_disconnect(struct pcifront_device *pdev)
+{
+       int err = 0;
+       enum xenbus_state prev_state;
+
+       spin_lock(&pdev->dev_lock);
+
+       prev_state = xenbus_read_driver_state(pdev->xdev->nodename);
+
+       if (prev_state >= XenbusStateClosing)
+               goto out;
+
+       if(prev_state == XenbusStateConnected) {
+               pcifront_free_roots(pdev);
+               pcifront_disconnect(pdev);
+       }
+
+       err = xenbus_switch_state(pdev->xdev, XenbusStateClosed);
+
+      out:
+       spin_unlock(&pdev->dev_lock);
+
+       return err;
+}
+
+static int __devinit pcifront_attach_devices(struct pcifront_device *pdev)
+{
+       int err = -EFAULT;
+       int i, num_roots, len;
+       unsigned int domain, bus;
+       char str[64];
+
+       spin_lock(&pdev->dev_lock);
+
+       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+           XenbusStateReconfiguring)
+               goto out;
+
+       err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend,
+                          "root_num", "%d", &num_roots);
+       if (err == -ENOENT) {
+               xenbus_dev_error(pdev->xdev, err,
+                                "No PCI Roots found, trying 0000:00");
+               err = pcifront_rescan_root(pdev, 0, 0);
+               num_roots = 0;
+       } else if (err != 1) {
+               if (err == 0)
+                       err = -EINVAL;
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error reading number of PCI roots");
+               goto out;
+       }
+
+       for (i = 0; i < num_roots; i++) {
+               len = snprintf(str, sizeof(str), "root-%d", i);
+               if (unlikely(len >= (sizeof(str) - 1))) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+                                  "%x:%x", &domain, &bus);
+               if (err != 2) {
+                       if (err >= 0)
+                               err = -EINVAL;
+                       xenbus_dev_fatal(pdev->xdev, err,
+                                        "Error reading PCI root %d", i);
+                       goto out;
+               }
+
+               err = pcifront_rescan_root(pdev, domain, bus);
+               if (err) {
+                       xenbus_dev_fatal(pdev->xdev, err,
+                                        "Error scanning PCI root %04x:%02x",
+                                        domain, bus);
+                       goto out;
+               }
+       }
+
+       xenbus_switch_state(pdev->xdev, XenbusStateConnected);
+
+      out:
+       spin_unlock(&pdev->dev_lock);
+       return err;
+}
+
+static int pcifront_detach_devices(struct pcifront_device *pdev)
+{
+       int err = 0;
+       int i, num_devs;
+       unsigned int domain, bus, slot, func;
+       struct pci_bus *pci_bus;
+       struct pci_dev *pci_dev;
+       char str[64];
+
+       spin_lock(&pdev->dev_lock);
+
+       if (xenbus_read_driver_state(pdev->xdev->nodename) !=
+           XenbusStateConnected)
+               goto out;
+
+       err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, "num_devs", "%d",
+                          &num_devs);
+       if (err != 1) {
+               if (err >= 0)
+                       err = -EINVAL;
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Error reading number of PCI devices");
+               goto out;
+       }
+
+       /* Find devices being detached and remove them. */
+       for (i = 0; i < num_devs; i++) {
+               int l, state;
+               l = snprintf(str, sizeof(str), "state-%d", i);
+               if (unlikely(l >= (sizeof(str) - 1))) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+               err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, "%d",
+                                  &state);
+               if (err != 1)
+                       state = XenbusStateUnknown;
+
+               if (state != XenbusStateClosing)
+                       continue;
+
+               /* Remove device. */
+               l = snprintf(str, sizeof(str), "vdev-%d", i);
+               if (unlikely(l >= (sizeof(str) - 1))) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+               err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str,
+                                  "%x:%x:%x.%x", &domain, &bus, &slot, &func);
+               if (err != 4) {
+                       if (err >= 0)
+                               err = -EINVAL;
+                       xenbus_dev_fatal(pdev->xdev, err,
+                                        "Error reading PCI device %d", i);
+                       goto out;
+               }
+
+               pci_bus = pci_find_bus(domain, bus);
+               if(!pci_bus) {
+                       dev_dbg(&pdev->xdev->dev, "Cannot get bus %04x:%02x\n",
+                               domain, bus);
+                       continue;
+               }
+               pci_dev = pci_get_slot(pci_bus, PCI_DEVFN(slot, func));
+               if(!pci_dev) {
+                       dev_dbg(&pdev->xdev->dev,
+                               "Cannot get PCI device %04x:%02x:%02x.%u\n",
+                               domain, bus, slot, func);
+                       continue;
+               }
+               pci_stop_and_remove_bus_device(pci_dev);
+               pci_dev_put(pci_dev);
+
+               dev_dbg(&pdev->xdev->dev,
+                       "PCI device %04x:%02x:%02x.%u removed.\n",
+                       domain, bus, slot, func);
+       }
+
+       err = xenbus_switch_state(pdev->xdev, XenbusStateReconfiguring);
+
+      out:
+       spin_unlock(&pdev->dev_lock);
+       return err;
+}
+
+static void __init_refok pcifront_backend_changed(struct xenbus_device *xdev,
+                                                 enum xenbus_state be_state)
+{
+       struct pcifront_device *pdev = dev_get_drvdata(&xdev->dev);
+
+       switch (be_state) {
+       case XenbusStateUnknown:
+       case XenbusStateInitialising:
+       case XenbusStateInitWait:
+       case XenbusStateInitialised:
+       case XenbusStateClosed:
+               break;
+
+       case XenbusStateConnected:
+               pcifront_try_connect(pdev);
+               break;
+
+       case XenbusStateClosing:
+               dev_warn(&xdev->dev, "backend going away!\n");
+               pcifront_try_disconnect(pdev);
+               break;
+
+       case XenbusStateReconfiguring:
+               pcifront_detach_devices(pdev);
+               break;
+
+       case XenbusStateReconfigured:
+               pcifront_attach_devices(pdev);
+               break;
+       }
+}
+
+static int pcifront_xenbus_probe(struct xenbus_device *xdev,
+                                const struct xenbus_device_id *id)
+{
+       int err = 0;
+       struct pcifront_device *pdev = alloc_pdev(xdev);
+
+       if (pdev == NULL) {
+               err = -ENOMEM;
+               xenbus_dev_fatal(xdev, err,
+                                "Error allocating pcifront_device struct");
+               goto out;
+       }
+
+       err = pcifront_publish_info(pdev);
+       if (err)
+               free_pdev(pdev);
+
+      out:
+       return err;
+}
+
+static int pcifront_xenbus_remove(struct xenbus_device *xdev)
+{
+       if (dev_get_drvdata(&xdev->dev))
+               free_pdev(dev_get_drvdata(&xdev->dev));
+
+       return 0;
+}
+
+static const struct xenbus_device_id pcifront_ids[] = {
+       {"pci"},
+       {{0}},
+};
+MODULE_ALIAS("xen:pci");
+
+static DEFINE_XENBUS_DRIVER(pcifront, "pcifront",
+       .probe                  = pcifront_xenbus_probe,
+       .remove                 = pcifront_xenbus_remove,
+       .otherend_changed       = pcifront_backend_changed,
+);
+
+static int __init pcifront_init(void)
+{
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       return xenbus_register_frontend(&pcifront_driver);
+}
+
+/* Initialize after the Xen PCI Frontend Stub is initialized */
+subsys_initcall(pcifront_init);
diff --git a/drivers/xen/privcmd/Makefile b/drivers/xen/privcmd/Makefile

new file mode 100644 (file)

index 0000000..507245a
--- /dev/null
+++ b/drivers/xen/privcmd/Makefile
@@ -0,0 +1,3 @@
+priv-$(CONFIG_COMPAT) := compat_privcmd.o
+obj-y := privcmd.o
+obj-$(CONFIG_XEN_PRIVILEGED_GUEST) += $(priv-y)
diff --git a/drivers/xen/privcmd/compat_privcmd.c b/drivers/xen/privcmd/compat_privcmd.c

new file mode 100644 (file)

index 0000000..3e5c077
--- /dev/null
+++ b/drivers/xen/privcmd/compat_privcmd.c
@@ -0,0 +1,140 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Copyright (C) IBM Corp. 2006
+ *
+ * Authors: Jimi Xenidis <jimix@watson.ibm.com>
+ */
+
+#include <linux/compat.h>
+#include <linux/ioctl.h>
+#include <linux/syscalls.h>
+#include <asm/hypervisor.h>
+#include <asm/uaccess.h>
+#include <xen/public/privcmd.h>
+#include <xen/compat_ioctl.h>
+
+int privcmd_ioctl_32(int fd, unsigned int cmd, void __user *arg)
+{
+       int ret;
+
+       switch (cmd) {
+       case IOCTL_PRIVCMD_MMAP_32: {
+               struct privcmd_mmap __user *p;
+               struct privcmd_mmap_32 __user *p32 = arg;
+               struct privcmd_mmap_32 n32;
+
+               p = compat_alloc_user_space(sizeof(*p));
+               if (copy_from_user(&n32, p32, sizeof(n32)) ||
+                   put_user(n32.num, &p->num) ||
+                   put_user(n32.dom, &p->dom) ||
+                   put_user(compat_ptr(n32.entry), &p->entry))
+                       return -EFAULT;
+               
+               ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAP, (unsigned long)p);
+       }
+               break;
+       case IOCTL_PRIVCMD_MMAPBATCH_32: {
+               struct privcmd_mmapbatch __user *p;
+               struct privcmd_mmapbatch_32 __user *p32 = arg;
+               struct privcmd_mmapbatch_32 n32;
+#ifdef xen_pfn32_t
+               xen_pfn_t *__user arr;
+               xen_pfn32_t *__user arr32;
+               unsigned int i;
+#endif
+
+               p = compat_alloc_user_space(sizeof(*p));
+               if (copy_from_user(&n32, p32, sizeof(n32)) ||
+                   put_user(n32.num, &p->num) ||
+                   put_user(n32.dom, &p->dom) ||
+                   put_user(n32.addr, &p->addr))
+                       return -EFAULT;
+#ifdef xen_pfn32_t
+               arr = compat_alloc_user_space(n32.num * sizeof(*arr)
+                                             + sizeof(*p));
+               arr32 = compat_ptr(n32.arr);
+               for (i = 0; i < n32.num; ++i) {
+                       xen_pfn32_t mfn;
+
+                       if (get_user(mfn, arr32 + i) || put_user(mfn, arr + i))
+                               return -EFAULT;
+               }
+
+               if (put_user(arr, &p->arr))
+                       return -EFAULT;
+#else
+               if (put_user(compat_ptr(n32.arr), &p->arr))
+                       return -EFAULT;
+#endif
+               
+               ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAPBATCH, (unsigned long)p);
+
+#ifdef xen_pfn32_t
+               for (i = 0; !ret && i < n32.num; ++i) {
+                       xen_pfn_t mfn;
+
+                       if (get_user(mfn, arr + i) || put_user(mfn, arr32 + i))
+                               ret = -EFAULT;
+                       else if (mfn != (xen_pfn32_t)mfn)
+                               ret = -ERANGE;
+               }
+#endif
+       }
+               break;
+       case IOCTL_PRIVCMD_MMAPBATCH_V2_32: {
+               struct privcmd_mmapbatch_v2 __user *p;
+               struct privcmd_mmapbatch_v2_32 __user *p32 = arg;
+               struct privcmd_mmapbatch_v2_32 n32;
+#ifdef xen_pfn32_t
+               xen_pfn_t *__user arr;
+               const xen_pfn32_t *__user arr32;
+               unsigned int i;
+#endif
+
+               p = compat_alloc_user_space(sizeof(*p));
+               if (copy_from_user(&n32, p32, sizeof(n32)) ||
+                   put_user(n32.num, &p->num) ||
+                   put_user(n32.dom, &p->dom) ||
+                   put_user(n32.addr, &p->addr) ||
+                   put_user(compat_ptr(n32.err), &p->err))
+                       return -EFAULT;
+#ifdef xen_pfn32_t
+               arr = compat_alloc_user_space(n32.num * sizeof(*arr)
+                                             + sizeof(*p));
+               arr32 = compat_ptr(n32.arr);
+               for (i = 0; i < n32.num; ++i) {
+                       xen_pfn32_t mfn;
+
+                       if (get_user(mfn, arr32 + i) || put_user(mfn, arr + i))
+                               return -EFAULT;
+               }
+
+               if (put_user(arr, &p->arr))
+                       return -EFAULT;
+#else
+               if (put_user(compat_ptr(n32.arr), &p->arr))
+                       return -EFAULT;
+#endif
+
+               ret = sys_ioctl(fd, IOCTL_PRIVCMD_MMAPBATCH_V2, (unsigned long)p);
+       }
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+       return ret;
+}
diff --git a/drivers/xen/privcmd/privcmd.c b/drivers/xen/privcmd/privcmd.c

new file mode 100644 (file)

index 0000000..e14e4e2
--- /dev/null
+++ b/drivers/xen/privcmd/privcmd.c
@@ -0,0 +1,470 @@
+/******************************************************************************
+ * privcmd.c
+ * 
+ * Interface to privileged domain-0 commands.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <asm/hypervisor.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/tlb.h>
+#include <asm/hypervisor.h>
+#include <xen/public/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/xen_proc.h>
+#include <xen/features.h>
+
+static struct proc_dir_entry *privcmd_intf;
+static struct proc_dir_entry *capabilities_intf;
+
+#ifndef CONFIG_XEN_PRIVILEGED_GUEST
+#define HAVE_ARCH_PRIVCMD_MMAP
+#endif
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int enforce_singleshot_mapping_fn(pte_t *pte, struct page *pmd_page,
+                                        unsigned long addr, void *data)
+{
+       return pte_none(*pte) ? 0 : -EBUSY;
+}
+
+static inline int enforce_singleshot_mapping(struct vm_area_struct *vma,
+                                            unsigned long addr,
+                                            unsigned long npages)
+{
+       return apply_to_page_range(vma->vm_mm, addr, npages << PAGE_SHIFT,
+                                  enforce_singleshot_mapping_fn, NULL) == 0;
+}
+#else
+#define enforce_singleshot_mapping(vma, addr, npages) \
+       privcmd_enforce_singleshot_mapping(vma)
+#endif
+
+static long privcmd_ioctl(struct file *file,
+                         unsigned int cmd, unsigned long data)
+{
+       long ret;
+       void __user *udata = (void __user *) data;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       unsigned long i, addr, nr, nr_pages;
+       int paged_out;
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       LIST_HEAD(pagelist);
+       struct list_head *l, *l2;
+#endif
+
+       switch (cmd) {
+       case IOCTL_PRIVCMD_HYPERCALL: {
+               privcmd_hypercall_t hypercall;
+  
+               if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+                       return -EFAULT;
+
+#ifdef CONFIG_X86
+               ret = -ENOSYS;
+               if (hypercall.op >= (PAGE_SIZE >> 5))
+                       break;
+               ret = _hypercall(long, (unsigned int)hypercall.op,
+                                (unsigned long)hypercall.arg[0],
+                                (unsigned long)hypercall.arg[1],
+                                (unsigned long)hypercall.arg[2],
+                                (unsigned long)hypercall.arg[3],
+                                (unsigned long)hypercall.arg[4]);
+#else
+               ret = privcmd_hypercall(&hypercall);
+#endif
+       }
+       break;
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+
+       case IOCTL_PRIVCMD_MMAP: {
+#define MMAP_NR_PER_PAGE \
+       (unsigned long)((PAGE_SIZE - sizeof(*l)) / sizeof(*msg))
+               privcmd_mmap_t mmapcmd;
+               privcmd_mmap_entry_t *msg;
+               privcmd_mmap_entry_t __user *p;
+
+               if (!is_initial_xendomain())
+                       return -EPERM;
+
+               if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+                       return -EFAULT;
+
+               if (mmapcmd.num <= 0)
+                       return -EINVAL;
+
+               p = mmapcmd.entry;
+               for (i = 0; i < mmapcmd.num;) {
+                       nr = min(mmapcmd.num - i, MMAP_NR_PER_PAGE);
+
+                       ret = -ENOMEM;
+                       l = (struct list_head *) __get_free_page(GFP_KERNEL);
+                       if (l == NULL)
+                               goto mmap_out;
+
+                       INIT_LIST_HEAD(l);
+                       list_add_tail(l, &pagelist);
+                       msg = (privcmd_mmap_entry_t*)(l + 1);
+
+                       ret = -EFAULT;
+                       if (copy_from_user(msg, p, nr*sizeof(*msg)))
+                               goto mmap_out;
+                       i += nr;
+                       p += nr;
+               }
+
+               l = pagelist.next;
+               msg = (privcmd_mmap_entry_t*)(l + 1);
+
+               down_write(&mm->mmap_sem);
+
+               vma = find_vma(mm, msg->va);
+               ret = -EINVAL;
+               if (!vma || (msg->va != vma->vm_start))
+                       goto mmap_out;
+
+               addr = vma->vm_start;
+
+               i = 0;
+               list_for_each(l, &pagelist) {
+                       nr = i + min(mmapcmd.num - i, MMAP_NR_PER_PAGE);
+
+                       msg = (privcmd_mmap_entry_t*)(l + 1);
+                       while (i<nr) {
+
+                               /* Do not allow range to wrap the address space. */
+                               if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+                                   (((unsigned long)msg->npages << PAGE_SHIFT) >= -addr))
+                                       goto mmap_out;
+
+                               /* Range chunks must be contiguous in va space. */
+                               if ((msg->va != addr) ||
+                                   ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+                                       goto mmap_out;
+
+                               addr += msg->npages << PAGE_SHIFT;
+                               msg++;
+                               i++;
+                       }
+               }
+
+               if (!enforce_singleshot_mapping(vma, vma->vm_start,
+                                               (addr - vma->vm_start) >> PAGE_SHIFT))
+                       goto mmap_out;
+
+               addr = vma->vm_start;
+               i = 0;
+               list_for_each(l, &pagelist) {
+                       nr = i + min(mmapcmd.num - i, MMAP_NR_PER_PAGE);
+
+                       msg = (privcmd_mmap_entry_t*)(l + 1);
+                       while (i < nr) {
+                               if ((ret = direct_remap_pfn_range(
+                                            vma,
+                                            msg->va & PAGE_MASK,
+                                            msg->mfn,
+                                            msg->npages << PAGE_SHIFT,
+                                            vma->vm_page_prot,
+                                            mmapcmd.dom)) < 0)
+                                       goto mmap_out;
+
+                               addr += msg->npages << PAGE_SHIFT;
+                               msg++;
+                               i++;
+                       }
+               }
+
+               ret = 0;
+
+       mmap_out:
+               up_write(&mm->mmap_sem);
+               list_for_each_safe(l,l2,&pagelist)
+                       free_page((unsigned long)l);
+       }
+#undef MMAP_NR_PER_PAGE
+       break;
+
+       case IOCTL_PRIVCMD_MMAPBATCH: {
+#define MMAPBATCH_NR_PER_PAGE \
+       (unsigned long)((PAGE_SIZE - sizeof(*l)) / sizeof(*mfn))
+               privcmd_mmapbatch_t m;
+               xen_pfn_t __user *p;
+               xen_pfn_t *mfn;
+
+               if (!is_initial_xendomain())
+                       return -EPERM;
+
+               if (copy_from_user(&m, udata, sizeof(m)))
+                       return -EFAULT;
+
+               nr_pages = m.num;
+               addr = m.addr;
+               if (m.num <= 0 || nr_pages > (LONG_MAX >> PAGE_SHIFT) ||
+                   addr != m.addr || nr_pages > (-addr >> PAGE_SHIFT))
+                       return -EINVAL;
+
+               p = m.arr;
+               for (i=0; i<nr_pages; ) {
+                       nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
+
+                       ret = -ENOMEM;
+                       l = (struct list_head *)__get_free_page(GFP_KERNEL);
+                       if (l == NULL)
+                               goto mmapbatch_out;
+
+                       INIT_LIST_HEAD(l);
+                       list_add_tail(l, &pagelist);
+
+                       mfn = (unsigned long*)(l + 1);
+                       ret = -EFAULT;
+                       if (copy_from_user(mfn, p, nr*sizeof(*mfn)))
+                               goto mmapbatch_out;
+
+                       i += nr; p+= nr;
+               }
+
+               down_write(&mm->mmap_sem);
+
+               vma = find_vma(mm, addr);
+               ret = -EINVAL;
+               if (!vma ||
+                   addr < vma->vm_start ||
+                   addr + (nr_pages << PAGE_SHIFT) > vma->vm_end ||
+                   !enforce_singleshot_mapping(vma, addr, nr_pages)) {
+                       up_write(&mm->mmap_sem);
+                       goto mmapbatch_out;
+               }
+
+               i = 0;
+               ret = 0;
+               paged_out = 0;
+               list_for_each(l, &pagelist) {
+                       nr = i + min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
+                       mfn = (unsigned long *)(l + 1);
+
+                       while (i<nr) {
+                               int rc;
+
+                               rc = direct_remap_pfn_range(vma, addr & PAGE_MASK,
+                                                           *mfn, PAGE_SIZE,
+                                                           vma->vm_page_prot, m.dom);
+                               if(rc < 0) {
+                                       if (rc == -ENOENT)
+                                       {
+                                               *mfn |= 0x80000000U;
+                                               paged_out = 1;
+                                       }
+                                       else
+                                               *mfn |= 0xf0000000U;
+                                       ret++;
+                               }
+                               mfn++; i++; addr += PAGE_SIZE;
+                       }
+               }
+
+               up_write(&mm->mmap_sem);
+               if (ret > 0) {
+                       p = m.arr;
+                       i = 0;
+                       if (paged_out)
+                               ret = -ENOENT;
+                       else
+                               ret = 0;
+                       list_for_each(l, &pagelist) {
+                               nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
+                               mfn = (unsigned long *)(l + 1);
+                               if (copy_to_user(p, mfn, nr*sizeof(*mfn)))
+                                       ret = -EFAULT;
+                               i += nr; p += nr;
+                       }
+               }
+       mmapbatch_out:
+               list_for_each_safe(l,l2,&pagelist)
+                       free_page((unsigned long)l);
+       }
+       break;
+
+       case IOCTL_PRIVCMD_MMAPBATCH_V2: {
+               privcmd_mmapbatch_v2_t m;
+               const xen_pfn_t __user *p;
+               xen_pfn_t *mfn;
+               int *err;
+
+               if (!is_initial_xendomain())
+                       return -EPERM;
+
+               if (copy_from_user(&m, udata, sizeof(m)))
+                       return -EFAULT;
+
+               nr_pages = m.num;
+               addr = m.addr;
+               if (m.num <= 0 || nr_pages > (ULONG_MAX >> PAGE_SHIFT) ||
+                   addr != m.addr || nr_pages > (-addr >> PAGE_SHIFT))
+                       return -EINVAL;
+
+               p = m.arr;
+               for (i = 0; i < nr_pages; i += nr, p += nr) {
+                       nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
+
+                       ret = -ENOMEM;
+                       l = (struct list_head *)__get_free_page(GFP_KERNEL);
+                       if (l == NULL)
+                               goto mmapbatch_v2_out;
+
+                       INIT_LIST_HEAD(l);
+                       list_add_tail(l, &pagelist);
+
+                       mfn = (void *)(l + 1);
+                       ret = -EFAULT;
+                       if (copy_from_user(mfn, p, nr * sizeof(*mfn)))
+                               goto mmapbatch_v2_out;
+               }
+
+               down_write(&mm->mmap_sem);
+
+               vma = find_vma(mm, addr);
+               ret = -EINVAL;
+               if (!vma ||
+                   addr < vma->vm_start ||
+                   addr + (nr_pages << PAGE_SHIFT) > vma->vm_end ||
+                   !enforce_singleshot_mapping(vma, addr, nr_pages)) {
+                       up_write(&mm->mmap_sem);
+                       goto mmapbatch_v2_out;
+               }
+
+               i = 0;
+               ret = 0;
+               paged_out = 0;
+               list_for_each(l, &pagelist) {
+                       nr = i + min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
+                       mfn = (void *)(l + 1);
+                       err = (void *)(l + 1);
+                       BUILD_BUG_ON(sizeof(*err) > sizeof(*mfn));
+
+                       while (i < nr) {
+                               int rc;
+
+                               rc = direct_remap_pfn_range(vma, addr & PAGE_MASK,
+                                                           *mfn, PAGE_SIZE,
+                                                           vma->vm_page_prot, m.dom);
+                               if (rc < 0) {
+                                       if (rc == -ENOENT)
+                                               paged_out = 1;
+                                       ret++;
+                               } else
+                                       BUG_ON(rc > 0);
+                               *err++ = rc;
+                               mfn++; i++; addr += PAGE_SIZE;
+                       }
+               }
+
+               up_write(&mm->mmap_sem);
+
+               if (ret > 0) {
+                       int __user *p = m.err;
+
+                       ret = paged_out ? -ENOENT : 0;
+                       i = 0;
+                       list_for_each(l, &pagelist) {
+                               nr = min(nr_pages - i, MMAPBATCH_NR_PER_PAGE);
+                               err = (void *)(l + 1);
+                               if (copy_to_user(p, err, nr * sizeof(*err)))
+                                       ret = -EFAULT;
+                               i += nr; p += nr;
+                       }
+               } else if (clear_user(m.err, nr_pages * sizeof(*m.err)))
+                       ret = -EFAULT;
+
+       mmapbatch_v2_out:
+               list_for_each_safe(l, l2, &pagelist)
+                       free_page((unsigned long)l);
+#undef MMAPBATCH_NR_PER_PAGE
+       }
+       break;
+
+#endif /* CONFIG_XEN_PRIVILEGED_GUEST */
+
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       return ret;
+}
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+       return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct privcmd_vm_ops = {
+       .fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file * file, struct vm_area_struct * vma)
+{
+       /* Unsupported for auto-translate guests. */
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return -ENOSYS;
+
+       /* DONTCOPY is essential for Xen because copy_page_range doesn't know
+        * how to recreate these mappings */
+       vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTCOPY;
+       vma->vm_ops = &privcmd_vm_ops;
+       vma->vm_private_data = NULL;
+
+       return 0;
+}
+#endif
+
+static const struct file_operations privcmd_file_ops = {
+       .open = nonseekable_open,
+       .llseek = no_llseek,
+       .unlocked_ioctl = privcmd_ioctl,
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       .mmap = privcmd_mmap,
+#endif
+};
+
+static int capabilities_read(char *page, char **start, off_t off,
+                            int count, int *eof, void *data)
+{
+       int len = 0;
+       *page = 0;
+
+       if (is_initial_xendomain())
+               len = sprintf( page, "control_d\n" );
+
+       *eof = 1;
+       return len;
+}
+
+static int __init privcmd_init(void)
+{
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       privcmd_intf = create_xen_proc_entry("privcmd", 0400);
+       if (privcmd_intf != NULL)
+               privcmd_intf->proc_fops = &privcmd_file_ops;
+
+       capabilities_intf = create_xen_proc_entry("capabilities", 0400 );
+       if (capabilities_intf != NULL)
+               capabilities_intf->read_proc = capabilities_read;
+
+       return 0;
+}
+
+__initcall(privcmd_init);
diff --git a/drivers/xen/scsiback/Makefile b/drivers/xen/scsiback/Makefile

new file mode 100644 (file)

index 0000000..56271df
--- /dev/null
+++ b/drivers/xen/scsiback/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_XEN_SCSI_BACKEND) := xen-scsibk.o
+
+xen-scsibk-y   := interface.o scsiback.o xenbus.o translate.o emulate.o
+
diff --git a/drivers/xen/scsiback/common.h b/drivers/xen/scsiback/common.h

new file mode 100644 (file)

index 0000000..ee481d1
--- /dev/null
+++ b/drivers/xen/scsiback/common.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * Based on the blkback driver code.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __SCSIIF__BACKEND__COMMON_H__
+#define __SCSIIF__BACKEND__COMMON_H__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+#include <linux/blkdev.h>
+#include <linux/list.h>
+#include <linux/kthread.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_dbg.h>
+#include <scsi/scsi_eh.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/vscsiif.h>
+
+
+#define DPRINTK(_f, _a...)                     \
+       pr_debug("(file=%s, line=%d) " _f,      \
+                __FILE__ , __LINE__ , ## _a )
+
+struct ids_tuple {
+       unsigned int hst;               /* host    */
+       unsigned int chn;               /* channel */
+       unsigned int tgt;               /* target  */
+       unsigned int lun;               /* LUN     */
+};
+
+struct v2p_entry {
+       struct ids_tuple v;             /* translate from */
+       struct scsi_device *sdev;       /* translate to   */
+       struct list_head l;
+};
+
+struct vscsibk_info {
+       struct xenbus_device *dev;
+
+       domid_t domid;
+       unsigned int evtchn;
+       unsigned int irq;
+
+       int feature;
+
+       struct vscsiif_back_ring  ring;
+       struct vm_struct *ring_area;
+
+       spinlock_t ring_lock;
+       atomic_t nr_unreplied_reqs;
+
+       spinlock_t v2p_lock;
+       struct list_head v2p_entry_lists;
+
+       struct task_struct *kthread;
+       wait_queue_head_t waiting_to_free;
+       wait_queue_head_t wq;
+       unsigned int waiting_reqs;
+       struct page **mmap_pages;
+
+};
+
+typedef struct {
+       unsigned char act;
+       struct vscsibk_info *info;
+       struct scsi_device *sdev;
+
+       uint16_t rqid;
+       
+       uint16_t v_chn, v_tgt;
+
+       uint8_t nr_segments;
+       uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE];
+       uint8_t cmd_len;
+
+       uint8_t sc_data_direction;
+       uint16_t timeout_per_command;
+       
+       uint32_t request_bufflen;
+       struct scatterlist *sgl;
+       grant_ref_t gref[VSCSIIF_SG_TABLESIZE];
+
+       int32_t rslt;
+       uint32_t resid;
+       uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
+
+       struct list_head free_list;
+} pending_req_t;
+
+
+
+#define scsiback_get(_b) (atomic_inc(&(_b)->nr_unreplied_reqs))
+#define scsiback_put(_b)                               \
+       do {                                            \
+               if (atomic_dec_and_test(&(_b)->nr_unreplied_reqs))      \
+                       wake_up(&(_b)->waiting_to_free);\
+       } while (0)
+
+#define VSCSIIF_TIMEOUT                (900*HZ)
+
+#define VSCSI_TYPE_HOST                1
+
+irqreturn_t scsiback_intr(int, void *);
+int scsiback_init_sring(struct vscsibk_info *, grant_ref_t, evtchn_port_t);
+int scsiback_schedule(void *data);
+
+
+struct vscsibk_info *vscsibk_info_alloc(domid_t domid);
+void scsiback_free(struct vscsibk_info *info);
+void scsiback_disconnect(struct vscsibk_info *);
+int __init scsiback_interface_init(void);
+void scsiback_interface_exit(void);
+int scsiback_xenbus_init(void);
+void scsiback_xenbus_unregister(void);
+
+void scsiback_init_translation_table(struct vscsibk_info *info);
+
+int scsiback_add_translation_entry(struct vscsibk_info *info,
+                       struct scsi_device *sdev, struct ids_tuple *v);
+
+int scsiback_del_translation_entry(struct vscsibk_info *info,
+                               struct ids_tuple *v);
+struct scsi_device *scsiback_do_translation(struct vscsibk_info *info,
+                       struct ids_tuple *v);
+void scsiback_release_translation_entry(struct vscsibk_info *info);
+
+
+void scsiback_cmd_exec(pending_req_t *pending_req);
+void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result,
+                       uint32_t resid, pending_req_t *pending_req);
+void scsiback_fast_flush_area(pending_req_t *req);
+
+void scsiback_rsp_emulation(pending_req_t *pending_req);
+void scsiback_req_emulation_or_cmdexec(pending_req_t *pending_req);
+void scsiback_emulation_init(void);
+
+
+#endif /* __SCSIIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/scsiback/emulate.c b/drivers/xen/scsiback/emulate.c

new file mode 100644 (file)

index 0000000..67113a7
--- /dev/null
+++ b/drivers/xen/scsiback/emulate.c
@@ -0,0 +1,480 @@
+/*
+ * Xen SCSI backend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/*
+* Patched to support >2TB drives + allow tape & autoloader operations
+* 2010, Samuel Kvasnica, IMS Nanofabrication AG
+*/
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include "common.h"
+
+/* Following SCSI commands are not defined in scsi/scsi.h */
+#define EXTENDED_COPY          0x83    /* EXTENDED COPY command        */
+#define REPORT_ALIASES         0xa3    /* REPORT ALIASES command       */
+#define CHANGE_ALIASES         0xa4    /* CHANGE ALIASES command       */
+#define SET_PRIORITY           0xa4    /* SET PRIORITY command         */
+
+
+/*
+  The bitmap in order to control emulation.
+  (Bit 3 to 7 are reserved for future use.)
+*/
+#define VSCSIIF_NEED_CMD_EXEC          0x01    /* If this bit is set, cmd exec */
+                                               /* is required.                 */
+#define VSCSIIF_NEED_EMULATE_REQBUF    0x02    /* If this bit is set, need     */
+                                               /* emulation reqest buff before */
+                                               /* cmd exec.                    */
+#define VSCSIIF_NEED_EMULATE_RSPBUF    0x04    /* If this bit is set, need     */
+                                               /* emulation resp buff after    */
+                                               /* cmd exec.                    */
+
+/* Additional Sense Code (ASC) used */
+#define NO_ADDITIONAL_SENSE            0x0
+#define LOGICAL_UNIT_NOT_READY         0x4
+#define UNRECOVERED_READ_ERR           0x11
+#define PARAMETER_LIST_LENGTH_ERR      0x1a
+#define INVALID_OPCODE                 0x20
+#define ADDR_OUT_OF_RANGE              0x21
+#define INVALID_FIELD_IN_CDB           0x24
+#define INVALID_FIELD_IN_PARAM_LIST    0x26
+#define POWERON_RESET                  0x29
+#define SAVING_PARAMS_UNSUP            0x39
+#define THRESHOLD_EXCEEDED             0x5d
+#define LOW_POWER_COND_ON              0x5e
+
+
+
+/* Number os SCSI op_code      */
+#define VSCSI_MAX_SCSI_OP_CODE         256
+static unsigned char bitmap[VSCSI_MAX_SCSI_OP_CODE];
+
+#define NO_EMULATE(cmd) \
+       bitmap[cmd] = VSCSIIF_NEED_CMD_EXEC; \
+       pre_function[cmd] = NULL; \
+       post_function[cmd] = NULL
+
+
+
+/*
+  Emulation routines for each SCSI op_code.
+*/
+static void (*pre_function[VSCSI_MAX_SCSI_OP_CODE])(pending_req_t *, void *);
+static void (*post_function[VSCSI_MAX_SCSI_OP_CODE])(pending_req_t *, void *);
+
+
+static const int check_condition_result =
+               (DRIVER_SENSE << 24) | SAM_STAT_CHECK_CONDITION;
+
+static void scsiback_mk_sense_buffer(uint8_t *data, uint8_t key,
+                       uint8_t asc, uint8_t asq)
+{
+       data[0] = 0x70;  /* fixed, current */
+       data[2] = key;
+       data[7] = 0xa;    /* implies 18 byte sense buffer */
+       data[12] = asc;
+       data[13] = asq;
+}
+
+static void resp_not_supported_cmd(pending_req_t *pending_req, void *data)
+{
+       scsiback_mk_sense_buffer(pending_req->sense_buffer, ILLEGAL_REQUEST,
+               INVALID_OPCODE, 0);
+       pending_req->resid = 0;
+       pending_req->rslt  = check_condition_result;
+}
+
+
+static int __copy_to_sg(struct scatterlist *sgl, unsigned int nr_sg,
+              void *buf, unsigned int buflen)
+{
+       struct scatterlist *sg;
+       void *from = buf;
+       void *to;
+       unsigned int from_rest = buflen;
+       unsigned int to_capa;
+       unsigned int copy_size = 0;
+       unsigned int i;
+       unsigned long pfn;
+
+       for_each_sg (sgl, sg, nr_sg, i) {
+               if (sg_page(sg) == NULL) {
+                       pr_warning("%s: inconsistent length field in "
+                                  "scatterlist\n", __FUNCTION__);
+                       return -ENOMEM;
+               }
+
+               to_capa  = sg->length;
+               copy_size = min_t(unsigned int, to_capa, from_rest);
+
+               pfn = page_to_pfn(sg_page(sg));
+               to = pfn_to_kaddr(pfn) + (sg->offset);
+               memcpy(to, from, copy_size);
+
+               from_rest  -= copy_size;
+               if (from_rest == 0) {
+                       return 0;
+               }
+               
+               from += copy_size;
+       }
+
+       pr_warning("%s: no space in scatterlist\n", __FUNCTION__);
+       return -ENOMEM;
+}
+
+static int __maybe_unused __copy_from_sg(struct scatterlist *sgl,
+                                        unsigned int nr_sg, void *buf,
+                                        unsigned int buflen)
+{
+       struct scatterlist *sg;
+       void *from;
+       void *to = buf;
+       unsigned int from_rest;
+       unsigned int to_capa = buflen;
+       unsigned int copy_size;
+       unsigned int i;
+       unsigned long pfn;
+
+       for_each_sg (sgl, sg, nr_sg, i) {
+               if (sg_page(sg) == NULL) {
+                       pr_warning("%s: inconsistent length field in "
+                                  "scatterlist\n", __FUNCTION__);
+                       return -ENOMEM;
+               }
+
+               from_rest = sg->length;
+               if ((from_rest > 0) && (to_capa < from_rest)) {
+                       pr_warning("%s: no space in destination buffer\n",
+                                  __FUNCTION__);
+                       return -ENOMEM;
+               }
+               copy_size = from_rest;
+
+               pfn = page_to_pfn(sg_page(sg));
+               from = pfn_to_kaddr(pfn) + (sg->offset);
+               memcpy(to, from, copy_size);
+
+               to_capa  -= copy_size;
+               to += copy_size;
+       }
+
+       return 0;
+}
+
+static int __nr_luns_under_host(struct vscsibk_info *info)
+{
+       struct v2p_entry *entry;
+       struct list_head *head = &(info->v2p_entry_lists);
+       unsigned long flags;
+       int lun_cnt = 0;
+
+       spin_lock_irqsave(&info->v2p_lock, flags);
+       list_for_each_entry(entry, head, l) {
+                       lun_cnt++;
+       }
+       spin_unlock_irqrestore(&info->v2p_lock, flags);
+
+       return (lun_cnt);
+}
+
+
+/* REPORT LUNS Define*/
+#define VSCSI_REPORT_LUNS_HEADER       8
+#define VSCSI_REPORT_LUNS_RETRY                3
+
+/* quoted scsi_debug.c/resp_report_luns() */
+static void __report_luns(pending_req_t *pending_req, void *data)
+{
+       struct vscsibk_info *info   = pending_req->info;
+       unsigned int        channel = pending_req->v_chn;
+       unsigned int        target  = pending_req->v_tgt;
+       unsigned int        nr_seg  = pending_req->nr_segments;
+       unsigned char *cmd = (unsigned char *)pending_req->cmnd;
+       
+       unsigned char *buff = NULL;
+       unsigned char alloc_len;
+       unsigned int alloc_luns = 0;
+       unsigned int req_bufflen = 0;
+       unsigned int actual_len = 0;
+       unsigned int retry_cnt = 0;
+       int select_report = (int)cmd[2];
+       int i, lun_cnt = 0, lun, upper, err = 0;
+       
+       struct v2p_entry *entry;
+       struct list_head *head = &(info->v2p_entry_lists);
+       unsigned long flags;
+       
+       struct scsi_lun *one_lun;
+
+       req_bufflen = cmd[9] + (cmd[8] << 8) + (cmd[7] << 16) + (cmd[6] << 24);
+       if ((req_bufflen < 4) || (select_report != 0))
+               goto fail;
+
+       alloc_luns = __nr_luns_under_host(info);
+       alloc_len  = sizeof(struct scsi_lun) * alloc_luns
+                               + VSCSI_REPORT_LUNS_HEADER;
+retry:
+       if ((buff = kzalloc(alloc_len, GFP_KERNEL)) == NULL) {
+               pr_err("scsiback:%s kmalloc err\n", __FUNCTION__);
+               goto fail;
+       }
+
+       one_lun = (struct scsi_lun *) &buff[8];
+       spin_lock_irqsave(&info->v2p_lock, flags);
+       list_for_each_entry(entry, head, l) {
+               if ((entry->v.chn == channel) &&
+                   (entry->v.tgt == target)) {
+                       
+                       /* check overflow */
+                       if (lun_cnt >= alloc_luns) {
+                               spin_unlock_irqrestore(&info->v2p_lock,
+                                                       flags);
+
+                               if (retry_cnt < VSCSI_REPORT_LUNS_RETRY) {
+                                       retry_cnt++;
+                                       if (buff)
+                                               kfree(buff);
+                                       goto retry;
+                               }
+
+                               goto fail;
+                       }
+
+                       lun = entry->v.lun;
+                       upper = (lun >> 8) & 0x3f;
+                       if (upper)
+                               one_lun[lun_cnt].scsi_lun[0] = upper;
+                       one_lun[lun_cnt].scsi_lun[1] = lun & 0xff;
+                       lun_cnt++;
+               }
+       }
+
+       spin_unlock_irqrestore(&info->v2p_lock, flags);
+
+       buff[2] = ((sizeof(struct scsi_lun) * lun_cnt) >> 8) & 0xff;
+       buff[3] = (sizeof(struct scsi_lun) * lun_cnt) & 0xff;
+
+       actual_len = lun_cnt * sizeof(struct scsi_lun) 
+                               + VSCSI_REPORT_LUNS_HEADER;
+       req_bufflen = 0;
+       for (i = 0; i < nr_seg; i++)
+               req_bufflen += pending_req->sgl[i].length;
+
+       err = __copy_to_sg(pending_req->sgl, nr_seg, buff, 
+                               min(req_bufflen, actual_len));
+       if (err)
+               goto fail;
+
+       memset(pending_req->sense_buffer, 0, VSCSIIF_SENSE_BUFFERSIZE);
+       pending_req->rslt = 0x00;
+       pending_req->resid = req_bufflen - min(req_bufflen, actual_len);
+
+       kfree(buff);
+       return;
+
+fail:
+       scsiback_mk_sense_buffer(pending_req->sense_buffer, ILLEGAL_REQUEST,
+               INVALID_FIELD_IN_CDB, 0);
+       pending_req->rslt  = check_condition_result;
+       pending_req->resid = 0;
+       if (buff)
+               kfree(buff);
+       return;
+}
+
+
+
+int __pre_do_emulation(pending_req_t *pending_req, void *data)
+{
+       uint8_t op_code = pending_req->cmnd[0];
+
+       if ((bitmap[op_code] & VSCSIIF_NEED_EMULATE_REQBUF) &&
+           pre_function[op_code] != NULL) {
+               pre_function[op_code](pending_req, data);
+       }
+
+       /*
+           0: no need for native driver call, so should return immediately.
+           1: non emulation or should call native driver 
+              after modifing the request buffer.
+       */
+       return !!(bitmap[op_code] & VSCSIIF_NEED_CMD_EXEC);
+}
+
+void scsiback_rsp_emulation(pending_req_t *pending_req)
+{
+       uint8_t op_code = pending_req->cmnd[0];
+
+       if ((bitmap[op_code] & VSCSIIF_NEED_EMULATE_RSPBUF) &&
+           post_function[op_code] != NULL) {
+               post_function[op_code](pending_req, NULL);
+       }
+
+       return;
+}
+
+
+void scsiback_req_emulation_or_cmdexec(pending_req_t *pending_req)
+{
+       if (__pre_do_emulation(pending_req, NULL)) {
+               scsiback_cmd_exec(pending_req);
+       }
+       else {
+               scsiback_fast_flush_area(pending_req);
+               scsiback_do_resp_with_sense(pending_req->sense_buffer,
+                 pending_req->rslt, pending_req->resid, pending_req);
+       }
+}
+
+
+/*
+  Following are not customizable functions.
+*/
+void scsiback_emulation_init(void)
+{
+       int i;
+
+       /* Initialize to default state */
+       for (i = 0; i < VSCSI_MAX_SCSI_OP_CODE; i++) {
+               bitmap[i]        = (VSCSIIF_NEED_EMULATE_REQBUF | 
+                                       VSCSIIF_NEED_EMULATE_RSPBUF);
+               pre_function[i]  = resp_not_supported_cmd;
+               post_function[i] = NULL;
+               /* means,
+                  - no need for pre-emulation
+                  - no need for post-emulation
+                  - call native driver
+               */
+       }
+
+       /*
+         Register appropriate functions below as you need.
+         (See scsi/scsi.h for definition of SCSI op_code.)
+       */
+
+       /*
+         Following commands do not require emulation.
+       */
+       NO_EMULATE(TEST_UNIT_READY);       /*0x00*/ /* sd,st */
+       NO_EMULATE(REZERO_UNIT);           /*0x01*/ /* st */
+       NO_EMULATE(REQUEST_SENSE);         /*0x03*/
+       NO_EMULATE(FORMAT_UNIT);           /*0x04*/
+       NO_EMULATE(READ_BLOCK_LIMITS);     /*0x05*/ /* st */
+       /*NO_EMULATE(REASSIGN_BLOCKS);       *//*0x07*/
+       NO_EMULATE(INITIALIZE_ELEMENT_STATUS); /*0x07*/ /* ch */
+       NO_EMULATE(READ_6);                /*0x08*/ /* sd,st */
+       NO_EMULATE(WRITE_6);               /*0x0a*/ /* sd,st */
+       NO_EMULATE(SEEK_6);                /*0x0b*/
+       /*NO_EMULATE(READ_REVERSE);          *//*0x0f*/
+       NO_EMULATE(WRITE_FILEMARKS);       /*0x10*/ /* st */
+       NO_EMULATE(SPACE);                 /*0x11*/ /* st */
+       NO_EMULATE(INQUIRY);               /*0x12*/
+       /*NO_EMULATE(RECOVER_BUFFERED_DATA); *//*0x14*/
+       NO_EMULATE(MODE_SELECT);           /*0x15*/ /* st */
+       NO_EMULATE(RESERVE);               /*0x16*/
+       NO_EMULATE(RELEASE);               /*0x17*/
+       /*NO_EMULATE(COPY);                  *//*0x18*/
+       NO_EMULATE(ERASE);                 /*0x19*/ /* st */
+       NO_EMULATE(MODE_SENSE);            /*0x1a*/ /* st */
+       NO_EMULATE(START_STOP);            /*0x1b*/ /* sd,st */
+       NO_EMULATE(RECEIVE_DIAGNOSTIC);    /*0x1c*/
+       NO_EMULATE(SEND_DIAGNOSTIC);       /*0x1d*/
+       NO_EMULATE(ALLOW_MEDIUM_REMOVAL);  /*0x1e*/
+
+       /*NO_EMULATE(SET_WINDOW);            *//*0x24*/
+       NO_EMULATE(READ_CAPACITY);         /*0x25*/ /* sd */
+       NO_EMULATE(READ_10);               /*0x28*/ /* sd */
+       NO_EMULATE(WRITE_10);              /*0x2a*/ /* sd */
+       NO_EMULATE(SEEK_10);               /*0x2b*/ /* st */
+       NO_EMULATE(POSITION_TO_ELEMENT);   /*0x2b*/ /* ch */
+       /*NO_EMULATE(WRITE_VERIFY);          *//*0x2e*/
+       /*NO_EMULATE(VERIFY);                *//*0x2f*/
+       /*NO_EMULATE(SEARCH_HIGH);           *//*0x30*/
+       /*NO_EMULATE(SEARCH_EQUAL);          *//*0x31*/
+       /*NO_EMULATE(SEARCH_LOW);            *//*0x32*/
+       NO_EMULATE(SET_LIMITS);            /*0x33*/
+       NO_EMULATE(PRE_FETCH);             /*0x34*/ /* st! */
+       NO_EMULATE(READ_POSITION);          /*0x34*/ /* st */
+       NO_EMULATE(SYNCHRONIZE_CACHE);      /*0x35*/ /* sd */
+       NO_EMULATE(LOCK_UNLOCK_CACHE);     /*0x36*/
+       NO_EMULATE(READ_DEFECT_DATA);      /*0x37*/
+       NO_EMULATE(MEDIUM_SCAN);           /*0x38*/
+       /*NO_EMULATE(COMPARE);               *//*0x39*/
+       /*NO_EMULATE(COPY_VERIFY);           *//*0x3a*/
+       NO_EMULATE(WRITE_BUFFER);          /*0x3b*/
+       NO_EMULATE(READ_BUFFER);           /*0x3c*/ /* osst */
+       /*NO_EMULATE(UPDATE_BLOCK);          *//*0x3d*/
+       /*NO_EMULATE(READ_LONG);             *//*0x3e*/
+       /*NO_EMULATE(WRITE_LONG);            *//*0x3f*/
+       /*NO_EMULATE(CHANGE_DEFINITION);     *//*0x40*/
+       /*NO_EMULATE(WRITE_SAME);            *//*0x41*/
+       NO_EMULATE(READ_TOC);              /*0x43*/ /* sr */
+       NO_EMULATE(LOG_SELECT);            /*0x4c*/
+       NO_EMULATE(LOG_SENSE);             /*0x4d*/ /* st! */
+       /*NO_EMULATE(MODE_SELECT_10);        *//*0x55*/
+       /*NO_EMULATE(RESERVE_10);            *//*0x56*/
+       /*NO_EMULATE(RELEASE_10);            *//*0x57*/
+       NO_EMULATE(MODE_SENSE_10);         /*0x5a*/ /* scsi_lib */
+       /*NO_EMULATE(PERSISTENT_RESERVE_IN); *//*0x5e*/
+       /*NO_EMULATE(PERSISTENT_RESERVE_OUT); *//*0x5f*/
+       /*           REPORT_LUNS             *//*0xa0*//*Full emulaiton*/
+#ifdef MAINTENANCE_IN
+       NO_EMULATE(MAINTENANCE_IN);           /*0xa3*/ /* IFT alua */
+       NO_EMULATE(MAINTENANCE_OUT);       /*0xa4*/ /* IFT alua */
+#endif
+       NO_EMULATE(MOVE_MEDIUM);           /*0xa5*/ /* ch */
+       NO_EMULATE(EXCHANGE_MEDIUM);       /*0xa6*/ /* ch */
+       /*NO_EMULATE(READ_12);               *//*0xa8*/
+       /*NO_EMULATE(WRITE_12);              *//*0xaa*/
+       /*NO_EMULATE(WRITE_VERIFY_12);       *//*0xae*/
+       /*NO_EMULATE(SEARCH_HIGH_12);        *//*0xb0*/
+       /*NO_EMULATE(SEARCH_EQUAL_12);       *//*0xb1*/
+       /*NO_EMULATE(SEARCH_LOW_12);         *//*0xb2*/
+       NO_EMULATE(READ_ELEMENT_STATUS);   /*0xb8*/ /* ch */
+       NO_EMULATE(SEND_VOLUME_TAG);       /*0xb6*/ /* ch */
+       /*NO_EMULATE(WRITE_LONG_2);          *//*0xea*/
+       NO_EMULATE(READ_16);               /*0x88*/ /* sd >2TB */
+       NO_EMULATE(WRITE_16);              /*0x8a*/ /* sd >2TB */
+       NO_EMULATE(VERIFY_16);             /*0x8f*/
+       NO_EMULATE(SERVICE_ACTION_IN);     /*0x9e*/ /* sd >2TB */
+
+/* st: QFA_REQUEST_BLOCK, QFA_SEEK_BLOCK might be needed ? */
+       /*
+         Following commands require emulation.
+       */
+       pre_function[REPORT_LUNS] = __report_luns;
+       bitmap[REPORT_LUNS] = (VSCSIIF_NEED_EMULATE_REQBUF | 
+                                       VSCSIIF_NEED_EMULATE_RSPBUF);
+
+       return;
+}
diff --git a/drivers/xen/scsiback/interface.c b/drivers/xen/scsiback/interface.c

new file mode 100644 (file)

index 0000000..9098a3c
--- /dev/null
+++ b/drivers/xen/scsiback/interface.c
@@ -0,0 +1,141 @@
+/*
+ * interface management.
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * Based on the blkback driver code.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_device.h>
+#include "common.h"
+
+#include <xen/evtchn.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+
+
+static struct kmem_cache *scsiback_cachep;
+
+struct vscsibk_info *vscsibk_info_alloc(domid_t domid)
+{
+       struct vscsibk_info *info;
+
+       info = kmem_cache_zalloc(scsiback_cachep, GFP_KERNEL);
+       if (!info)
+               return ERR_PTR(-ENOMEM);
+
+       info->domid = domid;
+       spin_lock_init(&info->ring_lock);
+       atomic_set(&info->nr_unreplied_reqs, 0);
+       init_waitqueue_head(&info->wq);
+       init_waitqueue_head(&info->waiting_to_free);
+
+       return info;
+}
+
+int scsiback_init_sring(struct vscsibk_info *info, grant_ref_t ring_ref,
+                       evtchn_port_t evtchn)
+{
+       struct vm_struct *area;
+       struct vscsiif_sring *sring;
+       int err;
+
+       if (info->irq) {
+               pr_err("scsiback: Already connected through?\n");
+               return -1;
+       }
+
+       area = xenbus_map_ring_valloc(info->dev, ring_ref);
+       if (IS_ERR(area))
+               return PTR_ERR(area);
+       info->ring_area = area;
+
+       sring = (struct vscsiif_sring *)area->addr;
+       BACK_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+       err = bind_interdomain_evtchn_to_irqhandler(
+                       info->domid, evtchn,
+                       scsiback_intr, 0, "vscsiif-backend", info);
+
+       if (err < 0)
+               goto unmap_page;
+               
+       info->irq = err;
+
+       return 0;
+
+unmap_page:
+       xenbus_unmap_ring_vfree(info->dev, area);
+
+       return err;
+}
+
+void scsiback_disconnect(struct vscsibk_info *info)
+{
+       if (info->kthread) {
+               kthread_stop(info->kthread);
+               info->kthread = NULL;
+       }
+
+       wait_event(info->waiting_to_free, 
+               atomic_read(&info->nr_unreplied_reqs) == 0);
+
+       if (info->irq) {
+               unbind_from_irqhandler(info->irq, info);
+               info->irq = 0;
+       }
+
+       if (info->ring.sring) {
+               xenbus_unmap_ring_vfree(info->dev, info->ring_area);
+               info->ring.sring = NULL;
+       }
+}
+
+void scsiback_free(struct vscsibk_info *info)
+{
+       kmem_cache_free(scsiback_cachep, info);
+}
+
+int __init scsiback_interface_init(void)
+{
+       scsiback_cachep = kmem_cache_create("vscsiif_cache",
+               sizeof(struct vscsibk_info), 0, 0, NULL);
+       if (!scsiback_cachep) {
+               pr_err("scsiback: can't init scsi cache\n");
+               return -ENOMEM;
+       }
+       
+       return 0;
+}
+
+void scsiback_interface_exit(void)
+{
+       kmem_cache_destroy(scsiback_cachep);
+}
diff --git a/drivers/xen/scsiback/scsiback.c b/drivers/xen/scsiback/scsiback.c

new file mode 100644 (file)

index 0000000..726723d
--- /dev/null
+++ b/drivers/xen/scsiback/scsiback.c
@@ -0,0 +1,730 @@
+/*
+ * Xen SCSI backend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * Based on the blkback driver code.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <asm/hypervisor.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_dbg.h>
+#include <scsi/scsi_eh.h>
+
+#include "common.h"
+
+
+struct list_head pending_free;
+DEFINE_SPINLOCK(pending_free_lock);
+DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+int vscsiif_reqs = VSCSIIF_BACK_MAX_PENDING_REQS;
+module_param_named(reqs, vscsiif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of scsiback requests to allocate");
+
+static unsigned int log_print_stat = 0;
+module_param(log_print_stat, int, 0644);
+
+#define SCSIBACK_INVALID_HANDLE (~0)
+
+static pending_req_t *pending_reqs;
+static struct page **pending_pages;
+static grant_handle_t *pending_grant_handles;
+
+static int vaddr_pagenr(pending_req_t *req, int seg)
+{
+       return (req - pending_reqs) * VSCSIIF_SG_TABLESIZE + seg;
+}
+
+static unsigned long vaddr(pending_req_t *req, int seg)
+{
+       unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
+       return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#define pending_handle(_req, _seg) \
+       (pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
+
+void scsiback_fast_flush_area(pending_req_t *req)
+{
+       struct gnttab_unmap_grant_ref unmap[VSCSIIF_SG_TABLESIZE];
+       unsigned int i, invcount = 0;
+       grant_handle_t handle;
+       int err;
+
+       if (req->nr_segments) {
+               for (i = 0; i < req->nr_segments; i++) {
+                       handle = pending_handle(req, i);
+                       if (handle == SCSIBACK_INVALID_HANDLE)
+                               continue;
+                       gnttab_set_unmap_op(&unmap[i], vaddr(req, i),
+                                               GNTMAP_host_map, handle);
+                       pending_handle(req, i) = SCSIBACK_INVALID_HANDLE;
+                       invcount++;
+               }
+
+               err = HYPERVISOR_grant_table_op(
+                       GNTTABOP_unmap_grant_ref, unmap, invcount);
+               BUG_ON(err);
+               kfree(req->sgl);
+       }
+
+       return;
+}
+
+
+static pending_req_t * alloc_req(struct vscsibk_info *info)
+{
+       pending_req_t *req = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+       if (!list_empty(&pending_free)) {
+               req = list_entry(pending_free.next, pending_req_t, free_list);
+               list_del(&req->free_list);
+       }
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+       return req;
+}
+
+
+static void free_req(pending_req_t *req)
+{
+       unsigned long flags;
+       int was_empty;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+       was_empty = list_empty(&pending_free);
+       list_add(&req->free_list, &pending_free);
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+       if (was_empty)
+               wake_up(&pending_free_wq);
+}
+
+
+static void scsiback_notify_work(struct vscsibk_info *info)
+{
+       info->waiting_reqs = 1;
+       wake_up(&info->wq);
+}
+
+void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result,
+                       uint32_t resid, pending_req_t *pending_req)
+{
+       vscsiif_response_t *ring_res;
+       struct vscsibk_info *info = pending_req->info;
+       int notify;
+       int more_to_do = 1;
+       struct scsi_sense_hdr sshdr;
+       unsigned long flags;
+
+       DPRINTK("%s\n",__FUNCTION__);
+
+       spin_lock_irqsave(&info->ring_lock, flags);
+
+       ring_res = RING_GET_RESPONSE(&info->ring, info->ring.rsp_prod_pvt);
+       info->ring.rsp_prod_pvt++;
+
+       ring_res->rslt   = result;
+       ring_res->rqid   = pending_req->rqid;
+
+       if (sense_buffer != NULL) {
+               if (scsi_normalize_sense(sense_buffer,
+                       sizeof(sense_buffer), &sshdr)) {
+
+                       int len = 8 + sense_buffer[7];
+
+                       if (len > VSCSIIF_SENSE_BUFFERSIZE)
+                               len = VSCSIIF_SENSE_BUFFERSIZE;
+
+                       memcpy(ring_res->sense_buffer, sense_buffer, len);
+                       ring_res->sense_len = len;
+               }
+       } else {
+               ring_res->sense_len = 0;
+       }
+
+       ring_res->residual_len = resid;
+
+       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&info->ring, notify);
+       if (info->ring.rsp_prod_pvt == info->ring.req_cons) {
+               RING_FINAL_CHECK_FOR_REQUESTS(&info->ring, more_to_do);
+       } else if (RING_HAS_UNCONSUMED_REQUESTS(&info->ring)) {
+               more_to_do = 1;
+       }
+       
+       spin_unlock_irqrestore(&info->ring_lock, flags);
+
+       if (more_to_do)
+               scsiback_notify_work(info);
+
+       if (notify)
+               notify_remote_via_irq(info->irq);
+
+       free_req(pending_req);
+}
+
+static void scsiback_print_status(char *sense_buffer, int errors,
+                                       pending_req_t *pending_req)
+{
+       struct scsi_device *sdev = pending_req->sdev;
+       
+       pr_err("scsiback: %d:%d:%d:%d ",
+              sdev->host->host_no, sdev->channel, sdev->id, sdev->lun);
+       pr_err("status = 0x%02x, message = 0x%02x, host = 0x%02x,"
+              " driver = 0x%02x\n",
+              status_byte(errors), msg_byte(errors),
+              host_byte(errors), driver_byte(errors));
+
+       pr_err("scsiback: cmnd[0]=0x%02X\n", pending_req->cmnd[0]);
+
+       if (CHECK_CONDITION & status_byte(errors))
+               __scsi_print_sense("scsiback", sense_buffer, SCSI_SENSE_BUFFERSIZE);
+}
+
+
+static void scsiback_cmd_done(struct request *req, int uptodate)
+{
+       pending_req_t *pending_req = req->end_io_data;
+       unsigned char *sense_buffer;
+       unsigned int resid;
+       int errors;
+
+       sense_buffer = req->sense;
+       resid        = blk_rq_bytes(req);
+       errors       = req->errors;
+
+       if (errors != 0) {
+               if (log_print_stat)
+                       scsiback_print_status(sense_buffer, errors, pending_req);
+       }
+
+       /* The Host mode is through as for Emulation. */
+       if (pending_req->info->feature != VSCSI_TYPE_HOST)
+               scsiback_rsp_emulation(pending_req);
+
+       scsiback_fast_flush_area(pending_req);
+       scsiback_do_resp_with_sense(sense_buffer, errors, resid, pending_req);
+       scsiback_put(pending_req->info);
+
+       __blk_put_request(req->q, req);
+}
+
+
+static int scsiback_gnttab_data_map(vscsiif_request_t *ring_req,
+                                       pending_req_t *pending_req)
+{
+       u32 flags;
+       int write;
+       int i, err = 0;
+       unsigned int data_len = 0;
+       struct gnttab_map_grant_ref map[VSCSIIF_SG_TABLESIZE];
+       struct vscsibk_info *info   = pending_req->info;
+
+       int data_dir = (int)pending_req->sc_data_direction;
+       unsigned int nr_segments = (unsigned int)pending_req->nr_segments;
+
+       write = (data_dir == DMA_TO_DEVICE);
+
+       if (nr_segments) {
+               struct scatterlist *sg;
+
+               /* free of (sgl) in fast_flush_area()*/
+               pending_req->sgl = kmalloc(sizeof(struct scatterlist) * nr_segments,
+                                               GFP_KERNEL);
+               if (!pending_req->sgl) {
+                       pr_err("scsiback: %s: kmalloc() error\n", __FUNCTION__);
+                       return -ENOMEM;
+               }
+
+               sg_init_table(pending_req->sgl, nr_segments);
+
+               flags = GNTMAP_host_map;
+               if (write)
+                       flags |= GNTMAP_readonly;
+
+               for (i = 0; i < nr_segments; i++)
+                       gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
+                                               ring_req->seg[i].gref,
+                                               info->domid);
+
+               err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nr_segments);
+               BUG_ON(err);
+
+               for_each_sg (pending_req->sgl, sg, nr_segments, i) {
+                       struct page *pg;
+
+                       /* Retry maps with GNTST_eagain */
+                       if (unlikely(map[i].status == GNTST_eagain))
+                               gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
+                       if (unlikely(map[i].status != GNTST_okay)) {
+                               pr_err("scsiback: invalid buffer -- could not remap it\n");
+                               map[i].handle = SCSIBACK_INVALID_HANDLE;
+                               err |= 1;
+                       }
+
+                       pending_handle(pending_req, i) = map[i].handle;
+
+                       if (err)
+                               continue;
+
+                       pg = pending_pages[vaddr_pagenr(pending_req, i)];
+
+                       set_phys_to_machine(page_to_pfn(pg),
+                               FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
+
+                       sg_set_page(sg, pg, ring_req->seg[i].length,
+                                   ring_req->seg[i].offset);
+                       data_len += sg->length;
+
+                       barrier();
+                       if (sg->offset >= PAGE_SIZE ||
+                           sg->length > PAGE_SIZE ||
+                           sg->offset + sg->length > PAGE_SIZE)
+                               err |= 1;
+
+               }
+
+               if (err)
+                       goto fail_flush;
+       }
+       
+       pending_req->request_bufflen = data_len;
+       
+       return 0;
+       
+fail_flush:
+       scsiback_fast_flush_area(pending_req);
+       return -ENOMEM;
+}
+
+/* quoted scsi_lib.c/scsi_bi_endio */
+static void scsiback_bi_endio(struct bio *bio, int error)
+{
+       bio_put(bio);
+}
+
+
+
+/* quoted scsi_lib.c/scsi_req_map_sg . */
+static struct bio *request_map_sg(pending_req_t *pending_req)
+{
+       struct request_queue *q = pending_req->sdev->request_queue;
+       unsigned int nsegs = (unsigned int)pending_req->nr_segments;
+       unsigned int i, len, bytes, off, nr_pages, nr_vecs = 0;
+       struct scatterlist *sg;
+       struct page *page;
+       struct bio *bio = NULL, *bio_first = NULL, *bio_last = NULL;
+       int err;
+
+       for_each_sg (pending_req->sgl, sg, nsegs, i) {
+               page = sg_page(sg);
+               off = sg->offset;
+               len = sg->length;
+
+               nr_pages = (len + off + PAGE_SIZE - 1) >> PAGE_SHIFT;
+               while (len > 0) {
+                       bytes = min_t(unsigned int, len, PAGE_SIZE - off);
+
+                       if (!bio) {
+                               nr_vecs = min_t(unsigned int, BIO_MAX_PAGES,
+                                               nr_pages);
+                               nr_pages -= nr_vecs;
+                               bio = bio_alloc(GFP_KERNEL, nr_vecs);
+                               if (!bio) {
+                                       err = -ENOMEM;
+                                       goto free_bios;
+                               }
+                               bio->bi_end_io = scsiback_bi_endio;
+                               if (bio_last)
+                                       bio_last->bi_next = bio;
+                               else
+                                       bio_first = bio;
+                               bio_last = bio;
+                       }
+
+                       if (bio_add_pc_page(q, bio, page, bytes, off) !=
+                                               bytes) {
+                               bio_put(bio);
+                               err = -EINVAL;
+                               goto free_bios;
+                       }
+
+                       if (bio->bi_vcnt >= nr_vecs) {
+                               bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+                               if (pending_req->sc_data_direction == WRITE)
+                                       bio->bi_rw |= REQ_WRITE;
+                               bio = NULL;
+                       }
+
+                       page++;
+                       len -= bytes;
+                       off = 0;
+               }
+       }
+
+       return bio_first;
+
+free_bios:
+       while ((bio = bio_first) != NULL) {
+               bio_first = bio->bi_next;
+               bio_put(bio);
+       }
+
+       return ERR_PTR(err);
+}
+
+
+void scsiback_cmd_exec(pending_req_t *pending_req)
+{
+       int cmd_len  = (int)pending_req->cmd_len;
+       int data_dir = (int)pending_req->sc_data_direction;
+       unsigned int timeout;
+       struct request *rq;
+       int write;
+
+       DPRINTK("%s\n",__FUNCTION__);
+
+       /* because it doesn't timeout backend earlier than frontend.*/
+       if (pending_req->timeout_per_command)
+               timeout = pending_req->timeout_per_command * HZ;
+       else
+               timeout = VSCSIIF_TIMEOUT;
+
+       write = (data_dir == DMA_TO_DEVICE);
+       if (pending_req->nr_segments) {
+               struct bio *bio = request_map_sg(pending_req);
+
+               if (IS_ERR(bio)) {
+                       pr_err("scsiback: SG Request Map Error\n");
+                       return;
+               }
+
+               rq = blk_make_request(pending_req->sdev->request_queue, bio,
+                                     GFP_KERNEL);
+               if (IS_ERR(rq)) {
+                       pr_err("scsiback: Make Request Error\n");
+                       return;
+               }
+
+               rq->buffer = NULL;
+       } else {
+               rq = blk_get_request(pending_req->sdev->request_queue, write,
+                                    GFP_KERNEL);
+               if (unlikely(!rq)) {
+                       pr_err("scsiback: Get Request Error\n");
+                       return;
+               }
+       }
+
+       rq->cmd_type = REQ_TYPE_BLOCK_PC;
+       rq->cmd_len = cmd_len;
+       memcpy(rq->cmd, pending_req->cmnd, cmd_len);
+
+       memset(pending_req->sense_buffer, 0, VSCSIIF_SENSE_BUFFERSIZE);
+       rq->sense       = pending_req->sense_buffer;
+       rq->sense_len = 0;
+
+       /* not allowed to retry in backend.                   */
+       rq->retries   = 0;
+       rq->timeout   = timeout;
+       rq->end_io_data = pending_req;
+
+       scsiback_get(pending_req->info);
+       blk_execute_rq_nowait(rq->q, NULL, rq, 1, scsiback_cmd_done);
+
+       return ;
+}
+
+
+static void scsiback_device_reset_exec(pending_req_t *pending_req)
+{
+       struct vscsibk_info *info = pending_req->info;
+       int err;
+       struct scsi_device *sdev = pending_req->sdev;
+
+       scsiback_get(info);
+       err = scsi_reset_provider(sdev, SCSI_TRY_RESET_DEVICE);
+
+       scsiback_do_resp_with_sense(NULL, err, 0, pending_req);
+       scsiback_put(info);
+
+       return;
+}
+
+
+irqreturn_t scsiback_intr(int irq, void *dev_id)
+{
+       scsiback_notify_work((struct vscsibk_info *)dev_id);
+       return IRQ_HANDLED;
+}
+
+static int prepare_pending_reqs(struct vscsibk_info *info,
+               vscsiif_request_t *ring_req, pending_req_t *pending_req)
+{
+       struct scsi_device *sdev;
+       struct ids_tuple vir;
+       int err = -EINVAL;
+
+       DPRINTK("%s\n",__FUNCTION__);
+
+       pending_req->rqid       = ring_req->rqid;
+       pending_req->act        = ring_req->act;
+
+       pending_req->info       = info;
+
+       pending_req->v_chn = vir.chn = ring_req->channel;
+       pending_req->v_tgt = vir.tgt = ring_req->id;
+       vir.lun = ring_req->lun;
+
+       rmb();
+       sdev = scsiback_do_translation(info, &vir);
+       if (!sdev) {
+               pending_req->sdev = NULL;
+               DPRINTK("scsiback: doesn't exist.\n");
+               err = -ENODEV;
+               goto invalid_value;
+       }
+       pending_req->sdev = sdev;
+
+       /* request range check from frontend */
+       pending_req->sc_data_direction = ring_req->sc_data_direction;
+       barrier();
+       if ((pending_req->sc_data_direction != DMA_BIDIRECTIONAL) &&
+               (pending_req->sc_data_direction != DMA_TO_DEVICE) &&
+               (pending_req->sc_data_direction != DMA_FROM_DEVICE) &&
+               (pending_req->sc_data_direction != DMA_NONE)) {
+               DPRINTK("scsiback: invalid parameter data_dir = %d\n",
+                       pending_req->sc_data_direction);
+               err = -EINVAL;
+               goto invalid_value;
+       }
+
+       pending_req->nr_segments = ring_req->nr_segments;
+       barrier();
+       if (pending_req->nr_segments > VSCSIIF_SG_TABLESIZE) {
+               DPRINTK("scsiback: invalid parameter nr_seg = %d\n",
+                       pending_req->nr_segments);
+               err = -EINVAL;
+               goto invalid_value;
+       }
+
+       pending_req->cmd_len = ring_req->cmd_len;
+       barrier();
+       if (pending_req->cmd_len > VSCSIIF_MAX_COMMAND_SIZE) {
+               DPRINTK("scsiback: invalid parameter cmd_len = %d\n",
+                       pending_req->cmd_len);
+               err = -EINVAL;
+               goto invalid_value;
+       }
+       memcpy(pending_req->cmnd, ring_req->cmnd, pending_req->cmd_len);
+       
+       pending_req->timeout_per_command = ring_req->timeout_per_command;
+
+       if(scsiback_gnttab_data_map(ring_req, pending_req)) {
+               DPRINTK("scsiback: invalid buffer\n");
+               err = -EINVAL;
+               goto invalid_value;
+       }
+
+       return 0;
+
+invalid_value:
+       return err;
+}
+
+
+static int scsiback_do_cmd_fn(struct vscsibk_info *info)
+{
+       struct vscsiif_back_ring *ring = &info->ring;
+       vscsiif_request_t  *ring_req;
+
+       pending_req_t *pending_req;
+       RING_IDX rc, rp;
+       int err, more_to_do = 0;
+
+       DPRINTK("%s\n",__FUNCTION__);
+
+       rc = ring->req_cons;
+       rp = ring->sring->req_prod;
+       rmb();
+
+       while ((rc != rp)) {
+               if (RING_REQUEST_CONS_OVERFLOW(ring, rc))
+                       break;
+               pending_req = alloc_req(info);
+               if (NULL == pending_req) {
+                       more_to_do = 1;
+                       break;
+               }
+
+               ring_req = RING_GET_REQUEST(ring, rc);
+               ring->req_cons = ++rc;
+
+               err = prepare_pending_reqs(info, ring_req,
+                                               pending_req);
+               if (err == -EINVAL) {
+                       scsiback_do_resp_with_sense(NULL, (DRIVER_ERROR << 24),
+                               0, pending_req);
+                       continue;
+               } else if (err == -ENODEV) {
+                       scsiback_do_resp_with_sense(NULL, (DID_NO_CONNECT << 16),
+                               0, pending_req);
+                       continue;
+               }
+
+               if (pending_req->act == VSCSIIF_ACT_SCSI_CDB) {
+
+                       /* The Host mode is through as for Emulation. */
+                       if (info->feature == VSCSI_TYPE_HOST)
+                               scsiback_cmd_exec(pending_req);
+                       else
+                               scsiback_req_emulation_or_cmdexec(pending_req);
+
+               } else if (pending_req->act == VSCSIIF_ACT_SCSI_RESET) {
+                       scsiback_device_reset_exec(pending_req);
+               } else {
+                       pr_err("scsiback: invalid parameter for request\n");
+                       scsiback_do_resp_with_sense(NULL, (DRIVER_ERROR << 24),
+                               0, pending_req);
+                       continue;
+               }
+       }
+
+       if (RING_HAS_UNCONSUMED_REQUESTS(ring))
+               more_to_do = 1;
+
+       /* Yield point for this unbounded loop. */
+       cond_resched();
+
+       return more_to_do;
+}
+
+
+int scsiback_schedule(void *data)
+{
+       struct vscsibk_info *info = (struct vscsibk_info *)data;
+
+       DPRINTK("%s\n",__FUNCTION__);
+
+       while (!kthread_should_stop()) {
+               wait_event_interruptible(
+                       info->wq,
+                       info->waiting_reqs || kthread_should_stop());
+               wait_event_interruptible(
+                       pending_free_wq,
+                       !list_empty(&pending_free) || kthread_should_stop());
+
+               info->waiting_reqs = 0;
+               smp_mb();
+
+               if (scsiback_do_cmd_fn(info))
+                       info->waiting_reqs = 1;
+       }
+
+       return 0;
+}
+
+
+static int __init scsiback_init(void)
+{
+       int i, mmap_pages;
+
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       mmap_pages = vscsiif_reqs * VSCSIIF_SG_TABLESIZE;
+
+       pending_reqs          = kzalloc(sizeof(pending_reqs[0]) *
+                                       vscsiif_reqs, GFP_KERNEL);
+       pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+                                       mmap_pages, GFP_KERNEL);
+       pending_pages         = alloc_empty_pages_and_pagevec(mmap_pages);
+
+       if (!pending_reqs || !pending_grant_handles || !pending_pages)
+               goto out_of_memory;
+
+       for (i = 0; i < mmap_pages; i++)
+               pending_grant_handles[i] = SCSIBACK_INVALID_HANDLE;
+
+       if (scsiback_interface_init() < 0)
+               goto out_of_memory;
+
+       INIT_LIST_HEAD(&pending_free);
+
+       for (i = 0; i < vscsiif_reqs; i++)
+               list_add_tail(&pending_reqs[i].free_list, &pending_free);
+
+       if (scsiback_xenbus_init())
+               goto out_interface;
+
+       scsiback_emulation_init();
+
+       return 0;
+
+out_interface:
+       scsiback_interface_exit();
+out_of_memory:
+       kfree(pending_reqs);
+       kfree(pending_grant_handles);
+       free_empty_pages_and_pagevec(pending_pages, mmap_pages);
+       pr_err("scsiback: %s: out of memory\n", __FUNCTION__);
+       return -ENOMEM;
+}
+
+#if 0
+static void __exit scsiback_exit(void)
+{
+       scsiback_xenbus_unregister();
+       scsiback_interface_exit();
+       kfree(pending_reqs);
+       kfree(pending_grant_handles);
+       free_empty_pages_and_pagevec(pending_pages, (vscsiif_reqs * VSCSIIF_SG_TABLESIZE));
+
+}
+#endif
+
+module_init(scsiback_init);
+
+#if 0
+module_exit(scsiback_exit);
+#endif
+
+MODULE_DESCRIPTION("Xen SCSI backend driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vscsi");
diff --git a/drivers/xen/scsiback/translate.c b/drivers/xen/scsiback/translate.c

new file mode 100644 (file)

index 0000000..c82e5b8
--- /dev/null
+++ b/drivers/xen/scsiback/translate.c
@@ -0,0 +1,168 @@
+/*
+ * Xen SCSI backend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/list.h>
+#include <linux/gfp.h>
+
+#include "common.h"
+
+/*
+  Initialize the translation entry list
+*/
+void scsiback_init_translation_table(struct vscsibk_info *info)
+{
+       INIT_LIST_HEAD(&info->v2p_entry_lists);
+       spin_lock_init(&info->v2p_lock);
+}
+
+
+/*
+  Add a new translation entry
+*/
+int scsiback_add_translation_entry(struct vscsibk_info *info,
+                       struct scsi_device *sdev, struct ids_tuple *v)
+{
+       int err = 0;
+       struct v2p_entry *entry;
+       struct v2p_entry *new;
+       struct list_head *head = &(info->v2p_entry_lists);
+       unsigned long flags;
+       
+       spin_lock_irqsave(&info->v2p_lock, flags);
+
+       /* Check double assignment to identical virtual ID */
+       list_for_each_entry(entry, head, l) {
+               if ((entry->v.chn == v->chn) &&
+                   (entry->v.tgt == v->tgt) &&
+                   (entry->v.lun == v->lun)) {
+                       pr_warning("scsiback: Virtual ID is already used. "
+                                  "Assignment was not performed.\n");
+                       err = -EEXIST;
+                       goto out;
+               }
+
+       }
+
+       /* Create a new translation entry and add to the list */
+       if ((new = kmalloc(sizeof(struct v2p_entry), GFP_ATOMIC)) == NULL) {
+               pr_err("scsiback: %s: kmalloc() error\n", __FUNCTION__);
+               err = -ENOMEM;
+               goto out;
+       }
+       new->v = *v;
+       new->sdev = sdev;
+       list_add_tail(&new->l, head);
+
+out:   
+       spin_unlock_irqrestore(&info->v2p_lock, flags);
+       return err;
+}
+
+
+/*
+  Delete the translation entry specfied
+*/
+int scsiback_del_translation_entry(struct vscsibk_info *info,
+                               struct ids_tuple *v)
+{
+       struct v2p_entry *entry;
+       struct list_head *head = &(info->v2p_entry_lists);
+       unsigned long flags;
+
+       spin_lock_irqsave(&info->v2p_lock, flags);
+       /* Find out the translation entry specified */
+       list_for_each_entry(entry, head, l) {
+               if ((entry->v.chn == v->chn) &&
+                   (entry->v.tgt == v->tgt) &&
+                   (entry->v.lun == v->lun)) {
+                       goto found;
+               }
+       }
+
+       spin_unlock_irqrestore(&info->v2p_lock, flags);
+       return 1;
+
+found:
+       /* Delete the translation entry specfied */
+       scsi_device_put(entry->sdev);
+       list_del(&entry->l);
+       kfree(entry);
+
+       spin_unlock_irqrestore(&info->v2p_lock, flags);
+       return 0;
+}
+
+
+/*
+  Perform virtual to physical translation
+*/
+struct scsi_device *scsiback_do_translation(struct vscsibk_info *info,
+                       struct ids_tuple *v)
+{
+       struct v2p_entry *entry;
+       struct list_head *head = &(info->v2p_entry_lists);
+       struct scsi_device *sdev = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&info->v2p_lock, flags);
+       list_for_each_entry(entry, head, l) {
+               if ((entry->v.chn == v->chn) &&
+                   (entry->v.tgt == v->tgt) &&
+                   (entry->v.lun == v->lun)) {
+                       sdev = entry->sdev;
+                       goto out;
+               }
+       }
+out:
+       spin_unlock_irqrestore(&info->v2p_lock, flags);
+       return sdev;
+}
+
+
+/*
+  Release the translation entry specfied
+*/
+void scsiback_release_translation_entry(struct vscsibk_info *info)
+{
+       struct v2p_entry *entry, *tmp;
+       struct list_head *head = &(info->v2p_entry_lists);
+       unsigned long flags;
+
+       spin_lock_irqsave(&info->v2p_lock, flags);
+       list_for_each_entry_safe(entry, tmp, head, l) {
+               scsi_device_put(entry->sdev);
+               list_del(&entry->l);
+               kfree(entry);
+       }
+
+       spin_unlock_irqrestore(&info->v2p_lock, flags);
+       return;
+
+}
diff --git a/drivers/xen/scsiback/xenbus.c b/drivers/xen/scsiback/xenbus.c

new file mode 100644 (file)

index 0000000..55f3e5d
--- /dev/null
+++ b/drivers/xen/scsiback/xenbus.c
@@ -0,0 +1,375 @@
+/*
+ * Xen SCSI backend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * Based on the blkback driver code.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdarg.h>
+#include <linux/kthread.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_device.h>
+
+#include "common.h"
+
+struct backend_info
+{
+       struct xenbus_device *dev;
+       struct vscsibk_info *info;
+};
+
+
+static int __vscsiif_name(struct backend_info *be, char *buf)
+{
+       struct xenbus_device *dev = be->dev;
+       unsigned int domid, id;
+
+       sscanf(dev->nodename, "backend/vscsi/%u/%u", &domid, &id);
+       snprintf(buf, TASK_COMM_LEN, "vscsi.%u.%u", be->info->domid, id);
+
+       return 0;
+}
+
+static int scsiback_map(struct backend_info *be)
+{
+       struct xenbus_device *dev = be->dev;
+       unsigned int ring_ref, evtchn;
+       int err;
+       char name[TASK_COMM_LEN];
+
+       err = xenbus_gather(XBT_NIL, dev->otherend,
+                       "ring-ref", "%u", &ring_ref,
+                       "event-channel", "%u", &evtchn, NULL);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "reading %s ring", dev->otherend);
+               return err;
+       }
+
+       err = scsiback_init_sring(be->info, ring_ref, evtchn);
+       if (err)
+               return err;
+
+       err = __vscsiif_name(be, name);
+       if (err) {
+               xenbus_dev_error(dev, err, "get scsiback dev name");
+               return err;
+       }
+
+       be->info->kthread = kthread_run(scsiback_schedule, be->info, name);
+       if (IS_ERR(be->info->kthread)) {
+               err = PTR_ERR(be->info->kthread);
+               be->info->kthread = NULL;
+               xenbus_dev_error(be->dev, err, "start vscsiif");
+               return err;
+       }
+
+       return 0;
+}
+
+
+struct scsi_device *scsiback_get_scsi_device(struct ids_tuple *phy)
+{
+       struct Scsi_Host *shost;
+       struct scsi_device *sdev = NULL;
+
+       shost = scsi_host_lookup(phy->hst);
+       if (IS_ERR(shost)) {
+               pr_err("scsiback: host%d doesn't exist\n", phy->hst);
+               return NULL;
+       }
+       sdev   = scsi_device_lookup(shost, phy->chn, phy->tgt, phy->lun);
+       if (!sdev) {
+               pr_err("scsiback: %d:%d:%d:%d doesn't exist\n",
+                      phy->hst, phy->chn, phy->tgt, phy->lun);
+               scsi_host_put(shost);
+               return NULL;
+       }
+
+       scsi_host_put(shost);
+       return (sdev);
+}
+
+#define VSCSIBACK_OP_ADD_OR_DEL_LUN    1
+#define VSCSIBACK_OP_UPDATEDEV_STATE   2
+
+
+static void scsiback_do_lun_hotplug(struct backend_info *be, int op)
+{
+       int i, err = 0;
+       struct ids_tuple phy, vir;
+       int device_state;
+       char str[64], state_str[64];
+       char **dir;
+       unsigned int dir_n = 0;
+       struct xenbus_device *dev = be->dev;
+       struct scsi_device *sdev;
+
+       dir = xenbus_directory(XBT_NIL, dev->nodename, "vscsi-devs", &dir_n);
+       if (IS_ERR(dir))
+               return;
+
+       for (i = 0; i < dir_n; i++) {
+               
+               /* read status */
+               snprintf(state_str, sizeof(state_str), "vscsi-devs/%s/state", dir[i]);
+               err = xenbus_scanf(XBT_NIL, dev->nodename, state_str, "%u",
+                       &device_state);
+               if (XENBUS_EXIST_ERR(err))
+                       continue;
+
+               /* physical SCSI device */
+               snprintf(str, sizeof(str), "vscsi-devs/%s/p-dev", dir[i]);
+               err = xenbus_scanf(XBT_NIL, dev->nodename, str,
+                       "%u:%u:%u:%u", &phy.hst, &phy.chn, &phy.tgt, &phy.lun);
+               if (XENBUS_EXIST_ERR(err)) {
+                       xenbus_printf(XBT_NIL, dev->nodename, state_str,
+                                       "%d", XenbusStateClosed);
+                       continue;
+               }
+
+               /* virtual SCSI device */
+               snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", dir[i]);
+               err = xenbus_scanf(XBT_NIL, dev->nodename, str,
+                       "%u:%u:%u:%u", &vir.hst, &vir.chn, &vir.tgt, &vir.lun);
+               if (XENBUS_EXIST_ERR(err)) {
+                       xenbus_printf(XBT_NIL, dev->nodename, state_str,
+                                       "%d", XenbusStateClosed);
+                       continue;
+               }
+
+               switch (op) {
+               case VSCSIBACK_OP_ADD_OR_DEL_LUN:
+                       if (device_state == XenbusStateInitialising) {
+                               sdev = scsiback_get_scsi_device(&phy);
+                               if (!sdev)
+                                       xenbus_printf(XBT_NIL, dev->nodename, state_str, 
+                                                           "%d", XenbusStateClosed);
+                               else {
+                                       err = scsiback_add_translation_entry(be->info, sdev, &vir);
+                                       if (!err) {
+                                               if (xenbus_printf(XBT_NIL, dev->nodename, state_str, 
+                                                                   "%d", XenbusStateInitialised)) {
+                                                       pr_err("scsiback: xenbus_printf error %s\n",
+                                                              state_str);
+                                                       scsiback_del_translation_entry(be->info, &vir);
+                                               }
+                                       } else {
+                                               scsi_device_put(sdev);
+                                               xenbus_printf(XBT_NIL, dev->nodename, state_str, 
+                                                                   "%d", XenbusStateClosed);
+                                       }
+                               }
+                       }
+
+                       if (device_state == XenbusStateClosing) {
+                               if (!scsiback_del_translation_entry(be->info, &vir)) {
+                                       if (xenbus_printf(XBT_NIL, dev->nodename, state_str, 
+                                                           "%d", XenbusStateClosed))
+                                               pr_err("scsiback: xenbus_printf error %s\n",
+                                                      state_str);
+                               }
+                       }
+                       break;
+
+               case VSCSIBACK_OP_UPDATEDEV_STATE:
+                       if (device_state == XenbusStateInitialised) {
+                               /* modify vscsi-devs/dev-x/state */
+                               if (xenbus_printf(XBT_NIL, dev->nodename, state_str, 
+                                                   "%d", XenbusStateConnected)) {
+                                       pr_err("scsiback: xenbus_printf error %s\n",
+                                              state_str);
+                                       scsiback_del_translation_entry(be->info, &vir);
+                                       xenbus_printf(XBT_NIL, dev->nodename, state_str, 
+                                                           "%d", XenbusStateClosed);
+                               }
+                       }
+                       break;
+               /*When it is necessary, processing is added here.*/
+               default:
+                       break;
+               }
+       }
+
+       kfree(dir);
+       return ;
+}
+
+
+static void scsiback_frontend_changed(struct xenbus_device *dev,
+                                       enum xenbus_state frontend_state)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+       int err;
+
+       switch (frontend_state) {
+       case XenbusStateInitialising:
+               break;
+       case XenbusStateInitialised:
+               err = scsiback_map(be);
+               if (err)
+                       break;
+
+               scsiback_do_lun_hotplug(be, VSCSIBACK_OP_ADD_OR_DEL_LUN);
+               xenbus_switch_state(dev, XenbusStateConnected);
+
+               break;
+       case XenbusStateConnected:
+
+               scsiback_do_lun_hotplug(be, VSCSIBACK_OP_UPDATEDEV_STATE);
+
+               if (dev->state == XenbusStateConnected)
+                       break;
+
+               xenbus_switch_state(dev, XenbusStateConnected);
+
+               break;
+
+       case XenbusStateClosing:
+               scsiback_disconnect(be->info);
+               xenbus_switch_state(dev, XenbusStateClosing);
+               break;
+
+       case XenbusStateClosed:
+               xenbus_switch_state(dev, XenbusStateClosed);
+               if (xenbus_dev_is_online(dev))
+                       break;
+               /* fall through if not online */
+       case XenbusStateUnknown:
+               device_unregister(&dev->dev);
+               break;
+
+       case XenbusStateReconfiguring:
+               scsiback_do_lun_hotplug(be, VSCSIBACK_OP_ADD_OR_DEL_LUN);
+
+               xenbus_switch_state(dev, XenbusStateReconfigured);
+
+               break;
+
+       default:
+               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+                                       frontend_state);
+               break;
+       }
+}
+
+
+static int scsiback_remove(struct xenbus_device *dev)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+       if (be->info) {
+               scsiback_disconnect(be->info);
+               scsiback_release_translation_entry(be->info);
+               scsiback_free(be->info);
+               be->info = NULL;
+       }
+
+       kfree(be);
+       dev_set_drvdata(&dev->dev, NULL);
+
+       return 0;
+}
+
+
+static int scsiback_probe(struct xenbus_device *dev,
+                          const struct xenbus_device_id *id)
+{
+       int err;
+       unsigned val = 0;
+
+       struct backend_info *be = kzalloc(sizeof(struct backend_info),
+                                         GFP_KERNEL);
+
+       DPRINTK("%p %d\n", dev, dev->otherend_id);
+
+       if (!be) {
+               xenbus_dev_fatal(dev, -ENOMEM,
+                                "allocating backend structure");
+               return -ENOMEM;
+       }
+       be->dev = dev;
+       dev_set_drvdata(&dev->dev, be);
+
+       be->info = vscsibk_info_alloc(dev->otherend_id);
+       if (IS_ERR(be->info)) {
+               err = PTR_ERR(be->info);
+               be->info = NULL;
+               xenbus_dev_fatal(dev, err, "creating scsihost interface");
+               goto fail;
+       }
+
+       be->info->dev = dev;
+       be->info->irq = 0;
+       be->info->feature = 0;  /*default not HOSTMODE.*/
+
+       scsiback_init_translation_table(be->info);
+
+       err = xenbus_scanf(XBT_NIL, dev->nodename,
+                               "feature-host", "%d", &val);
+       if (XENBUS_EXIST_ERR(err))
+               val = 0;
+
+       if (val)
+               be->info->feature = VSCSI_TYPE_HOST;
+
+       err = xenbus_switch_state(dev, XenbusStateInitWait);
+       if (err)
+               goto fail;
+
+       return 0;
+
+
+fail:
+       pr_warning("scsiback: %s failed\n",__FUNCTION__);
+       scsiback_remove(dev);
+
+       return err;
+}
+
+
+static const struct xenbus_device_id scsiback_ids[] = {
+       { "vscsi" },
+       { "" }
+};
+
+static DEFINE_XENBUS_DRIVER(scsiback, ,
+       .probe                  = scsiback_probe,
+       .remove                 = scsiback_remove,
+       .otherend_changed       = scsiback_frontend_changed
+);
+
+int __init scsiback_xenbus_init(void)
+{
+       return xenbus_register_backend(&scsiback_driver);
+}
+
+void __exit scsiback_xenbus_unregister(void)
+{
+       xenbus_unregister_driver(&scsiback_driver);
+}
diff --git a/drivers/xen/scsifront/Makefile b/drivers/xen/scsifront/Makefile

new file mode 100644 (file)

index 0000000..58ee185
--- /dev/null
+++ b/drivers/xen/scsifront/Makefile
@@ -0,0 +1,3 @@
+
+obj-$(CONFIG_XEN_SCSI_FRONTEND)        := xenscsi.o
+xenscsi-objs := scsifront.o xenbus.o
diff --git a/drivers/xen/scsifront/common.h b/drivers/xen/scsifront/common.h

new file mode 100644 (file)

index 0000000..51afd68
--- /dev/null
+++ b/drivers/xen/scsifront/common.h
@@ -0,0 +1,134 @@
+/*
+ * Xen SCSI frontend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_DRIVERS_SCSIFRONT_H__
+#define __XEN_DRIVERS_SCSIFRONT_H__
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/blkdev.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <xen/xenbus.h>
+#include <xen/gnttab.h>
+#include <xen/evtchn.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/vscsiif.h>
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/protocols.h>
+#include <asm/delay.h>
+#include <asm/hypervisor.h>
+#include <asm/maddr.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#define VSCSI_IN_ABORT         1
+#define VSCSI_IN_RESET         2
+
+/* tuning point*/
+#define VSCSIIF_DEFAULT_CMD_PER_LUN 10
+#define VSCSIIF_MAX_TARGET          64
+#define VSCSIIF_MAX_LUN             255
+
+#define VSCSIIF_RING_SIZE      __CONST_RING_SIZE(vscsiif, PAGE_SIZE)
+#define VSCSIIF_MAX_REQS       VSCSIIF_RING_SIZE
+
+struct vscsifrnt_shadow {
+       uint16_t next_free;
+       
+       /* command between backend and frontend
+        * VSCSIIF_ACT_SCSI_CDB or VSCSIIF_ACT_SCSI_RESET */
+       unsigned char act;
+       
+       /* do reset function */
+       wait_queue_head_t wq_reset;     /* reset work queue           */
+       int wait_reset;                 /* reset work queue condition */
+       int32_t rslt_reset;             /* reset response status      */
+                                       /* (SUCESS or FAILED)         */
+
+       /* for DMA_TO_DEVICE(1), DMA_FROM_DEVICE(2), DMA_NONE(3) 
+          requests */
+       unsigned int sc_data_direction;
+       
+       /* Number of pieces of scatter-gather */
+       unsigned int nr_segments;
+
+       /* requested struct scsi_cmnd is stored from kernel */
+       unsigned long req_scsi_cmnd;
+       int gref[VSCSIIF_SG_TABLESIZE];
+};
+
+struct vscsifrnt_info {
+       struct xenbus_device *dev;
+
+       struct Scsi_Host *host;
+
+       spinlock_t io_lock;
+       spinlock_t shadow_lock;
+       unsigned int evtchn;
+       unsigned int irq;
+
+       grant_ref_t ring_ref;
+       struct vscsiif_front_ring ring;
+       struct vscsiif_response ring_res;
+
+       struct vscsifrnt_shadow shadow[VSCSIIF_MAX_REQS];
+       uint32_t shadow_free;
+
+       struct task_struct *kthread;
+       wait_queue_head_t wq;
+       unsigned int waiting_resp;
+
+};
+
+#define DPRINTK(_f, _a...)                             \
+       pr_debug("(file=%s, line=%d) " _f,      \
+                __FILE__ , __LINE__ , ## _a )
+
+int scsifront_xenbus_init(void);
+void scsifront_xenbus_unregister(void);
+int scsifront_schedule(void *data);
+irqreturn_t scsifront_intr(int irq, void *dev_id);
+int scsifront_cmd_done(struct vscsifrnt_info *info);
+
+
+#endif /* __XEN_DRIVERS_SCSIFRONT_H__  */
diff --git a/drivers/xen/scsifront/scsifront.c b/drivers/xen/scsifront/scsifront.c

new file mode 100644 (file)

index 0000000..9d19419
--- /dev/null
+++ b/drivers/xen/scsifront/scsifront.c
@@ -0,0 +1,478 @@
+/*
+ * Xen SCSI frontend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+ 
+
+#include <linux/version.h>
+#include "common.h"
+
+static int get_id_from_freelist(struct vscsifrnt_info *info)
+{
+       unsigned long flags;
+       uint32_t free;
+
+       spin_lock_irqsave(&info->shadow_lock, flags);
+
+       free = info->shadow_free;
+       BUG_ON(free > VSCSIIF_MAX_REQS);
+       info->shadow_free = info->shadow[free].next_free;
+       info->shadow[free].next_free = 0x0fff;
+
+       info->shadow[free].wait_reset = 0;
+
+       spin_unlock_irqrestore(&info->shadow_lock, flags);
+
+       return free;
+}
+
+static void add_id_to_freelist(struct vscsifrnt_info *info, uint32_t id)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&info->shadow_lock, flags);
+
+       info->shadow[id].next_free  = info->shadow_free;
+       info->shadow[id].req_scsi_cmnd = 0;
+       info->shadow_free = id;
+
+       spin_unlock_irqrestore(&info->shadow_lock, flags);
+}
+
+
+struct vscsiif_request * scsifront_pre_request(struct vscsifrnt_info *info)
+{
+       struct vscsiif_front_ring *ring = &(info->ring);
+       vscsiif_request_t *ring_req;
+       uint32_t id;
+
+       ring_req = RING_GET_REQUEST(&(info->ring), ring->req_prod_pvt);
+
+       ring->req_prod_pvt++;
+       
+       id = get_id_from_freelist(info);        /* use id by response */
+       ring_req->rqid = (uint16_t)id;
+
+       return ring_req;
+}
+
+
+static void scsifront_notify_work(struct vscsifrnt_info *info)
+{
+       info->waiting_resp = 1;
+       wake_up(&info->wq);
+}
+
+
+static void scsifront_do_request(struct vscsifrnt_info *info)
+{
+       struct vscsiif_front_ring *ring = &(info->ring);
+       unsigned int irq = info->irq;
+       int notify;
+
+       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(ring, notify);
+       if (notify)
+               notify_remote_via_irq(irq);
+}
+
+irqreturn_t scsifront_intr(int irq, void *dev_id)
+{
+       scsifront_notify_work((struct vscsifrnt_info *)dev_id);
+       return IRQ_HANDLED;
+}
+
+
+static void scsifront_gnttab_done(struct vscsifrnt_shadow *s, uint32_t id)
+{
+       int i;
+
+       if (s->sc_data_direction == DMA_NONE)
+               return;
+
+       if (s->nr_segments) {
+               for (i = 0; i < s->nr_segments; i++) {
+                       if (unlikely(gnttab_query_foreign_access(
+                               s->gref[i]) != 0)) {
+                               pr_alert("scsifront: "
+                                        "grant still in use by backend\n");
+                               BUG();
+                       }
+                       gnttab_end_foreign_access(s->gref[i], 0UL);
+               }
+       }
+
+       return;
+}
+
+
+static void scsifront_cdb_cmd_done(struct vscsifrnt_info *info,
+                      vscsiif_response_t *ring_res)
+{
+       struct scsi_cmnd *sc;
+       uint32_t id;
+       uint8_t sense_len;
+
+       id = ring_res->rqid;
+       sc = (struct scsi_cmnd *)info->shadow[id].req_scsi_cmnd;
+
+       if (sc == NULL)
+               BUG();
+
+       scsifront_gnttab_done(&info->shadow[id], id);
+       add_id_to_freelist(info, id);
+
+       sc->result = ring_res->rslt;
+       scsi_set_resid(sc, ring_res->residual_len);
+
+       if (ring_res->sense_len > VSCSIIF_SENSE_BUFFERSIZE)
+               sense_len = VSCSIIF_SENSE_BUFFERSIZE;
+       else
+               sense_len = ring_res->sense_len;
+
+       if (sense_len)
+               memcpy(sc->sense_buffer, ring_res->sense_buffer, sense_len);
+
+       sc->scsi_done(sc);
+
+       return;
+}
+
+
+static void scsifront_sync_cmd_done(struct vscsifrnt_info *info,
+                               vscsiif_response_t *ring_res)
+{
+       uint16_t id = ring_res->rqid;
+       unsigned long flags;
+       
+       spin_lock_irqsave(&info->shadow_lock, flags);
+       info->shadow[id].wait_reset = 1;
+       info->shadow[id].rslt_reset = ring_res->rslt;
+       spin_unlock_irqrestore(&info->shadow_lock, flags);
+
+       wake_up(&(info->shadow[id].wq_reset));
+}
+
+
+int scsifront_cmd_done(struct vscsifrnt_info *info)
+{
+       vscsiif_response_t *ring_res;
+
+       RING_IDX i, rp;
+       int more_to_do = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&info->io_lock, flags);
+
+       rp = info->ring.sring->rsp_prod;
+       rmb();
+       for (i = info->ring.rsp_cons; i != rp; i++) {
+               
+               ring_res = RING_GET_RESPONSE(&info->ring, i);
+
+               if (info->shadow[ring_res->rqid].act == VSCSIIF_ACT_SCSI_CDB)
+                       scsifront_cdb_cmd_done(info, ring_res);
+               else
+                       scsifront_sync_cmd_done(info, ring_res);
+       }
+
+       info->ring.rsp_cons = i;
+
+       if (i != info->ring.req_prod_pvt) {
+               RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+       } else {
+               info->ring.sring->rsp_event = i + 1;
+       }
+
+       spin_unlock_irqrestore(&info->io_lock, flags);
+
+
+       /* Yield point for this unbounded loop. */
+       cond_resched();
+
+       return more_to_do;
+}
+
+
+
+
+int scsifront_schedule(void *data)
+{
+       struct vscsifrnt_info *info = (struct vscsifrnt_info *)data;
+
+       while (!kthread_should_stop()) {
+               wait_event_interruptible(
+                       info->wq,
+                       info->waiting_resp || kthread_should_stop());
+
+               info->waiting_resp = 0;
+               smp_mb();
+
+               if (scsifront_cmd_done(info))
+                       info->waiting_resp = 1;
+       }
+
+       return 0;
+}
+
+
+
+static int map_data_for_request(struct vscsifrnt_info *info,
+               struct scsi_cmnd *sc, vscsiif_request_t *ring_req, uint32_t id)
+{
+       grant_ref_t gref_head;
+       struct page *page;
+       int err, ref, ref_cnt = 0;
+       int write = (sc->sc_data_direction == DMA_TO_DEVICE);
+       unsigned int i, nr_pages, off, len, bytes;
+       unsigned long buffer_pfn;
+
+       if (sc->sc_data_direction == DMA_NONE)
+               return 0;
+
+       err = gnttab_alloc_grant_references(VSCSIIF_SG_TABLESIZE, &gref_head);
+       if (err) {
+               pr_err("scsifront: gnttab_alloc_grant_references() error\n");
+               return -ENOMEM;
+       }
+
+       if (scsi_bufflen(sc)) {
+               /* quoted scsi_lib.c/scsi_req_map_sg . */
+               struct scatterlist *sg, *sgl = scsi_sglist(sc);
+               unsigned int data_len = scsi_bufflen(sc);
+
+               nr_pages = (data_len + sgl->offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+               if (nr_pages > VSCSIIF_SG_TABLESIZE) {
+                       pr_err("scsifront: Unable to map request_buffer for command!\n");
+                       ref_cnt = (-E2BIG);
+                       goto big_to_sg;
+               }
+
+               for_each_sg (sgl, sg, scsi_sg_count(sc), i) {
+                       page = sg_page(sg);
+                       off = sg->offset;
+                       len = sg->length;
+
+                       buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
+
+                       while (len > 0 && data_len > 0) {
+                               /*
+                                * sg sends a scatterlist that is larger than
+                                * the data_len it wants transferred for certain
+                                * IO sizes
+                                */
+                               bytes = min_t(unsigned int, len, PAGE_SIZE - off);
+                               bytes = min(bytes, data_len);
+                               
+                               ref = gnttab_claim_grant_reference(&gref_head);
+                               BUG_ON(ref == -ENOSPC);
+
+                               gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id,
+                                       buffer_pfn, write);
+
+                               info->shadow[id].gref[ref_cnt]  = ref;
+                               ring_req->seg[ref_cnt].gref     = ref;
+                               ring_req->seg[ref_cnt].offset   = (uint16_t)off;
+                               ring_req->seg[ref_cnt].length   = (uint16_t)bytes;
+
+                               buffer_pfn++;
+                               len -= bytes;
+                               data_len -= bytes;
+                               off = 0;
+                               ref_cnt++;
+                       }
+               }
+       }
+
+big_to_sg:
+
+       gnttab_free_grant_references(gref_head);
+
+       return ref_cnt;
+}
+
+static int scsifront_queuecommand(struct Scsi_Host *shost,
+                                 struct scsi_cmnd *sc)
+{
+       struct vscsifrnt_info *info = shost_priv(shost);
+       vscsiif_request_t *ring_req;
+       unsigned long flags;
+       int ref_cnt;
+       uint16_t rqid;
+
+/* debug printk to identify more missing scsi commands
+       printk(KERN_INFO "scsicmd: len=%i, 0x%x,0x%x,0x%x,0x%x,0x%x,0x%x,0x%x,0x%x,0x%x,0x%x",sc->cmd_len,
+               sc->cmnd[0],sc->cmnd[1],sc->cmnd[2],sc->cmnd[3],sc->cmnd[4],
+               sc->cmnd[5],sc->cmnd[6],sc->cmnd[7],sc->cmnd[8],sc->cmnd[9]);
+*/
+       spin_lock_irqsave(shost->host_lock, flags);
+       scsi_cmd_get_serial(shost, sc);
+       if (RING_FULL(&info->ring)) {
+               spin_unlock_irqrestore(shost->host_lock, flags);
+               return SCSI_MLQUEUE_HOST_BUSY;
+       }
+
+       sc->result    = 0;
+
+       ring_req          = scsifront_pre_request(info);
+       rqid              = ring_req->rqid;
+       ring_req->act     = VSCSIIF_ACT_SCSI_CDB;
+
+       ring_req->id      = sc->device->id;
+       ring_req->lun     = sc->device->lun;
+       ring_req->channel = sc->device->channel;
+       ring_req->cmd_len = sc->cmd_len;
+
+       BUG_ON(sc->cmd_len > VSCSIIF_MAX_COMMAND_SIZE);
+
+       if ( sc->cmd_len )
+               memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len);
+       else
+               memset(ring_req->cmnd, 0, VSCSIIF_MAX_COMMAND_SIZE);
+
+       ring_req->sc_data_direction   = (uint8_t)sc->sc_data_direction;
+       ring_req->timeout_per_command = (sc->request->timeout / HZ);
+
+       info->shadow[rqid].req_scsi_cmnd     = (unsigned long)sc;
+       info->shadow[rqid].sc_data_direction = sc->sc_data_direction;
+       info->shadow[rqid].act               = ring_req->act;
+
+       ref_cnt = map_data_for_request(info, sc, ring_req, rqid);
+       if (ref_cnt < 0) {
+               add_id_to_freelist(info, rqid);
+               spin_unlock_irqrestore(shost->host_lock, flags);
+               if (ref_cnt == (-ENOMEM))
+                       return SCSI_MLQUEUE_HOST_BUSY;
+               sc->result = (DID_ERROR << 16);
+               sc->scsi_done(sc);
+               return 0;
+       }
+
+       ring_req->nr_segments          = (uint8_t)ref_cnt;
+       info->shadow[rqid].nr_segments = ref_cnt;
+
+       scsifront_do_request(info);
+       spin_unlock_irqrestore(shost->host_lock, flags);
+
+       return 0;
+}
+
+
+static int scsifront_eh_abort_handler(struct scsi_cmnd *sc)
+{
+       return (FAILED);
+}
+
+/* vscsi supports only device_reset, because it is each of LUNs */
+static int scsifront_dev_reset_handler(struct scsi_cmnd *sc)
+{
+       struct Scsi_Host *host = sc->device->host;
+       struct vscsifrnt_info *info = shost_priv(host);
+
+       vscsiif_request_t *ring_req;
+       uint16_t rqid;
+       int err;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12)
+       spin_lock_irq(host->host_lock);
+#endif
+
+       ring_req      = scsifront_pre_request(info);
+       ring_req->act = VSCSIIF_ACT_SCSI_RESET;
+
+       rqid          = ring_req->rqid;
+       info->shadow[rqid].act = VSCSIIF_ACT_SCSI_RESET;
+
+       ring_req->channel = sc->device->channel;
+       ring_req->id      = sc->device->id;
+       ring_req->lun     = sc->device->lun;
+       ring_req->cmd_len = sc->cmd_len;
+
+       if ( sc->cmd_len )
+               memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len);
+       else
+               memset(ring_req->cmnd, 0, VSCSIIF_MAX_COMMAND_SIZE);
+
+       ring_req->sc_data_direction   = (uint8_t)sc->sc_data_direction;
+       ring_req->timeout_per_command = (sc->request->timeout / HZ);
+       ring_req->nr_segments         = 0;
+
+       scsifront_do_request(info);     
+
+       spin_unlock_irq(host->host_lock);
+       wait_event_interruptible(info->shadow[rqid].wq_reset,
+                        info->shadow[rqid].wait_reset);
+       spin_lock_irq(host->host_lock);
+
+       err = info->shadow[rqid].rslt_reset;
+
+       add_id_to_freelist(info, rqid);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12)
+       spin_unlock_irq(host->host_lock);
+#endif
+       return (err);
+}
+
+
+struct scsi_host_template scsifront_sht = {
+       .module                 = THIS_MODULE,
+       .name                   = "Xen SCSI frontend driver",
+       .queuecommand           = scsifront_queuecommand,
+       .eh_abort_handler       = scsifront_eh_abort_handler,
+       .eh_device_reset_handler= scsifront_dev_reset_handler,
+       .cmd_per_lun            = VSCSIIF_DEFAULT_CMD_PER_LUN,
+       .can_queue              = VSCSIIF_MAX_REQS,
+       .this_id                = -1,
+       .sg_tablesize           = VSCSIIF_SG_TABLESIZE,
+       .use_clustering         = DISABLE_CLUSTERING,
+       .proc_name              = "scsifront",
+};
+
+
+static int __init scsifront_init(void)
+{
+       int err;
+
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       err = scsifront_xenbus_init();
+
+       return err;
+}
+
+static void __exit scsifront_exit(void)
+{
+       scsifront_xenbus_unregister();
+}
+
+module_init(scsifront_init);
+module_exit(scsifront_exit);
+
+MODULE_DESCRIPTION("Xen SCSI frontend driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/scsifront/xenbus.c b/drivers/xen/scsifront/xenbus.c

new file mode 100644 (file)

index 0000000..4d01d1b
--- /dev/null
+++ b/drivers/xen/scsifront/xenbus.c
@@ -0,0 +1,424 @@
+/*
+ * Xen SCSI frontend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/*
+* Patched to support >2TB drives
+* 2010, Samuel Kvasnica, IMS Nanofabrication AG
+*/
+
+#include <linux/version.h>
+#include <linux/slab.h>
+#include "common.h"
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
+  #define DEFAULT_TASK_COMM_LEN        16
+#else
+  #define DEFAULT_TASK_COMM_LEN        TASK_COMM_LEN
+#endif
+
+extern struct scsi_host_template scsifront_sht;
+
+static void scsifront_free(struct vscsifrnt_info *info)
+{
+       struct Scsi_Host *host = info->host;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,14)
+       if (host->shost_state != SHOST_DEL) {
+#else
+       if (!test_bit(SHOST_DEL, &host->shost_state)) {
+#endif
+               scsi_remove_host(info->host);
+       }
+
+       if (info->ring_ref != GRANT_INVALID_REF) {
+               gnttab_end_foreign_access(info->ring_ref,
+                                       (unsigned long)info->ring.sring);
+               info->ring_ref = GRANT_INVALID_REF;
+               info->ring.sring = NULL;
+       }
+
+       if (info->irq)
+               unbind_from_irqhandler(info->irq, info);
+       info->irq = 0;
+
+       scsi_host_put(info->host);
+}
+
+
+static int scsifront_alloc_ring(struct vscsifrnt_info *info)
+{
+       struct xenbus_device *dev = info->dev;
+       struct vscsiif_sring *sring;
+       int err = -ENOMEM;
+
+
+       info->ring_ref = GRANT_INVALID_REF;
+
+       /***** Frontend to Backend ring start *****/
+       sring = (struct vscsiif_sring *) __get_free_page(GFP_KERNEL);
+       if (!sring) {
+               xenbus_dev_fatal(dev, err, "fail to allocate shared ring (Front to Back)");
+               return err;
+       }
+       SHARED_RING_INIT(sring);
+       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+       err = xenbus_grant_ring(dev, virt_to_mfn(sring));
+       if (err < 0) {
+               free_page((unsigned long) sring);
+               info->ring.sring = NULL;
+               xenbus_dev_fatal(dev, err, "fail to grant shared ring (Front to Back)");
+               goto free_sring;
+       }
+       info->ring_ref = err;
+
+       err = bind_listening_port_to_irqhandler(
+                       dev->otherend_id, scsifront_intr,
+                       IRQF_SAMPLE_RANDOM, "scsifront", info);
+
+       if (err <= 0) {
+               xenbus_dev_fatal(dev, err, "bind_listening_port_to_irqhandler");
+               goto free_sring;
+       }
+       info->irq = err;
+
+       return 0;
+
+/* free resource */
+free_sring:
+       scsifront_free(info);
+
+       return err;
+}
+
+
+static int scsifront_init_ring(struct vscsifrnt_info *info)
+{
+       struct xenbus_device *dev = info->dev;
+       struct xenbus_transaction xbt;
+       int err;
+
+       DPRINTK("%s\n",__FUNCTION__);
+
+       err = scsifront_alloc_ring(info);
+       if (err)
+               return err;
+       DPRINTK("%u %u\n", info->ring_ref, info->evtchn);
+
+again:
+       err = xenbus_transaction_start(&xbt);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "starting transaction");
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "ring-ref", "%u",
+                               info->ring_ref);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "%s", "writing ring-ref");
+               goto fail;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+                               irq_to_evtchn_port(info->irq));
+
+       if (err) {
+               xenbus_dev_fatal(dev, err, "%s", "writing event-channel");
+               goto fail;
+       }
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err) {
+               if (err == -EAGAIN)
+                       goto again;
+               xenbus_dev_fatal(dev, err, "completing transaction");
+               goto free_sring;
+       }
+
+       return 0;
+
+fail:
+       xenbus_transaction_end(xbt, 1);
+free_sring:
+       /* free resource */
+       scsifront_free(info);
+       
+       return err;
+}
+
+
+static int scsifront_probe(struct xenbus_device *dev,
+                               const struct xenbus_device_id *id)
+{
+       struct vscsifrnt_info *info;
+       struct Scsi_Host *host;
+       int i, err = -ENOMEM;
+       char name[DEFAULT_TASK_COMM_LEN];
+
+       host = scsi_host_alloc(&scsifront_sht, sizeof(*info));
+       if (!host) {
+               xenbus_dev_fatal(dev, err, "fail to allocate scsi host");
+               return err;
+       }
+       info = (struct vscsifrnt_info *) host->hostdata;
+       info->host = host;
+
+
+       dev_set_drvdata(&dev->dev, info);
+       info->dev  = dev;
+
+       for (i = 0; i < VSCSIIF_MAX_REQS; i++) {
+               info->shadow[i].next_free = i + 1;
+               init_waitqueue_head(&(info->shadow[i].wq_reset));
+               info->shadow[i].wait_reset = 0;
+       }
+       info->shadow[VSCSIIF_MAX_REQS - 1].next_free = 0x0fff;
+
+       err = scsifront_init_ring(info);
+       if (err) {
+               scsi_host_put(host);
+               return err;
+       }
+
+       init_waitqueue_head(&info->wq);
+       spin_lock_init(&info->io_lock);
+       spin_lock_init(&info->shadow_lock);
+
+       snprintf(name, DEFAULT_TASK_COMM_LEN, "vscsiif.%d", info->host->host_no);
+
+       info->kthread = kthread_run(scsifront_schedule, info, name);
+       if (IS_ERR(info->kthread)) {
+               err = PTR_ERR(info->kthread);
+               info->kthread = NULL;
+               pr_err("scsifront: kthread start err %d\n", err);
+               goto free_sring;
+       }
+
+       host->max_id      = VSCSIIF_MAX_TARGET;
+       host->max_channel = 0;
+       host->max_lun     = VSCSIIF_MAX_LUN;
+       host->max_sectors = (VSCSIIF_SG_TABLESIZE - 1) * PAGE_SIZE / 512;
+       host->max_cmd_len = VSCSIIF_MAX_COMMAND_SIZE;
+
+       err = scsi_add_host(host, &dev->dev);
+       if (err) {
+               pr_err("scsifront: fail to add scsi host %d\n", err);
+               goto free_sring;
+       }
+
+       xenbus_switch_state(dev, XenbusStateInitialised);
+
+       return 0;
+
+free_sring:
+       /* free resource */
+       scsifront_free(info);
+       return err;
+}
+
+static int scsifront_remove(struct xenbus_device *dev)
+{
+       struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev);
+
+       DPRINTK("%s: %s removed\n",__FUNCTION__ ,dev->nodename);
+
+       if (info->kthread) {
+               kthread_stop(info->kthread);
+               info->kthread = NULL;
+       }
+
+       scsifront_free(info);
+       
+       return 0;
+}
+
+
+static int scsifront_disconnect(struct vscsifrnt_info *info)
+{
+       struct xenbus_device *dev = info->dev;
+       struct Scsi_Host *host = info->host;
+
+       DPRINTK("%s: %s disconnect\n",__FUNCTION__ ,dev->nodename);
+
+       /* 
+         When this function is executed,  all devices of 
+         Frontend have been deleted. 
+         Therefore, it need not block I/O before remove_host.
+       */
+
+       scsi_remove_host(host);
+       xenbus_frontend_closed(dev);
+
+       return 0;
+}
+
+#define VSCSIFRONT_OP_ADD_LUN  1
+#define VSCSIFRONT_OP_DEL_LUN  2
+
+static void scsifront_do_lun_hotplug(struct vscsifrnt_info *info, int op)
+{
+       struct xenbus_device *dev = info->dev;
+       int i, err = 0;
+       char str[64], state_str[64];
+       char **dir;
+       unsigned int dir_n = 0;
+       unsigned int device_state;
+       unsigned int hst, chn, tgt, lun;
+       struct scsi_device *sdev;
+
+       dir = xenbus_directory(XBT_NIL, dev->otherend, "vscsi-devs", &dir_n);
+       if (IS_ERR(dir))
+               return;
+
+       for (i = 0; i < dir_n; i++) {
+               /* read status */
+               snprintf(str, sizeof(str), "vscsi-devs/%s/state", dir[i]);
+               err = xenbus_scanf(XBT_NIL, dev->otherend, str, "%u",
+                       &device_state);
+               if (XENBUS_EXIST_ERR(err))
+                       continue;
+               
+               /* virtual SCSI device */
+               snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", dir[i]);
+               err = xenbus_scanf(XBT_NIL, dev->otherend, str,
+                       "%u:%u:%u:%u", &hst, &chn, &tgt, &lun);
+               if (XENBUS_EXIST_ERR(err))
+                       continue;
+
+               /* front device state path */
+               snprintf(state_str, sizeof(state_str), "vscsi-devs/%s/state", dir[i]);
+
+               switch (op) {
+               case VSCSIFRONT_OP_ADD_LUN:
+                       if (device_state == XenbusStateInitialised) {
+                               sdev = scsi_device_lookup(info->host, chn, tgt, lun);
+                               if (sdev) {
+                                       pr_err("scsifront: Device already in use.\n");
+                                       scsi_device_put(sdev);
+                                       xenbus_printf(XBT_NIL, dev->nodename,
+                                               state_str, "%d", XenbusStateClosed);
+                               } else {
+                                       scsi_add_device(info->host, chn, tgt, lun);
+                                       xenbus_printf(XBT_NIL, dev->nodename,
+                                               state_str, "%d", XenbusStateConnected);
+                               }
+                       }
+                       break;
+               case VSCSIFRONT_OP_DEL_LUN:
+                       if (device_state == XenbusStateClosing) {
+                               sdev = scsi_device_lookup(info->host, chn, tgt, lun);
+                               if (sdev) {
+                                       scsi_remove_device(sdev);
+                                       scsi_device_put(sdev);
+                                       xenbus_printf(XBT_NIL, dev->nodename,
+                                               state_str, "%d", XenbusStateClosed);
+                               }
+                       }
+                       break;
+               default:
+                       break;
+               }
+       }
+       
+       kfree(dir);
+       return;
+}
+
+
+
+
+static void scsifront_backend_changed(struct xenbus_device *dev,
+                               enum xenbus_state backend_state)
+{
+       struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev);
+
+       DPRINTK("%p %u %u\n", dev, dev->state, backend_state);
+
+       switch (backend_state) {
+       case XenbusStateUnknown:
+       case XenbusStateInitialising:
+       case XenbusStateInitWait:
+       case XenbusStateClosed:
+               break;
+
+       case XenbusStateInitialised:
+               break;
+
+       case XenbusStateConnected:
+               if (xenbus_read_driver_state(dev->nodename) ==
+                       XenbusStateInitialised) {
+                       scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN);
+               }
+               
+               if (dev->state == XenbusStateConnected)
+                       break;
+                       
+               xenbus_switch_state(dev, XenbusStateConnected);
+               break;
+
+       case XenbusStateClosing:
+               scsifront_disconnect(info);
+               break;
+
+       case XenbusStateReconfiguring:
+               scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_DEL_LUN);
+               xenbus_switch_state(dev, XenbusStateReconfiguring);
+               break;
+
+       case XenbusStateReconfigured:
+               scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN);
+               xenbus_switch_state(dev, XenbusStateConnected);
+               break;
+       }
+}
+
+
+static const struct xenbus_device_id scsifront_ids[] = {
+       { "vscsi" },
+       { "" }
+};
+MODULE_ALIAS("xen:vscsi");
+
+static DEFINE_XENBUS_DRIVER(scsifront, ,
+       .probe                  = scsifront_probe,
+       .remove                 = scsifront_remove,
+/*     .resume                 = scsifront_resume, */
+       .otherend_changed       = scsifront_backend_changed,
+);
+
+int scsifront_xenbus_init(void)
+{
+       return xenbus_register_frontend(&scsifront_driver);
+}
+
+void scsifront_xenbus_unregister(void)
+{
+       xenbus_unregister_driver(&scsifront_driver);
+}
+
diff --git a/drivers/xen/sfc_netback/Makefile b/drivers/xen/sfc_netback/Makefile

new file mode 100644 (file)

index 0000000..1286c3a
--- /dev/null
+++ b/drivers/xen/sfc_netback/Makefile
@@ -0,0 +1,12 @@
+EXTRA_CFLAGS += -Idrivers/xen/sfc_netback -Idrivers/xen/sfc_netutil -Idrivers/xen/netback -Idrivers/net/sfc -Idrivers/net/sfc/sfc_resource
+EXTRA_CFLAGS += -D__ci_driver__ 
+EXTRA_CFLAGS += -DEFX_USE_KCOMPAT
+EXTRA_CFLAGS += -Werror
+
+ifdef GCOV
+EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV
+endif
+
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_BACKEND) := sfc_netback.o
+
+sfc_netback-objs   := accel.o accel_fwd.o accel_msg.o accel_solarflare.o accel_xenbus.o accel_debugfs.o
diff --git a/drivers/xen/sfc_netback/accel.c b/drivers/xen/sfc_netback/accel.c

new file mode 100644 (file)

index 0000000..21367f2
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel.c
@@ -0,0 +1,147 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include "accel.h"
+#include "accel_msg_iface.h"
+#include "accel_solarflare.h"
+
+#include <linux/notifier.h>
+
+#ifdef EFX_GCOV
+#include "gcov.h"
+#endif
+
+static int netback_accel_netdev_event(struct notifier_block *nb,
+                                     unsigned long event, void *ptr)
+{
+       struct net_device *net_dev = (struct net_device *)ptr;
+       struct netback_accel *bend;
+
+       if ((event == NETDEV_UP) || 
+           (event == NETDEV_DOWN) ||
+           (event == NETDEV_CHANGE)) {
+               mutex_lock(&bend_list_mutex);
+               bend = bend_list;
+               while (bend != NULL) {
+                       mutex_lock(&bend->bend_mutex);
+                       /*
+                        * This happens when the shared pages have
+                        * been unmapped, but the bend not yet removed
+                        * from list
+                        */
+                       if (bend->shared_page == NULL)
+                               goto next;
+
+                       if (bend->net_dev->ifindex == net_dev->ifindex) {
+                               int ok;
+                               if (event == NETDEV_CHANGE)
+                                       ok = (netif_carrier_ok(net_dev) && 
+                                             (net_dev->flags & IFF_UP));
+                               else
+                                       ok = (netif_carrier_ok(net_dev) && 
+                                             (event == NETDEV_UP));
+                               netback_accel_set_interface_state(bend, ok);
+                       }
+
+               next:
+                       mutex_unlock(&bend->bend_mutex);
+                       bend = bend->next_bend;
+               }
+               mutex_unlock(&bend_list_mutex);
+       }
+
+       return NOTIFY_DONE;
+}
+
+
+static struct notifier_block netback_accel_netdev_notifier = {
+       .notifier_call = netback_accel_netdev_event,
+};
+
+
+unsigned sfc_netback_max_pages = NETBACK_ACCEL_DEFAULT_MAX_BUF_PAGES;
+module_param_named(max_pages, sfc_netback_max_pages, uint, 0644);
+MODULE_PARM_DESC(max_pages, 
+                "The number of buffer pages to enforce on each guest");
+
+/* Initialise subsystems need for the accelerated fast path */
+static int __init netback_accel_init(void)
+{
+       int rc = 0;
+
+#ifdef EFX_GCOV
+       gcov_provider_init(THIS_MODULE);
+#endif
+
+       rc = netback_accel_init_fwd();
+       if (rc != 0)
+               goto fail0;
+
+       netback_accel_debugfs_init();
+
+       rc = netback_accel_sf_init();
+       if (rc != 0)
+               goto fail1;
+
+       rc = register_netdevice_notifier
+               (&netback_accel_netdev_notifier);
+       if (rc != 0)
+               goto fail2;
+
+       return 0;
+
+ fail2:
+       netback_accel_sf_shutdown();
+ fail1:
+       netback_accel_debugfs_fini();
+       netback_accel_shutdown_fwd();
+ fail0:
+#ifdef EFX_GCOV
+       gcov_provider_fini(THIS_MODULE);
+#endif
+       return rc;
+}
+
+module_init(netback_accel_init);
+
+static void __exit netback_accel_exit(void)
+{
+       unregister_netdevice_notifier(&netback_accel_netdev_notifier);
+
+       netback_accel_sf_shutdown();
+
+       netback_accel_shutdown_bends();
+
+       netback_accel_debugfs_fini();
+
+       netback_accel_shutdown_fwd();
+
+#ifdef EFX_GCOV
+       gcov_provider_fini(THIS_MODULE);
+#endif
+}
+
+module_exit(netback_accel_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/sfc_netback/accel.h b/drivers/xen/sfc_netback/accel.h

new file mode 100644 (file)

index 0000000..f371a3e
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel.h
@@ -0,0 +1,392 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETBACK_ACCEL_H
+#define NETBACK_ACCEL_H
+
+#include <linux/version.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+
+#include <xen/xenbus.h>
+
+#include "accel_shared_fifo.h"
+#include "accel_msg_iface.h"
+#include "accel_util.h"
+
+/**************************************************************************
+ * Datatypes
+ **************************************************************************/
+
+#define NETBACK_ACCEL_DEFAULT_MAX_FILTERS (8)
+#define NETBACK_ACCEL_DEFAULT_MAX_MCASTS (8)
+#define NETBACK_ACCEL_DEFAULT_MAX_BUF_PAGES (384)
+/* Variable to store module parameter for max_buf_pages */
+extern unsigned sfc_netback_max_pages;
+
+#define NETBACK_ACCEL_STATS 1
+
+#if NETBACK_ACCEL_STATS
+#define NETBACK_ACCEL_STATS_OP(x) x
+#else
+#define NETBACK_ACCEL_STATS_OP(x)
+#endif
+
+/*! Statistics for a given backend */
+struct netback_accel_stats {
+       /*! Number of eventq wakeup events */
+       u64 evq_wakeups;
+       /*! Number of eventq timeout events */
+       u64 evq_timeouts;
+       /*! Number of filters used */
+       u32 num_filters;
+       /*! Number of buffer pages registered */
+       u32 num_buffer_pages;
+};
+
+
+/* Debug fs nodes for each of the above stats */
+struct netback_accel_dbfs {
+       struct dentry *evq_wakeups;
+       struct dentry *evq_timeouts;
+       struct dentry *num_filters;
+       struct dentry *num_buffer_pages;
+};
+
+
+/*! Resource limits for a given NIC */
+struct netback_accel_limits {
+       int max_filters;            /*!< Max. number of filters to use. */
+       int max_mcasts;      /*!< Max. number  of mcast subscriptions */
+       int max_buf_pages;        /*!< Max. number of pages of NIC buffers */
+};
+
+
+/*! The state for an instance of the back end driver. */
+struct netback_accel {
+       /*! mutex to protect this state */
+       struct mutex bend_mutex;
+
+       /*! Watches on xenstore */
+       struct xenbus_watch domu_accel_watch;
+       struct xenbus_watch config_accel_watch;
+
+       /*! Pointer to whatever device cookie ties us in to the hypervisor */
+       void *hdev_data;
+
+       /*! FIFO indices. Next page is msg FIFOs */
+       struct net_accel_shared_page *shared_page;
+
+       /*! Defer control message processing */
+       struct work_struct handle_msg;
+
+       /*! Identifies other end VM and interface.*/
+       int far_end;
+       int vif_num;
+
+       /*!< To unmap the shared pages */
+       void *sh_pages_unmap;
+
+       /* Resource tracking */
+       /*! Limits on H/W & Dom0 resources */
+       struct netback_accel_limits quotas;
+
+       /* Hardware resources */
+       /*! The H/W type of associated NIC */
+       enum net_accel_hw_type hw_type;
+       /*! State of allocation */             
+       int hw_state;
+       /*! How to set up the acceleration for this hardware */
+       int (*accel_setup)(struct netback_accel *); 
+       /*! And how to stop it. */
+       void (*accel_shutdown)(struct netback_accel *);
+
+       /*! The physical/real net_dev for this interface */
+       struct net_device *net_dev;
+
+       /*! Magic pointer to locate state in fowarding table */
+       void *fwd_priv;
+
+       /*! Message FIFO */
+       sh_msg_fifo2 to_domU;
+       /*! Message FIFO */
+       sh_msg_fifo2 from_domU;
+
+       /*! General notification channel id */
+       int msg_channel;
+       /*! General notification channel irq */
+       int msg_channel_irq;
+
+       /*! Event channel id dedicated to network packet interrupts. */
+       int net_channel; 
+       /*! Event channel irq dedicated to network packets interrupts */
+       int net_channel_irq; 
+
+       /*! The MAC address the frontend goes by. */
+       u8 mac[ETH_ALEN];
+       /*! Driver name of associated NIC */
+       char *nicname;    
+
+       /*! Array of pointers to buffer pages mapped */
+       grant_handle_t *buffer_maps; 
+       u64 *buffer_addrs;
+       /*! Index into buffer_maps */
+       int buffer_maps_index; 
+       /*! Max number of pages that domU is allowed/will request to map */
+       int max_pages; 
+
+       /*! Pointer to hardware specific private area */
+       void *accel_hw_priv; 
+
+       /*! Wait queue for changes in accelstate. */
+       wait_queue_head_t state_wait_queue;
+
+       /*! Current state of the frontend according to the xenbus
+        *  watch. */
+       XenbusState frontend_state;
+
+       /*! Current state of this backend. */
+       XenbusState backend_state;
+
+       /*! Non-zero if the backend is being removed. */
+       int removing;
+
+       /*! Non-zero if the setup_vnic has been called. */
+       int vnic_is_setup;
+
+#if NETBACK_ACCEL_STATS
+       struct netback_accel_stats stats;
+#endif 
+#if defined(CONFIG_DEBUG_FS)
+       char *dbfs_dir_name;
+       struct dentry *dbfs_dir;
+       struct netback_accel_dbfs dbfs;
+#endif
+
+       /*! List */
+       struct netback_accel *next_bend;
+};
+
+
+/*
+ * Values for netback_accel.hw_state.  States of resource allocation
+ * we can go through
+ */
+/*! No hardware has yet been allocated. */
+#define NETBACK_ACCEL_RES_NONE  (0)
+/*! Hardware has been allocated. */
+#define NETBACK_ACCEL_RES_ALLOC (1)
+#define NETBACK_ACCEL_RES_FILTER (2)
+#define NETBACK_ACCEL_RES_HWINFO (3)
+
+/*! Filtering specification. This assumes that for VNIC support we
+ *  will always want wildcard entries, so only specifies the
+ *  destination IP/port
+ */
+struct netback_accel_filter_spec {
+       /*! Internal, used to access efx_vi API */
+       void *filter_handle; 
+
+       /*! Destination IP in network order */
+       u32 destip_be;
+       /*! Destination port in network order */
+       u16 destport_be;
+       /*! Mac address */
+       u8  mac[ETH_ALEN];
+       /*! TCP or UDP */
+       u8  proto;      
+};
+
+
+/**************************************************************************
+ * From accel.c
+ **************************************************************************/
+
+/*! \brief Start up all the acceleration plugins 
+ *
+ * \return 0 on success, an errno on failure
+ */
+extern int netback_accel_init_accel(void);
+
+/*! \brief Shut down all the acceleration plugins 
+ */
+extern void netback_accel_shutdown_accel(void);
+
+
+/**************************************************************************
+ * From accel_fwd.c
+ **************************************************************************/
+
+/*! \brief Init the forwarding infrastructure
+ * \return 0 on success, or -ENOMEM if it couldn't get memory for the
+ * forward table 
+ */
+extern int netback_accel_init_fwd(void);
+
+/*! \brief Shut down the forwarding and free memory. */
+extern void netback_accel_shutdown_fwd(void);
+
+/*! Initialise each nic port's fowarding table */
+extern void *netback_accel_init_fwd_port(void);
+extern void netback_accel_shutdown_fwd_port(void *fwd_priv);
+
+/*! \brief Add an entry to the forwarding table. 
+ * \param mac : MAC address, used as hash key
+ * \param ctxt : value to associate with key (can be NULL, see
+ * netback_accel_fwd_set_context)
+ * \return 0 on success, -ENOMEM if table was full and could no grow it
+ */
+extern int netback_accel_fwd_add(const __u8 *mac, void *context,
+                                void *fwd_priv);
+
+/*! \brief Remove an entry from the forwarding table. 
+ * \param mac : the MAC address to remove
+ * \return nothing: it is not an error if the mac was not in the table
+ */
+extern void netback_accel_fwd_remove(const __u8 *mac, void *fwd_priv);
+
+/*! \brief Set the context pointer for an existing fwd table entry.
+ * \param mac : key that is already present in the table
+ * \param context : new value to associate with key
+ * \return 0 on success, -ENOENT if mac not present in table.
+ */
+extern int netback_accel_fwd_set_context(const __u8 *mac, void *context,
+                                        void *fwd_priv);
+
+/**************************************************************************
+ * From accel_msg.c
+ **************************************************************************/
+
+
+/*! \brief Send the start-of-day message that handshakes with the VNIC
+ *  and tells it its MAC address.
+ *
+ * \param bend The back end driver data structure
+ * \param version The version of communication to use, e.g. NET_ACCEL_MSG_VERSION
+ */
+extern void netback_accel_msg_tx_hello(struct netback_accel *bend,
+                                      unsigned version);
+
+/*! \brief Send a "there's a new local mac address" message 
+ *
+ * \param bend The back end driver data structure for the vnic to send
+ * the message to 
+ * \param mac Pointer to the new mac address
+ */
+extern void netback_accel_msg_tx_new_localmac(struct netback_accel *bend,
+                                             const void *mac);
+
+/*! \brief Send a "a mac address that was local has gone away" message 
+ *
+ * \param bend The back end driver data structure for the vnic to send
+ * the message to 
+ * \param mac Pointer to the old mac address
+ */
+extern void netback_accel_msg_tx_old_localmac(struct netback_accel *bend,
+                                             const void *mac);
+
+extern void netback_accel_set_interface_state(struct netback_accel *bend,
+                                             int up);
+
+/*! \brief Process the message queue for a bend that has just
+ * interrupted.
+ * 
+ * Demultiplexs an interrupt from the front end driver, taking
+ * messages from the fifo and taking appropriate action.
+ * 
+ * \param bend The back end driver data structure
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+extern void netback_accel_msg_rx_handler(struct work_struct *arg);
+#else
+extern void netback_accel_msg_rx_handler(void *bend_void);
+#endif
+
+/**************************************************************************
+ * From accel_xenbus.c
+ **************************************************************************/
+/*! List of all the bends currently in existence. */
+extern struct netback_accel *bend_list;
+extern struct mutex bend_list_mutex;
+
+/*! \brief Probe a new network interface. */
+extern int netback_accel_probe(struct xenbus_device *dev);
+
+/*! \brief Remove a network interface. */
+extern int netback_accel_remove(struct xenbus_device *dev);
+
+/*! \brief Shutdown all accelerator backends */
+extern void netback_accel_shutdown_bends(void);
+
+/*! \brief Initiate the xenbus state teardown handshake */
+extern void netback_accel_set_closing(struct netback_accel *bend);
+
+/**************************************************************************
+ * From accel_debugfs.c
+ **************************************************************************/
+/*! Global statistics */
+struct netback_accel_global_stats {
+       /*! Number of TX packets seen through driverlink */
+       u64 dl_tx_packets;
+       /*! Number of TX packets seen through driverlink we didn't like */
+       u64 dl_tx_bad_packets;
+       /*! Number of RX packets seen through driverlink */
+       u64 dl_rx_packets;
+       /*! Number of mac addresses we are forwarding to */
+       u32 num_fwds;
+};
+
+/*! Debug fs entries for each of the above stats */
+struct netback_accel_global_dbfs {
+       struct dentry *dl_tx_packets;
+       struct dentry *dl_tx_bad_packets;
+       struct dentry *dl_rx_packets;
+       struct dentry *num_fwds;
+};
+
+#if NETBACK_ACCEL_STATS
+extern struct netback_accel_global_stats global_stats;
+#endif
+
+/*! \brief Initialise the debugfs root and populate with global stats */
+extern void netback_accel_debugfs_init(void);
+
+/*! \brief Remove our debugfs root directory */
+extern void netback_accel_debugfs_fini(void);
+
+/*! \brief Add per-bend statistics to debug fs */
+extern int netback_accel_debugfs_create(struct netback_accel *bend);
+/*! \brief Remove per-bend statistics from debug fs */
+extern int netback_accel_debugfs_remove(struct netback_accel *bend);
+
+#endif /* NETBACK_ACCEL_H */
+
+
diff --git a/drivers/xen/sfc_netback/accel_debugfs.c b/drivers/xen/sfc_netback/accel_debugfs.c

new file mode 100644 (file)

index 0000000..6527c4b
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_debugfs.c
@@ -0,0 +1,148 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+
+#include "accel.h"
+
+#if defined(CONFIG_DEBUG_FS)
+static struct dentry *sfc_debugfs_root = NULL;
+#endif
+
+#if NETBACK_ACCEL_STATS
+struct netback_accel_global_stats global_stats;
+#if defined(CONFIG_DEBUG_FS)
+static struct netback_accel_global_dbfs  global_dbfs;
+#endif
+#endif
+
+void netback_accel_debugfs_init(void) 
+{
+#if defined(CONFIG_DEBUG_FS)
+       sfc_debugfs_root = debugfs_create_dir("sfc_netback", NULL);
+       if (sfc_debugfs_root == NULL)
+               return;
+
+       global_dbfs.num_fwds = debugfs_create_u32
+               ("num_fwds", S_IRUSR | S_IRGRP | S_IROTH,
+                sfc_debugfs_root, &global_stats.num_fwds);
+       global_dbfs.dl_tx_packets = debugfs_create_u64
+               ("dl_tx_packets", S_IRUSR | S_IRGRP | S_IROTH,
+                sfc_debugfs_root, &global_stats.dl_tx_packets);
+       global_dbfs.dl_rx_packets = debugfs_create_u64
+               ("dl_rx_packets", S_IRUSR | S_IRGRP | S_IROTH,
+                sfc_debugfs_root, &global_stats.dl_rx_packets);
+       global_dbfs.dl_tx_bad_packets = debugfs_create_u64
+               ("dl_tx_bad_packets", S_IRUSR | S_IRGRP | S_IROTH,
+                sfc_debugfs_root, &global_stats.dl_tx_bad_packets);
+#endif
+}
+
+
+void netback_accel_debugfs_fini(void)
+{
+#if defined(CONFIG_DEBUG_FS)
+       debugfs_remove(global_dbfs.num_fwds);
+       debugfs_remove(global_dbfs.dl_tx_packets);
+       debugfs_remove(global_dbfs.dl_rx_packets);
+       debugfs_remove(global_dbfs.dl_tx_bad_packets);
+
+       debugfs_remove(sfc_debugfs_root);
+#endif
+}
+
+
+int netback_accel_debugfs_create(struct netback_accel *bend)
+{
+#if defined(CONFIG_DEBUG_FS)
+       /* Smallest length is 7 (vif0.0\n) */
+       int length = 7, temp;
+
+       if (sfc_debugfs_root == NULL)
+               return -ENOENT;
+
+       /* Work out length of string representation of far_end and vif_num */
+       temp = bend->far_end;
+       while (temp > 9) {
+               length++;
+               temp = temp / 10;
+       }
+       temp = bend->vif_num;
+       while (temp > 9) {
+               length++;
+               temp = temp / 10;
+       }
+
+       bend->dbfs_dir_name = kmalloc(length, GFP_KERNEL);
+       if (bend->dbfs_dir_name == NULL)
+               return -ENOMEM;
+       sprintf(bend->dbfs_dir_name, "vif%d.%d", bend->far_end, bend->vif_num);
+
+       bend->dbfs_dir = debugfs_create_dir(bend->dbfs_dir_name, 
+                                           sfc_debugfs_root);
+       if (bend->dbfs_dir == NULL) {
+               kfree(bend->dbfs_dir_name);
+               return -ENOMEM;
+       }
+
+#if NETBACK_ACCEL_STATS
+       bend->dbfs.evq_wakeups = debugfs_create_u64
+               ("evq_wakeups", S_IRUSR | S_IRGRP | S_IROTH,
+                bend->dbfs_dir, &bend->stats.evq_wakeups);
+       bend->dbfs.evq_timeouts = debugfs_create_u64
+               ("evq_timeouts", S_IRUSR | S_IRGRP | S_IROTH,
+                bend->dbfs_dir, &bend->stats.evq_timeouts);
+       bend->dbfs.num_filters = debugfs_create_u32
+               ("num_filters", S_IRUSR | S_IRGRP | S_IROTH,
+                bend->dbfs_dir, &bend->stats.num_filters);
+       bend->dbfs.num_buffer_pages = debugfs_create_u32
+               ("num_buffer_pages", S_IRUSR | S_IRGRP | S_IROTH,
+                bend->dbfs_dir, &bend->stats.num_buffer_pages);
+#endif
+#endif
+        return 0;
+}
+
+
+int netback_accel_debugfs_remove(struct netback_accel *bend)
+{
+#if defined(CONFIG_DEBUG_FS)
+       if (bend->dbfs_dir != NULL) {
+#if NETBACK_ACCEL_STATS
+               debugfs_remove(bend->dbfs.evq_wakeups);
+               debugfs_remove(bend->dbfs.evq_timeouts);
+               debugfs_remove(bend->dbfs.num_filters);
+               debugfs_remove(bend->dbfs.num_buffer_pages);
+#endif
+               debugfs_remove(bend->dbfs_dir);
+       }
+
+       if (bend->dbfs_dir_name)
+               kfree(bend->dbfs_dir_name);
+#endif
+        return 0;
+}
+
+
diff --git a/drivers/xen/sfc_netback/accel_fwd.c b/drivers/xen/sfc_netback/accel_fwd.c

new file mode 100644 (file)

index 0000000..385855a
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_fwd.c
@@ -0,0 +1,420 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include "accel.h"
+#include "accel_cuckoo_hash.h"
+#include "accel_util.h"
+#include "accel_solarflare.h"
+
+#include "driverlink_api.h"
+
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+
+/* State stored in the forward table */
+struct fwd_struct {
+       struct list_head link; /* Forms list */
+       void * context;
+       __u8 valid;
+       __u8 mac[ETH_ALEN];
+};
+
+/* Max value we support */
+#define NUM_FWDS_BITS 8
+#define NUM_FWDS (1 << NUM_FWDS_BITS)
+#define FWD_MASK (NUM_FWDS - 1)
+
+struct port_fwd {
+       /* Make a list */
+       struct list_head link;
+       /* Hash table to store the fwd_structs */
+       cuckoo_hash_table fwd_hash_table;
+       /* The array of fwd_structs */
+       struct fwd_struct *fwd_array;
+       /* Linked list of entries in use. */
+       struct list_head fwd_list;
+       /* Could do something clever with a reader/writer lock. */
+       spinlock_t fwd_lock;
+       /* Make find_free_entry() a bit faster by caching this */
+       int last_free_index;
+};
+
+/*
+ * This is unlocked as it's only called from dl probe and remove,
+ * which are themselves synchronised.  Could get rid of it entirely as
+ * it's never iterated, but useful for debug
+ */
+static struct list_head port_fwds;
+
+
+/* Search the fwd_array for an unused entry */
+static int fwd_find_free_entry(struct port_fwd *fwd_set)
+{
+       int index = fwd_set->last_free_index;
+
+       do {
+               if (!fwd_set->fwd_array[index].valid) {
+                       fwd_set->last_free_index = index;
+                       return index;
+               }
+               index++;
+               if (index >= NUM_FWDS)
+                       index = 0;
+       } while (index != fwd_set->last_free_index);
+
+       return -ENOMEM;
+}
+
+
+/* Look up a MAC in the hash table. Caller should hold table lock. */
+static inline struct fwd_struct *fwd_find_entry(const __u8 *mac,
+                                               struct port_fwd *fwd_set)
+{
+       cuckoo_hash_value value;
+       cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac);
+
+       if (cuckoo_hash_lookup(&fwd_set->fwd_hash_table,
+                              (cuckoo_hash_key *)(&key),
+                              &value)) {
+               struct fwd_struct *fwd = &fwd_set->fwd_array[value];
+               DPRINTK_ON(memcmp(fwd->mac, mac, ETH_ALEN) != 0);
+               return fwd;
+       }
+
+       return NULL;
+}
+
+
+/* Initialise each nic port's fowarding table */
+void *netback_accel_init_fwd_port(void) 
+{      
+       struct port_fwd *fwd_set;
+
+       fwd_set = kzalloc(sizeof(struct port_fwd), GFP_KERNEL);
+       if (fwd_set == NULL) {
+               return NULL;
+       }
+
+       spin_lock_init(&fwd_set->fwd_lock);
+       
+       fwd_set->fwd_array = kzalloc(sizeof (struct fwd_struct) * NUM_FWDS,
+                                    GFP_KERNEL);
+       if (fwd_set->fwd_array == NULL) {
+               kfree(fwd_set);
+               return NULL;
+       }
+       
+       if (cuckoo_hash_init(&fwd_set->fwd_hash_table, NUM_FWDS_BITS, 8) != 0) {
+               kfree(fwd_set->fwd_array);
+               kfree(fwd_set);
+               return NULL;
+       }
+       
+       INIT_LIST_HEAD(&fwd_set->fwd_list);
+       
+       list_add(&fwd_set->link, &port_fwds);
+
+       return fwd_set;
+}
+
+
+void netback_accel_shutdown_fwd_port(void *fwd_priv)
+{
+       struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+
+       BUG_ON(fwd_priv == NULL);
+       
+       BUG_ON(list_empty(&port_fwds));
+       list_del(&fwd_set->link);
+
+       BUG_ON(!list_empty(&fwd_set->fwd_list));
+
+       cuckoo_hash_destroy(&fwd_set->fwd_hash_table);
+       kfree(fwd_set->fwd_array);
+       kfree(fwd_set);
+}
+
+
+int netback_accel_init_fwd()
+{
+       INIT_LIST_HEAD(&port_fwds);
+       return 0;
+}
+
+
+void netback_accel_shutdown_fwd()
+{
+       BUG_ON(!list_empty(&port_fwds));
+}
+
+
+/*
+ * Add an entry to the forwarding table.  Returns -ENOMEM if no
+ * space.
+ */
+int netback_accel_fwd_add(const __u8 *mac, void *context, void *fwd_priv)
+{
+       struct fwd_struct *fwd;
+       int rc = 0, index;
+       unsigned long flags;
+       cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac);
+       struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+
+       BUG_ON(fwd_priv == NULL);
+
+       DPRINTK("Adding mac %pM\n", mac);
+       
+       spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+       
+       if ((rc = fwd_find_free_entry(fwd_set)) < 0 ) {
+               spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+               return rc;
+       }
+
+       index = rc;
+
+       /* Shouldn't already be in the table */
+       if (cuckoo_hash_lookup(&fwd_set->fwd_hash_table,
+                              (cuckoo_hash_key *)(&key), &rc) != 0) {
+               spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+               EPRINTK("MAC address %pM already accelerated.\n", mac);
+               return -EEXIST;
+       }
+
+       if ((rc = cuckoo_hash_add(&fwd_set->fwd_hash_table,
+                                 (cuckoo_hash_key *)(&key), index, 1)) == 0) {
+               fwd = &fwd_set->fwd_array[index];
+               fwd->valid = 1;
+               fwd->context = context;
+               memcpy(fwd->mac, mac, ETH_ALEN);
+               list_add(&fwd->link, &fwd_set->fwd_list);
+               NETBACK_ACCEL_STATS_OP(global_stats.num_fwds++);
+       }
+
+       spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+
+       /*
+        * No need to tell frontend that this mac address is local -
+        * it should auto-discover through packets on fastpath what is
+        * local and what is not, and just being on same server
+        * doesn't make it local (it could be on a different
+        * bridge)
+        */
+
+       return rc;
+}
+
+
+/* remove an entry from the forwarding tables. */
+void netback_accel_fwd_remove(const __u8 *mac, void *fwd_priv)
+{
+       struct fwd_struct *fwd;
+       unsigned long flags;
+       cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac);
+       struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+
+       DPRINTK("Removing mac %pM\n", mac);
+
+       BUG_ON(fwd_priv == NULL);
+
+       spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+
+       fwd = fwd_find_entry(mac, fwd_set);
+       if (fwd != NULL) {
+               BUG_ON(list_empty(&fwd_set->fwd_list));
+               list_del(&fwd->link);
+
+               fwd->valid = 0;
+               cuckoo_hash_remove(&fwd_set->fwd_hash_table, 
+                                  (cuckoo_hash_key *)(&key));
+               NETBACK_ACCEL_STATS_OP(global_stats.num_fwds--);
+       }
+       spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+
+       /*
+        * No need to tell frontend that this is no longer present -
+        * the frontend is currently only interested in remote
+        * addresses and it works these out (mostly) by itself
+        */
+}
+
+
+/* Set the context pointer for a hash table entry. */
+int netback_accel_fwd_set_context(const __u8 *mac, void *context, 
+                                 void *fwd_priv)
+{
+       struct fwd_struct *fwd;
+       unsigned long flags;
+       int rc = -ENOENT;
+       struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+
+       BUG_ON(fwd_priv == NULL);
+
+       spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+       fwd = fwd_find_entry(mac, fwd_set);
+       if (fwd != NULL) {
+               fwd->context = context;
+               rc = 0;
+       }
+       spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+       return rc;
+}
+
+
+/**************************************************************************
+ * Process a received packet
+ **************************************************************************/
+
+/*
+ * Returns whether or not we have a match in our forward table for the
+ * this skb. Must be called with appropriate fwd_lock already held
+ */
+static struct netback_accel *for_a_vnic(struct netback_pkt_buf *skb, 
+                                       struct port_fwd *fwd_set)
+{
+       struct fwd_struct *fwd;
+       struct netback_accel *retval = NULL;
+
+       fwd = fwd_find_entry(skb->mac.raw, fwd_set);
+       if (fwd != NULL)
+               retval = fwd->context;
+       return retval;
+}
+
+
+static inline int packet_is_arp_reply(struct sk_buff *skb)
+{
+       return skb->protocol == ntohs(ETH_P_ARP) 
+               && arp_hdr(skb)->ar_op == ntohs(ARPOP_REPLY);
+}
+
+
+static inline void hdr_to_filt(struct ethhdr *ethhdr, struct iphdr *ip,
+                              struct netback_accel_filter_spec *spec)
+{
+       spec->proto = ip->protocol;
+       spec->destip_be = ip->daddr;
+       memcpy(spec->mac, ethhdr->h_source, ETH_ALEN);
+
+       if (ip->protocol == IPPROTO_TCP) {
+               struct tcphdr *tcp = (struct tcphdr *)((char *)ip + 4 * ip->ihl);
+               spec->destport_be = tcp->dest;
+       } else {
+               struct udphdr *udp = (struct udphdr *)((char *)ip + 4 * ip->ihl);
+               EPRINTK_ON(ip->protocol != IPPROTO_UDP);
+               spec->destport_be = udp->dest;
+       }
+}
+
+
+static inline int netback_accel_can_filter(struct netback_pkt_buf *skb) 
+{
+       return (skb->protocol == htons(ETH_P_IP) && 
+               ((skb->nh.iph->protocol == IPPROTO_TCP) ||
+                (skb->nh.iph->protocol == IPPROTO_UDP)));
+}
+
+
+static inline void netback_accel_filter_packet(struct netback_accel *bend,
+                                              struct netback_pkt_buf *skb)
+{
+       struct netback_accel_filter_spec fs;
+       struct ethhdr *eh = (struct ethhdr *)(skb->mac.raw);
+
+       hdr_to_filt(eh, skb->nh.iph, &fs);
+       
+       netback_accel_filter_check_add(bend, &fs);
+}
+
+
+/*
+ * Receive a packet and do something appropriate with it. Return true
+ * to take exclusive ownership of the packet.  This is verging on
+ * solarflare specific
+ */
+void netback_accel_rx_packet(struct netback_pkt_buf *skb, void *fwd_priv)
+{
+       struct netback_accel *bend;
+       struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+       unsigned long flags;
+
+       BUG_ON(fwd_priv == NULL);
+
+       /* Checking for bcast is cheaper so do that first */
+       if (is_broadcast_ether_addr(skb->mac.raw)) {
+               /* pass through the slow path by not claiming ownership */
+               return;
+       } else if (is_multicast_ether_addr(skb->mac.raw)) {
+               /* pass through the slow path by not claiming ownership */
+               return;
+       } else {
+               /* It is unicast */
+               spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+               /* We insert filter to pass it off to a VNIC */
+               if ((bend = for_a_vnic(skb, fwd_set)) != NULL)
+                       if (netback_accel_can_filter(skb))
+                               netback_accel_filter_packet(bend, skb);
+               spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+       }
+       return;
+}
+
+
+void netback_accel_tx_packet(struct sk_buff *skb, void *fwd_priv) 
+{
+       __u8 *mac;
+       unsigned long flags;
+       struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv;
+       struct fwd_struct *fwd;
+
+       BUG_ON(fwd_priv == NULL);
+
+       if (is_broadcast_ether_addr(skb_mac_header(skb))
+           && packet_is_arp_reply(skb)) {
+               /*
+                * update our fast path forwarding to reflect this
+                * gratuitous ARP
+                */ 
+               mac = skb_mac_header(skb)+ETH_ALEN;
+
+               DPRINTK("%s: found gratuitous ARP for %pM\n",
+                       __FUNCTION__, mac);
+
+               spin_lock_irqsave(&fwd_set->fwd_lock, flags);
+               /*
+                * Might not be local, but let's tell them all it is,
+                * and they can restore the fastpath if they continue
+                * to get packets that way
+                */
+               list_for_each_entry(fwd, &fwd_set->fwd_list, link) {
+                       struct netback_accel *bend = fwd->context;
+                       if (bend != NULL)
+                               netback_accel_msg_tx_new_localmac(bend, mac);
+               }
+
+               spin_unlock_irqrestore(&fwd_set->fwd_lock, flags);
+       }
+       return;
+}
diff --git a/drivers/xen/sfc_netback/accel_msg.c b/drivers/xen/sfc_netback/accel_msg.c

new file mode 100644 (file)

index 0000000..b8982a7
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_msg.c
@@ -0,0 +1,391 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <xen/evtchn.h>
+
+#include "accel.h"
+#include "accel_msg_iface.h"
+#include "accel_util.h"
+#include "accel_solarflare.h"
+
+/* Send a HELLO to front end to start things off */
+void netback_accel_msg_tx_hello(struct netback_accel *bend, unsigned version)
+{
+       unsigned long lock_state;
+       struct net_accel_msg *msg = 
+               net_accel_msg_start_send(bend->shared_page,
+                                        &bend->to_domU, &lock_state);
+       /* The queue _cannot_ be full, we're the first users. */
+       EPRINTK_ON(msg == NULL);
+
+       if (msg != NULL) {
+               net_accel_msg_init(msg, NET_ACCEL_MSG_HELLO);
+               msg->u.hello.version = version;
+               msg->u.hello.max_pages = bend->quotas.max_buf_pages; 
+               VPRINTK("Sending hello to channel %d\n", bend->msg_channel);
+               net_accel_msg_complete_send_notify(bend->shared_page, 
+                                                  &bend->to_domU,
+                                                  &lock_state, 
+                                                  bend->msg_channel_irq);
+       }
+}
+
+/* Send a local mac message to vnic */
+static void netback_accel_msg_tx_localmac(struct netback_accel *bend, 
+                                         int type, const void *mac)
+{
+       unsigned long lock_state;
+       struct net_accel_msg *msg;
+
+       BUG_ON(bend == NULL || mac == NULL);
+
+       VPRINTK("Sending local mac message: %pM\n", mac);
+       
+       msg = net_accel_msg_start_send(bend->shared_page, &bend->to_domU,
+                                      &lock_state);
+       
+       if (msg != NULL) {
+               net_accel_msg_init(msg, NET_ACCEL_MSG_LOCALMAC);
+               msg->u.localmac.flags = type;
+               memcpy(msg->u.localmac.mac, mac, ETH_ALEN);
+               net_accel_msg_complete_send_notify(bend->shared_page, 
+                                                  &bend->to_domU,
+                                                  &lock_state, 
+                                                  bend->msg_channel_irq);
+       } else {
+               /*
+                * TODO if this happens we may leave a domU
+                * fastpathing packets when they should be delivered
+                * locally.  Solution is get domU to timeout entries
+                * in its fastpath lookup table when it receives no RX
+                * traffic
+                */
+               EPRINTK("%s: saw full queue, may need ARP timer to recover\n",
+                       __FUNCTION__);
+       }
+}
+
+/* Send an add local mac message to vnic */
+void netback_accel_msg_tx_new_localmac(struct netback_accel *bend,
+                                      const void *mac)
+{
+       netback_accel_msg_tx_localmac(bend, NET_ACCEL_MSG_ADD, mac);
+}
+
+
+static int netback_accel_msg_rx_buffer_map(struct netback_accel *bend, 
+                                          struct net_accel_msg *msg)
+{
+       int log2_pages, rc;
+
+       /* Can only allocate in power of two */
+       log2_pages = log2_ge(msg->u.mapbufs.pages, 0);
+       if (msg->u.mapbufs.pages != pow2(log2_pages)) {
+               EPRINTK("%s: Can only alloc bufs in power of 2 sizes (%d)\n",
+                       __FUNCTION__, msg->u.mapbufs.pages);
+               rc = -EINVAL;
+               goto err_out;
+       }
+  
+       /*
+        * Sanity.  Assumes NET_ACCEL_MSG_MAX_PAGE_REQ is same for
+        * both directions/domains
+        */
+       if (msg->u.mapbufs.pages > NET_ACCEL_MSG_MAX_PAGE_REQ) {
+               EPRINTK("%s: too many pages in a single message: %d %d\n", 
+                       __FUNCTION__, msg->u.mapbufs.pages,
+                       NET_ACCEL_MSG_MAX_PAGE_REQ);
+               rc = -EINVAL;
+               goto err_out;
+       }
+  
+       if ((rc = netback_accel_add_buffers(bend, msg->u.mapbufs.pages, 
+                                           log2_pages, msg->u.mapbufs.grants, 
+                                           &msg->u.mapbufs.buf)) < 0) {
+               goto err_out;
+       }
+
+       msg->id |= NET_ACCEL_MSG_REPLY;
+  
+       return 0;
+
+ err_out:
+       EPRINTK("%s: err_out\n", __FUNCTION__);
+       msg->id |= NET_ACCEL_MSG_ERROR | NET_ACCEL_MSG_REPLY;
+       return rc;
+}
+
+
+/* Hint from frontend that one of our filters is out of date */
+static int netback_accel_process_fastpath(struct netback_accel *bend, 
+                                         struct net_accel_msg *msg)
+{
+       struct netback_accel_filter_spec spec;
+
+       if (msg->u.fastpath.flags & NET_ACCEL_MSG_REMOVE) {
+               /* 
+                * Would be nice to BUG() this but would leave us
+                * vulnerable to naughty frontend
+                */
+               EPRINTK_ON(msg->u.fastpath.flags & NET_ACCEL_MSG_ADD);
+               
+               memcpy(spec.mac, msg->u.fastpath.mac, ETH_ALEN);
+               spec.destport_be = msg->u.fastpath.port;
+               spec.destip_be = msg->u.fastpath.ip;
+               spec.proto = msg->u.fastpath.proto;
+
+               netback_accel_filter_remove_spec(bend, &spec);
+       }
+
+       return 0;
+}
+
+
+/* Flow control for message queues */
+inline void set_queue_not_full(struct netback_accel *bend)
+{
+       if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B, 
+                             (unsigned long *)&bend->shared_page->aflags))
+               notify_remote_via_irq(bend->msg_channel_irq);
+       else
+               VPRINTK("queue not full bit already set, not signalling\n");
+}
+
+
+/* Flow control for message queues */
+inline void set_queue_full(struct netback_accel *bend)
+{
+       if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B,
+                             (unsigned long *)&bend->shared_page->aflags))
+               notify_remote_via_irq(bend->msg_channel_irq);
+       else
+               VPRINTK("queue full bit already set, not signalling\n");
+}
+
+
+void netback_accel_set_interface_state(struct netback_accel *bend, int up)
+{
+       bend->shared_page->net_dev_up = up;
+       if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B, 
+                            (unsigned long *)&bend->shared_page->aflags))
+               notify_remote_via_irq(bend->msg_channel_irq);
+       else
+               VPRINTK("interface up/down bit already set, not signalling\n");
+}
+
+
+static int check_rx_hello_version(unsigned version) 
+{
+       /* Should only happen if there's been a version mismatch */
+       BUG_ON(version == NET_ACCEL_MSG_VERSION);
+
+       if (version > NET_ACCEL_MSG_VERSION) {
+               /* Newer protocol, we must refuse */
+               return -EPROTO;
+       }
+
+       if (version < NET_ACCEL_MSG_VERSION) {
+               /*
+                * We are newer, so have discretion to accept if we
+                * wish.  For now however, just reject
+                */
+               return -EPROTO;
+       }
+
+       return -EINVAL;
+}
+
+
+static int process_rx_msg(struct netback_accel *bend,
+                         struct net_accel_msg *msg)
+{
+       int err = 0;
+                     
+       switch (msg->id) {
+       case NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_HELLO:
+               /* Reply to a HELLO; mark ourselves as connected */
+               DPRINTK("got Hello reply, version %.8x\n",
+                       msg->u.hello.version);
+               
+               /*
+                * Check that we've not successfully done this
+                * already.  NB no check at the moment that this reply
+                * comes after we've actually sent a HELLO as that's
+                * not possible with the current code structure
+                */
+               if (bend->hw_state != NETBACK_ACCEL_RES_NONE)
+                       return -EPROTO;
+
+               /* Store max_pages for accel_setup */
+               if (msg->u.hello.max_pages > bend->quotas.max_buf_pages) {
+                       EPRINTK("More pages than quota allows (%d > %d)\n",
+                               msg->u.hello.max_pages, 
+                               bend->quotas.max_buf_pages);
+                       /* Force it down to the quota */
+                       msg->u.hello.max_pages = bend->quotas.max_buf_pages;
+               }
+               bend->max_pages = msg->u.hello.max_pages;
+               
+               /* Set up the hardware visible to the other end */
+               err = bend->accel_setup(bend);
+               if (err) {
+                       /* This is fatal */
+                       DPRINTK("Hello gave accel_setup error %d\n", err);
+                       netback_accel_set_closing(bend);
+               } else {
+                       /*
+                        * Now add the context so that packet
+                        * forwarding will commence
+                        */
+                       netback_accel_fwd_set_context(bend->mac, bend, 
+                                                     bend->fwd_priv);
+               }
+               break;
+       case NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_ERROR:
+               EPRINTK("got Hello error, versions us:%.8x them:%.8x\n",
+                       NET_ACCEL_MSG_VERSION, msg->u.hello.version);
+
+               if (bend->hw_state != NETBACK_ACCEL_RES_NONE)
+                       return -EPROTO;
+
+               if (msg->u.hello.version != NET_ACCEL_MSG_VERSION) {
+                       /* Error is due to version mismatch */
+                       err = check_rx_hello_version(msg->u.hello.version);
+                       if (err == 0) {
+                               /*
+                                * It's OK to be compatible, send
+                                * another hello with compatible version
+                                */
+                               netback_accel_msg_tx_hello
+                                       (bend, msg->u.hello.version);
+                       } else {
+                               /*
+                                * Tell frontend that we're not going to
+                                * send another HELLO by going to Closing.
+                                */
+                               netback_accel_set_closing(bend);
+                       }
+               } 
+               break;
+       case NET_ACCEL_MSG_MAPBUF:
+               VPRINTK("Got mapped buffers request %d\n",
+                       msg->u.mapbufs.reqid);
+
+               if (bend->hw_state == NETBACK_ACCEL_RES_NONE)
+                       return -EPROTO;
+
+               /*
+                * Frontend wants a buffer table entry for the
+                * supplied pages
+                */
+               err = netback_accel_msg_rx_buffer_map(bend, msg);
+               if (net_accel_msg_reply_notify(bend->shared_page,
+                                              bend->msg_channel_irq, 
+                                              &bend->to_domU, msg)) {
+                       /*
+                        * This is fatal as we can't tell the frontend
+                        * about the problem through the message
+                        * queue, and so would otherwise stalemate
+                        */
+                       netback_accel_set_closing(bend);
+               }
+               break;
+       case NET_ACCEL_MSG_FASTPATH:
+               DPRINTK("Got fastpath request\n");
+
+               if (bend->hw_state == NETBACK_ACCEL_RES_NONE)
+                       return -EPROTO;
+
+               err = netback_accel_process_fastpath(bend, msg);
+               break;
+       default:
+               EPRINTK("Huh? Message code is %x\n", msg->id);
+               err = -EPROTO;
+               break;
+       }
+       return err;
+}
+
+
+/*  Demultiplex an IRQ from the frontend driver.  */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+void netback_accel_msg_rx_handler(struct work_struct *arg)
+#else
+void netback_accel_msg_rx_handler(void *bend_void)
+#endif
+{
+       struct net_accel_msg msg;
+       int err, queue_was_full = 0;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+       struct netback_accel *bend = 
+               container_of(arg, struct netback_accel, handle_msg);
+#else
+       struct netback_accel *bend = (struct netback_accel *)bend_void;
+#endif
+
+       mutex_lock(&bend->bend_mutex);
+
+       /*
+        * This happens when the shared pages have been unmapped, but
+        * the workqueue not flushed yet
+        */
+       if (bend->shared_page == NULL)
+               goto done;
+
+       if ((bend->shared_page->aflags &
+            NET_ACCEL_MSG_AFLAGS_TO_DOM0_MASK) != 0) {
+               if (bend->shared_page->aflags &
+                   NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL) {
+                       /* We've been told there may now be space. */
+                       clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B, 
+                                 (unsigned long *)&bend->shared_page->aflags);
+               }
+
+               if (bend->shared_page->aflags &
+                   NET_ACCEL_MSG_AFLAGS_QUEUEUFULL) {
+                       clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B, 
+                                 (unsigned long *)&bend->shared_page->aflags);
+                       queue_was_full = 1;
+               }
+       }
+
+       while ((err = net_accel_msg_recv(bend->shared_page, &bend->from_domU,
+                                        &msg)) == 0) {
+               err = process_rx_msg(bend, &msg);
+               
+               if (err != 0) {
+                       EPRINTK("%s: Error %d\n", __FUNCTION__, err);
+                       goto err;
+               }
+       }
+
+ err:
+       /* There will be space now if we can make any. */
+       if (queue_was_full) 
+               set_queue_not_full(bend);
+ done:
+       mutex_unlock(&bend->bend_mutex);
+
+       return;
+}
diff --git a/drivers/xen/sfc_netback/accel_solarflare.c b/drivers/xen/sfc_netback/accel_solarflare.c

new file mode 100644 (file)

index 0000000..f5809a2
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_solarflare.c
@@ -0,0 +1,1292 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include "common.h"
+
+#include "accel.h"
+#include "accel_solarflare.h"
+#include "accel_msg_iface.h"
+#include "accel_util.h"
+
+#include "accel_cuckoo_hash.h"
+
+#include "ci/driver/resource/efx_vi.h"
+
+#include "ci/efrm/nic_table.h" 
+#include "ci/efhw/public.h"
+
+#include <xen/evtchn.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+#include "driverlink_api.h"
+
+#define SF_XEN_RX_USR_BUF_SIZE 2048
+
+struct falcon_bend_accel_priv {
+       struct efx_vi_state *efx_vih;
+
+       /*! Array of pointers to dma_map state, used so VNIC can
+        *  request their removal in a single message
+        */
+       struct efx_vi_dma_map_state **dma_maps;
+       /*! Index into dma_maps */
+       int dma_maps_index; 
+
+       /*! Serialises access to filters */
+       spinlock_t filter_lock;      
+       /*! Bitmap of which filters are free */
+       unsigned long free_filters;      
+       /*! Used for index normalisation */
+       u32 filter_idx_mask;            
+       struct netback_accel_filter_spec *fspecs; 
+       cuckoo_hash_table filter_hash_table;
+
+       u32 txdmaq_gnt;
+       u32 rxdmaq_gnt;
+       u32 doorbell_gnt;
+       u32 evq_rptr_gnt;
+       u32 evq_mem_gnts[EF_HW_FALCON_EVQ_PAGES];
+       u32 evq_npages;
+};
+
+/* Forward declaration */
+static int netback_accel_filter_init(struct netback_accel *);
+static void netback_accel_filter_shutdown(struct netback_accel *);
+
+/**************************************************************************
+ * 
+ * Driverlink stuff
+ *
+ **************************************************************************/
+
+struct driverlink_port {
+       struct list_head link;
+       enum net_accel_hw_type type;
+       struct net_device *net_dev;
+       struct efx_dl_device *efx_dl_dev;
+       void *fwd_priv;
+};
+
+static struct list_head dl_ports;
+
+/* This mutex protects global state, such as the dl_ports list */
+DEFINE_MUTEX(accel_mutex);
+
+static int init_done = 0;
+
+/* The DL callbacks */
+
+
+#if defined(EFX_USE_FASTCALL)
+static enum efx_veto fastcall
+#else
+static enum efx_veto
+#endif
+bend_dl_tx_packet(struct efx_dl_device *efx_dl_dev,
+                 struct sk_buff *skb)
+{
+       struct driverlink_port *port = efx_dl_dev->priv;
+
+       BUG_ON(port == NULL);
+
+       NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_packets++);
+       if (skb_mac_header_was_set(skb))
+               netback_accel_tx_packet(skb, port->fwd_priv);
+       else {
+               DPRINTK("Ignoring packet with missing mac address\n");
+               NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_bad_packets++);
+       }
+       return EFX_ALLOW_PACKET;
+}
+
+/* EFX_USE_FASTCALL */
+#if defined(EFX_USE_FASTCALL)
+static enum efx_veto fastcall
+#else
+static enum efx_veto
+#endif
+bend_dl_rx_packet(struct efx_dl_device *efx_dl_dev,
+                 const char *pkt_buf, int pkt_len)
+{
+       struct driverlink_port *port = efx_dl_dev->priv;
+       struct netback_pkt_buf pkt;
+       struct ethhdr *eh;
+
+       BUG_ON(port == NULL);
+
+       pkt.mac.raw = (char *)pkt_buf;
+       pkt.nh.raw = (char *)pkt_buf + ETH_HLEN;
+       eh = (struct ethhdr *)pkt_buf;
+       pkt.protocol = eh->h_proto;
+
+       NETBACK_ACCEL_STATS_OP(global_stats.dl_rx_packets++);
+       netback_accel_rx_packet(&pkt, port->fwd_priv);
+       return EFX_ALLOW_PACKET;
+}
+
+
+/* Callbacks we'd like to get from the netdriver through driverlink */
+struct efx_dl_callbacks bend_dl_callbacks =
+       {
+               .tx_packet = bend_dl_tx_packet,
+               .rx_packet = bend_dl_rx_packet,
+       };
+
+
+static struct netback_accel_hooks accel_hooks = {
+       THIS_MODULE,
+       &netback_accel_probe,
+       &netback_accel_remove
+};
+
+
+/* Driver link probe - register our callbacks */
+static int bend_dl_probe(struct efx_dl_device *efx_dl_dev,
+                        const struct net_device *net_dev,
+                        const struct efx_dl_device_info *dev_info,
+                        const char* silicon_rev)
+{
+       int rc;
+       enum net_accel_hw_type type;
+       struct driverlink_port *port;
+
+       DPRINTK("%s: %s\n", __FUNCTION__, silicon_rev);
+
+       if (strcmp(silicon_rev, "falcon/a1") == 0)
+               type = NET_ACCEL_MSG_HWTYPE_FALCON_A;
+       else if (strcmp(silicon_rev, "falcon/b0") == 0)
+               type = NET_ACCEL_MSG_HWTYPE_FALCON_B;
+       else if (strcmp(silicon_rev, "siena/a0") == 0)
+               type = NET_ACCEL_MSG_HWTYPE_SIENA_A;
+       else {
+               EPRINTK("%s: unsupported silicon %s\n", __FUNCTION__,
+                       silicon_rev);
+               rc = -EINVAL;
+               goto fail1;
+       }
+       
+       port = kmalloc(sizeof(struct driverlink_port), GFP_KERNEL);
+       if (port == NULL) {
+               EPRINTK("%s: no memory for dl probe\n", __FUNCTION__);
+               rc = -ENOMEM;
+               goto fail1;
+       }
+
+       port->efx_dl_dev = efx_dl_dev;
+       efx_dl_dev->priv = port;
+
+       port->fwd_priv = netback_accel_init_fwd_port();
+       if (port->fwd_priv == NULL) {
+               EPRINTK("%s: failed to set up forwarding for port\n",
+                       __FUNCTION__);
+               rc = -ENOMEM;
+               goto fail2;
+       }
+
+       rc = efx_dl_register_callbacks(efx_dl_dev, &bend_dl_callbacks);
+       if (rc != 0) {
+               EPRINTK("%s: register_callbacks failed\n", __FUNCTION__);
+               goto fail3;
+       }
+
+       port->type = type;
+       port->net_dev = (struct net_device *)net_dev;
+
+       mutex_lock(&accel_mutex);
+       list_add(&port->link, &dl_ports);
+       mutex_unlock(&accel_mutex);
+
+       rc = netback_connect_accelerator(NETBACK_ACCEL_VERSION, 0,
+                                        port->net_dev->name, &accel_hooks);
+
+       if (rc < 0) {
+               EPRINTK("Xen netback accelerator version mismatch\n");
+               goto fail4;
+       } else if (rc > 0) {
+               /*
+                * In future may want to add backwards compatibility
+                * and accept certain subsets of previous versions
+                */
+               EPRINTK("Xen netback accelerator version mismatch\n");
+               goto fail4;
+       } 
+
+       return 0;
+
+ fail4:
+       mutex_lock(&accel_mutex);
+       list_del(&port->link);
+       mutex_unlock(&accel_mutex);
+
+       efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks);
+ fail3: 
+       netback_accel_shutdown_fwd_port(port->fwd_priv);
+ fail2:
+       efx_dl_dev->priv = NULL;
+       kfree(port);
+ fail1:
+       return rc;
+}
+
+
+static void bend_dl_remove(struct efx_dl_device *efx_dl_dev)
+{
+       struct driverlink_port *port;
+
+       DPRINTK("Unregistering driverlink callbacks.\n");
+
+       mutex_lock(&accel_mutex);
+
+       port = (struct driverlink_port *)efx_dl_dev->priv;
+
+       BUG_ON(list_empty(&dl_ports));
+       BUG_ON(port == NULL);
+       BUG_ON(port->efx_dl_dev != efx_dl_dev);
+
+       netback_disconnect_accelerator(0, port->net_dev->name);
+
+       list_del(&port->link);
+
+       mutex_unlock(&accel_mutex);
+
+       efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks);
+       netback_accel_shutdown_fwd_port(port->fwd_priv);
+
+       efx_dl_dev->priv = NULL;
+       kfree(port);
+
+       return;
+}
+
+
+static void bend_dl_reset_suspend(struct efx_dl_device *efx_dl_dev)
+{
+       struct driverlink_port *port;
+
+       DPRINTK("Driverlink reset suspend.\n");
+
+       mutex_lock(&accel_mutex);
+
+       port = (struct driverlink_port *)efx_dl_dev->priv;
+       BUG_ON(list_empty(&dl_ports));
+       BUG_ON(port == NULL);
+       BUG_ON(port->efx_dl_dev != efx_dl_dev);
+
+       netback_disconnect_accelerator(0, port->net_dev->name);
+       mutex_unlock(&accel_mutex);
+}
+
+
+static void bend_dl_reset_resume(struct efx_dl_device *efx_dl_dev, int ok)
+{
+       int rc;
+       struct driverlink_port *port;
+
+       DPRINTK("Driverlink reset resume.\n");
+       
+       if (!ok)
+               return;
+
+       port = (struct driverlink_port *)efx_dl_dev->priv;
+       BUG_ON(list_empty(&dl_ports));
+       BUG_ON(port == NULL);
+       BUG_ON(port->efx_dl_dev != efx_dl_dev);
+
+       rc = netback_connect_accelerator(NETBACK_ACCEL_VERSION, 0,
+                                        port->net_dev->name, &accel_hooks);
+       if (rc != 0) {
+               EPRINTK("Xen netback accelerator version mismatch\n");
+
+               mutex_lock(&accel_mutex);
+               list_del(&port->link);
+               mutex_unlock(&accel_mutex);
+
+               efx_dl_unregister_callbacks(efx_dl_dev, &bend_dl_callbacks);
+
+               netback_accel_shutdown_fwd_port(port->fwd_priv);
+
+               efx_dl_dev->priv = NULL;
+               kfree(port);
+       }
+}
+
+
+static struct efx_dl_driver bend_dl_driver = 
+       {
+               .name = "SFC Xen backend",
+               .probe = bend_dl_probe,
+               .remove = bend_dl_remove,
+               .reset_suspend = bend_dl_reset_suspend,
+               .reset_resume = bend_dl_reset_resume
+       };
+
+
+int netback_accel_sf_init(void)
+{
+       int rc, nic_i;
+       struct efhw_nic *nic;
+
+       INIT_LIST_HEAD(&dl_ports);
+
+       rc = efx_dl_register_driver(&bend_dl_driver);
+       /* If we couldn't find the NET driver, give up */
+       if (rc == -ENOENT)
+               return rc;
+       
+       if (rc == 0) {
+               EFRM_FOR_EACH_NIC(nic_i, nic)
+                       falcon_nic_set_rx_usr_buf_size(nic, 
+                                                      SF_XEN_RX_USR_BUF_SIZE);
+       }
+
+       init_done = (rc == 0);
+       return rc;
+}
+
+
+void netback_accel_sf_shutdown(void)
+{
+       if (!init_done)
+               return;
+       DPRINTK("Unregistering driverlink driver\n");
+
+       /*
+        * This will trigger removal callbacks for all the devices, which
+        * will unregister their callbacks, disconnect from netfront, etc.
+        */
+       efx_dl_unregister_driver(&bend_dl_driver);
+}
+
+
+int netback_accel_sf_hwtype(struct netback_accel *bend)
+{
+       struct driverlink_port *port;
+
+       mutex_lock(&accel_mutex);
+
+       list_for_each_entry(port, &dl_ports, link) {
+               if (strcmp(bend->nicname, port->net_dev->name) == 0) {
+                       bend->hw_type = port->type;
+                       bend->accel_setup = netback_accel_setup_vnic_hw;
+                       bend->accel_shutdown = netback_accel_shutdown_vnic_hw;
+                       bend->fwd_priv = port->fwd_priv;
+                       bend->net_dev = port->net_dev;
+                       mutex_unlock(&accel_mutex);
+                       return 0;
+               }
+       }
+
+       mutex_unlock(&accel_mutex);
+
+       EPRINTK("Failed to identify backend device '%s' with a NIC\n",
+               bend->nicname);
+
+       return -ENOENT;
+}
+
+
+/****************************************************************************
+ * Resource management code
+ ***************************************************************************/
+
+static int alloc_page_state(struct netback_accel *bend, int max_pages)
+{
+       struct falcon_bend_accel_priv *accel_hw_priv;
+
+       if (max_pages < 0 || max_pages > bend->quotas.max_buf_pages) {
+               EPRINTK("%s: invalid max_pages: %d\n", __FUNCTION__, max_pages);
+               return -EINVAL;
+       }
+
+       accel_hw_priv = kzalloc(sizeof(struct falcon_bend_accel_priv),
+                               GFP_KERNEL);
+       if (accel_hw_priv == NULL) {
+               EPRINTK("%s: no memory for accel_hw_priv\n", __FUNCTION__);
+               return -ENOMEM;
+       }
+
+       accel_hw_priv->dma_maps = kzalloc
+               (sizeof(struct efx_vi_dma_map_state **) * 
+                (max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ), GFP_KERNEL);
+       if (accel_hw_priv->dma_maps == NULL) {
+               EPRINTK("%s: no memory for dma_maps\n", __FUNCTION__);
+               kfree(accel_hw_priv);
+               return -ENOMEM;
+       }
+
+       bend->buffer_maps = kzalloc(sizeof(struct vm_struct *) * max_pages, 
+                                   GFP_KERNEL);
+       if (bend->buffer_maps == NULL) {
+               EPRINTK("%s: no memory for buffer_maps\n", __FUNCTION__);
+               kfree(accel_hw_priv->dma_maps);
+               kfree(accel_hw_priv);
+               return -ENOMEM;
+       }
+
+       bend->buffer_addrs = kzalloc(sizeof(u64) * max_pages, GFP_KERNEL);
+       if (bend->buffer_addrs == NULL) {
+               kfree(bend->buffer_maps);
+               kfree(accel_hw_priv->dma_maps);
+               kfree(accel_hw_priv);
+               return -ENOMEM;
+       }
+
+       bend->accel_hw_priv = accel_hw_priv;
+
+       return 0;
+}
+
+
+static int free_page_state(struct netback_accel *bend)
+{
+       struct falcon_bend_accel_priv *accel_hw_priv;
+
+       DPRINTK("%s: %p\n", __FUNCTION__, bend);
+
+       accel_hw_priv = bend->accel_hw_priv;
+
+       if (accel_hw_priv) {
+               kfree(accel_hw_priv->dma_maps);
+               kfree(bend->buffer_maps);
+               kfree(bend->buffer_addrs);
+               kfree(accel_hw_priv);
+               bend->accel_hw_priv = NULL;
+               bend->max_pages = 0;
+       }
+
+       return 0;
+}
+
+
+/* The timeout event callback for the event q */
+static void bend_evq_timeout(void *context, int is_timeout)
+{
+       struct netback_accel *bend = (struct netback_accel *)context;
+       if (is_timeout) {
+               /* Pass event to vnic front end driver */
+               VPRINTK("timeout event to %d\n", bend->net_channel);
+               NETBACK_ACCEL_STATS_OP(bend->stats.evq_timeouts++);
+               notify_remote_via_irq(bend->net_channel_irq);
+       } else {
+               /* It's a wakeup event, used by Falcon */
+               VPRINTK("wakeup to %d\n", bend->net_channel);
+               NETBACK_ACCEL_STATS_OP(bend->stats.evq_wakeups++);
+               notify_remote_via_irq(bend->net_channel_irq);
+       }
+}
+
+
+/*
+ * Create the eventq and associated gubbins for communication with the
+ * front end vnic driver
+ */
+static int ef_get_vnic(struct netback_accel *bend)
+{
+       struct falcon_bend_accel_priv *accel_hw_priv;
+       int rc = 0;
+
+       BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_NONE);
+
+       /* Allocate page related state and accel_hw_priv */
+       rc = alloc_page_state(bend, bend->max_pages);
+       if (rc != 0) {
+               EPRINTK("Failed to allocate page state: %d\n", rc);
+               return rc;
+       }
+
+       accel_hw_priv = bend->accel_hw_priv;
+
+       rc = efx_vi_alloc(&accel_hw_priv->efx_vih, bend->net_dev->ifindex);
+       if (rc != 0) {
+               EPRINTK("%s: efx_vi_alloc failed %d\n", __FUNCTION__, rc);
+               free_page_state(bend);
+               return rc;
+       }
+
+       rc = efx_vi_eventq_register_callback(accel_hw_priv->efx_vih,
+                                            bend_evq_timeout,
+                                            bend);
+       if (rc != 0) {
+               EPRINTK("%s: register_callback failed %d\n", __FUNCTION__, rc);
+               efx_vi_free(accel_hw_priv->efx_vih);
+               free_page_state(bend);
+               return rc;
+       }
+
+       bend->hw_state = NETBACK_ACCEL_RES_ALLOC;
+       
+       return 0;
+}
+
+
+static void ef_free_vnic(struct netback_accel *bend)
+{
+       struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+
+       BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_ALLOC);
+
+       efx_vi_eventq_kill_callback(accel_hw_priv->efx_vih);
+
+       DPRINTK("Hardware is freeable. Will proceed.\n");
+
+       efx_vi_free(accel_hw_priv->efx_vih);
+       accel_hw_priv->efx_vih = NULL;
+
+       VPRINTK("Free page state...\n");
+       free_page_state(bend);
+
+       bend->hw_state = NETBACK_ACCEL_RES_NONE;
+}
+
+
+static inline void ungrant_or_crash(grant_ref_t gntref, int domain) {
+       if (net_accel_ungrant_page(gntref) == -EBUSY)
+               net_accel_shutdown_remote(domain);
+}
+
+
+static void netback_accel_release_hwinfo(struct netback_accel *bend)
+{
+       struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+       int i;
+
+       DPRINTK("Remove dma q grants %d %d\n", accel_hw_priv->txdmaq_gnt,
+               accel_hw_priv->rxdmaq_gnt);
+       ungrant_or_crash(accel_hw_priv->txdmaq_gnt, bend->far_end);
+       ungrant_or_crash(accel_hw_priv->rxdmaq_gnt, bend->far_end);
+
+       DPRINTK("Remove doorbell grant %d\n", accel_hw_priv->doorbell_gnt);
+       ungrant_or_crash(accel_hw_priv->doorbell_gnt, bend->far_end);
+
+       if (bend->hw_type == NET_ACCEL_MSG_HWTYPE_FALCON_A) {
+               DPRINTK("Remove rptr grant %d\n", accel_hw_priv->evq_rptr_gnt);
+               ungrant_or_crash(accel_hw_priv->evq_rptr_gnt, bend->far_end);
+       }
+
+       for (i = 0; i < accel_hw_priv->evq_npages; i++) {
+               DPRINTK("Remove evq grant %d\n", accel_hw_priv->evq_mem_gnts[i]);
+               ungrant_or_crash(accel_hw_priv->evq_mem_gnts[i], bend->far_end);
+       }
+
+       bend->hw_state = NETBACK_ACCEL_RES_FILTER;
+
+       return;
+}
+
+
+static int ef_bend_hwinfo_falcon_common(struct netback_accel *bend, 
+                                       struct net_accel_hw_falcon_b *hwinfo)
+{
+       struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+       struct efx_vi_hw_resource_metadata res_mdata;
+       struct efx_vi_hw_resource res_array[EFX_VI_HW_RESOURCE_MAXSIZE];
+       int rc, len = EFX_VI_HW_RESOURCE_MAXSIZE, i, pfn = 0;
+       unsigned long txdmaq_pfn = 0, rxdmaq_pfn = 0;
+
+       rc = efx_vi_hw_resource_get_phys(accel_hw_priv->efx_vih, &res_mdata,
+                                        res_array, &len);
+       if (rc != 0) {
+               DPRINTK("%s: resource_get_phys returned %d\n",
+                       __FUNCTION__, rc);
+               return rc;
+       }
+
+       hwinfo->nic_arch = res_mdata.nic_arch;
+       hwinfo->nic_variant = res_mdata.nic_variant;
+       hwinfo->nic_revision = res_mdata.nic_revision;
+
+       hwinfo->evq_order = res_mdata.evq_order;
+       hwinfo->evq_offs = res_mdata.evq_offs;
+       hwinfo->evq_capacity = res_mdata.evq_capacity;
+       hwinfo->instance = res_mdata.instance;
+       hwinfo->rx_capacity = res_mdata.rx_capacity;
+       hwinfo->tx_capacity = res_mdata.tx_capacity;
+
+       VPRINTK("evq_order %d evq_offs %d evq_cap %d inst %d rx_cap %d tx_cap %d\n",
+               hwinfo->evq_order, hwinfo->evq_offs, hwinfo->evq_capacity,
+               hwinfo->instance, hwinfo->rx_capacity, hwinfo->tx_capacity);
+
+       for (i = 0; i < len; i++) {
+               struct efx_vi_hw_resource *res = &(res_array[i]);
+               switch (res->type) {
+               case EFX_VI_HW_RESOURCE_TXDMAQ:
+                       txdmaq_pfn = page_to_pfn(virt_to_page(res->address));
+                       break;
+               case EFX_VI_HW_RESOURCE_RXDMAQ: 
+                       rxdmaq_pfn = page_to_pfn(virt_to_page(res->address));
+                       break;
+               case EFX_VI_HW_RESOURCE_EVQTIMER:
+                       break;
+               case EFX_VI_HW_RESOURCE_EVQRPTR:
+               case EFX_VI_HW_RESOURCE_EVQRPTR_OFFSET:
+                       hwinfo->evq_rptr = res->address;
+                       break;
+               case EFX_VI_HW_RESOURCE_EVQMEMKVA: 
+                       accel_hw_priv->evq_npages =  1 << res_mdata.evq_order;
+                       pfn = page_to_pfn(virt_to_page(res->address));
+                       break;
+               case EFX_VI_HW_RESOURCE_BELLPAGE:
+                       hwinfo->doorbell_mfn  = res->address;
+                       break;
+               default:
+                       EPRINTK("%s: Unknown hardware resource type %d\n",
+                               __FUNCTION__, res->type);
+                       break;
+               }
+       }
+
+       VPRINTK("Passing txdmaq page pfn %lx\n", txdmaq_pfn);
+       rc = net_accel_grant_page(bend->hdev_data, pfn_to_mfn(txdmaq_pfn), 0);
+       if (rc < 0)
+               goto fail0;
+       accel_hw_priv->txdmaq_gnt = hwinfo->txdmaq_gnt = rc;
+
+       VPRINTK("Passing rxdmaq page pfn %lx\n", rxdmaq_pfn);
+       rc = net_accel_grant_page(bend->hdev_data, pfn_to_mfn(rxdmaq_pfn), 0);
+       if (rc < 0)
+               goto fail1;
+       accel_hw_priv->rxdmaq_gnt = hwinfo->rxdmaq_gnt = rc;
+
+       VPRINTK("Passing doorbell page mfn %x\n", hwinfo->doorbell_mfn);
+       /* Make the relevant H/W pages mappable by the far end */
+       rc = net_accel_grant_page(bend->hdev_data, hwinfo->doorbell_mfn, 1);
+       if (rc < 0)
+               goto fail2;
+       accel_hw_priv->doorbell_gnt = hwinfo->doorbell_gnt = rc;
+       
+       /* Now do the same for the memory pages */
+       /* Convert the page + length we got back for the evq to grants. */
+       for (i = 0; i < accel_hw_priv->evq_npages; i++) {
+               rc = net_accel_grant_page(bend->hdev_data, pfn_to_mfn(pfn), 0);
+               if (rc < 0)
+                       goto fail3;
+               accel_hw_priv->evq_mem_gnts[i] = hwinfo->evq_mem_gnts[i] = rc;
+
+               VPRINTK("Got grant %u for evq pfn %x\n", hwinfo->evq_mem_gnts[i], 
+                       pfn);
+               pfn++;
+       }
+
+       return 0;
+
+ fail3:
+       for (i = i - 1; i >= 0; i--) {
+               ungrant_or_crash(accel_hw_priv->evq_mem_gnts[i], bend->far_end);
+       }
+       ungrant_or_crash(accel_hw_priv->doorbell_gnt, bend->far_end);
+ fail2:
+       ungrant_or_crash(accel_hw_priv->rxdmaq_gnt, bend->far_end);
+ fail1:
+       ungrant_or_crash(accel_hw_priv->txdmaq_gnt, bend->far_end);     
+ fail0:
+       return rc;
+}
+
+
+static int ef_bend_hwinfo_falcon_a(struct netback_accel *bend, 
+                                  struct net_accel_hw_falcon_a *hwinfo)
+{
+       int rc, i;
+       struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+
+       if ((rc = ef_bend_hwinfo_falcon_common(bend, &hwinfo->common)) != 0)
+               return rc;
+
+       /*
+        * Note that unlike the above, where the message field is the
+        * page number, here evq_rptr is the entire address because
+        * it is currently a pointer into the densely mapped timer page.
+        */
+       VPRINTK("Passing evq_rptr pfn %x for rptr %x\n", 
+               hwinfo->common.evq_rptr >> PAGE_SHIFT,
+               hwinfo->common.evq_rptr);
+       rc = net_accel_grant_page(bend->hdev_data, 
+                                 hwinfo->common.evq_rptr >> PAGE_SHIFT, 0);
+       if (rc < 0) {
+               /* Undo ef_bend_hwinfo_falcon_common() */
+               ungrant_or_crash(accel_hw_priv->txdmaq_gnt, bend->far_end);
+               ungrant_or_crash(accel_hw_priv->rxdmaq_gnt, bend->far_end);
+               ungrant_or_crash(accel_hw_priv->doorbell_gnt, bend->far_end);
+               for (i = 0; i < accel_hw_priv->evq_npages; i++) {
+                       ungrant_or_crash(accel_hw_priv->evq_mem_gnts[i],
+                                        bend->far_end);
+               }
+               return rc;
+       }
+
+       accel_hw_priv->evq_rptr_gnt = hwinfo->evq_rptr_gnt = rc;
+       VPRINTK("evq_rptr_gnt got %d\n", hwinfo->evq_rptr_gnt);
+       
+       return 0;
+}
+
+
+static int ef_bend_hwinfo_falcon_b(struct netback_accel *bend, 
+                                  struct net_accel_hw_falcon_b *hwinfo)
+{
+       return ef_bend_hwinfo_falcon_common(bend, hwinfo);
+}
+
+
+/*
+ * Fill in the message with a description of the hardware resources, based on
+ * the H/W type
+ */
+static int netback_accel_hwinfo(struct netback_accel *bend, 
+                               struct net_accel_msg_hw *msgvi)
+{
+       int rc = 0;
+       
+       BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_FILTER);
+
+       msgvi->type = bend->hw_type;
+       switch (bend->hw_type) {
+       case NET_ACCEL_MSG_HWTYPE_FALCON_A:
+               rc = ef_bend_hwinfo_falcon_a(bend, &msgvi->resources.falcon_a);
+               break;
+       case NET_ACCEL_MSG_HWTYPE_FALCON_B:
+       case NET_ACCEL_MSG_HWTYPE_SIENA_A:
+               rc = ef_bend_hwinfo_falcon_b(bend, &msgvi->resources.falcon_b);
+               break;
+       case NET_ACCEL_MSG_HWTYPE_NONE:
+               /* Nothing to do. The slow path should just work. */
+               break;
+       }
+
+       if (rc == 0)
+               bend->hw_state = NETBACK_ACCEL_RES_HWINFO;
+               
+       return rc;
+}
+
+
+/* Allocate hardware resources and make them available to the client domain */
+int netback_accel_setup_vnic_hw(struct netback_accel *bend)
+{
+       struct net_accel_msg msg;
+       int err;
+
+       /* Allocate the event queue, VI and so on. */
+       err = ef_get_vnic(bend);
+       if (err) {
+               EPRINTK("Failed to allocate hardware resource for bend:"
+                       "error %d\n", err);
+               return err;
+       }
+
+       /* Set up the filter management */
+       err = netback_accel_filter_init(bend);
+       if (err) {
+               EPRINTK("Filter setup failed, error %d", err);
+               ef_free_vnic(bend);
+               return err;
+       }
+
+       net_accel_msg_init(&msg, NET_ACCEL_MSG_SETHW);
+
+       /*
+        * Extract the low-level hardware info we will actually pass to the
+        * other end, and set up the grants/ioremap permissions needed
+        */
+       err = netback_accel_hwinfo(bend, &msg.u.hw);
+
+       if (err != 0) {
+               netback_accel_filter_shutdown(bend);
+               ef_free_vnic(bend);
+               return err;
+       }
+
+       /* Send the message, this is a reply to a hello-reply */
+       err = net_accel_msg_reply_notify(bend->shared_page, 
+                                        bend->msg_channel_irq, 
+                                        &bend->to_domU, &msg);
+
+       /*
+        * The message should succeed as it's logically a reply and we
+        * guarantee space for replies, but a misbehaving frontend
+        * could result in that behaviour, so be tolerant
+        */
+       if (err != 0) {
+               netback_accel_release_hwinfo(bend);
+               netback_accel_filter_shutdown(bend);
+               ef_free_vnic(bend);
+       }
+
+       return err;
+}
+
+
+/* Free hardware resources  */
+void netback_accel_shutdown_vnic_hw(struct netback_accel *bend)
+{
+       /*
+        * Only try and release resources if accel_hw_priv was setup,
+        * otherwise there is nothing to do as we're on "null-op"
+        * acceleration
+        */
+       switch (bend->hw_state) {
+       case NETBACK_ACCEL_RES_HWINFO:
+               VPRINTK("Release hardware resources\n");
+               netback_accel_release_hwinfo(bend);
+               /* deliberate drop through */
+       case NETBACK_ACCEL_RES_FILTER:          
+               VPRINTK("Free filters...\n");
+               netback_accel_filter_shutdown(bend);
+               /* deliberate drop through */
+       case NETBACK_ACCEL_RES_ALLOC:
+               VPRINTK("Free vnic...\n");
+               ef_free_vnic(bend);
+               /* deliberate drop through */
+       case NETBACK_ACCEL_RES_NONE:
+               break;
+       default:
+               BUG();
+       }
+}
+
+/**************************************************************************
+ * 
+ * Buffer table stuff
+ *
+ **************************************************************************/
+
+/*
+ * Undo any allocation that netback_accel_msg_rx_buffer_map() has made
+ * if it fails half way through
+ */
+static inline void buffer_map_cleanup(struct netback_accel *bend, int i)
+{
+       while (i > 0) {
+               i--;
+               bend->buffer_maps_index--;
+               net_accel_unmap_device_page(bend->hdev_data, 
+                                           bend->buffer_maps[bend->buffer_maps_index],
+                                           bend->buffer_addrs[bend->buffer_maps_index]);
+       }
+}
+
+
+int netback_accel_add_buffers(struct netback_accel *bend, int pages, int log2_pages,
+                             u32 *grants, u32 *buf_addr_out)
+{
+       struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+       unsigned long long addr_array[NET_ACCEL_MSG_MAX_PAGE_REQ];
+       int rc, i, index;
+       u64 dev_bus_addr;
+
+       /* Make sure we can't overflow the dma_maps array */
+       if (accel_hw_priv->dma_maps_index >= 
+           bend->max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ) {
+               EPRINTK("%s: too many buffer table allocations: %d %d\n",
+                       __FUNCTION__, accel_hw_priv->dma_maps_index, 
+                       bend->max_pages / NET_ACCEL_MSG_MAX_PAGE_REQ);
+               return -EINVAL;
+       }
+
+       /* Make sure we can't overflow the buffer_maps array */
+       if (bend->buffer_maps_index + pages > bend->max_pages) {
+               EPRINTK("%s: too many pages mapped: %d + %d > %d\n", 
+                       __FUNCTION__, bend->buffer_maps_index,
+                       pages, bend->max_pages);
+               return -EINVAL;
+       }
+
+       for (i = 0; i < pages; i++) {
+               VPRINTK("%s: mapping page %d\n", __FUNCTION__, i);
+               rc = net_accel_map_device_page
+                       (bend->hdev_data, grants[i],
+                        &bend->buffer_maps[bend->buffer_maps_index],
+                        &dev_bus_addr);
+    
+               if (rc != 0) {
+                       EPRINTK("error in net_accel_map_device_page\n");
+                       buffer_map_cleanup(bend, i);
+                       return rc;
+               }
+               
+               bend->buffer_addrs[bend->buffer_maps_index] = dev_bus_addr;
+
+               bend->buffer_maps_index++;
+
+               addr_array[i] = dev_bus_addr;
+       }
+
+       VPRINTK("%s: mapping dma addresses to vih %p\n", __FUNCTION__, 
+               accel_hw_priv->efx_vih);
+
+       index = accel_hw_priv->dma_maps_index;
+       if ((rc = efx_vi_dma_map_addrs(accel_hw_priv->efx_vih, addr_array, pages,
+                                      &(accel_hw_priv->dma_maps[index]))) < 0) {
+               EPRINTK("error in dma_map_pages\n");
+               buffer_map_cleanup(bend, i);
+               return rc;
+       }
+
+       accel_hw_priv->dma_maps_index++;
+       NETBACK_ACCEL_STATS_OP(bend->stats.num_buffer_pages += pages);
+
+       //DPRINTK("%s: getting map address\n", __FUNCTION__);
+
+       *buf_addr_out = efx_vi_dma_get_map_addr(accel_hw_priv->efx_vih, 
+                                               accel_hw_priv->dma_maps[index]);
+
+       //DPRINTK("%s: done\n", __FUNCTION__);
+
+       return 0;
+}
+
+
+int netback_accel_remove_buffers(struct netback_accel *bend)
+{
+       /* Only try to free buffers if accel_hw_priv was setup */
+       if (bend->hw_state != NETBACK_ACCEL_RES_NONE) {
+               struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+               int i;
+
+               efx_vi_reset(accel_hw_priv->efx_vih);
+
+               while (accel_hw_priv->dma_maps_index > 0) {
+                       accel_hw_priv->dma_maps_index--;
+                       i = accel_hw_priv->dma_maps_index;
+                       efx_vi_dma_unmap_addrs(accel_hw_priv->efx_vih, 
+                                              accel_hw_priv->dma_maps[i]);
+               }
+               
+               while (bend->buffer_maps_index > 0) {
+                       VPRINTK("Unmapping granted buffer %d\n", 
+                               bend->buffer_maps_index);
+                       bend->buffer_maps_index--;
+                       i = bend->buffer_maps_index;
+                       net_accel_unmap_device_page(bend->hdev_data, 
+                                                   bend->buffer_maps[i],
+                                                   bend->buffer_addrs[i]);
+               }
+
+               NETBACK_ACCEL_STATS_OP(bend->stats.num_buffer_pages = 0);
+       }
+
+       return 0;
+}
+
+/**************************************************************************
+ * 
+ * Filter stuff
+ *
+ **************************************************************************/
+
+static int netback_accel_filter_init(struct netback_accel *bend)
+{
+       struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+       int i, rc;
+
+       BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_ALLOC);
+
+       spin_lock_init(&accel_hw_priv->filter_lock);
+
+       if ((rc = cuckoo_hash_init(&accel_hw_priv->filter_hash_table, 
+                                  5 /* space for 32 filters */, 8)) != 0) {
+               EPRINTK("Failed to initialise filter hash table\n");
+               return rc;
+       }
+
+       accel_hw_priv->fspecs = kzalloc(sizeof(struct netback_accel_filter_spec) *
+                                       bend->quotas.max_filters,
+                                       GFP_KERNEL);
+
+       if (accel_hw_priv->fspecs == NULL) {
+               EPRINTK("No memory for filter specs.\n");
+               cuckoo_hash_destroy(&accel_hw_priv->filter_hash_table);
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < bend->quotas.max_filters; i++) {
+               accel_hw_priv->free_filters |= (1 << i);
+       }
+
+       /* Base mask on highest set bit in max_filters  */
+       accel_hw_priv->filter_idx_mask = (1 << fls(bend->quotas.max_filters)) - 1;
+       VPRINTK("filter setup: max is %x mask is %x\n",
+               bend->quotas.max_filters, accel_hw_priv->filter_idx_mask);
+
+       bend->hw_state = NETBACK_ACCEL_RES_FILTER;
+
+       return 0;
+}
+
+
+static inline void make_filter_key(cuckoo_hash_ip_key *key,  
+                                  struct netback_accel_filter_spec *filt)
+
+{
+       key->local_ip = filt->destip_be;
+       key->local_port = filt->destport_be;
+       key->proto = filt->proto;
+}
+
+
+static inline 
+void netback_accel_free_filter(struct falcon_bend_accel_priv *accel_hw_priv,
+                              int filter)
+{
+       cuckoo_hash_ip_key filter_key;
+
+       if (!(accel_hw_priv->free_filters & (1 << filter))) {
+               efx_vi_filter_stop(accel_hw_priv->efx_vih, 
+                                  accel_hw_priv->fspecs[filter].filter_handle);
+               make_filter_key(&filter_key, &(accel_hw_priv->fspecs[filter]));
+               if (cuckoo_hash_remove(&accel_hw_priv->filter_hash_table,
+                                      (cuckoo_hash_key *)&filter_key)) {
+                       EPRINTK("%s: Couldn't find filter to remove from table\n",
+                               __FUNCTION__);
+                       BUG();
+               }
+       }
+}
+
+
+static void netback_accel_filter_shutdown(struct netback_accel *bend)
+{
+       struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+       int i;
+       unsigned long flags;
+
+       BUG_ON(bend->hw_state != NETBACK_ACCEL_RES_FILTER);
+
+       spin_lock_irqsave(&accel_hw_priv->filter_lock, flags);
+
+       BUG_ON(accel_hw_priv->fspecs == NULL);
+
+       for (i = 0; i < bend->quotas.max_filters; i++) {
+               netback_accel_free_filter(accel_hw_priv, i);
+       }
+       
+       kfree(accel_hw_priv->fspecs);
+       accel_hw_priv->fspecs = NULL;
+       accel_hw_priv->free_filters = 0;
+       
+       cuckoo_hash_destroy(&accel_hw_priv->filter_hash_table);
+
+       spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags);
+
+       bend->hw_state = NETBACK_ACCEL_RES_ALLOC;
+}
+
+
+/*! Suggest a filter to replace when we want to insert a new one and have
+ *  none free.
+ */
+static unsigned get_victim_filter(struct netback_accel *bend)
+{
+       /*
+        * We could attempt to get really clever, and may do at some
+        * point, but random replacement is v. cheap and low on
+        * pathological worst cases.
+        */
+       unsigned index, cycles;
+
+       rdtscl(cycles);
+
+       /*
+        * Some doubt about the quality of the bottom few bits, so
+        * throw 'em * away
+        */
+       index = (cycles >> 4) & ((struct falcon_bend_accel_priv *)
+                                bend->accel_hw_priv)->filter_idx_mask;
+       /*
+        * We don't enforce that the number of filters is a power of
+        * two, but the masking gets us to within one subtraction of a
+        * valid index
+        */
+       if (index >= bend->quotas.max_filters)
+               index -= bend->quotas.max_filters;
+       DPRINTK("backend %s->%d has no free filters. Filter %d will be evicted\n",
+               bend->nicname, bend->far_end, index);
+       return index;
+}
+
+
+/* Add a filter for the specified IP/port to the backend */
+int 
+netback_accel_filter_check_add(struct netback_accel *bend, 
+                              struct netback_accel_filter_spec *filt)
+{
+       struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+       struct netback_accel_filter_spec *fs;
+       unsigned filter_index;
+       unsigned long flags;
+       int rc, recycling = 0;
+       cuckoo_hash_ip_key filter_key, evict_key;
+
+       BUG_ON(filt->proto != IPPROTO_TCP && filt->proto != IPPROTO_UDP);
+
+       DPRINTK("Will add %s filter for dst ip %08x and dst port %d\n", 
+               (filt->proto == IPPROTO_TCP) ? "TCP" : "UDP",
+               be32_to_cpu(filt->destip_be), be16_to_cpu(filt->destport_be));
+
+       spin_lock_irqsave(&accel_hw_priv->filter_lock, flags);
+       /*
+        * Check to see if we're already filtering this IP address and
+        * port. Happens if you insert a filter mid-stream as there
+        * are many packets backed up to be delivered to dom0 already
+        */
+       make_filter_key(&filter_key, filt);
+       if (cuckoo_hash_lookup(&accel_hw_priv->filter_hash_table, 
+                              (cuckoo_hash_key *)(&filter_key), 
+                              &filter_index)) {
+               DPRINTK("Found matching filter %d already in table\n", 
+                       filter_index);
+               rc = -1;
+               goto out;
+       }
+
+       if (accel_hw_priv->free_filters == 0) {
+               filter_index = get_victim_filter(bend);
+               recycling = 1;
+       } else {
+               filter_index = __ffs(accel_hw_priv->free_filters);
+               clear_bit(filter_index, &accel_hw_priv->free_filters);
+       }
+
+       fs = &accel_hw_priv->fspecs[filter_index];
+
+       if (recycling) {
+               DPRINTK("Removing filter index %d handle %p\n", filter_index,
+                       fs->filter_handle);
+
+               if ((rc = efx_vi_filter_stop(accel_hw_priv->efx_vih, 
+                                            fs->filter_handle)) != 0) {
+                       EPRINTK("Couldn't clear NIC filter table entry %d\n", rc);
+               }
+
+               make_filter_key(&evict_key, fs);
+               if (cuckoo_hash_remove(&accel_hw_priv->filter_hash_table,
+                                      (cuckoo_hash_key *)&evict_key)) {
+                       EPRINTK("Couldn't find filter to remove from table\n");
+                       BUG();
+               }
+               NETBACK_ACCEL_STATS_OP(bend->stats.num_filters--);
+       }
+
+       /* Update the filter spec with new details */
+       *fs = *filt;
+
+       if ((rc = cuckoo_hash_add(&accel_hw_priv->filter_hash_table, 
+                                 (cuckoo_hash_key *)&filter_key, filter_index,
+                                 1)) != 0) {
+               EPRINTK("Error (%d) adding filter to table\n", rc);
+               accel_hw_priv->free_filters |= (1 << filter_index);
+               goto out;
+       }
+
+       rc = efx_vi_filter(accel_hw_priv->efx_vih, filt->proto, filt->destip_be,
+                          filt->destport_be, 
+                          (struct filter_resource_t **)&fs->filter_handle);
+
+       if (rc != 0) {
+               EPRINTK("Hardware filter insertion failed. Error %d\n", rc);
+               accel_hw_priv->free_filters |= (1 << filter_index);
+               cuckoo_hash_remove(&accel_hw_priv->filter_hash_table, 
+                                  (cuckoo_hash_key *)&filter_key);
+               rc = -1;
+               goto out;
+       }
+
+       NETBACK_ACCEL_STATS_OP(bend->stats.num_filters++);
+
+       VPRINTK("%s: success index %d handle %p\n", __FUNCTION__, filter_index, 
+               fs->filter_handle);
+
+       rc = filter_index;
+ out:
+       spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags);
+       return rc;
+}
+
+
+/* Remove a filter entry for the specific device and IP/port */
+static void netback_accel_filter_remove(struct netback_accel *bend, 
+                                       int filter_index)
+{
+       struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+
+       BUG_ON(accel_hw_priv->free_filters & (1 << filter_index));
+       netback_accel_free_filter(accel_hw_priv, filter_index);
+       accel_hw_priv->free_filters |= (1 << filter_index);
+}
+
+
+/* Remove a filter entry for the specific device and IP/port */
+void netback_accel_filter_remove_spec(struct netback_accel *bend, 
+                                     struct netback_accel_filter_spec *filt)
+{
+       struct falcon_bend_accel_priv *accel_hw_priv = bend->accel_hw_priv;
+       unsigned filter_found;
+       unsigned long flags;
+       cuckoo_hash_ip_key filter_key;
+       struct netback_accel_filter_spec *fs;
+
+       if (filt->proto == IPPROTO_TCP) {
+               DPRINTK("Remove TCP filter for dst ip %08x and dst port %d\n",
+                       be32_to_cpu(filt->destip_be),
+                       be16_to_cpu(filt->destport_be));
+       } else if (filt->proto == IPPROTO_UDP) {
+               DPRINTK("Remove UDP filter for dst ip %08x and dst port %d\n",
+                       be32_to_cpu(filt->destip_be),
+                       be16_to_cpu(filt->destport_be));
+       } else {
+               /*
+                * This could be provoked by an evil frontend, so can't
+                * BUG(), but harmless as it should fail tests below 
+                */
+               DPRINTK("Non-TCP/UDP filter dst ip %08x and dst port %d\n",
+                       be32_to_cpu(filt->destip_be),
+                       be16_to_cpu(filt->destport_be));
+       }
+
+       spin_lock_irqsave(&accel_hw_priv->filter_lock, flags);
+
+       make_filter_key(&filter_key, filt);
+       if (!cuckoo_hash_lookup(&accel_hw_priv->filter_hash_table, 
+                              (cuckoo_hash_key *)(&filter_key), 
+                              &filter_found)) {
+               EPRINTK("Couldn't find matching filter already in table\n");
+               goto out;
+       }
+       
+       /* Do a full check to make sure we've not had a hash collision */
+       fs = &accel_hw_priv->fspecs[filter_found];
+       if (fs->destip_be == filt->destip_be &&
+           fs->destport_be == filt->destport_be &&
+           fs->proto == filt->proto &&
+           !memcmp(fs->mac, filt->mac, ETH_ALEN)) {
+               netback_accel_filter_remove(bend, filter_found);
+       } else {
+               EPRINTK("Entry in hash table does not match filter spec\n");
+               goto out;
+       }
+
+ out:
+       spin_unlock_irqrestore(&accel_hw_priv->filter_lock, flags);
+}
diff --git a/drivers/xen/sfc_netback/accel_solarflare.h b/drivers/xen/sfc_netback/accel_solarflare.h

new file mode 100644 (file)

index 0000000..84d2146
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_solarflare.h
@@ -0,0 +1,88 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETBACK_ACCEL_SOLARFLARE_H
+#define NETBACK_ACCEL_SOLARFLARE_H
+
+#include "accel.h"
+#include "accel_msg_iface.h"
+
+#include "driverlink_api.h"
+
+#define MAX_NICS 5
+#define MAX_PORTS 2
+
+
+extern int netback_accel_sf_init(void);
+extern void netback_accel_sf_shutdown(void);
+extern int netback_accel_sf_hwtype(struct netback_accel *bend);
+
+extern int netback_accel_sf_char_init(void);
+extern void netback_accel_sf_char_shutdown(void);
+
+extern int netback_accel_setup_vnic_hw(struct netback_accel *bend);
+extern void netback_accel_shutdown_vnic_hw(struct netback_accel *bend);
+
+extern int netback_accel_add_buffers(struct netback_accel *bend, int pages, 
+                                    int log2_pages, u32 *grants,
+                                    u32 *buf_addr_out);
+extern int netback_accel_remove_buffers(struct netback_accel *bend);
+
+
+/* Add a filter for the specified IP/port to the backend */
+extern int
+netback_accel_filter_check_add(struct netback_accel *bend, 
+                              struct netback_accel_filter_spec *filt);
+/* Remove a filter entry for the specific device and IP/port */
+extern
+void netback_accel_filter_remove_index(struct netback_accel *bend, 
+                                      int filter_index);
+extern
+void netback_accel_filter_remove_spec(struct netback_accel *bend, 
+                                     struct netback_accel_filter_spec *filt);
+
+/* This is designed to look a bit like a skb */
+struct netback_pkt_buf {
+       union {
+               unsigned char *raw;
+       } mac;
+       union {
+               struct iphdr  *iph;
+               struct arphdr *arph;
+               unsigned char *raw;
+       } nh;
+       int protocol;
+};
+
+/*! \brief Handle a received packet: insert fast path filters as necessary
+ * \param skb The packet buffer
+ */
+extern void netback_accel_rx_packet(struct netback_pkt_buf *skb, void *fwd_priv);
+
+/*! \brief Handle a transmitted packet: update fast path filters as necessary
+ * \param skb The packet buffer
+ */
+extern void netback_accel_tx_packet(struct sk_buff *skb, void *fwd_priv);
+
+#endif /* NETBACK_ACCEL_SOLARFLARE_H */
diff --git a/drivers/xen/sfc_netback/accel_xenbus.c b/drivers/xen/sfc_netback/accel_xenbus.c

new file mode 100644 (file)

index 0000000..4fb82d8
--- /dev/null
+++ b/drivers/xen/sfc_netback/accel_xenbus.c
@@ -0,0 +1,831 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <xen/evtchn.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+
+/* drivers/xen/netback/common.h */
+#include "common.h"
+
+#include "accel.h"
+#include "accel_solarflare.h"
+#include "accel_util.h"
+
+#define NODENAME_PATH_FMT "backend/vif/%d/%d"
+
+#define NETBACK_ACCEL_FROM_XENBUS_DEVICE(_dev) (struct netback_accel *) \
+       ((struct backend_info *)dev_get_drvdata(&(_dev)->dev))->netback_accel_priv
+
+/* List of all the bends currently in existence. */
+struct netback_accel *bend_list = NULL;
+DEFINE_MUTEX(bend_list_mutex);
+
+/* Put in bend_list.  Must hold bend_list_mutex */
+static void link_bend(struct netback_accel *bend)
+{
+       bend->next_bend = bend_list;
+       bend_list = bend;
+}
+
+/* Remove from bend_list,  Must hold bend_list_mutex */
+static void unlink_bend(struct netback_accel *bend)
+{
+       struct netback_accel *tmp = bend_list;
+       struct netback_accel *prev = NULL;
+       while (tmp != NULL) {
+               if (tmp == bend) {
+                       if (prev != NULL)
+                               prev->next_bend = bend->next_bend;
+                       else
+                               bend_list = bend->next_bend;
+                       return;
+               }
+               prev = tmp;
+               tmp = tmp->next_bend;
+       }
+}
+
+
+/* Demultiplex a message IRQ from the frontend driver.  */
+static irqreturn_t msgirq_from_frontend(int irq, void *context)
+{
+       struct xenbus_device *dev = context;
+       struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
+       VPRINTK("irq %d from device %s\n", irq, dev->nodename);
+       schedule_work(&bend->handle_msg);
+       return IRQ_HANDLED;
+}
+
+
+/*
+ * Demultiplex an IRQ from the frontend driver.  This is never used
+ * functionally, but we need it to pass to the bind function, and may
+ * get called spuriously
+ */
+static irqreturn_t netirq_from_frontend(int irq, void *context)
+{
+       VPRINTK("netirq %d from device %s\n", irq,
+               ((struct xenbus_device *)context)->nodename);
+       
+       return IRQ_HANDLED;
+}
+
+
+/* Read the limits values of the xenbus structure. */
+static 
+void cfg_hw_quotas(struct xenbus_device *dev, struct netback_accel *bend)
+{
+       int err = xenbus_gather
+               (XBT_NIL, dev->nodename,
+                "limits/max-filters", "%d", &bend->quotas.max_filters,
+                "limits/max-buf-pages", "%d", &bend->quotas.max_buf_pages,
+                "limits/max-mcasts", "%d", &bend->quotas.max_mcasts,
+                NULL);
+       if (err) {
+               /*
+                * TODO what if they have previously been set by the
+                * user?  This will overwrite with defaults.  Maybe
+                * not what we want to do, but useful in startup
+                * case 
+                */
+               DPRINTK("Failed to read quotas from xenbus, using defaults\n");
+               bend->quotas.max_filters = NETBACK_ACCEL_DEFAULT_MAX_FILTERS;
+               bend->quotas.max_buf_pages = sfc_netback_max_pages;
+               bend->quotas.max_mcasts = NETBACK_ACCEL_DEFAULT_MAX_MCASTS;
+       }
+
+       return;
+}
+
+
+static void bend_config_accel_change(struct xenbus_watch *watch,
+                                    const char **vec, unsigned int len)
+{
+       struct netback_accel *bend;
+
+       bend = container_of(watch, struct netback_accel, config_accel_watch);
+
+       mutex_lock(&bend->bend_mutex);
+       if (bend->config_accel_watch.node != NULL) {
+               struct xenbus_device *dev = 
+                       (struct xenbus_device *)bend->hdev_data;
+               DPRINTK("Watch matched, got dev %p otherend %p\n",
+                       dev, dev->otherend);
+               if(!xenbus_exists(XBT_NIL, watch->node, "")) {
+                       DPRINTK("Ignoring watch as otherend seems invalid\n");
+                       goto out;
+               }
+               
+               cfg_hw_quotas(dev, bend);
+       }
+ out:
+       mutex_unlock(&bend->bend_mutex);
+       return;
+}
+
+
+/*
+ * Setup watch on "limits" in the backend vif info to know when
+ * configuration has been set
+ */
+static int setup_config_accel_watch(struct xenbus_device *dev,
+                                   struct netback_accel *bend)
+{
+       int err;
+
+       VPRINTK("Setting watch on %s/%s\n", dev->nodename, "limits");
+
+       err = xenbus_watch_path2(dev, dev->nodename, "limits", 
+                                &bend->config_accel_watch, 
+                                bend_config_accel_change);
+
+       if (err) {
+               EPRINTK("%s: Failed to register xenbus watch: %d\n",
+                       __FUNCTION__, err);
+               bend->config_accel_watch.node = NULL;
+               return err;
+       }
+       return 0;
+}
+
+
+static int 
+cfg_frontend_info(struct xenbus_device *dev, struct netback_accel *bend,
+                 int *grants)
+{
+       /* Get some info from xenbus on the event channel and shmem grant */
+       int err = xenbus_gather(XBT_NIL, dev->otherend, 
+                               "accel-msg-channel", "%u", &bend->msg_channel, 
+                               "accel-ctrl-page", "%d", &(grants[0]),
+                               "accel-msg-page", "%d", &(grants[1]),
+                               "accel-net-channel", "%u", &bend->net_channel,
+                               NULL);
+       if (err)
+               EPRINTK("failed to read event channels or shmem grant: %d\n",
+                       err);
+       else
+               DPRINTK("got event chan %d and net chan %d from frontend\n",
+                       bend->msg_channel, bend->net_channel);
+       return err;
+}
+
+
+/* Setup all the comms needed to chat with the front end driver */
+static int setup_vnic(struct xenbus_device *dev)
+{
+       struct netback_accel *bend;
+       int grants[2], err, msgs_per_queue;
+
+       bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
+
+       err = cfg_frontend_info(dev, bend, grants);
+       if (err)
+               goto fail1;
+
+       /*
+        * If we get here, both frontend Connected and configuration
+        * options available.  All is well.
+        */
+
+       /* Get the hardware quotas for the VNIC in question.  */
+       cfg_hw_quotas(dev, bend);
+
+       /* Set up the deferred work handlers */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+       INIT_WORK(&bend->handle_msg, 
+                 netback_accel_msg_rx_handler);
+#else
+       INIT_WORK(&bend->handle_msg, 
+                 netback_accel_msg_rx_handler,
+                 (void*)bend);
+#endif
+
+       /* Request the frontend mac */
+       err = net_accel_xen_net_read_mac(dev, bend->mac);
+       if (err)
+               goto fail2;
+
+       /* Set up the shared page. */
+       bend->shared_page = net_accel_map_grants_contig(dev, grants, 2, 
+                                                       &bend->sh_pages_unmap);
+
+       if (bend->shared_page == NULL) {
+               EPRINTK("failed to map shared page for %s\n", dev->otherend);
+               err = -ENOMEM;
+               goto fail2;
+       }
+
+       /* Initialise the shared page(s) used for comms */
+       net_accel_msg_init_page(bend->shared_page, PAGE_SIZE, 
+                               (bend->net_dev->flags & IFF_UP) && 
+                               (netif_carrier_ok(bend->net_dev)));
+
+       msgs_per_queue = (PAGE_SIZE/2) / sizeof(struct net_accel_msg);
+
+       net_accel_msg_init_queue
+               (&bend->to_domU, &bend->shared_page->queue0,
+                (struct net_accel_msg *)((__u8*)bend->shared_page + PAGE_SIZE),
+                msgs_per_queue);
+
+       net_accel_msg_init_queue
+               (&bend->from_domU, &bend->shared_page->queue1, 
+                (struct net_accel_msg *)((__u8*)bend->shared_page + 
+                                         (3 * PAGE_SIZE / 2)),
+                msgs_per_queue);
+
+       /* Bind the message event channel to a handler
+        *
+        * Note that we will probably get a spurious interrupt when we
+        * do this, so it must not be done until we have set up
+        * everything we need to handle it.
+        */
+       err = bind_interdomain_evtchn_to_irqhandler(dev->otherend_id,
+                                                   bend->msg_channel,
+                                                   msgirq_from_frontend,
+                                                   0,
+                                                   "netback_accel",
+                                                   dev);
+       if (err < 0) {
+               EPRINTK("failed to bind event channel: %d\n", err);
+               goto fail3;
+       }
+       else
+               bend->msg_channel_irq = err;
+
+       /* TODO: No need to bind this evtchn to an irq. */
+       err = bind_interdomain_evtchn_to_irqhandler(dev->otherend_id,
+                                                   bend->net_channel,
+                                                   netirq_from_frontend,
+                                                   0,
+                                                   "netback_accel",
+                                                   dev);
+       if (err < 0) {
+               EPRINTK("failed to bind net channel: %d\n", err);
+               goto fail4;
+       }  
+       else
+               bend->net_channel_irq = err;
+
+       /*
+        * Grab ourselves an entry in the forwarding hash table. We do
+        * this now so we don't have the embarassmesnt of sorting out
+        * an allocation failure while at IRQ. Because we pass NULL as
+        * the context, the actual hash lookup will succeed for this
+        * NIC, but the check for somewhere to forward to will
+        * fail. This is necessary to prevent forwarding before
+        * hardware resources are set up
+        */
+       err = netback_accel_fwd_add(bend->mac, NULL, bend->fwd_priv);
+       if (err) {
+               EPRINTK("failed to add to fwd hash table\n");
+               goto fail5;
+       }
+
+       /*
+        * Say hello to frontend.  Important to do this straight after
+        * obtaining the message queue as otherwise we are vulnerable
+        * to an evil frontend sending a HELLO-REPLY before we've sent
+        * the HELLO and confusing us
+        */
+       netback_accel_msg_tx_hello(bend, NET_ACCEL_MSG_VERSION);
+       return 0;
+
+ fail5:
+       unbind_from_irqhandler(bend->net_channel_irq, dev);
+ fail4:
+       unbind_from_irqhandler(bend->msg_channel_irq, dev);
+ fail3:
+       net_accel_unmap_grants_contig(dev, bend->sh_pages_unmap);
+       bend->shared_page = NULL;
+       bend->sh_pages_unmap = NULL;
+ fail2:
+ fail1:
+       return err;
+}
+
+
+static int read_nicname(struct xenbus_device *dev, struct netback_accel *bend)
+{
+       int len;
+
+       /* nic name used to select interface used for acceleration */
+       bend->nicname = xenbus_read(XBT_NIL, dev->nodename, "accel", &len);
+       if (IS_ERR(bend->nicname))
+               return PTR_ERR(bend->nicname);
+
+       return 0;
+}
+
+static const char *frontend_name = "sfc_netfront";
+
+static int publish_frontend_name(struct xenbus_device *dev)
+{
+       struct xenbus_transaction tr;
+       int err;
+       
+       /* Publish the name of the frontend driver */
+       do {
+               err = xenbus_transaction_start(&tr);
+               if (err != 0) { 
+                       EPRINTK("%s: transaction start failed\n", __FUNCTION__);
+                       return err;
+               }
+               err = xenbus_printf(tr, dev->nodename, "accel-frontend", 
+                                   "%s", frontend_name);
+               if (err != 0) {
+                       EPRINTK("%s: xenbus_printf failed\n", __FUNCTION__);
+                       xenbus_transaction_end(tr, 1);
+                       return err;
+               }
+               err = xenbus_transaction_end(tr, 0);
+       } while (err == -EAGAIN);
+       
+       if (err != 0) {
+               EPRINTK("failed to end frontend name transaction\n");
+               return err;
+       }
+       return 0;
+}
+
+
+static int unpublish_frontend_name(struct xenbus_device *dev)
+{
+       struct xenbus_transaction tr;
+       int err;
+
+       do {
+               err = xenbus_transaction_start(&tr);
+               if (err != 0)
+                       break;
+               err = xenbus_rm(tr, dev->nodename, "accel-frontend");
+               if (err != 0) {
+                       xenbus_transaction_end(tr, 1);
+                       break;
+               }
+               err = xenbus_transaction_end(tr, 0);
+       } while (err == -EAGAIN);
+
+       return err;
+}
+
+
+static void cleanup_vnic(struct netback_accel *bend)
+{
+       struct xenbus_device *dev;
+
+       dev = (struct xenbus_device *)bend->hdev_data;
+
+       DPRINTK("%s: bend %p dev %p\n", __FUNCTION__, bend, dev);
+
+       DPRINTK("%s: Remove %p's mac from fwd table...\n", 
+               __FUNCTION__, bend);
+       netback_accel_fwd_remove(bend->mac, bend->fwd_priv);
+
+       /* Free buffer table allocations */
+       netback_accel_remove_buffers(bend);
+
+       DPRINTK("%s: Release hardware resources...\n", __FUNCTION__);
+       if (bend->accel_shutdown)
+               bend->accel_shutdown(bend);
+
+       if (bend->net_channel_irq) {
+               unbind_from_irqhandler(bend->net_channel_irq, dev);
+               bend->net_channel_irq = 0;
+       }
+
+       if (bend->msg_channel_irq) {
+               unbind_from_irqhandler(bend->msg_channel_irq, dev);
+               bend->msg_channel_irq = 0;
+       }
+
+       if (bend->sh_pages_unmap) {
+               DPRINTK("%s: Unmap grants %p\n", __FUNCTION__, 
+                       bend->sh_pages_unmap);
+               net_accel_unmap_grants_contig(dev, bend->sh_pages_unmap);
+               bend->sh_pages_unmap = NULL;
+               bend->shared_page = NULL;
+       }
+}
+
+
+/*************************************************************************/
+
+/*
+ * The following code handles accelstate changes between the frontend
+ * and the backend.  It calls setup_vnic and cleanup_vnic in matching
+ * pairs in response to transitions.
+ *
+ * Valid state transitions for Dom0 are as follows:
+ *
+ * Closed->Init       on probe or in response to Init from domU
+ * Closed->Closing    on error/remove
+ *
+ * Init->Connected    in response to Connected from domU
+ * Init->Closing      on error/remove or in response to Closing from domU
+ *
+ * Connected->Closing on error/remove or in response to Closing from domU
+ *
+ * Closing->Closed    in response to Closed from domU
+ *
+ */
+
+
+static void netback_accel_frontend_changed(struct xenbus_device *dev,
+                                          XenbusState frontend_state)
+{
+       struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
+       XenbusState backend_state;
+
+       DPRINTK("%s: changing from %s to %s. nodename %s, otherend %s\n",
+               __FUNCTION__, xenbus_strstate(bend->frontend_state),
+               xenbus_strstate(frontend_state),dev->nodename, dev->otherend);
+
+       /*
+        * Ignore duplicate state changes.  This can happen if the
+        * frontend changes state twice in quick succession and the
+        * first watch fires in the backend after the second
+        * transition has completed.
+        */
+       if (bend->frontend_state == frontend_state)
+               return;
+
+       bend->frontend_state = frontend_state;
+       backend_state = bend->backend_state;
+
+       switch (frontend_state) {
+       case XenbusStateInitialising:
+               if (backend_state == XenbusStateClosed &&
+                   !bend->removing)
+                       backend_state = XenbusStateInitialising;
+               break;
+
+       case XenbusStateConnected:
+               if (backend_state == XenbusStateInitialising) {
+                       if (!bend->vnic_is_setup &&
+                           setup_vnic(dev) == 0) {
+                               bend->vnic_is_setup = 1;
+                               backend_state = XenbusStateConnected;
+                       } else {
+                               backend_state = XenbusStateClosing;
+                       }
+               }
+               break;
+
+       case XenbusStateInitWait:
+       case XenbusStateInitialised:
+       default:
+               DPRINTK("Unknown state %s (%d) from frontend.\n",
+                       xenbus_strstate(frontend_state), frontend_state);
+               /* Unknown state.  Fall through. */
+       case XenbusStateClosing:
+               if (backend_state != XenbusStateClosed)
+                       backend_state = XenbusStateClosing;
+
+               /*
+                * The bend will now persist (with watches active) in
+                * case the frontend comes back again, eg. after
+                * frontend module reload or suspend/resume
+                */
+
+               break;
+
+       case XenbusStateUnknown:
+       case XenbusStateClosed:
+               if (bend->vnic_is_setup) {
+                       bend->vnic_is_setup = 0;
+                       cleanup_vnic(bend);
+               }
+
+               if (backend_state == XenbusStateClosing)
+                       backend_state = XenbusStateClosed;
+               break;
+       }
+
+       if (backend_state != bend->backend_state) {
+               DPRINTK("Switching from state %s (%d) to %s (%d)\n",
+                       xenbus_strstate(bend->backend_state),
+                       bend->backend_state,
+                       xenbus_strstate(backend_state), backend_state);
+               bend->backend_state = backend_state;
+               net_accel_update_state(dev, backend_state);
+       }
+
+       wake_up(&bend->state_wait_queue);
+}
+
+
+/* accelstate on the frontend's xenbus node has changed */
+static void bend_domu_accel_change(struct xenbus_watch *watch,
+                                  const char **vec, unsigned int len)
+{
+       int state;
+       struct netback_accel *bend;
+
+       bend = container_of(watch, struct netback_accel, domu_accel_watch);
+       if (bend->domu_accel_watch.node != NULL) {
+               struct xenbus_device *dev = 
+                       (struct xenbus_device *)bend->hdev_data;
+               VPRINTK("Watch matched, got dev %p otherend %p\n",
+                       dev, dev->otherend);
+               /*
+                * dev->otherend != NULL check to protect against
+                * watch firing when domain goes away and we haven't
+                * yet cleaned up
+                */
+               if (!dev->otherend ||
+                   !xenbus_exists(XBT_NIL, watch->node, "") ||
+                   strncmp(dev->otherend, vec[XS_WATCH_PATH],
+                           strlen(dev->otherend))) {
+                       DPRINTK("Ignoring watch as otherend seems invalid\n");
+                       return;
+               }
+
+               mutex_lock(&bend->bend_mutex);
+
+               xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", 
+                            &state);
+               netback_accel_frontend_changed(dev, state);
+
+               mutex_unlock(&bend->bend_mutex);
+       }
+}
+
+/* Setup watch on frontend's accelstate */
+static int setup_domu_accel_watch(struct xenbus_device *dev,
+                                 struct netback_accel *bend)
+{
+       int err;
+
+       VPRINTK("Setting watch on %s/%s\n", dev->otherend, "accelstate");
+
+       err = xenbus_watch_path2(dev, dev->otherend, "accelstate", 
+                                &bend->domu_accel_watch, 
+                                bend_domu_accel_change);
+       if (err) {
+               EPRINTK("%s: Failed to register xenbus watch: %d\n",
+                       __FUNCTION__, err);
+               goto fail;
+       }
+       return 0;
+ fail:
+       bend->domu_accel_watch.node = NULL;
+       return err;
+}
+
+
+int netback_accel_probe(struct xenbus_device *dev)
+{
+       struct netback_accel *bend;
+       struct backend_info *binfo;
+       int err;
+
+       DPRINTK("%s: passed device %s\n", __FUNCTION__, dev->nodename);
+
+       /* Allocate structure to store all our state... */
+       bend = kzalloc(sizeof(struct netback_accel), GFP_KERNEL);
+       if (bend == NULL) {
+               DPRINTK("%s: no memory for bend\n", __FUNCTION__);
+               return -ENOMEM;
+       }
+       
+       mutex_init(&bend->bend_mutex);
+
+       mutex_lock(&bend->bend_mutex);
+
+       /* ...and store it where we can get at it */
+       binfo = dev_get_drvdata(&dev->dev);
+       binfo->netback_accel_priv = bend;
+       /* And vice-versa */
+       bend->hdev_data = dev;
+
+       DPRINTK("%s: Adding bend %p to list\n", __FUNCTION__, bend);
+       
+       init_waitqueue_head(&bend->state_wait_queue);
+       bend->vnic_is_setup = 0;
+       bend->frontend_state = XenbusStateUnknown;
+       bend->backend_state = XenbusStateClosed;
+       bend->removing = 0;
+
+       sscanf(dev->nodename, NODENAME_PATH_FMT, &bend->far_end, 
+              &bend->vif_num);
+
+       err = read_nicname(dev, bend);
+       if (err) {
+               /*
+                * Technically not an error, just means we're not 
+                * supposed to accelerate this
+                */
+               DPRINTK("failed to get device name\n");
+               goto fail_nicname;
+       }
+
+       /*
+        * Look up the device name in the list of NICs provided by
+        * driverlink to get the hardware type.
+        */
+       err = netback_accel_sf_hwtype(bend);
+       if (err) {
+               /*
+                * Technically not an error, just means we're not
+                * supposed to accelerate this, probably belongs to
+                * some other backend
+                */
+               DPRINTK("failed to match device name\n");
+               goto fail_init_type;
+       }
+
+       err = publish_frontend_name(dev);
+       if (err)
+               goto fail_publish;
+
+       err = netback_accel_debugfs_create(bend);
+       if (err)
+               goto fail_debugfs;
+       
+       mutex_unlock(&bend->bend_mutex);
+
+       err = setup_config_accel_watch(dev, bend);
+       if (err)
+               goto fail_config_watch;
+
+       err = setup_domu_accel_watch(dev, bend);
+       if (err)
+               goto fail_domu_watch;
+
+       /*
+        * Indicate to the other end that we're ready to start unless
+        * the watch has already fired.
+        */
+       mutex_lock(&bend->bend_mutex);
+       if (bend->backend_state == XenbusStateClosed) {
+               bend->backend_state = XenbusStateInitialising;
+               net_accel_update_state(dev, XenbusStateInitialising);
+       }
+       mutex_unlock(&bend->bend_mutex);
+
+       mutex_lock(&bend_list_mutex);
+       link_bend(bend);
+       mutex_unlock(&bend_list_mutex);
+
+       return 0;
+
+fail_domu_watch:
+
+       unregister_xenbus_watch(&bend->config_accel_watch);
+       kfree(bend->config_accel_watch.node);
+fail_config_watch:
+
+       /*
+        * Flush the scheduled work queue before freeing bend to get
+        * rid of any pending netback_accel_msg_rx_handler()
+        */
+       flush_work_sync(&bend->handle_msg);
+
+       mutex_lock(&bend->bend_mutex);
+       net_accel_update_state(dev, XenbusStateUnknown);
+       netback_accel_debugfs_remove(bend);
+fail_debugfs:
+
+       unpublish_frontend_name(dev);
+fail_publish:
+
+       /* No need to reverse netback_accel_sf_hwtype. */
+fail_init_type:
+
+       kfree(bend->nicname);
+fail_nicname:
+       binfo->netback_accel_priv = NULL;
+       mutex_unlock(&bend->bend_mutex);
+       kfree(bend);
+       return err;
+}
+
+
+int netback_accel_remove(struct xenbus_device *dev)
+{
+       struct backend_info *binfo;
+       struct netback_accel *bend; 
+       int frontend_state;
+
+       binfo = dev_get_drvdata(&dev->dev);
+       bend = (struct netback_accel *) binfo->netback_accel_priv;
+
+       DPRINTK("%s: dev %p bend %p\n", __FUNCTION__, dev, bend);
+       
+       BUG_ON(bend == NULL);
+       
+       mutex_lock(&bend_list_mutex);
+       unlink_bend(bend);
+       mutex_unlock(&bend_list_mutex);
+
+       mutex_lock(&bend->bend_mutex);
+
+       /* Reject any requests to connect. */
+       bend->removing = 1;
+
+       /*
+        * Switch to closing to tell the other end that we're going
+        * away.
+        */
+       if (bend->backend_state != XenbusStateClosing) {
+               bend->backend_state = XenbusStateClosing;
+               net_accel_update_state(dev, XenbusStateClosing);
+       }
+
+       frontend_state = (int)XenbusStateUnknown;
+       xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d",
+                    &frontend_state);
+
+       mutex_unlock(&bend->bend_mutex);
+
+       /*
+        * Wait until this end goes to the closed state.  This happens
+        * in response to the other end going to the closed state.
+        * Don't bother doing this if the other end is already closed
+        * because if it is then there is nothing to do.
+        */
+       if (frontend_state != (int)XenbusStateClosed &&
+           frontend_state != (int)XenbusStateUnknown)
+               wait_event(bend->state_wait_queue,
+                          bend->backend_state == XenbusStateClosed);
+
+       unregister_xenbus_watch(&bend->domu_accel_watch);
+       kfree(bend->domu_accel_watch.node);
+
+       unregister_xenbus_watch(&bend->config_accel_watch);
+       kfree(bend->config_accel_watch.node);
+
+       /*
+        * Flush the scheduled work queue before freeing bend to get
+        * rid of any pending netback_accel_msg_rx_handler()
+        */
+       flush_work_sync(&bend->handle_msg);
+
+       mutex_lock(&bend->bend_mutex);
+
+       /* Tear down the vnic if it was set up. */
+       if (bend->vnic_is_setup) {
+               bend->vnic_is_setup = 0;
+               cleanup_vnic(bend);
+       }
+
+       bend->backend_state = XenbusStateUnknown;
+       net_accel_update_state(dev, XenbusStateUnknown);
+
+       netback_accel_debugfs_remove(bend);
+
+       unpublish_frontend_name(dev);
+
+       kfree(bend->nicname);
+
+       binfo->netback_accel_priv = NULL;
+
+       mutex_unlock(&bend->bend_mutex);
+
+       kfree(bend);
+
+       return 0;
+}
+
+
+void netback_accel_shutdown_bends(void)
+{
+       mutex_lock(&bend_list_mutex);
+       /*
+        * I think we should have had a remove callback for all
+        * interfaces before being allowed to unload the module
+        */
+       BUG_ON(bend_list != NULL);
+       mutex_unlock(&bend_list_mutex);
+}
+
+
+void netback_accel_set_closing(struct netback_accel *bend) 
+{
+
+       bend->backend_state = XenbusStateClosing;
+       net_accel_update_state((struct xenbus_device *)bend->hdev_data,
+                              XenbusStateClosing);
+}
diff --git a/drivers/xen/sfc_netback/ci/compat.h b/drivers/xen/sfc_netback/ci/compat.h

new file mode 100644 (file)

index 0000000..79f96f2
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat.h
@@ -0,0 +1,53 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Compatability layer.  Provides definitions of fundamental
+ *          types and definitions that are used throughout CI source
+ *          code.  It does not introduce any link time dependencies,
+ *          or include any unnecessary system headers.
+ */
+/*! \cidoxg_include_ci */
+
+#ifndef __CI_COMPAT_H__
+#define __CI_COMPAT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <ci/compat/primitive.h>
+#include <ci/compat/sysdep.h>
+#include <ci/compat/utils.h>
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __CI_COMPAT_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/gcc.h b/drivers/xen/sfc_netback/ci/compat/gcc.h

new file mode 100644 (file)

index 0000000..0cf77c4
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/gcc.h
@@ -0,0 +1,158 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_compat  */
+
+#ifndef __CI_COMPAT_GCC_H__
+#define __CI_COMPAT_GCC_H__
+
+
+#define CI_HAVE_INT64
+
+
+#if defined(__linux__) && defined(__KERNEL__)
+
+# include <linux/types.h>
+
+typedef __u64                 ci_uint64;
+typedef __s64                 ci_int64;
+# if BITS_PER_LONG == 32
+typedef __s32                 ci_ptr_arith_t;
+typedef __u32                 ci_uintptr_t;
+# else
+typedef __s64                 ci_ptr_arith_t;
+typedef __u64                 ci_uintptr_t;
+# endif
+
+
+/* it's not obvious to me why the below is wrong for x64_64, but
+ * gcc seems to complain on this platform
+ */
+# if defined(__ia64__)
+#  define CI_PRId64            "ld"
+#  define CI_PRIi64            "li"
+#  define CI_PRIo64            "lo"
+#  define CI_PRIu64            "lu"
+#  define CI_PRIx64            "lx"
+#  define CI_PRIX64            "lX"
+# else
+#  define CI_PRId64            "lld"
+#  define CI_PRIi64            "lli"
+#  define CI_PRIo64            "llo"
+#  define CI_PRIu64            "llu"
+#  define CI_PRIx64            "llx"
+#  define CI_PRIX64            "llX"
+# endif
+
+# define CI_PRId32            "d"
+# define CI_PRIi32            "i"
+# define CI_PRIo32            "o"
+# define CI_PRIu32            "u"
+# define CI_PRIx32            "x"
+# define CI_PRIX32            "X"
+
+#else
+
+# include <stdint.h>
+# include <inttypes.h>
+
+typedef uint64_t              ci_uint64;
+typedef int64_t               ci_int64;
+typedef intptr_t              ci_ptr_arith_t;
+typedef uintptr_t             ci_uintptr_t;
+
+# define CI_PRId64            PRId64
+# define CI_PRIi64            PRIi64
+# define CI_PRIo64            PRIo64
+# define CI_PRIu64            PRIu64
+# define CI_PRIx64            PRIx64
+# define CI_PRIX64            PRIX64
+
+# define CI_PRId32            PRId32
+# define CI_PRIi32            PRIi32
+# define CI_PRIo32            PRIo32
+# define CI_PRIu32            PRIu32
+# define CI_PRIx32            PRIx32
+# define CI_PRIX32            PRIX32
+
+#endif
+
+
+typedef ci_uint64                       ci_fixed_descriptor_t;
+
+#define from_fixed_descriptor(desc) ((ci_uintptr_t)(desc))
+#define to_fixed_descriptor(desc) ((ci_fixed_descriptor_t)(ci_uintptr_t)(desc))
+
+
+#if __GNUC__ >= 3 && !defined(__cplusplus)
+/*
+** Checks that [p_mbr] has the same type as [&c_type::mbr_name].
+*/
+# define CI_CONTAINER(c_type, mbr_name, p_mbr)                         \
+   __builtin_choose_expr(                                              \
+     __builtin_types_compatible_p(__typeof__(&((c_type*)0)->mbr_name), \
+                                __typeof__(p_mbr)),                    \
+     __CI_CONTAINER(c_type, mbr_name, p_mbr), (void)0)
+
+# define ci_restrict  __restrict__
+#endif
+
+
+#if !defined(__KERNEL__) || defined(__unix__)
+#define CI_HAVE_NPRINTF  1
+#endif
+
+
+/* At what version was this introduced? */
+#if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ > 91)
+# define CI_LIKELY(t)    __builtin_expect((t), 1)
+# define CI_UNLIKELY(t)  __builtin_expect((t), 0)
+#endif
+
+/**********************************************************************
+ * Attributes
+ */
+#if __GNUC__ >= 3 && defined(NDEBUG)
+# define CI_HF __attribute__((visibility("hidden")))
+# define CI_HV __attribute__((visibility("hidden")))
+#else
+# define CI_HF
+# define CI_HV
+#endif
+
+#if __GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
+# define ci_noinline  static __attribute__((__noinline__))
+/* (Linux 2.6 defines its own "noinline", so we use the "__noinline__" form) */
+#else
+# define ci_noinline  static
+#endif
+
+#define CI_ALIGN(x) __attribute__ ((aligned (x)))
+
+#define CI_PRINTF_LIKE(a,b) __attribute__((format(printf,a,b)))
+
+#endif  /* __CI_COMPAT_GCC_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/gcc_x86.h b/drivers/xen/sfc_netback/ci/compat/gcc_x86.h

new file mode 100644 (file)

index 0000000..438f0ba
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/gcc_x86.h
@@ -0,0 +1,115 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_compat  */
+
+#ifndef __CI_COMPAT_GCC_X86_H__
+#define __CI_COMPAT_GCC_X86_H__
+
+/*
+** The facts:
+**
+**   SSE   sfence
+**   SSE2  lfence, mfence, pause
+*/
+
+/* 
+   Barriers to enforce ordering with respect to:
+
+   normal memory use: ci_wmb, ci_rmb, ci_wmb
+   IO bus access use: ci_wiob, ci_riob, ci_iob
+*/
+#if defined(__x86_64__)
+# define ci_x86_mb() __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
+#else
+# define ci_x86_mb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
+#endif
+
+/* ?? measure the impact of latency of sfence on a modern processor before we
+   take a decision on how to integrate with respect to writecombining */
+
+/* DJR: I don't think we need to add "memory" here.  It means the asm does
+** something to memory that GCC doesn't understand.  But all this does is
+** commit changes that GCC thinks have already happened.  NB. GCC will not
+** reorder across a __volatile__ __asm__ anyway.
+*/
+#define ci_gcc_fence()    __asm__ __volatile__ ("")
+
+#if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
+# define ci_x86_sfence()  __asm__ __volatile__ ("sfence")
+# define ci_x86_lfence()  __asm__ __volatile__ ("lfence")
+# define ci_x86_mfence()  __asm__ __volatile__ ("mfence")
+#else
+# define ci_x86_sfence()  __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xF8")
+# define ci_x86_lfence()  __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xE8")
+# define ci_x86_mfence()  __asm__ __volatile__ (".byte 0x0F, 0xAE, 0xF0")
+#endif
+
+
+/* x86 processors to P4 Xeon store in-order unless executing streaming
+   extensions or when using writecombining 
+
+   Hence we do not define ci_wmb to use sfence by default. Requirement is that
+   we do not use writecombining to memory and any code which uses SSE
+   extensions must call sfence directly 
+
+   We need to track non intel clones which may support out of order store.
+
+*/
+
+#if CI_CPU_OOS
+# if CI_CPU_HAS_SSE
+#  define ci_wmb()     ci_x86_sfence()
+# else
+#  define ci_wmb()     ci_x86_mb()
+# endif
+#else
+# define ci_wmb()       ci_gcc_fence()
+#endif
+
+#if CI_CPU_HAS_SSE2
+# define ci_rmb()      ci_x86_lfence()
+# define ci_mb()       ci_x86_mfence()
+# define ci_riob()     ci_x86_lfence()
+# define ci_wiob()     ci_x86_sfence()
+# define ci_iob()      ci_x86_mfence()
+#else
+# if CI_CPU_HAS_SSE
+#  define ci_wiob()    ci_x86_sfence()
+# else
+#  define ci_wiob()    ci_x86_mb()
+# endif
+# define ci_rmb()      ci_x86_mb()
+# define ci_mb()       ci_x86_mb()
+# define ci_riob()     ci_x86_mb()
+# define ci_iob()      ci_x86_mb()
+#endif
+
+typedef unsigned long   ci_phys_addr_t;
+#define ci_phys_addr_fmt  "%lx"
+
+#endif  /* __CI_COMPAT_GCC_X86_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/primitive.h b/drivers/xen/sfc_netback/ci/compat/primitive.h

new file mode 100644 (file)

index 0000000..3e58685
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/primitive.h
@@ -0,0 +1,77 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+/*! \cidoxg_include_ci_compat  */
+
+#ifndef __CI_COMPAT_PRIMITIVE_H__
+#define __CI_COMPAT_PRIMITIVE_H__
+
+
+/**********************************************************************
+ * Primitive types.
+ */
+
+typedef unsigned char                   ci_uint8;
+typedef char                            ci_int8;
+
+typedef unsigned short                  ci_uint16;
+typedef short                           ci_int16;
+
+typedef unsigned int                    ci_uint32;
+typedef int                             ci_int32;
+
+/* 64-bit support is platform dependent. */
+
+
+/**********************************************************************
+ * Other fancy types.
+ */
+
+typedef ci_uint8                        ci_octet;
+
+typedef enum {
+  CI_FALSE = 0,
+  CI_TRUE
+} ci_boolean_t;
+
+
+/**********************************************************************
+ * Some nice types you'd always assumed were standards.
+ * (Really, they are SYSV "standards".)
+ */
+
+#ifdef _WIN32
+typedef unsigned long                   ulong;              
+typedef unsigned int                    uint;
+typedef char*                           caddr_t;
+#elif defined(__linux__) && defined(__KERNEL__)
+#include <linux/types.h>
+#elif defined(__linux__)
+#include <sys/types.h>
+#endif
+
+
+#endif  /* __CI_COMPAT_PRIMITIVE_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/sysdep.h b/drivers/xen/sfc_netback/ci/compat/sysdep.h

new file mode 100644 (file)

index 0000000..7f7423c
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/sysdep.h
@@ -0,0 +1,166 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_compat  */
+
+#ifndef __CI_COMPAT_SYSDEP_H__
+#define __CI_COMPAT_SYSDEP_H__
+
+
+/**********************************************************************
+ * Platform definition fixups.
+ */
+
+#if defined(__ci_ul_driver__) && !defined(__ci_driver__)
+# define __ci_driver__
+#endif
+
+#if defined(__ci_driver__) && !defined(__ci_ul_driver__) && \
+   !defined(__KERNEL__)
+# define __KERNEL__
+#endif
+
+
+/**********************************************************************
+ * Sanity checks (no cheating!)
+ */
+
+#if defined(__KERNEL__) && !defined(__ci_driver__)
+# error Insane.
+#endif
+
+#if defined(__KERNEL__) && defined(__ci_ul_driver__)
+# error Madness.
+#endif
+
+#if defined(__unix__) && defined(_WIN32)
+# error Strange.
+#endif
+
+#if defined(__GNUC__) && defined(_MSC_VER)
+# error Crazy.
+#endif
+
+
+/**********************************************************************
+ * Compiler and processor dependencies.
+ */
+
+#if defined(__GNUC__)
+
+# include <ci/compat/gcc.h>
+
+# if defined(__i386__)
+#  include <ci/compat/x86.h>
+#  include <ci/compat/gcc_x86.h>
+# elif defined(__x86_64__)
+#  include <ci/compat/x86_64.h>
+#  include <ci/compat/gcc_x86.h>
+# elif defined(__PPC__)
+#  include <ci/compat/ppc.h>
+#  include <ci/compat/gcc_ppc.h>
+# elif defined(__ia64__)
+#  include <ci/compat/ia64.h>
+#  include <ci/compat/gcc_ia64.h>
+# else
+#  error Unknown processor - GNU C
+# endif
+
+#elif defined(_MSC_VER)
+
+# include <ci/compat/msvc.h>
+
+# if defined(__i386__)
+#  include <ci/compat/x86.h>
+#  include <ci/compat/msvc_x86.h>
+# elif defined(__x86_64__)
+#  include <ci/compat/x86_64.h>
+#  include <ci/compat/msvc_x86_64.h>
+# else
+#  error Unknown processor MSC
+# endif
+
+#elif defined(__PGI)
+
+# include <ci/compat/x86.h>
+# include <ci/compat/pg_x86.h>
+
+#elif defined(__INTEL_COMPILER)
+
+/* Intel compilers v7 claim to be very gcc compatible. */
+# if __INTEL_COMPILER >= 700
+#  include <ci/compat/gcc.h>
+#  include <ci/compat/x86.h>
+#  include <ci/compat/gcc_x86.h>
+# else
+#  error Old Intel compiler not supported.  Yet.
+# endif
+
+#else
+# error Unknown compiler.
+#endif
+
+
+/**********************************************************************
+ * Misc stuff (that probably shouldn't be here).
+ */
+
+#ifdef __sun
+# ifdef __KERNEL__
+#  define _KERNEL
+#  define _SYSCALL32
+#  ifdef _LP64
+#   define _SYSCALL32_IMPL
+#  endif
+# else
+#  define _REENTRANT
+# endif
+#endif
+
+
+/**********************************************************************
+ * Defaults for anything left undefined.
+ */
+
+#ifndef  CI_LIKELY
+# define CI_LIKELY(t)    (t)
+# define CI_UNLIKELY(t)  (t)
+#endif
+
+#ifndef  ci_restrict
+# define ci_restrict
+#endif
+
+#ifndef  ci_inline
+# define ci_inline  static inline
+#endif
+
+#ifndef  ci_noinline
+# define ci_noinline  static
+#endif
+
+#endif  /* __CI_COMPAT_SYSDEP_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/utils.h b/drivers/xen/sfc_netback/ci/compat/utils.h

new file mode 100644 (file)

index 0000000..34d4c99
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/utils.h
@@ -0,0 +1,269 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Handy utility macros.
+ *   \date  2003/01/17
+ */
+
+/*! \cidoxg_include_ci_compat  */
+
+#ifndef __CI_COMPAT_UTILS_H__
+#define __CI_COMPAT_UTILS_H__
+
+
+/**********************************************************************
+ * Alignment -- [align] must be a power of 2.
+ **********************************************************************/
+
+  /*! Align forward onto next boundary. */
+
+#define CI_ALIGN_FWD(p, align)               (((p)+(align)-1u) & ~((align)-1u))
+
+
+  /*! Align back onto prev boundary. */
+
+#define CI_ALIGN_BACK(p, align)              ((p) & ~((align)-1u))
+
+
+  /*! How far to next boundary? */
+
+#define CI_ALIGN_NEEDED(p, align, signed_t)  (-(signed_t)(p) & ((align)-1u))
+
+
+  /*! How far beyond prev boundary? */
+
+#define CI_OFFSET(p, align)                  ((p) & ((align)-1u))
+
+
+  /*! Does object fit in gap before next boundary? */
+
+#define CI_FITS(p, size, align, signed_t)                      \
+  (CI_ALIGN_NEEDED((p) + 1, (align), signed_t) + 1 >= (size))
+
+
+  /*! Align forward onto next boundary. */
+
+#define CI_PTR_ALIGN_FWD(p, align)                                        \
+  ((char*) CI_ALIGN_FWD(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align))))
+
+  /*! Align back onto prev boundary. */
+
+#define CI_PTR_ALIGN_BACK(p, align)                                        \
+  ((char*) CI_ALIGN_BACK(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align))))
+
+  /*! How far to next boundary? */
+
+#define CI_PTR_ALIGN_NEEDED(p, align)                                  \
+  CI_ALIGN_NEEDED(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align)),    \
+                 ci_ptr_arith_t)
+
+  /*! How far to next boundary? NZ = not zero i.e. give align if on boundary  */
+
+#define CI_PTR_ALIGN_NEEDED_NZ(p, align)                                       \
+  ((align) - (((char*)p) -                                                      \
+  ((char*) CI_ALIGN_BACK(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align))))))
+
+  /*! How far beyond prev boundary? */
+
+#define CI_PTR_OFFSET(p, align)                                        \
+  CI_OFFSET(((ci_ptr_arith_t)(p)), ((ci_ptr_arith_t)(align)))
+
+
+  /* Same as CI_ALIGN_FWD and CI_ALIGN_BACK. */
+
+#define CI_ROUND_UP(i, align)      (((i)+(align)-1u) & ~((align)-1u))
+
+#define CI_ROUND_DOWN(i, align)    ((i) & ~((align)-1u))
+
+
+/**********************************************************************
+ * Byte-order
+ **********************************************************************/
+
+/* These are not flags.  They are enumeration values for use with
+ * CI_MY_BYTE_ORDER. */
+#define CI_BIG_ENDIAN          1
+#define CI_LITTLE_ENDIAN       0
+
+/*
+** Note that these byte-swapping primitives may leave junk in bits above
+** the range they operate on.
+**
+** The CI_BSWAP_nn() routines require that bits above [nn] are zero.  Use
+** CI_BSWAPM_nn(x) if this cannot be guaranteed.
+*/
+
+/* ?? May be able to improve on some of these with inline assembler on some
+** platforms.
+*/
+
+#define CI_BSWAP_16(v)    ((((v) & 0xff) << 8) | ((v) >> 8))
+#define CI_BSWAPM_16(v)   ((((v) & 0xff) << 8) | (((v) & 0xff00) >> 8))
+
+#define CI_BSWAP_32(v)    (((v) >> 24)               |         \
+                          (((v) & 0x00ff0000) >> 8) |  \
+                          (((v) & 0x0000ff00) << 8) |  \
+                          ((v) << 24))
+#define CI_BSWAPM_32(v)   ((((v) & 0xff000000) >> 24) |        \
+                          (((v) & 0x00ff0000) >> 8)  | \
+                          (((v) & 0x0000ff00) << 8)  | \
+                          ((v) << 24))
+
+#define CI_BSWAP_64(v)    (((v) >> 56)                        |        \
+                          (((v) & 0x00ff000000000000) >> 40) | \
+                          (((v) & 0x0000ff0000000000) >> 24) | \
+                          (((v) & 0x000000ff00000000) >> 8)  | \
+                          (((v) & 0x00000000ff000000) << 8)  | \
+                          (((v) & 0x0000000000ff0000) << 24) | \
+                          (((v) & 0x000000000000ff00) << 40) | \
+                          ((v) << 56))
+
+# define CI_BSWAPPED_16_IF(c,v)  ((c) ? CI_BSWAP_16(v) : (v))
+# define CI_BSWAPPED_32_IF(c,v)  ((c) ? CI_BSWAP_32(v) : (v))
+# define CI_BSWAPPED_64_IF(c,v)  ((c) ? CI_BSWAP_64(v) : (v))
+# define CI_BSWAP_16_IF(c,v)     do{ if((c)) (v) = CI_BSWAP_16(v); }while(0)
+# define CI_BSWAP_32_IF(c,v)     do{ if((c)) (v) = CI_BSWAP_32(v); }while(0)
+# define CI_BSWAP_64_IF(c,v)     do{ if((c)) (v) = CI_BSWAP_64(v); }while(0)
+
+#if (CI_MY_BYTE_ORDER == CI_LITTLE_ENDIAN)
+# define CI_BSWAP_LE16(v)    (v)
+# define CI_BSWAP_LE32(v)    (v)
+# define CI_BSWAP_LE64(v)    (v)
+# define CI_BSWAP_BE16(v)    CI_BSWAP_16(v)
+# define CI_BSWAP_BE32(v)    CI_BSWAP_32(v)
+# define CI_BSWAP_BE64(v)    CI_BSWAP_64(v)
+# define CI_BSWAPM_LE16(v)   (v)
+# define CI_BSWAPM_LE32(v)   (v)
+# define CI_BSWAPM_LE64(v)   (v)
+# define CI_BSWAPM_BE16(v)   CI_BSWAPM_16(v)
+# define CI_BSWAPM_BE32(v)   CI_BSWAPM_32(v)
+#elif (CI_MY_BYTE_ORDER == CI_BIG_ENDIAN)
+# define CI_BSWAP_BE16(v)    (v)
+# define CI_BSWAP_BE32(v)    (v)
+# define CI_BSWAP_BE64(v)    (v)
+# define CI_BSWAP_LE16(v)    CI_BSWAP_16(v)
+# define CI_BSWAP_LE32(v)    CI_BSWAP_32(v)
+# define CI_BSWAP_LE64(v)    CI_BSWAP_64(v)
+# define CI_BSWAPM_BE16(v)   (v)
+# define CI_BSWAPM_BE32(v)   (v)
+# define CI_BSWAPM_BE64(v)   (v)
+# define CI_BSWAPM_LE16(v)   CI_BSWAPM_16(v)
+# define CI_BSWAPM_LE32(v)   CI_BSWAPM_32(v)
+#else
+# error Bad endian.
+#endif
+
+
+/**********************************************************************
+ * Get pointer to struct from pointer to member
+ **********************************************************************/
+
+#define CI_MEMBER_OFFSET(c_type, mbr_name)  \
+  ((ci_uint32) (ci_uintptr_t)(&((c_type*)0)->mbr_name))
+
+#define CI_MEMBER_SIZE(c_type, mbr_name)        \
+  sizeof(((c_type*)0)->mbr_name)
+
+#define __CI_CONTAINER(c_type, mbr_name, p_mbr)  \
+  ( (c_type*) ((char*)(p_mbr) - CI_MEMBER_OFFSET(c_type, mbr_name)) )
+
+#ifndef CI_CONTAINER
+# define CI_CONTAINER(t,m,p)  __CI_CONTAINER(t,m,p)
+#endif
+
+
+/**********************************************************************
+ * Structure member initialiser.
+ **********************************************************************/
+
+#ifndef CI_STRUCT_MBR
+# define CI_STRUCT_MBR(name, val)      .name = val
+#endif
+
+
+/**********************************************************************
+ * min / max
+ **********************************************************************/ 
+
+#define CI_MIN(x,y) (((x) < (y)) ? (x) : (y))
+#define CI_MAX(x,y) (((x) > (y)) ? (x) : (y))
+
+/**********************************************************************
+ * abs
+ **********************************************************************/ 
+
+#define CI_ABS(x) (((x) < 0) ? -(x) : (x))
+
+/**********************************************************************
+ * Conditional debugging
+ **********************************************************************/ 
+
+#ifdef NDEBUG
+# define CI_DEBUG(x)
+# define CI_NDEBUG(x)      x
+# define CI_IF_DEBUG(y,n)  (n)
+# define CI_DEBUG_ARG(x)
+#else
+# define CI_DEBUG(x)       x
+# define CI_NDEBUG(x)
+# define CI_IF_DEBUG(y,n)  (y)
+# define CI_DEBUG_ARG(x)   ,x
+#endif
+
+#ifdef __KERNEL__
+#define CI_KERNEL_ARG(x)   ,x
+#else
+#define CI_KERNEL_ARG(x)
+#endif
+
+#ifdef _WIN32
+# define CI_KERNEL_ARG_WIN(x) CI_KERNEL_ARG(x)
+# define CI_ARG_WIN(x) ,x
+#else
+# define CI_KERNEL_ARG_WIN(x)
+# define CI_ARG_WIN(x) 
+#endif
+
+#ifdef __unix__
+# define CI_KERNEL_ARG_UNIX(x) CI_KERNEL_ARG(x)
+# define CI_ARG_UNIX(x) ,x
+#else
+# define CI_KERNEL_ARG_UNIX(x)
+# define CI_ARG_UNIX(x) 
+#endif
+
+#ifdef __linux__
+# define CI_KERNEL_ARG_LINUX(x) CI_KERNEL_ARG(x)
+# define CI_ARG_LINUX(x) ,x
+#else
+# define CI_KERNEL_ARG_LINUX(x)
+# define CI_ARG_LINUX(x) 
+#endif
+
+
+#endif  /* __CI_COMPAT_UTILS_H__ */
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/x86.h b/drivers/xen/sfc_netback/ci/compat/x86.h

new file mode 100644 (file)

index 0000000..2c1dfb3
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/x86.h
@@ -0,0 +1,48 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_compat  */
+
+#ifndef __CI_COMPAT_X86_H__
+#define __CI_COMPAT_X86_H__
+
+
+#define CI_MY_BYTE_ORDER   CI_LITTLE_ENDIAN
+
+#define CI_WORD_SIZE       4
+#define CI_PTR_SIZE        4
+
+#define CI_PAGE_SIZE       4096
+#define CI_PAGE_SHIFT      12
+#define CI_PAGE_MASK       (~(CI_PAGE_SIZE - 1))
+
+#define CI_CPU_HAS_SSE    1    /* SSE extensions supported */
+#define CI_CPU_HAS_SSE2           0    /* SSE2 extensions supported */
+#define CI_CPU_OOS        0    /* CPU does out of order stores */
+
+
+#endif  /* __CI_COMPAT_X86_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/compat/x86_64.h b/drivers/xen/sfc_netback/ci/compat/x86_64.h

new file mode 100644 (file)

index 0000000..c09f540
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/compat/x86_64.h
@@ -0,0 +1,54 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Arch stuff for AMD x86_64.
+ *   \date  2004/08/17
+ */
+
+/*! \cidoxg_include_ci_compat  */
+#ifndef __CI_COMPAT_X86_64_H__
+#define __CI_COMPAT_X86_64_H__
+
+
+#define CI_MY_BYTE_ORDER       CI_LITTLE_ENDIAN
+
+#define CI_WORD_SIZE           8
+#define CI_PTR_SIZE            8
+
+#define CI_PAGE_SIZE           4096
+#define CI_PAGE_SHIFT          12
+#define CI_PAGE_MASK           (~(CI_PAGE_SIZE - 1))
+
+#define CI_CPU_HAS_SSE         1       /* SSE extensions supported */
+
+/* SSE2 disabled while investigating BUG1060 */
+#define CI_CPU_HAS_SSE2                0       /* SSE2 extensions supported */
+#define CI_CPU_OOS             0       /* CPU does out of order stores */
+
+
+#endif  /* __CI_COMPAT_X86_64_H__ */
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/tools/config.h b/drivers/xen/sfc_netback/ci/tools/config.h

new file mode 100644 (file)

index 0000000..fb802f9
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/config.h
@@ -0,0 +1,49 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_tools */
+
+#ifndef __CI_TOOLS_CONFIG_H__
+#define __CI_TOOLS_CONFIG_H__
+
+
+/**********************************************************************
+ * Debugging.
+ */
+
+#define CI_INCLUDE_ASSERT_VALID           0
+
+/* Set non-zero to allow info about who has allocated what to appear in
+ * /proc/drivers/level5/mem.
+ * However - Note that doing so can lead to segfault when you unload the
+ * driver, and other weirdness.  i.e. I don't think the code for is quite
+ * right (written by Oktet, hacked by gel), but it does work well enough to be
+ * useful.
+ */
+#define CI_MEMLEAK_DEBUG_ALLOC_TABLE     0
+
+
+#endif  /* __CI_TOOLS_CONFIG_H__ */
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/tools/debug.h b/drivers/xen/sfc_netback/ci/tools/debug.h

new file mode 100644 (file)

index 0000000..a25c2c4
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/debug.h
@@ -0,0 +1,336 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_tools */
+
+#ifndef __CI_TOOLS_DEBUG_H__
+#define __CI_TOOLS_DEBUG_H__
+
+#define CI_LOG_E(x)       x              /* errors      */
+#define CI_LOG_W(x)       x              /* warnings    */
+#define CI_LOG_I(x)       x              /* information */
+#define CI_LOG_V(x)       x              /* verbose     */
+
+/* Build time asserts. We paste the line number into the type name
+ * so that the macro can be used more than once per file even if the
+ * compiler objects to multiple identical typedefs. Collisions
+ * between use in different header files is still possible. */
+#ifndef CI_BUILD_ASSERT
+#define __CI_BUILD_ASSERT_NAME(_x) __CI_BUILD_ASSERT_ILOATHECPP(_x)
+#define __CI_BUILD_ASSERT_ILOATHECPP(_x)  __CI_BUILD_ASSERT__ ##_x
+#define CI_BUILD_ASSERT(e)\
+ typedef char  __CI_BUILD_ASSERT_NAME(__LINE__)[(e)?1:-1]
+#endif
+
+
+#ifdef NDEBUG
+
+# define _ci_check(exp, file, line)
+# define _ci_assert2(e, x, y, file, line)
+# define _ci_assert(exp, file, line)
+# define _ci_assert_equal(exp1, exp2, file, line)
+# define _ci_assert_equiv(exp1, exp2, file, line)
+# define _ci_assert_nequal(exp1, exp2, file, line)
+# define _ci_assert_le(exp1, exp2, file, line)
+# define _ci_assert_lt(exp1, exp2, file, line)
+# define _ci_assert_ge(exp1, exp2, file, line)
+# define _ci_assert_gt(exp1, exp2, file, line)
+# define _ci_assert_impl(exp1, exp2, file, line)
+
+# define _ci_verify(exp, file, line) \
+  do { \
+    (void)(exp); \
+  } while (0)
+
+# define CI_DEBUG_TRY(exp) \
+  do { \
+    (void)(exp); \
+  } while (0)
+
+#define CI_TRACE(exp,fmt)
+#define CI_TRACE_INT(integer)
+#define CI_TRACE_INT32(integer)
+#define CI_TRACE_INT64(integer)
+#define CI_TRACE_UINT(integer)
+#define CI_TRACE_UINT32(integer)
+#define CI_TRACE_UINT64(integer)
+#define CI_TRACE_HEX(integer)
+#define CI_TRACE_HEX32(integer)
+#define CI_TRACE_HEX64(integer)
+#define CI_TRACE_PTR(pointer)
+#define CI_TRACE_STRING(string)
+#define CI_TRACE_MAC(mac)
+#define CI_TRACE_IP(ip_be32)
+#define CI_TRACE_ARP(arp_pkt)
+
+#else
+
+# define _CI_ASSERT_FMT   "\nfrom %s:%d"
+
+# define _ci_check(exp, file, line)                             \
+  do {                                                          \
+    if (CI_UNLIKELY(!(exp)))                                    \
+      ci_warn(("ci_check(%s)"_CI_ASSERT_FMT, #exp,              \
+               (file), (line)));                                \
+  } while (0)
+
+/*
+ * NOTE: ci_fail() emits the file and line where the assert is actually
+ *       coded.
+ */
+
+# define _ci_assert(exp, file, line)                            \
+  do {                                                          \
+    if (CI_UNLIKELY(!(exp)))                                    \
+      ci_fail(("ci_assert(%s)"_CI_ASSERT_FMT, #exp,            \
+               (file), (line)));                                \
+  } while (0)
+
+# define _ci_assert2(e, x, y, file, line)  do {                 \
+    if(CI_UNLIKELY( ! (e) ))                                    \
+      ci_fail(("ci_assert(%s)\nwhere [%s=%"CI_PRIx64"] "        \
+               "[%s=%"CI_PRIx64"]\nat %s:%d\nfrom %s:%d", #e    \
+               , #x, (ci_uint64)(ci_uintptr_t)(x)               \
+               , #y, (ci_uint64)(ci_uintptr_t)(y),              \
+               __FILE__, __LINE__, (file), (line)));            \
+  } while (0)
+
+# define _ci_verify(exp, file, line)                            \
+  do {                                                          \
+    if (CI_UNLIKELY(!(exp)))                                    \
+      ci_fail(("ci_verify(%s)"_CI_ASSERT_FMT, #exp,             \
+               (file), (line)));                                \
+  } while (0)
+
+# define _ci_assert_equal(x, y, f, l)  _ci_assert2((x)==(y), x, y, (f), (l))
+# define _ci_assert_nequal(x, y, f, l) _ci_assert2((x)!=(y), x, y, (f), (l))
+# define _ci_assert_le(x, y, f, l)     _ci_assert2((x)<=(y), x, y, (f), (l))
+# define _ci_assert_lt(x, y, f, l)     _ci_assert2((x)< (y), x, y, (f), (l))
+# define _ci_assert_ge(x, y, f, l)     _ci_assert2((x)>=(y), x, y, (f), (l))
+# define _ci_assert_gt(x, y, f, l)     _ci_assert2((x)> (y), x, y, (f), (l))
+# define _ci_assert_or(x, y, f, l)     _ci_assert2((x)||(y), x, y, (f), (l))
+# define _ci_assert_impl(x, y, f, l)   _ci_assert2(!(x) || (y), x, y, (f), (l))
+# define _ci_assert_equiv(x, y, f, l)  _ci_assert2(!(x)== !(y), x, y, (f), (l))
+
+#define _ci_assert_equal_msg(exp1, exp2, msg, file, line)       \
+  do {                                                          \
+    if (CI_UNLIKELY((exp1)!=(exp2)))                            \
+      ci_fail(("ci_assert_equal_msg(%s == %s) were "            \
+               "(%"CI_PRIx64":%"CI_PRIx64") with msg[%c%c%c%c]" \
+               _CI_ASSERT_FMT, #exp1, #exp2,                    \
+               (ci_uint64)(ci_uintptr_t)(exp1),                 \
+               (ci_uint64)(ci_uintptr_t)(exp2),                 \
+               (((ci_uint32)msg) >> 24) && 0xff,                \
+               (((ci_uint32)msg) >> 16) && 0xff,                \
+               (((ci_uint32)msg) >> 8 ) && 0xff,                \
+               (((ci_uint32)msg)      ) && 0xff,                \
+               (file), (line)));                                \
+  } while (0)
+
+# define CI_DEBUG_TRY(exp)  CI_TRY(exp)
+
+#define CI_TRACE(exp,fmt)                                              \
+  ci_log("%s:%d:%s] " #exp "=" fmt,                                     \
+         __FILE__, __LINE__, __FUNCTION__, (exp))
+
+
+#define CI_TRACE_INT(integer)                                          \
+  ci_log("%s:%d:%s] " #integer "=%d",                                   \
+         __FILE__, __LINE__, __FUNCTION__, (integer))
+
+
+#define CI_TRACE_INT32(integer)                                                \
+  ci_log("%s:%d:%s] " #integer "=%d",                                   \
+         __FILE__, __LINE__, __FUNCTION__, ((ci_int32)integer))
+
+
+#define CI_TRACE_INT64(integer)                                                \
+  ci_log("%s:%d:%s] " #integer "=%lld",                                 \
+         __FILE__, __LINE__, __FUNCTION__, ((ci_int64)integer))
+
+
+#define CI_TRACE_UINT(integer)                                         \
+  ci_log("%s:%d:%s] " #integer "=%ud",                                  \
+         __FILE__, __LINE__, __FUNCTION__, (integer))
+
+
+#define CI_TRACE_UINT32(integer)                                       \
+  ci_log("%s:%d:%s] " #integer "=%ud",                                  \
+         __FILE__, __LINE__, __FUNCTION__, ((ci_uint32)integer))
+
+
+#define CI_TRACE_UINT64(integer)                                       \
+  ci_log("%s:%d:%s] " #integer "=%ulld",                                \
+         __FILE__, __LINE__, __FUNCTION__, ((ci_uint64)integer))
+
+
+#define CI_TRACE_HEX(integer)                                          \
+  ci_log("%s:%d:%s] " #integer "=0x%x",                                 \
+         __FILE__, __LINE__, __FUNCTION__, (integer))
+
+
+#define CI_TRACE_HEX32(integer)                                                \
+  ci_log("%s:%d:%s] " #integer "=0x%x",                                 \
+         __FILE__, __LINE__, __FUNCTION__, ((ci_uint32)integer))
+
+
+#define CI_TRACE_HEX64(integer)                                                \
+  ci_log("%s:%d:%s] " #integer "=0x%llx",                               \
+         __FILE__, __LINE__, __FUNCTION__, ((ci_uint64)integer))
+
+
+#define CI_TRACE_PTR(pointer)                                          \
+  ci_log("%s:%d:%s] " #pointer "=0x%p",                                 \
+         __FILE__, __LINE__, __FUNCTION__, (pointer))
+
+
+#define CI_TRACE_STRING(string)                                                \
+  ci_log("%s:%d:%s] " #string "=%s",                                    \
+         __FILE__, __LINE__, __FUNCTION__, (string))
+
+
+#define CI_TRACE_MAC(mac)                                              \
+  ci_log("%s:%d:%s] " #mac "=" CI_MAC_PRINTF_FORMAT,                    \
+         __FILE__, __LINE__, __FUNCTION__, CI_MAC_PRINTF_ARGS(mac))
+
+
+#define CI_TRACE_IP(ip_be32)                                           \
+  ci_log("%s:%d:%s] " #ip_be32 "=" CI_IP_PRINTF_FORMAT, __FILE__,       \
+         __LINE__, __FUNCTION__, CI_IP_PRINTF_ARGS(&(ip_be32)))
+
+
+#define CI_TRACE_ARP(arp_pkt)                                           \
+  ci_log("%s:%d:%s]\n"CI_ARP_PRINTF_FORMAT,                             \
+         __FILE__, __LINE__, __FUNCTION__, CI_ARP_PRINTF_ARGS(arp_pkt))
+
+#endif  /* NDEBUG */
+
+#define ci_check(exp) \
+        _ci_check(exp, __FILE__, __LINE__)
+
+#define ci_assert(exp) \
+        _ci_assert(exp, __FILE__, __LINE__)
+
+#define ci_verify(exp) \
+        _ci_verify(exp, __FILE__, __LINE__)
+
+#define ci_assert_equal(exp1, exp2) \
+        _ci_assert_equal(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_equal_msg(exp1, exp2, msg) \
+        _ci_assert_equal_msg(exp1, exp2, msg, __FILE__, __LINE__)
+
+#define ci_assert_nequal(exp1, exp2) \
+        _ci_assert_nequal(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_le(exp1, exp2) \
+        _ci_assert_le(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_lt(exp1, exp2) \
+        _ci_assert_lt(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_ge(exp1, exp2) \
+        _ci_assert_ge(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_gt(exp1, exp2) \
+        _ci_assert_gt(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_impl(exp1, exp2) \
+        _ci_assert_impl(exp1, exp2, __FILE__, __LINE__)
+
+#define ci_assert_equiv(exp1, exp2) \
+        _ci_assert_equiv(exp1, exp2, __FILE__, __LINE__)
+
+
+#define CI_TEST(exp)                            \
+  do{                                           \
+    if( CI_UNLIKELY(!(exp)) )                   \
+      ci_fail(("CI_TEST(%s)", #exp));           \
+  }while(0)
+
+
+#define CI_TRY(exp)                            \
+  do{                                          \
+    int _trc;                                  \
+    _trc=(exp);                                        \
+    if( CI_UNLIKELY(_trc < 0) )                        \
+      ci_sys_fail(#exp, _trc);                 \
+  }while(0)
+
+
+#define CI_TRY_RET(exp)                                                         \
+  do{                                                                   \
+    int _trc;                                                           \
+    _trc=(exp);                                                                 \
+    if( CI_UNLIKELY(_trc < 0) ) {                                       \
+      ci_log("%s returned %d at %s:%d", #exp, _trc, __FILE__, __LINE__); \
+      return _trc;                                                      \
+    }                                                                   \
+  }while(0)
+
+#define CI_LOGLEVEL_TRY_RET(logfn, exp)                                    \
+  do{                                                                   \
+    int _trc;                                                           \
+    _trc=(exp);                                                                 \
+    if( CI_UNLIKELY(_trc < 0) ) {                                       \
+      logfn (ci_log("%s returned %d at %s:%d", #exp, _trc, __FILE__, __LINE__)); \
+      return _trc;                                                      \
+    }                                                                   \
+  }while(0)
+
+
+#define CI_SOCK_TRY(exp)                       \
+  do{                                          \
+    ci_sock_err_t _trc;                                \
+    _trc=(exp);                                        \
+    if( CI_UNLIKELY(!ci_sock_errok(_trc)) )    \
+      ci_sys_fail(#exp, _trc.val);             \
+  }while(0)
+
+
+#define CI_SOCK_TRY_RET(exp)                                                \
+  do{                                                                       \
+    ci_sock_err_t _trc;                                                             \
+    _trc=(exp);                                                                     \
+    if( CI_UNLIKELY(!ci_sock_errok(_trc)) ) {                               \
+      ci_log("%s returned %d at %s:%d", #exp, _trc.val, __FILE__, __LINE__); \
+      return ci_sock_errcode(_trc);                                         \
+    }                                                                       \
+  }while(0)
+
+
+#define CI_SOCK_TRY_SOCK_RET(exp)                                           \
+  do{                                                                       \
+    ci_sock_err_t _trc;                                                             \
+    _trc=(exp);                                                                     \
+    if( CI_UNLIKELY(!ci_sock_errok(_trc)) ) {                               \
+      ci_log("%s returned %d at %s:%d", #exp, _trc.val, __FILE__, __LINE__); \
+      return _trc;                                                          \
+    }                                                                       \
+  }while(0)
+
+#endif  /* __CI_TOOLS_DEBUG_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/tools/log.h b/drivers/xen/sfc_netback/ci/tools/log.h

new file mode 100644 (file)

index 0000000..a9d471d
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/log.h
@@ -0,0 +1,269 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Functions for logging and pretty-printing.
+ *   \date  2002/08/07
+ */
+
+/*! \cidoxg_include_ci_tools */
+
+#ifndef __CI_TOOLS_LOG_H__
+#define __CI_TOOLS_LOG_H__
+
+#include <stdarg.h>
+
+
+/**********************************************************************
+ * Logging.
+ */
+
+/* size of internal log buffer */ 
+#define  CI_LOG_MAX_LINE        512
+/* uses of ci_log must ensure that all trace messages are shorter than this */ 
+#define  CI_LOG_MAX_MSG_LENGTH        (CI_LOG_MAX_LINE-50)
+
+extern void ci_vlog(const char* fmt, va_list args)  CI_HF;
+extern void ci_log(const char* fmt, ...) CI_PRINTF_LIKE(1,2) CI_HF;
+
+  /*! Set the prefix for log messages.
+  **
+  ** Uses the storage pointed to by \em prefix.  Therefore \em prefix must
+  ** be allocated on the heap, or statically.
+  */
+extern void ci_set_log_prefix(const char* prefix)  CI_HF;
+
+typedef void (*ci_log_fn_t)(const char* msg);
+extern ci_log_fn_t  ci_log_fn  CI_HV;
+
+/* Log functions. */
+extern void ci_log_null(const char* msg) CI_HF;
+extern void ci_log_stderr(const char* msg) CI_HF;
+extern void ci_log_stdout(const char* msg) CI_HF;
+extern void ci_log_syslog(const char* msg) CI_HF;
+
+/*! Call the following to install special logging behaviours. */
+extern void ci_log_buffer_till_fail(void) CI_HF;
+extern void ci_log_buffer_till_exit(void) CI_HF;
+
+extern void __ci_log_unique(const char* msg) CI_HF;
+extern ci_log_fn_t __ci_log_unique_fn CI_HV;
+ci_inline void ci_log_uniquify(void) {
+  if( ci_log_fn != __ci_log_unique ) {
+    __ci_log_unique_fn = ci_log_fn;
+    ci_log_fn = __ci_log_unique;
+  }
+}
+
+extern void ci_log_file(const char* msg) CI_HF;
+extern int  ci_log_file_fd CI_HV;
+
+extern void __ci_log_nth(const char* msg) CI_HF;
+extern ci_log_fn_t __ci_log_nth_fn CI_HV;
+extern int  ci_log_nth_n CI_HV;  /* default 100 */
+ci_inline void ci_log_nth(void) {
+  if( ci_log_fn != __ci_log_nth ) {
+    __ci_log_nth_fn = ci_log_fn;
+    ci_log_fn = __ci_log_nth;
+  }
+}
+
+extern int  ci_log_level  CI_HV;
+
+extern int  ci_log_options  CI_HV;
+#define CI_LOG_PID             0x1
+#define CI_LOG_TID             0x2
+#define CI_LOG_TIME            0x4
+#define CI_LOG_DELTA           0x8
+
+/**********************************************************************
+ * Used to define which mode we are in
+ */
+#if (defined(_WIN32) && !defined(__KERNEL__))
+typedef enum {
+  ci_log_md_NULL=0,
+    ci_log_md_ioctl,
+    ci_log_md_stderr,
+    ci_log_md_stdout,
+    ci_log_md_file,
+    ci_log_md_serial,
+    ci_log_md_syslog,
+    ci_log_md_pidfile
+} ci_log_mode_t;
+extern ci_log_mode_t ci_log_mode;
+#endif
+
+/**********************************************************************
+ * Pretty-printing.
+ */
+
+extern char ci_printable_char(char c) CI_HF;
+
+extern void (*ci_hex_dump_formatter)(char* buf, const ci_octet* s,
+                                    int i, int off, int len) CI_HV;
+extern void ci_hex_dump_format_octets(char*,const ci_octet*,int,int,int) CI_HF;
+extern void ci_hex_dump_format_dwords(char*,const ci_octet*,int,int,int) CI_HF;
+
+extern void ci_hex_dump_row(char* buf, volatile const void* s, int len,
+                           ci_ptr_arith_t address) CI_HF;
+  /*!< A row contains up to 16 bytes.  Row starts at [address & 15u], so
+  ** therefore [len + (address & 15u)] must be <= 16.
+  */
+
+extern void ci_hex_dump(ci_log_fn_t, volatile const void*,
+                       int len, ci_ptr_arith_t address) CI_HF;
+
+extern int  ci_hex_dump_to_raw(const char* src_hex, void* buf,
+                              unsigned* addr_out_opt, int* skip)  CI_HF;
+  /*!< Recovers raw data from a single line of a hex dump.  [buf] must be at
+  ** least 16 bytes long.  Returns the number of bytes written to [buf] (in
+  ** range 1 -> 16), or -1 if [src_hex] doesn't contain hex data.  Does not
+  ** cope with missing bytes at the start of a line.
+  */
+
+extern int ci_format_eth_addr(char* buf, const void* eth_mac_addr,
+                             char sep)  CI_HF;
+  /*!< This will write 18 characters to <buf> including terminating null.
+  ** Returns number of bytes written excluding null.  If [sep] is zero, ':'
+  ** is used.
+  */
+
+extern int ci_parse_eth_addr(void* eth_mac_addr,
+                            const char* str, char sep) CI_HF;
+  /*!< If [sep] is zero, absolutely any separator is accepted (even
+  ** inconsistent separators).  Returns 0 on success, -1 on error.
+  */
+
+extern int ci_format_ip4_addr(char* buf, unsigned addr_be32) CI_HF;
+  /*!< Formats the IP address (in network endian) in dotted-quad.  Returns
+  ** the number of bytes written (up to 15), excluding the null.  [buf]
+  ** must be at least 16 bytes long.
+  */
+
+#if defined(__unix__) && ! defined(__KERNEL__)
+extern int ci_format_select_set(char* s, int len_s, int nfds, const fd_set*);
+extern int ci_format_select(char* s, int len_s,
+                           int nfds, const fd_set* rds, const fd_set* wrs,
+                           const fd_set* exs, struct timeval* timeout);
+#endif
+
+
+/**********************************************************************
+ * Error checking.
+ */
+
+extern void (*ci_fail_stop_fn)(void) CI_HV;
+
+extern void ci_fail_stop(void) CI_HF;
+extern void ci_fail_hang(void) CI_HF;
+extern void ci_fail_bomb(void) CI_HF;
+extern void ci_backtrace(void) CI_HF;
+
+#if defined __linux__ && !defined __KERNEL__
+extern void ci_fail_abort (void) CI_HF;
+#endif
+
+#ifdef __GNUC__
+extern void
+__ci_fail(const char*, ...) CI_PRINTF_LIKE(1,2) CI_HF;
+#else
+# if _PREFAST_
+  extern void _declspec(noreturn) __ci_fail(const char* fmt, ...);
+# else 
+  extern void __ci_fail(const char* fmt, ...);
+# endif
+
+#endif
+
+#define ci_warn(x)                                                        \
+  do{ ci_log("WARN at %s:%d", __FILE__, __LINE__); }while(0)
+
+#define ci_fail(x)                                                        \
+  do{ ci_log("FAIL at %s:%d", __FILE__, __LINE__);  __ci_fail x; }while(0)
+
+extern void __ci_sys_fail(const char* fn, int rc,
+                         const char* file, int line) CI_HF;
+#define ci_sys_fail(fn, rc)  __ci_sys_fail(fn, rc, __FILE__, __LINE__)
+
+/**********************************************************************
+ * Logging to buffer (src/citools/log_buffer.c)
+ */
+
+/*! Divert ci_log() messages to the log buffer
+ *  normally they go to the  system console */
+extern void ci_log_buffer_till_fail(void) CI_HF;
+
+/*! Dump the contents of the log buffer to the system console */
+extern void ci_log_buffer_dump(void) CI_HF;
+
+
+/**********************************************************************
+ * Some useful pretty-printing.
+ */
+
+#ifdef  __linux__
+# define CI_SOCKCALL_FLAGS_FMT "%s%s%s%s%s%s%s%s%s%s%s"
+
+# define CI_SOCKCALL_FLAGS_PRI_ARG(x)          \
+  (((x) & MSG_OOB         ) ? "OOB "         :""),     \
+  (((x) & MSG_PEEK        ) ? "PEEK "        :""),     \
+  (((x) & MSG_DONTROUTE   ) ? "DONTROUTE "   :""),     \
+  (((x) & MSG_EOR         ) ? "EOR "         :""),     \
+  (((x) & MSG_CTRUNC      ) ? "CTRUNC "      :""),     \
+  (((x) & MSG_TRUNC       ) ? "TRUNC "       :""),     \
+  (((x) & MSG_WAITALL     ) ? "WAITALL "     :""),     \
+  (((x) & MSG_DONTWAIT    ) ? "DONTWAIT "    :""),     \
+  (((x) & MSG_NOSIGNAL    ) ? "NOSIGNAL "    :""),     \
+  (((x) & MSG_ERRQUEUE    ) ? "ERRQUEUE "    :""),     \
+  (((x) & MSG_CONFIRM     ) ? "CONFIRM "     :"")
+#endif
+
+#ifdef  _WIN32
+# define CI_SOCKCALL_FLAGS_FMT "%s%s%s"
+
+# define CI_SOCKCALL_FLAGS_PRI_ARG(x)          \
+  (((x) & MSG_OOB         ) ? "OOB "         :""),     \
+  (((x) & MSG_PEEK        ) ? "PEEK "        :""),     \
+  (((x) & MSG_DONTROUTE   ) ? "DONTROUTE "   :"")
+#endif
+
+#ifdef  __sun__
+# define CI_SOCKCALL_FLAGS_FMT "%s%s%s%s%s%s%s%s%s"
+
+# define CI_SOCKCALL_FLAGS_PRI_ARG(x)          \
+  (((x) & MSG_OOB         ) ? "OOB "         :""),     \
+  (((x) & MSG_PEEK        ) ? "PEEK "        :""),     \
+  (((x) & MSG_DONTROUTE   ) ? "DONTROUTE "   :""),     \
+  (((x) & MSG_EOR         ) ? "EOR "         :""),     \
+  (((x) & MSG_CTRUNC      ) ? "CTRUNC "      :""),     \
+  (((x) & MSG_TRUNC       ) ? "TRUNC "       :""),     \
+  (((x) & MSG_WAITALL     ) ? "WAITALL "     :""),     \
+  (((x) & MSG_DONTWAIT    ) ? "DONTWAIT "    :""),     \
+  (((x) & MSG_NOTIFICATION) ? "NOTIFICATION" :"")
+#endif
+
+#endif  /* __CI_TOOLS_LOG_H__ */
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/tools/platform/gcc_x86.h b/drivers/xen/sfc_netback/ci/tools/platform/gcc_x86.h

new file mode 100644 (file)

index 0000000..33af3f1
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/platform/gcc_x86.h
@@ -0,0 +1,370 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_tools_platform  */
+
+#ifndef __CI_TOOLS_GCC_X86_H__
+#define __CI_TOOLS_GCC_X86_H__
+
+
+/**********************************************************************
+ * Free-running cycle counters.
+ */
+
+#define CI_HAVE_FRC64
+#define CI_HAVE_FRC32
+
+#define ci_frc32(pval)  __asm__ __volatile__("rdtsc" : "=a" (*pval) : : "edx")
+
+#if defined(__x86_64__)
+ci_inline void ci_frc64(ci_uint64* pval) {
+  /* temp fix until we figure how to get this out in one bite */          
+  ci_uint64 low, high;
+  __asm__ __volatile__("rdtsc" : "=a" (low) , "=d" (high));            
+  *pval = (high << 32) | low;
+}
+
+#else
+#define ci_frc64(pval)  __asm__ __volatile__("rdtsc" : "=A" (*pval))
+#endif
+
+#define ci_frc_flush()  /* ?? Need a pipeline barrier. */
+
+
+/**********************************************************************
+ * Atomic integer.
+ */
+
+/*
+** int  ci_atomic_read(a)         { return a->n;        }
+** void ci_atomic_set(a, v)       { a->n = v;           }
+** void ci_atomic_inc(a)          { ++a->n;             }
+** void ci_atomic_dec(a)          { --a->n;             }
+** int  ci_atomic_inc_and_test(a) { return ++a->n == 0; }
+** int  ci_atomic_dec_and_test(a) { return --a->n == 0; }
+** void ci_atomic_and(a, v)       { a->n &= v;          }
+** void ci_atomic_or(a, v)        { a->n |= v;          }
+*/
+
+typedef struct { volatile ci_int32 n; } ci_atomic_t;
+
+#define CI_ATOMIC_INITIALISER(i)  {(i)}
+
+static inline ci_int32  ci_atomic_read(const ci_atomic_t* a) { return a->n; }
+static inline void ci_atomic_set(ci_atomic_t* a, int v) { a->n = v; ci_wmb();   }
+
+static inline void ci_atomic_inc(ci_atomic_t* a)
+{ __asm__ __volatile__("lock; incl %0" : "+m" (a->n)); }
+
+ 
+static inline void ci_atomic_dec(ci_atomic_t* a)
+{ __asm__ __volatile__("lock; decl %0" : "+m" (a->n)); }
+
+static inline int ci_atomic_inc_and_test(ci_atomic_t* a) {
+  char r;
+  __asm__ __volatile__("lock; incl %0; sete %1"
+                      : "+m" (a->n), "=qm" (r));
+  return r;
+}
+
+static inline int ci_atomic_dec_and_test(ci_atomic_t* a) {
+  char r;
+  __asm__ __volatile__("lock; decl %0; sete %1"
+                      : "+m" (a->n), "=qm" (r));
+  return r;
+}
+
+ci_inline int
+ci_atomic_xadd (ci_atomic_t *a, int v) {
+   __asm__ ("lock xadd %0, %1" : "=r" (v), "+m" (a->n) : "0" (v));
+  return v;
+}
+ci_inline int
+ci_atomic_xchg (ci_atomic_t *a, int v) {
+   __asm__ ("lock xchg %0, %1" : "=r" (v), "+m" (a->n) : "0" (v));
+  return v;
+}
+
+ci_inline void ci_atomic32_or(volatile ci_uint32* p, ci_uint32 mask)
+{ __asm__ __volatile__("lock; orl %1, %0" : "+m" (*p) : "ir" (mask)); }
+
+ci_inline void ci_atomic32_and(volatile ci_uint32* p, ci_uint32 mask)
+{ __asm__ __volatile__("lock; andl %1, %0" : "+m" (*p) : "ir" (mask)); }
+
+ci_inline void ci_atomic32_add(volatile ci_uint32* p, ci_uint32 v)
+{ __asm__ __volatile__("lock; addl %1, %0" : "+m" (*p) : "ir" (v)); }
+
+ci_inline void ci_atomic32_inc(volatile ci_uint32* p)
+{ __asm__ __volatile__("lock; incl %0" : "+m" (*p)); }
+
+ci_inline int ci_atomic32_dec_and_test(volatile ci_uint32* p) {
+  char r;
+  __asm__ __volatile__("lock; decl %0; sete %1" : "+m" (*p), "=qm" (r));
+  return r;
+}
+
+#define ci_atomic_or(a, v)   ci_atomic32_or ((ci_uint32*) &(a)->n, (v))
+#define ci_atomic_and(a, v)  ci_atomic32_and((ci_uint32*) &(a)->n, (v))
+#define ci_atomic_add(a, v)  ci_atomic32_add((ci_uint32*) &(a)->n, (v))
+
+extern int ci_glibc_uses_nptl (void) CI_HF;
+extern int ci_glibc_nptl_broken(void) CI_HF;
+extern int ci_glibc_gs_get_is_multihreaded_offset (void) CI_HF;
+extern int ci_glibc_gs_is_multihreaded_offset CI_HV;
+
+#if !defined(__x86_64__)
+#ifdef __GLIBC__
+/* Returns non-zero if the calling process might be mulithreaded, returns 0 if
+ * it definitely isn't (i.e. if reimplementing this function for other
+ * architectures and platforms, you can safely just return 1).
+ */
+static inline int ci_is_multithreaded (void) {
+
+  while (1) {
+    if (ci_glibc_gs_is_multihreaded_offset >= 0) {
+      /* NPTL keeps a variable that tells us this hanging off gs (i.e. in thread-
+       * local storage); just return this
+       */
+      int r;
+      __asm__ __volatile__ ("movl %%gs:(%1), %0"
+                            : "=r" (r)
+                            : "r" (ci_glibc_gs_is_multihreaded_offset));
+      return r;
+    }
+
+    if (ci_glibc_gs_is_multihreaded_offset == -2) {
+      /* This means we've already determined that the libc version is NOT good
+       * for our funky "is multithreaded" hack
+       */
+      return 1;
+    }
+
+    /* If we get here, it means this is the first time the function has been
+     * called -- detect the libc version and go around again.
+     */
+    ci_glibc_gs_is_multihreaded_offset = ci_glibc_gs_get_is_multihreaded_offset ();
+
+    /* Go around again.  We do the test here rather than at the top so that we go
+     * quicker in the common the case
+     */
+  }
+}
+
+#else    /* def __GLIBC__ */
+
+#define ci_is_multithreaded() 1 /* ?? Is the the POSIX way of finding out */
+                                /*    whether the appication is single */
+                                /*    threaded? */
+
+#endif   /* def __GLIBC__ */
+
+#else    /* defined __x86_64__ */
+
+static inline int ci_is_multithreaded (void) {
+  /* Now easy way to tell on x86_64; so assume we're multithreaded */
+  return 1;
+}
+
+#endif    /* defined __x86_64__ */
+
+
+/**********************************************************************
+ * Compare and swap.
+ */
+
+#define CI_HAVE_COMPARE_AND_SWAP
+
+ci_inline int ci_cas32_succeed(volatile ci_int32* p, ci_int32 oldval,
+                               ci_int32 newval) {
+  char ret;
+  ci_int32 prevval;
+  __asm__ __volatile__("lock; cmpxchgl %3, %1; sete %0"
+                      : "=q"(ret), "+m"(*p), "=a"(prevval)
+                      : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+ci_inline int ci_cas32_fail(volatile ci_int32* p, ci_int32 oldval,
+                            ci_int32 newval) {
+  char ret;
+  ci_int32 prevval;
+  __asm__ __volatile__("lock; cmpxchgl %3, %1; setne %0"
+                      : "=q"(ret), "+m"(*p), "=a"(prevval)
+                      : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+#ifdef __x86_64__
+ci_inline int ci_cas64_succeed(volatile ci_int64* p, ci_int64 oldval,
+                              ci_int64 newval) {
+  char ret;
+  ci_int64 prevval;
+  __asm__ __volatile__("lock; cmpxchgq %3, %1; sete %0"
+                      : "=q"(ret), "+m"(*p), "=a"(prevval)
+                      : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+ci_inline int ci_cas64_fail(volatile ci_int64* p, ci_int64 oldval,
+                           ci_int64 newval) {
+  char ret;
+  ci_int64 prevval;
+  __asm__ __volatile__("lock; cmpxchgq %3, %1; setne %0"
+                      : "=q"(ret), "+m"(*p), "=a"(prevval)
+                      : "r"(newval), "a"(oldval));
+  return ret;
+}
+#endif
+
+ci_inline int ci_cas32u_succeed(volatile ci_uint32* p, ci_uint32 oldval, ci_uint32 newval) {
+  char ret;
+  ci_uint32 prevval;
+  __asm__ __volatile__("lock; cmpxchgl %3, %1; sete %0"
+                      : "=q"(ret), "+m"(*p), "=a"(prevval)
+                      : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+ci_inline int ci_cas32u_fail(volatile ci_uint32* p, ci_uint32 oldval, ci_uint32 newval) {
+  char ret;
+  ci_uint32 prevval;
+  __asm__ __volatile__("lock; cmpxchgl %3, %1; setne %0"
+                      : "=q"(ret), "+m"(*p), "=a"(prevval)
+                      : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+ci_inline int ci_cas64u_succeed(volatile ci_uint64* p, ci_uint64 oldval,
+                              ci_uint64 newval) {
+  char ret;
+  ci_uint64 prevval;
+  __asm__ __volatile__("lock; cmpxchgq %3, %1; sete %0"
+                      : "=q"(ret), "+m"(*p), "=a"(prevval)
+                      : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+ci_inline int ci_cas64u_fail(volatile ci_uint64* p, ci_uint64 oldval,
+                           ci_uint64 newval) {
+  char ret;
+  ci_uint64 prevval;
+  __asm__ __volatile__("lock; cmpxchgq %3, %1; setne %0"
+                      : "=q"(ret), "+m"(*p), "=a"(prevval)
+                      : "r"(newval), "a"(oldval));
+  return ret;
+}
+
+#ifdef __x86_64__
+
+# define ci_cas_uintptr_succeed(p,o,n)                         \
+    ci_cas64u_succeed((volatile ci_uint64*) (p), (o), (n))
+# define ci_cas_uintptr_fail(p,o,n)                            \
+    ci_cas64u_fail((volatile ci_uint64*) (p), (o), (n))
+
+#else
+
+# define ci_cas_uintptr_succeed(p,o,n)                         \
+    ci_cas32u_succeed((volatile ci_uint32*) (p), (o), (n))
+# define ci_cas_uintptr_fail(p,o,n)                            \
+    ci_cas32u_fail((volatile ci_uint32*) (p), (o), (n))
+
+#endif
+
+
+/**********************************************************************
+ * Atomic bit field.
+ */
+
+typedef ci_uint32  ci_bits;
+#define CI_BITS_N                      32u
+
+#define CI_BITS_DECLARE(name, n)                       \
+  ci_bits name[((n) + CI_BITS_N - 1u) / CI_BITS_N]
+
+ci_inline void ci_bits_clear_all(volatile ci_bits* b, int n_bits)
+{ memset((void*) b, 0, (n_bits+CI_BITS_N-1u) / CI_BITS_N * sizeof(ci_bits)); }
+
+ci_inline void ci_bit_set(volatile ci_bits* b, int i) {
+  __asm__ __volatile__("lock; btsl %1, %0"
+                      : "=m" (*b)
+                      : "Ir" (i));
+}
+
+ci_inline void ci_bit_clear(volatile ci_bits* b, int i) {
+  __asm__ __volatile__("lock; btrl %1, %0"
+                      : "=m" (*b)
+                      : "Ir" (i));
+}
+
+ci_inline int  ci_bit_test(volatile ci_bits* b, int i) {
+  char rc;
+  __asm__("btl %2, %1; setc %0"
+         : "=r" (rc)
+         : "m" (*b), "Ir" (i));
+  return rc;
+}
+
+ci_inline int ci_bit_test_and_set(volatile ci_bits* b, int i) {
+  char rc;
+  __asm__ __volatile__("lock; btsl %2, %1; setc %0"
+                      : "=r" (rc), "+m" (*b)
+                      : "Ir" (i));
+  return rc;
+}
+
+ci_inline int ci_bit_test_and_clear(volatile ci_bits* b, int i) {
+  char rc;
+  __asm__ __volatile__("lock; btrl %2, %1; setc %0"
+                      : "=r" (rc), "+m" (*b)
+                      : "Ir" (i));
+  return rc;
+}
+
+/* These mask ops only work within a single ci_bits word. */
+#define ci_bit_mask_set(b,m)   ci_atomic32_or((b), (m))
+#define ci_bit_mask_clear(b,m) ci_atomic32_and((b), ~(m))
+
+
+/**********************************************************************
+ * Misc.
+ */
+
+#if __GNUC__ >= 3
+# define ci_spinloop_pause()  __asm__("pause") 
+#else
+# define ci_spinloop_pause()  __asm__(".byte 0xf3, 0x90")
+#endif
+
+
+#define CI_HAVE_ADDC32
+#define ci_add_carry32(sum, v)  __asm__("addl %1, %0 ;"                          \
+                                       "adcl $0, %0 ;"                   \
+                                       : "=r" (sum)                      \
+                                       : "g" ((ci_uint32) v), "0" (sum))
+
+
+#endif  /* __CI_TOOLS_GCC_X86_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h b/drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h

new file mode 100644 (file)

index 0000000..e0870b6
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h
@@ -0,0 +1,361 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+
+/*! \cidoxg_include_ci_tools_platform  */
+
+#ifndef __CI_TOOLS_LINUX_KERNEL_H__
+#define __CI_TOOLS_LINUX_KERNEL_H__
+
+/**********************************************************************
+ * Need to know the kernel version.
+ */
+
+#ifndef LINUX_VERSION_CODE
+# include <linux/version.h>
+# ifndef UTS_RELEASE
+   /* 2.6.18 onwards defines UTS_RELEASE in a separate header */
+#  include <linux/utsrelease.h>
+# endif
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) || \
+    LINUX_VERSION_CODE >= KERNEL_VERSION(2,7,0)
+# error "Linux 2.6 required"
+#endif
+
+
+#include <linux/slab.h>     /* kmalloc / kfree */
+#include <linux/vmalloc.h>  /* vmalloc / vfree */
+#include <linux/interrupt.h>/* in_interrupt()  */
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/spinlock.h>
+#include <linux/highmem.h>
+#include <linux/ctype.h>
+#include <linux/uio.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/kmap_types.h>
+#include <asm/semaphore.h>
+
+#include <ci/tools/config.h>
+
+#define ci_in_irq        in_irq
+#define ci_in_interrupt  in_interrupt
+#define ci_in_atomic     in_atomic
+
+
+/**********************************************************************
+ * Misc stuff.
+ */
+
+#ifdef BUG
+# define  CI_BOMB     BUG
+#endif
+
+ci_inline void* __ci_alloc(size_t n)
+{ return kmalloc(n, (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)); }
+
+ci_inline void* __ci_atomic_alloc(size_t n)
+{ return kmalloc(n, GFP_ATOMIC ); }
+
+ci_inline void  __ci_free(void* p)     { return kfree(p);   }
+ci_inline void* __ci_vmalloc(size_t n) { return vmalloc(n); }
+ci_inline void  __ci_vfree(void* p)    { return vfree(p);   }
+
+
+#if CI_MEMLEAK_DEBUG_ALLOC_TABLE
+  #define ci_alloc(s)     ci_alloc_memleak_debug (s, __FILE__, __LINE__)
+  #define ci_atomic_alloc(s)  ci_atomic_alloc_memleak_debug(s, __FILE__, __LINE__)
+  #define ci_free         ci_free_memleak_debug
+  #define ci_vmalloc(s)   ci_vmalloc_memleak_debug (s, __FILE__,__LINE__)
+  #define ci_vfree        ci_vfree_memleak_debug
+  #define ci_alloc_fn     ci_alloc_fn_memleak_debug
+  #define ci_vmalloc_fn   ci_vmalloc_fn_memleak_debug
+#else /* !CI_MEMLEAK_DEBUG_ALLOC_TABLE */
+  #define ci_alloc_fn     __ci_alloc
+  #define ci_vmalloc_fn   __ci_vmalloc
+#endif 
+
+#ifndef ci_alloc
+  #define ci_atomic_alloc __ci_atomic_alloc
+  #define ci_alloc        __ci_alloc
+  #define ci_free         __ci_free
+  #define ci_vmalloc      __ci_vmalloc
+  #define ci_vmalloc_fn   __ci_vmalloc
+  #define ci_vfree        __ci_vfree
+#endif
+
+#define ci_sprintf        sprintf
+#define ci_vsprintf       vsprintf
+#define ci_snprintf       snprintf
+#define ci_vsnprintf      vsnprintf
+#define ci_sscanf         sscanf
+
+
+#define CI_LOG_FN_DEFAULT  ci_log_syslog
+
+
+/*--------------------------------------------------------------------
+ *
+ * irqs_disabled - needed for kmap helpers on some kernels 
+ *
+ *--------------------------------------------------------------------*/
+#ifdef irqs_disabled
+# define ci_irqs_disabled irqs_disabled
+#else
+# if defined(__i386__) | defined(__x86_64__)
+#   define ci_irqs_disabled(x)                  \
+  ({                                            \
+    unsigned long flags;                        \
+    local_save_flags(flags);                    \
+    !(flags & (1<<9));                          \
+  })
+# else
+#  error "Need to implement irqs_disabled() for your architecture"
+# endif
+#endif
+
+
+/**********************************************************************
+ * kmap helpers. 
+ *
+ * Use ci_k(un)map for code paths which are not in an atomic context.
+ * For atomic code you need to use ci_k(un)map_in_atomic. This will grab
+ * one of the per-CPU kmap slots.
+ *
+ * NB in_interrupt != in_irq. If you don't know the difference then
+ * don't use kmap_in_atomic
+ *
+ * 2.4 allocates kmap slots by function. We are going to re-use the
+ * skb module's slot - we also use the same interlock
+ * 
+ * 2.6 allocates kmap slots by type as well as by function. We are
+ * going to use the currently (2.6.10) unsused SOFTIRQ slot 
+ *
+ */
+
+ci_inline void* ci_kmap(struct page *page) {
+  CI_DEBUG(if( ci_in_atomic() | ci_in_interrupt() | ci_in_irq() )  BUG());
+  return kmap(page);
+}
+
+ci_inline void ci_kunmap(struct page *page) {
+  kunmap(page);
+}
+
+#define CI_KM_SLOT KM_SOFTIRQ0
+
+
+typedef struct semaphore ci_semaphore_t;
+
+ci_inline void
+ci_sem_init (ci_semaphore_t *sem, int val) {
+  sema_init (sem, val);
+}
+
+ci_inline void
+ci_sem_down (ci_semaphore_t *sem) {
+  down (sem);
+}
+
+ci_inline int
+ci_sem_trydown (ci_semaphore_t *sem) {
+  return down_trylock (sem);
+}
+
+ci_inline void
+ci_sem_up (ci_semaphore_t *sem) {
+  up (sem);
+}
+
+ci_inline int
+ci_sem_get_count(ci_semaphore_t *sem) {
+  return sem->count.counter;
+}
+
+ci_inline void* ci_kmap_in_atomic(struct page *page) 
+{
+  CI_DEBUG(if( ci_in_irq() )  BUG());
+
+  /* iSCSI can call without in_interrupt() but with irqs_disabled()
+     and in a context that can't sleep, so we need to check that
+     too */
+  if(ci_in_interrupt() || ci_irqs_disabled())
+    return kmap_atomic(page, CI_KM_SLOT);
+  else
+    return kmap(page);
+}
+
+ci_inline void ci_kunmap_in_atomic(struct page *page, void* kaddr) 
+{
+  CI_DEBUG(if( ci_in_irq() )  BUG());
+
+  /* iSCSI can call without in_interrupt() but with irqs_disabled()
+     and in a context that can't sleep, so we need to check that
+     too */
+  if(ci_in_interrupt() || ci_irqs_disabled())
+    kunmap_atomic(kaddr, CI_KM_SLOT);
+  else
+    kunmap(page);
+}
+
+/**********************************************************************
+ * spinlock implementation: used by <ci/tools/spinlock.h>
+ */
+
+#define CI_HAVE_SPINLOCKS
+
+typedef ci_uintptr_t                           ci_lock_holder_t;
+#define ci_lock_thisthread             (ci_lock_holder_t)current                       
+#define ci_lock_no_holder     (ci_lock_holder_t)NULL
+
+typedef spinlock_t                     ci_lock_i;
+typedef spinlock_t                     ci_irqlock_i;
+typedef unsigned long                  ci_irqlock_state_t;
+
+#define IRQLOCK_CYCLES  500000
+
+#define ci_lock_ctor_i(l)              spin_lock_init(l)
+#define ci_lock_dtor_i(l)              do{}while(0)
+#define ci_lock_lock_i(l)              spin_lock(l)
+#define ci_lock_trylock_i(l)           spin_trylock(l)
+#define ci_lock_unlock_i(l)            spin_unlock(l)
+
+#define ci_irqlock_ctor_i(l)           spin_lock_init(l)
+#define ci_irqlock_dtor_i(l)           do{}while(0)
+#define ci_irqlock_lock_i(l,s)         spin_lock_irqsave(l,*(s))
+#define ci_irqlock_unlock_i(l,s)       spin_unlock_irqrestore(l, *(s))
+
+
+/**********************************************************************
+ * register access
+ */
+
+#include <asm/io.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
+typedef volatile void __iomem* ioaddr_t;
+#else
+typedef unsigned long ioaddr_t;
+#endif
+
+
+
+/**********************************************************************
+ * thread implementation -- kernel dependancies probably should be
+ * moved to driver/linux_kernel.h
+ */
+
+#define ci_linux_daemonize(name) daemonize(name)
+
+#include <linux/workqueue.h>
+
+
+typedef struct {
+  void*                        (*fn)(void* arg);
+  void*                        arg;
+  const char*          name;
+  int                  thrd_id;
+  struct completion    exit_event;
+  struct work_struct   keventd_witem;
+} ci_kernel_thread_t;
+
+
+typedef ci_kernel_thread_t* cithread_t;
+
+
+extern int cithread_create(cithread_t* tid, void* (*fn)(void*), void* arg,
+                          const char* name);
+extern int cithread_detach(cithread_t kt);
+extern int cithread_join(cithread_t kt);
+
+
+/* Kernel sysctl variables. */
+extern int sysctl_tcp_wmem[3];
+extern int sysctl_tcp_rmem[3];
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+#define LINUX_HAS_SYSCTL_MEM_MAX
+extern ci_uint32 sysctl_wmem_max;
+extern ci_uint32 sysctl_rmem_max;
+#endif
+
+
+/*--------------------------------------------------------------------
+ *
+ * ci_bigbuf_t: An abstraction of a large buffer.  Needed because in the
+ * Linux kernel, large buffers need to be allocated with vmalloc(), whereas
+ * smaller buffers should use kmalloc().  This abstraction chooses the
+ * appropriate mechansim.
+ *
+ *--------------------------------------------------------------------*/
+
+typedef struct {
+  char*                p;
+  int          is_vmalloc;
+} ci_bigbuf_t;
+
+
+ci_inline int ci_bigbuf_alloc(ci_bigbuf_t* bb, size_t bytes) {
+  if( bytes >= CI_PAGE_SIZE && ! ci_in_atomic() ) {
+    bb->is_vmalloc = 1;
+    if( (bb->p = vmalloc(bytes)) )  return 0;
+  }
+  bb->is_vmalloc = 0;
+  bb->p = kmalloc(bytes, ci_in_interrupt() ? GFP_ATOMIC : GFP_KERNEL);
+  return bb->p ? 0 : -ENOMEM;
+}
+
+ci_inline void ci_bigbuf_free(ci_bigbuf_t* bb) {
+  if( bb->is_vmalloc )  vfree(bb->p);
+  else                  kfree(bb->p);
+}
+
+ci_inline char* ci_bigbuf_ptr(ci_bigbuf_t* bb)
+{ return bb->p; }
+
+/**********************************************************************
+ * struct iovec abstraction (for Windows port)
+ */
+
+typedef struct iovec ci_iovec;
+
+/* Accessors for buffer/length */
+#define CI_IOVEC_BASE(i) ((i)->iov_base)
+#define CI_IOVEC_LEN(i)  ((i)->iov_len)
+
+/**********************************************************************
+ * Signals
+ */
+
+ci_inline void
+ci_send_sig(int signum)
+{
+  send_sig(signum, current, 0);
+}
+
+#endif  /* __CI_TOOLS_LINUX_KERNEL_H__ */
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netback/ci/tools/sysdep.h b/drivers/xen/sfc_netback/ci/tools/sysdep.h

new file mode 100644 (file)

index 0000000..9be16dd
--- /dev/null
+++ b/drivers/xen/sfc_netback/ci/tools/sysdep.h
@@ -0,0 +1,132 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*! \cidoxg_include_ci_tools */
+
+#ifndef __CI_TOOLS_SYSDEP_H__
+#define __CI_TOOLS_SYSDEP_H__
+
+/* Make this header self-sufficient */
+#include <ci/compat.h>
+#include <ci/tools/log.h>
+#include <ci/tools/debug.h>
+
+
+/**********************************************************************
+ * Platform dependencies.
+ */
+
+#if defined(__KERNEL__)
+
+# if defined(__linux__)
+#  include <ci/tools/platform/linux_kernel.h>
+# elif defined(_WIN32)
+#  include <ci/tools/platform/win32_kernel.h>
+# elif defined(__sun__)
+#  include <ci/tools/platform/sunos_kernel.h>
+# else
+#  error Unknown platform.
+# endif
+
+#elif defined(_WIN32)
+
+# include <ci/tools/platform/win32.h>
+
+#elif defined(__unix__)
+
+# include <ci/tools/platform/unix.h>
+
+#else
+
+# error Unknown platform.
+
+#endif
+
+#if defined(__linux__)
+/*! Linux sendfile() support enable/disable. */
+# define CI_HAVE_SENDFILE            /* provide sendfile i/f */
+
+# define CI_HAVE_OS_NOPAGE
+#endif
+
+#if defined(__sun__)
+# define CI_HAVE_SENDFILE           /* provide sendfile i/f */
+# define CI_HAVE_SENDFILEV           /* provide sendfilev i/f */
+
+# define CI_IOCTL_SENDFILE           /*  use efrm CI_SENDFILEV ioctl */
+#endif
+
+#if defined(_WIN32)
+typedef ci_uint32 ci_uerr_t; /* range of OS user-mode return codes */
+typedef ci_uint32 ci_kerr_t; /* range of OS kernel-mode return codes */
+#elif defined(__unix__)
+typedef ci_int32 ci_uerr_t; /* range of OS user-mode return codes */
+typedef ci_int32 ci_kerr_t; /* range of OS kernel-mode return codes */
+#endif
+
+
+/**********************************************************************
+ * Compiler and processor dependencies.
+ */
+
+#if defined(__GNUC__)
+
+#if defined(__i386__) || defined(__x86_64__)
+# include <ci/tools/platform/gcc_x86.h>
+#elif defined(__PPC__)
+#  include <ci/tools/platform/gcc_ppc.h>
+#elif defined(__ia64__)
+#  include <ci/tools/platform/gcc_ia64.h>
+#else
+# error Unknown processor.
+#endif
+
+#elif defined(_MSC_VER)
+
+#if defined(__i386__)
+# include <ci/tools/platform/msvc_x86.h>
+# elif defined(__x86_64__)
+# include <ci/tools/platform/msvc_x86_64.h>
+#else
+# error Unknown processor.
+#endif
+
+#elif defined(__PGI)
+
+# include <ci/tools/platform/pg_x86.h>
+
+#elif defined(__INTEL_COMPILER)
+
+/* Intel compilers v7 claim to be very gcc compatible. */
+# include <ci/tools/platform/gcc_x86.h>
+
+#else
+# error Unknown compiler.
+#endif
+
+
+#endif  /* __CI_TOOLS_SYSDEP_H__ */
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netfront/Makefile b/drivers/xen/sfc_netfront/Makefile

new file mode 100644 (file)

index 0000000..0e4a54b
--- /dev/null
+++ b/drivers/xen/sfc_netfront/Makefile
@@ -0,0 +1,11 @@
+EXTRA_CFLAGS += -Idrivers/xen/sfc_netfront -Idrivers/xen/sfc_netutil -Idrivers/xen/netfront
+EXTRA_CFLAGS += -D__ci_driver__
+EXTRA_CFLAGS += -Werror
+
+ifdef GCOV
+EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV
+endif
+
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_FRONTEND)    := sfc_netfront.o
+
+sfc_netfront-objs := accel_msg.o accel_bufs.o accel_netfront.o accel_vi.o accel_xenbus.o accel_tso.o accel_ssr.o accel_debugfs.o falcon_event.o falcon_vi.o pt_tx.o vi_init.o
diff --git a/drivers/xen/sfc_netfront/accel.h b/drivers/xen/sfc_netfront/accel.h

new file mode 100644 (file)

index 0000000..a915aaa
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel.h
@@ -0,0 +1,495 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETFRONT_ACCEL_H
+#define NETFRONT_ACCEL_H
+
+#include "accel_msg_iface.h"
+#include "accel_cuckoo_hash.h"
+#include "accel_bufs.h"
+
+#include "etherfabric/ef_vi.h"
+
+#include <xen/xenbus.h>
+#include <xen/evtchn.h>
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/version.h>
+#include <linux/list.h>
+
+enum netfront_accel_post_status {
+       NETFRONT_ACCEL_STATUS_GOOD,
+       NETFRONT_ACCEL_STATUS_BUSY,
+       NETFRONT_ACCEL_STATUS_CANT
+};
+
+#define NETFRONT_ACCEL_STATS 1
+#if NETFRONT_ACCEL_STATS
+#define NETFRONT_ACCEL_STATS_OP(x) x
+#else
+#define NETFRONT_ACCEL_STATS_OP(x)
+#endif
+
+
+enum netfront_accel_msg_state {
+       NETFRONT_ACCEL_MSG_NONE = 0,
+       NETFRONT_ACCEL_MSG_HELLO = 1,
+       NETFRONT_ACCEL_MSG_HW = 2
+};
+
+
+typedef struct {
+       u32 in_progress;
+       u32 total_len;
+       struct sk_buff *skb;
+} netfront_accel_jumbo_state;
+
+
+struct netfront_accel_ssr_state {
+       /** List of tracked connections. */
+       struct list_head conns;
+
+       /** Free efx_ssr_conn instances. */
+       struct list_head free_conns;
+};
+
+
+struct netfront_accel_netdev_stats {
+       /* Fastpath stats. */
+       u32 fastpath_rx_pkts;
+       u32 fastpath_rx_bytes;
+       u32 fastpath_rx_errors;
+       u32 fastpath_tx_pkts; 
+       u32 fastpath_tx_bytes;
+       u32 fastpath_tx_errors;
+};
+
+
+struct netfront_accel_netdev_dbfs {
+       struct dentry *fastpath_rx_pkts;
+       struct dentry *fastpath_rx_bytes;
+       struct dentry *fastpath_rx_errors;
+       struct dentry *fastpath_tx_pkts; 
+       struct dentry *fastpath_tx_bytes;
+       struct dentry *fastpath_tx_errors;
+};
+
+
+struct netfront_accel_stats {
+       /** Fast path events */
+       u64 fastpath_tx_busy;
+
+       /** TX DMA queue status */
+       u64 fastpath_tx_completions;
+
+       /** The number of events processed. */
+       u64 event_count;
+
+       /** Number of frame trunc events seen on fastpath */
+       u64 fastpath_frm_trunc;
+
+       /** Number of rx discard (bad crc) events seen on fastpath */
+       u64 fastpath_crc_bad;
+
+       /** Number of rx discard (bad csum) events seen on fastpath */
+       u64 fastpath_csum_bad;
+
+       /** Number of rx discard (bad rights) events seen on fastpath */
+       u64 fastpath_rights_bad;
+
+       /** Number of rx discard ("other") events seen on fastpath */
+       u64 fastpath_discard_other;
+
+       /** Number of no rx descriptor trunc events seen on fastpath */
+       u64 rx_no_desc_trunc;
+
+       /** The number of misc bad events processed. */
+       u64 bad_event_count;
+
+       /** Number of events dealt with in poll loop */
+       u32 events_per_poll_max;
+       u32 events_per_poll_tx_max;
+       u32 events_per_poll_rx_max;
+
+       /** Largest number of concurrently outstanding tx descriptors */
+       u32 fastpath_tx_pending_max;
+
+       /** The number of events since the last interrupts. */
+       u32 event_count_since_irq;
+
+       /** The max number of events between interrupts. */
+       u32 events_per_irq_max;
+
+       /** The number of interrupts. */
+       u64 irq_count;
+
+       /** The number of useless interrupts. */
+       u64 useless_irq_count;
+
+       /** The number of polls scheduled. */
+       u64 poll_schedule_count;
+
+       /** The number of polls called. */
+       u64 poll_call_count;
+
+       /** The number of rechecks. */
+       u64 poll_reschedule_count;
+
+       /** Number of times we've called netif_stop_queue/netif_wake_queue */
+       u64 queue_stops;
+       u64 queue_wakes;
+
+       /** SSR stats */
+       u64 ssr_bursts;
+       u64 ssr_drop_stream;
+       u64 ssr_misorder;
+       u64 ssr_slow_start;
+       u64 ssr_merges;
+       u64 ssr_too_many;
+       u64 ssr_new_stream;
+};
+
+
+struct netfront_accel_dbfs {
+       struct dentry *fastpath_tx_busy;
+       struct dentry *fastpath_tx_completions;
+       struct dentry *fastpath_tx_pending_max;
+       struct dentry *fastpath_frm_trunc;
+       struct dentry *fastpath_crc_bad;
+       struct dentry *fastpath_csum_bad;
+       struct dentry *fastpath_rights_bad;
+       struct dentry *fastpath_discard_other;
+       struct dentry *rx_no_desc_trunc;
+       struct dentry *event_count;
+       struct dentry *bad_event_count;
+       struct dentry *events_per_poll_max;
+       struct dentry *events_per_poll_rx_max;
+       struct dentry *events_per_poll_tx_max;
+       struct dentry *event_count_since_irq;
+       struct dentry *events_per_irq_max;
+       struct dentry *irq_count;
+       struct dentry *useless_irq_count;
+       struct dentry *poll_schedule_count;
+       struct dentry *poll_call_count;
+       struct dentry *poll_reschedule_count;
+       struct dentry *queue_stops;
+       struct dentry *queue_wakes;
+       struct dentry *ssr_bursts;
+       struct dentry *ssr_drop_stream;
+       struct dentry *ssr_misorder;
+       struct dentry *ssr_slow_start;
+       struct dentry *ssr_merges;
+       struct dentry *ssr_too_many;
+       struct dentry *ssr_new_stream;
+};
+
+
+typedef struct netfront_accel_vnic {
+       struct netfront_accel_vnic *next;
+       
+       struct mutex vnic_mutex;
+
+       spinlock_t tx_lock;
+
+       struct netfront_accel_bufpages bufpages;
+       struct netfront_accel_bufinfo *rx_bufs;
+       struct netfront_accel_bufinfo *tx_bufs;
+       
+       /** Hardware & VI state */
+       ef_vi vi;
+
+       ef_vi_state *vi_state;
+
+       ef_eventq_state evq_state;
+
+       void *evq_mapping;
+
+       /** Hardware dependant state */
+       union {
+               struct {
+                       /** Falcon A or B */
+                       enum net_accel_hw_type type; 
+                       u32 *evq_rptr;
+                       u32 *doorbell;
+                       void *evq_rptr_mapping;
+                       void *doorbell_mapping;
+                       void *txdmaq_mapping;
+                       void *rxdmaq_mapping;
+               } falcon;
+       } hw;
+  
+       /** RX DMA queue status */
+       u32 rx_dma_level;
+
+       /** Number of RX descriptors waiting to be pushed to the card. */
+       u32 rx_dma_batched;
+#define NETFRONT_ACCEL_RX_DESC_BATCH 16
+
+       /**
+        * Hash table of remote mac addresses to decide whether to try
+        * fast path
+        */
+       cuckoo_hash_table fastpath_table;
+       spinlock_t table_lock;
+
+       /** the local mac address of virtual interface we're accelerating */
+       u8 mac[ETH_ALEN];
+
+       int rx_pkt_stride;
+       int rx_skb_stride;
+
+       /**
+        * Keep track of fragments of jumbo packets as events are
+        * delivered by NIC 
+        */
+       netfront_accel_jumbo_state jumbo_state;
+
+       struct net_device *net_dev;
+
+       /** These two gate the enabling of fast path operations */
+       int frontend_ready;
+       int backend_netdev_up;
+
+       int irq_enabled;
+       spinlock_t irq_enabled_lock;
+
+       int tx_enabled;
+
+       int poll_enabled;
+
+       /** A spare slot for a TX packet.  This is treated as an
+        * extension of the DMA queue.  Reads require either
+        * netfront's tx_lock or the vnic tx_lock; writes require both
+        * locks */
+       struct sk_buff *tx_skb;
+
+       /** Keep track of fragments of SSR packets */
+       struct netfront_accel_ssr_state ssr_state;
+
+       struct xenbus_device *dev;
+
+       /** Event channel for messages */
+       int msg_channel;
+       int msg_channel_irq;
+
+       /** Event channel for network interrupts. */
+       int net_channel;
+       int net_channel_irq;
+
+       struct net_accel_shared_page *shared_page;
+
+       grant_ref_t ctrl_page_gnt;
+       grant_ref_t msg_page_gnt;
+
+       /** Message Qs, 1 each way. */
+       sh_msg_fifo2 to_dom0;
+       sh_msg_fifo2 from_dom0;
+
+       enum netfront_accel_msg_state msg_state;
+
+       /** Watch on accelstate */
+       struct xenbus_watch backend_accel_watch;
+       /** Watch on frontend's MAC address */
+       struct xenbus_watch mac_address_watch;
+
+       /** Work to process received irq/msg */
+       struct work_struct msg_from_bend;
+
+       /** Wait queue for changes in accelstate. */
+       wait_queue_head_t state_wait_queue;
+
+       /** The current accelstate of this driver. */
+       XenbusState frontend_state;
+
+       /** The most recent accelstate seen by the xenbus watch. */
+       XenbusState backend_state;
+
+       /** Non-zero if we should reject requests to connect. */
+       int removing;
+
+       /** Non-zero if the domU shared state has been initialised. */
+       int domU_state_is_setup;
+
+       /** Non-zero if the dom0 shared state has been initialised. */
+       int dom0_state_is_setup;
+
+       /* Those statistics that are added to the netdev stats */
+       struct netfront_accel_netdev_stats netdev_stats;
+       struct netfront_accel_netdev_stats stats_last_read;
+#ifdef CONFIG_DEBUG_FS
+       struct netfront_accel_netdev_dbfs netdev_dbfs;
+#endif
+
+       /* These statistics are internal and optional */
+#if NETFRONT_ACCEL_STATS
+       struct netfront_accel_stats stats;
+#ifdef CONFIG_DEBUG_FS
+       struct netfront_accel_dbfs dbfs;
+#endif
+#endif
+
+       /** Debufs fs dir for this interface */
+       struct dentry *dbfs_dir;
+} netfront_accel_vnic;
+
+
+/* Module parameters */
+extern unsigned sfc_netfront_max_pages;
+extern unsigned sfc_netfront_buffer_split;
+
+extern const char *frontend_name;
+extern struct netfront_accel_hooks accel_hooks;
+extern struct workqueue_struct *netfront_accel_workqueue;
+
+
+extern
+void netfront_accel_vi_ctor(netfront_accel_vnic *vnic);
+
+extern
+int netfront_accel_vi_init(netfront_accel_vnic *vnic, 
+                          struct net_accel_msg_hw *hw_msg);
+
+extern
+void netfront_accel_vi_dtor(netfront_accel_vnic *vnic);
+
+
+/**
+ * Add new buffers which have been registered with the NIC.
+ *
+ * @v   vnic     The vnic instance to process the response.
+ *
+ * The buffers contained in the message are added to the buffer pool.
+ */
+extern
+void netfront_accel_vi_add_bufs(netfront_accel_vnic *vnic, int is_rx);
+
+/**
+ * Put a packet on the tx DMA queue.
+ *
+ * @v  vnic     The vnic instance to accept the packet.
+ * @v  skb      A sk_buff to send.
+ *
+ * Attempt to send a packet.  On success, the skb is owned by the DMA
+ * queue and will be released when the completion event arrives.
+ */
+extern enum netfront_accel_post_status
+netfront_accel_vi_tx_post(netfront_accel_vnic *vnic,
+                         struct sk_buff *skb);
+
+
+/**
+ * Process events in response to an interrupt.
+ *
+ * @v   vnic       The vnic instance to poll.
+ * @v   rx_packets The maximum number of rx packets to process.
+ * @ret rx_done    The number of rx packets processed.
+ *
+ * The vnic will process events until there are no more events
+ * remaining or the specified number of rx packets has been processed.
+ * The split from the interrupt call is to allow Linux NAPI
+ * polling.
+ */
+extern
+int netfront_accel_vi_poll(netfront_accel_vnic *vnic, int rx_packets);
+
+
+/**
+ * Iterate over the fragments of a packet buffer.
+ *
+ * @v   skb      The packet buffer to examine.
+ * @v   idx      A variable name for the fragment index.
+ * @v   data     A variable name for the address of the fragment data.
+ * @v   length   A variable name for the fragment length.
+ * @v   code     A section of code to execute for each fragment.
+ *
+ * This macro iterates over the fragments in a packet buffer and
+ * executes the code for each of them.
+ */
+#define NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT(skb, frag_idx,                \
+                                                frag_data, frag_len,   \
+                                                code)                  \
+       do {                                                            \
+               int frag_idx;                                           \
+               void *frag_data;                                        \
+               unsigned int      frag_len;                             \
+                                                                       \
+               frag_data = skb->data;                                  \
+               frag_len = skb_headlen(skb);                            \
+               frag_idx = 0;                                           \
+               while (1) { /* For each fragment */                     \
+                       code;                                           \
+                       if (frag_idx >= skb_shinfo(skb)->nr_frags) {    \
+                               break;                                  \
+                       } else {                                        \
+                               skb_frag_t *fragment;                   \
+                               fragment = &skb_shinfo(skb)->frags[frag_idx]; \
+                               frag_len = skb_frag_size(fragment);     \
+                               frag_data = ((void*)page_address(skb_frag_page(fragment)) \
+                                            + fragment->page_offset);  \
+                       };                                              \
+                       frag_idx++;                                     \
+               }                                                       \
+       } while(0)
+
+static inline
+void netfront_accel_disable_net_interrupts(netfront_accel_vnic *vnic)
+{
+       mask_evtchn(vnic->net_channel);
+}
+
+static inline
+void netfront_accel_enable_net_interrupts(netfront_accel_vnic *vnic)
+{
+       unmask_evtchn(vnic->net_channel);
+}
+
+void netfront_accel_msg_tx_fastpath(netfront_accel_vnic *vnic, const void *mac,
+                                   u32 ip, u16 port, u8 protocol);
+
+/* Process an IRQ received from back end driver */
+irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context);
+irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+extern void netfront_accel_msg_from_bend(struct work_struct *context);
+#else
+extern void netfront_accel_msg_from_bend(void *context);
+#endif
+
+extern void vnic_stop_fastpath(netfront_accel_vnic *vnic);
+
+extern int netfront_accel_probe(struct net_device *net_dev, 
+                               struct xenbus_device *dev);
+extern int netfront_accel_remove(struct xenbus_device *dev);
+extern void netfront_accel_set_closing(netfront_accel_vnic *vnic);
+
+extern int netfront_accel_vi_enable_interrupts(netfront_accel_vnic *vnic);
+
+extern void netfront_accel_debugfs_init(void);
+extern void netfront_accel_debugfs_fini(void);
+extern int netfront_accel_debugfs_create(netfront_accel_vnic *vnic);
+extern int netfront_accel_debugfs_remove(netfront_accel_vnic *vnic);
+
+#endif /* NETFRONT_ACCEL_H */
diff --git a/drivers/xen/sfc_netfront/accel_bufs.c b/drivers/xen/sfc_netfront/accel_bufs.c

new file mode 100644 (file)

index 0000000..f96f73c
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_bufs.c
@@ -0,0 +1,393 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <xen/gnttab.h>
+
+#include "accel_bufs.h"
+#include "accel_util.h"
+
+#include "accel.h"
+
+
+static int 
+netfront_accel_alloc_buf_desc_blocks(struct netfront_accel_bufinfo *manager,
+                                    int pages)
+{
+       manager->desc_blocks = 
+               kzalloc(sizeof(struct netfront_accel_pkt_desc *) * 
+                       NETFRONT_ACCEL_BUF_NUM_BLOCKS(pages), GFP_KERNEL);
+       if (manager->desc_blocks == NULL) {
+               return -ENOMEM;
+       }
+       
+       return 0;
+}
+
+static int 
+netfront_accel_alloc_buf_lists(struct netfront_accel_bufpages *bufpages,
+                              int pages)
+{
+       bufpages->page_list = kmalloc(pages * sizeof(void *), GFP_KERNEL);
+       if (bufpages->page_list == NULL) {
+               return -ENOMEM;
+       }
+
+       bufpages->grant_list = kzalloc(pages * sizeof(grant_ref_t), GFP_KERNEL);
+       if (bufpages->grant_list == NULL) {
+               kfree(bufpages->page_list);
+               bufpages->page_list = NULL;
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+
+int netfront_accel_alloc_buffer_mem(struct netfront_accel_bufpages *bufpages,
+                                   struct netfront_accel_bufinfo *rx_manager,
+                                   struct netfront_accel_bufinfo *tx_manager,
+                                   int pages)
+{
+       int n, rc;
+
+       if ((rc = netfront_accel_alloc_buf_desc_blocks
+            (rx_manager, pages - (pages / sfc_netfront_buffer_split))) < 0) {
+               goto rx_fail;
+       }
+
+       if ((rc = netfront_accel_alloc_buf_desc_blocks
+            (tx_manager, pages / sfc_netfront_buffer_split)) < 0) {
+               goto tx_fail;
+       }
+
+       if ((rc = netfront_accel_alloc_buf_lists(bufpages, pages)) < 0) {
+               goto lists_fail;
+       }
+
+       for (n = 0; n < pages; n++) {
+               void *tmp = (void*)__get_free_page(GFP_KERNEL);
+               if (tmp == NULL)
+                       break;
+
+               bufpages->page_list[n] = tmp;
+       }
+
+       if (n != pages) {
+               EPRINTK("%s: not enough pages: %d != %d\n", __FUNCTION__, n, 
+                       pages);
+               for (; n >= 0; n--)
+                       free_page((unsigned long)(bufpages->page_list[n]));
+               rc = -ENOMEM;
+               goto pages_fail;
+       }
+
+       bufpages->max_pages = pages;
+       bufpages->page_reqs = 0;
+
+       return 0;
+
+ pages_fail:
+       kfree(bufpages->page_list);
+       kfree(bufpages->grant_list);
+
+       bufpages->page_list = NULL;
+       bufpages->grant_list = NULL;
+ lists_fail:
+       kfree(tx_manager->desc_blocks);
+       tx_manager->desc_blocks = NULL;
+
+ tx_fail:
+       kfree(rx_manager->desc_blocks);
+       rx_manager->desc_blocks = NULL;
+ rx_fail:
+       return rc;
+}
+
+
+void netfront_accel_free_buffer_mem(struct netfront_accel_bufpages *bufpages,
+                                   struct netfront_accel_bufinfo *rx_manager,
+                                   struct netfront_accel_bufinfo *tx_manager)
+{
+       int i;
+
+       for (i = 0; i < bufpages->max_pages; i++) {
+               if (bufpages->grant_list[i] != 0)
+                       net_accel_ungrant_page(bufpages->grant_list[i]);
+               free_page((unsigned long)(bufpages->page_list[i]));
+       }
+
+       if (bufpages->max_pages) {
+               kfree(bufpages->page_list);
+               kfree(bufpages->grant_list);
+               kfree(rx_manager->desc_blocks);
+               kfree(tx_manager->desc_blocks);
+       }
+}
+
+
+/*
+ * Allocate memory for the buffer manager and create a lock.  If no
+ * lock is supplied its own is allocated.
+ */
+struct netfront_accel_bufinfo *netfront_accel_init_bufs(spinlock_t *lock)
+{
+       struct netfront_accel_bufinfo *res = kmalloc(sizeof(*res), GFP_KERNEL);
+       if (res != NULL) {
+               res->npages = res->nused = 0;
+               res->first_free = -1;
+
+               if (lock == NULL) {
+                       res->lock = kmalloc(sizeof(*res->lock), GFP_KERNEL);
+                       if (res->lock == NULL) {
+                               kfree(res);
+                               return NULL;
+                       }
+                       spin_lock_init(res->lock);
+                       res->internally_locked = 1;
+               } else {
+                       res->lock = lock;
+                       res->internally_locked = 0;
+               }
+               
+               res->desc_blocks = NULL;
+       }
+
+       return res;
+}
+
+
+void netfront_accel_fini_bufs(struct netfront_accel_bufinfo *bufs)
+{
+       if (bufs->internally_locked)
+               kfree(bufs->lock);
+       kfree(bufs);
+}
+
+
+int netfront_accel_buf_map_request(struct xenbus_device *dev,
+                                  struct netfront_accel_bufpages *bufpages,
+                                  struct net_accel_msg *msg, 
+                                  int pages, int offset)
+{
+       int i, mfn;
+       int err;
+
+       net_accel_msg_init(msg, NET_ACCEL_MSG_MAPBUF);
+
+       BUG_ON(pages > NET_ACCEL_MSG_MAX_PAGE_REQ);
+
+       msg->u.mapbufs.pages = pages;
+
+       for (i = 0; i < msg->u.mapbufs.pages; i++) {
+               /* 
+                * This can happen if we tried to send this message
+                * earlier but the queue was full.
+                */
+               if (bufpages->grant_list[offset+i] != 0) {
+                       msg->u.mapbufs.grants[i] = 
+                               bufpages->grant_list[offset+i];
+                       continue;
+               }
+
+               mfn = virt_to_mfn(bufpages->page_list[offset+i]);
+               VPRINTK("%s: Granting page %d, mfn %08x\n",
+                       __FUNCTION__, i, mfn);
+
+               bufpages->grant_list[offset+i] =
+                       net_accel_grant_page(dev, mfn, 0);
+               msg->u.mapbufs.grants[i] = bufpages->grant_list[offset+i];
+
+               if (msg->u.mapbufs.grants[i] < 0) {
+                       EPRINTK("%s: Failed to grant buffer: %d\n",
+                               __FUNCTION__, msg->u.mapbufs.grants[i]);
+                       err = -EIO;
+                       goto error;
+               }
+       }
+
+       /* This is interpreted on return as the offset in the the page_list */
+       msg->u.mapbufs.reqid = offset;
+
+       return 0;
+
+error:
+       /* Ungrant all the pages we've successfully granted. */
+       for (i--; i >= 0; i--) {
+               net_accel_ungrant_page(bufpages->grant_list[offset+i]);
+               bufpages->grant_list[offset+i] = 0;
+       }
+       return err;
+}
+
+
+/* Process a response to a buffer request. */
+int netfront_accel_add_bufs(struct netfront_accel_bufpages *bufpages,
+                           struct netfront_accel_bufinfo *manager, 
+                           struct net_accel_msg *msg)
+{
+       int msg_pages, page_offset, i, newtot;
+       int old_block_count, new_block_count;
+       u32 msg_buf;
+       unsigned long flags;
+
+       VPRINTK("%s: manager %p msg %p\n", __FUNCTION__, manager, msg);
+
+       BUG_ON(msg->id != (NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY));
+
+       msg_pages = msg->u.mapbufs.pages;
+       msg_buf = msg->u.mapbufs.buf;
+       page_offset = msg->u.mapbufs.reqid;
+
+       spin_lock_irqsave(manager->lock, flags);
+       newtot = manager->npages + msg_pages;
+       old_block_count = 
+               (manager->npages + NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK - 1) >>
+               NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT;
+       new_block_count = 
+               (newtot + NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK - 1) >>
+               NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT;
+
+       for (i = old_block_count; i < new_block_count; i++) {
+               struct netfront_accel_pkt_desc *block;
+               if (manager->desc_blocks[i] != NULL) {
+                       VPRINTK("Not needed\n");
+                       continue;
+               }
+               block = kzalloc(NETFRONT_ACCEL_BUFS_PER_BLOCK * 
+                               sizeof(netfront_accel_pkt_desc), GFP_ATOMIC);
+               if (block == NULL) {
+                       spin_unlock_irqrestore(manager->lock, flags);
+                       return -ENOMEM;
+               }
+               manager->desc_blocks[i] = block;
+       }
+       for (i = manager->npages; i < newtot; i++) {
+               int k, j = i - manager->npages;
+               int block_num;
+               int block_idx;
+               struct netfront_accel_pkt_desc *pkt;
+
+               block_num = i >> NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT;
+               block_idx = (NETFRONT_ACCEL_BUFS_PER_PAGE*i)
+                       & (NETFRONT_ACCEL_BUFS_PER_BLOCK-1);
+
+               pkt = manager->desc_blocks[block_num] + block_idx;
+               
+               for (k = 0; k < NETFRONT_ACCEL_BUFS_PER_PAGE; k++) {
+                       BUG_ON(page_offset + j >= bufpages->max_pages);
+
+                       pkt[k].buf_id = NETFRONT_ACCEL_BUFS_PER_PAGE * i + k;
+                       pkt[k].pkt_kva = bufpages->page_list[page_offset + j] +
+                               (PAGE_SIZE/NETFRONT_ACCEL_BUFS_PER_PAGE) * k;
+                       pkt[k].pkt_buff_addr = msg_buf +
+                               (PAGE_SIZE/NETFRONT_ACCEL_BUFS_PER_PAGE) * 
+                               (NETFRONT_ACCEL_BUFS_PER_PAGE * j + k);
+                       pkt[k].next_free = manager->first_free;
+                       manager->first_free = pkt[k].buf_id;
+                       *(int*)(pkt[k].pkt_kva) = pkt[k].buf_id;
+
+                       VPRINTK("buf %d desc %p kva %p buffaddr %x\n",
+                               pkt[k].buf_id, &(pkt[k]), pkt[k].pkt_kva, 
+                               pkt[k].pkt_buff_addr);
+               }
+       }
+       manager->npages = newtot;
+       spin_unlock_irqrestore(manager->lock, flags);
+       VPRINTK("Added %d pages. Total is now %d\n", msg_pages,
+               manager->npages);
+       return 0;
+}
+
+
+netfront_accel_pkt_desc *
+netfront_accel_buf_find(struct netfront_accel_bufinfo *manager, u16 id)
+{
+       netfront_accel_pkt_desc *pkt;
+       int block_num = id >> NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT;
+       int block_idx = id & (NETFRONT_ACCEL_BUFS_PER_BLOCK - 1);
+       BUG_ON(id >= manager->npages * NETFRONT_ACCEL_BUFS_PER_PAGE);
+       BUG_ON(block_idx >= NETFRONT_ACCEL_BUFS_PER_BLOCK);
+       pkt = manager->desc_blocks[block_num] + block_idx;
+       return pkt;
+}
+
+
+/* Allocate a buffer from the buffer manager */
+netfront_accel_pkt_desc *
+netfront_accel_buf_get(struct netfront_accel_bufinfo *manager)
+{
+       int bufno = -1;
+       netfront_accel_pkt_desc *buf = NULL;
+       unsigned long flags = 0;
+
+       /* Any spare? */
+       if (manager->first_free == -1)
+               return NULL;
+       /* Take lock */
+       if (manager->internally_locked)
+               spin_lock_irqsave(manager->lock, flags);
+       bufno = manager->first_free;
+       if (bufno != -1) {
+               buf = netfront_accel_buf_find(manager, bufno);
+               manager->first_free = buf->next_free;
+               manager->nused++;
+       }
+       /* Release lock */
+       if (manager->internally_locked)
+               spin_unlock_irqrestore(manager->lock, flags);
+
+       /* Tell the world */
+       VPRINTK("Allocated buffer %i, buffaddr %x\n", bufno,
+               buf->pkt_buff_addr);
+
+       return buf;
+}
+
+
+/* Release a buffer back to the buffer manager pool */
+int netfront_accel_buf_put(struct netfront_accel_bufinfo *manager, u16 id)
+{
+       netfront_accel_pkt_desc *buf = netfront_accel_buf_find(manager, id);
+       unsigned long flags = 0;
+       unsigned was_empty = 0;
+       int bufno = id;
+
+       VPRINTK("Freeing buffer %i\n", id);
+       BUG_ON(id == (u16)-1);
+
+       if (manager->internally_locked)
+               spin_lock_irqsave(manager->lock, flags);
+
+       if (manager->first_free == -1)
+               was_empty = 1;
+
+       buf->next_free = manager->first_free;
+       manager->first_free = bufno;
+       manager->nused--;
+
+       if (manager->internally_locked)
+               spin_unlock_irqrestore(manager->lock, flags);
+
+       return was_empty;
+}
diff --git a/drivers/xen/sfc_netfront/accel_bufs.h b/drivers/xen/sfc_netfront/accel_bufs.h

new file mode 100644 (file)

index 0000000..4ff3eaa
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_bufs.h
@@ -0,0 +1,181 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETFRONT_ACCEL_BUFS_H
+#define NETFRONT_ACCEL_BUFS_H
+
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <xen/xenbus.h>
+
+#include "accel_msg_iface.h"
+
+
+/*! Buffer descriptor structure */
+typedef struct netfront_accel_pkt_desc {
+       int buf_id;
+       u32 pkt_buff_addr;
+       void *pkt_kva;
+       /* This is the socket buffer currently married to this buffer */
+       struct sk_buff *skb;
+       int next_free;
+} netfront_accel_pkt_desc;
+
+
+#define NETFRONT_ACCEL_DEFAULT_BUF_PAGES (384)
+#define NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT (4)
+#define NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK             \
+       (1 << (NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT))
+#define NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT (1)
+#define NETFRONT_ACCEL_BUFS_PER_PAGE                   \
+       (1 << (NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT))
+#define NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT            \
+       (NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK_SHIFT +     \
+        NETFRONT_ACCEL_BUFS_PER_PAGE_SHIFT)
+#define NETFRONT_ACCEL_BUFS_PER_BLOCK                  \
+       (1 << NETFRONT_ACCEL_BUFS_PER_BLOCK_SHIFT)
+#define NETFRONT_ACCEL_BUF_NUM_BLOCKS(max_pages)                       \
+       (((max_pages)+NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK-1) /           \
+        NETFRONT_ACCEL_BUF_PAGES_PER_BLOCK)
+
+/*! Buffer management structure. */
+struct netfront_accel_bufinfo {
+       /* number added to this manager */
+       unsigned npages;
+       /* number currently used from this manager */
+       unsigned nused;
+
+       int first_free;
+
+       int internally_locked;
+       spinlock_t *lock;
+
+       /*
+        * array of pointers (length NETFRONT_ACCEL_BUF_NUM_BLOCKS) to
+        * pkt descs
+        */
+       struct netfront_accel_pkt_desc **desc_blocks; 
+};
+
+
+struct netfront_accel_bufpages {
+       /* length of lists of pages/grants */
+       int max_pages;
+       /* list of pages allocated for network buffers */
+       void **page_list;
+       /* list of grants for the above pages */
+       grant_ref_t *grant_list;
+       
+       /* number of page requests that have been made */
+       unsigned page_reqs;
+};
+
+
+/*! Allocate memory for the buffer manager, set up locks etc.
+ * Optionally takes a lock to use, if not supplied it makes its own.
+ *
+ * \return pointer to netfront_accel_bufinfo structure that represents the
+ * buffer manager
+ */
+extern struct netfront_accel_bufinfo *
+netfront_accel_init_bufs(spinlock_t *lock);
+
+/*! Allocate memory for the buffers
+ */
+extern int
+netfront_accel_alloc_buffer_mem(struct netfront_accel_bufpages *bufpages,
+                               struct netfront_accel_bufinfo *rx_res,
+                               struct netfront_accel_bufinfo *tx_res,
+                               int pages);
+extern void
+netfront_accel_free_buffer_mem(struct netfront_accel_bufpages *bufpages,
+                              struct netfront_accel_bufinfo *rx_res,
+                              struct netfront_accel_bufinfo *tx_res);
+
+/*! Release memory for the buffer manager, buffers, etc.
+ *
+ * \param manager pointer to netfront_accel_bufinfo structure that
+ * represents the buffer manager
+ */
+extern void netfront_accel_fini_bufs(struct netfront_accel_bufinfo *manager);
+
+/*! Release a buffer.
+ *
+ * \param manager  The buffer manager which owns the buffer.
+ * \param id   The buffer identifier.
+ */
+extern int netfront_accel_buf_put(struct netfront_accel_bufinfo *manager, 
+                                 u16 id);
+
+/*! Get the packet descriptor associated with a buffer id.
+ *
+ * \param manager  The buffer manager which owns the buffer.
+ * \param id       The buffer identifier.
+ *
+ * The returned value is the packet descriptor for this buffer.
+ */
+extern netfront_accel_pkt_desc *
+netfront_accel_buf_find(struct netfront_accel_bufinfo *manager, u16 id);
+
+
+/*! Fill out a message request for some buffers to be mapped by the
+ * back end driver
+ * 
+ * \param manager The buffer manager 
+ * \param msg Pointer to an ef_msg to complete.
+ * \return 0 on success
+ */
+extern int 
+netfront_accel_buf_map_request(struct xenbus_device *dev,
+                              struct netfront_accel_bufpages *bufpages,
+                              struct net_accel_msg *msg, 
+                              int pages, int offset);
+
+/*! Process a response to a buffer request. 
+ * 
+ * Deal with a received message from the back end in response to our
+ * request for buffers
+ * 
+ * \param manager The buffer manager
+ * \param msg The received message from the back end describing new
+ * buffers
+ * \return 0 on success
+ */
+extern int 
+netfront_accel_add_bufs(struct netfront_accel_bufpages *bufpages,
+                       struct netfront_accel_bufinfo *manager,
+                       struct net_accel_msg *msg);
+
+
+/*! Allocate a buffer from the buffer manager 
+ *
+ * \param manager The buffer manager data structure
+ * \param id On exit, the id of the buffer allocated
+ * \return Pointer to buffer descriptor.
+ */
+struct netfront_accel_pkt_desc *
+netfront_accel_buf_get(struct netfront_accel_bufinfo *manager);
+
+#endif /* NETFRONT_ACCEL_BUFS_H */
+
diff --git a/drivers/xen/sfc_netfront/accel_debugfs.c b/drivers/xen/sfc_netfront/accel_debugfs.c

new file mode 100644 (file)

index 0000000..cd2d2c5
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_debugfs.c
@@ -0,0 +1,227 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+
+#include "accel.h"
+
+#if defined(CONFIG_DEBUG_FS)
+static struct dentry *sfc_debugfs_root = NULL;
+#endif
+
+void netfront_accel_debugfs_init(void) 
+{
+#if defined(CONFIG_DEBUG_FS)
+       sfc_debugfs_root = debugfs_create_dir(frontend_name, NULL);
+#endif
+}
+
+
+void netfront_accel_debugfs_fini(void)
+{
+#if defined(CONFIG_DEBUG_FS)
+       if (sfc_debugfs_root)
+               debugfs_remove(sfc_debugfs_root);
+#endif
+}
+
+
+int netfront_accel_debugfs_create(netfront_accel_vnic *vnic)
+{
+#if defined(CONFIG_DEBUG_FS)
+       if (sfc_debugfs_root == NULL)
+               return -ENOENT;
+
+       vnic->dbfs_dir = debugfs_create_dir(vnic->net_dev->name, 
+                                           sfc_debugfs_root);
+       if (vnic->dbfs_dir == NULL)
+               return -ENOMEM;
+
+       vnic->netdev_dbfs.fastpath_rx_pkts = debugfs_create_u32
+               ("fastpath_rx_pkts", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_pkts);
+       vnic->netdev_dbfs.fastpath_rx_bytes = debugfs_create_u32
+               ("fastpath_rx_bytes", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_bytes);
+       vnic->netdev_dbfs.fastpath_rx_errors = debugfs_create_u32
+               ("fastpath_rx_errors", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->netdev_stats.fastpath_rx_errors);
+       vnic->netdev_dbfs.fastpath_tx_pkts = debugfs_create_u32
+               ("fastpath_tx_pkts", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_pkts);
+       vnic->netdev_dbfs.fastpath_tx_bytes = debugfs_create_u32
+               ("fastpath_tx_bytes", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_bytes);
+       vnic->netdev_dbfs.fastpath_tx_errors = debugfs_create_u32
+               ("fastpath_tx_errors", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->netdev_stats.fastpath_tx_errors);
+
+#if NETFRONT_ACCEL_STATS
+       vnic->dbfs.irq_count = debugfs_create_u64
+               ("irq_count", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.irq_count);
+       vnic->dbfs.useless_irq_count = debugfs_create_u64
+               ("useless_irq_count", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.useless_irq_count);
+       vnic->dbfs.poll_schedule_count = debugfs_create_u64
+               ("poll_schedule_count", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.poll_schedule_count);
+       vnic->dbfs.poll_call_count = debugfs_create_u64
+               ("poll_call_count", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.poll_call_count);
+       vnic->dbfs.poll_reschedule_count = debugfs_create_u64
+               ("poll_reschedule_count", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.poll_reschedule_count);
+       vnic->dbfs.queue_stops = debugfs_create_u64
+               ("queue_stops", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.queue_stops);
+       vnic->dbfs.queue_wakes = debugfs_create_u64
+               ("queue_wakes", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.queue_wakes);
+       vnic->dbfs.ssr_bursts = debugfs_create_u64
+               ("ssr_bursts", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.ssr_bursts);
+       vnic->dbfs.ssr_drop_stream = debugfs_create_u64
+               ("ssr_drop_stream", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.ssr_drop_stream);
+       vnic->dbfs.ssr_misorder = debugfs_create_u64
+               ("ssr_misorder", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.ssr_misorder);
+       vnic->dbfs.ssr_slow_start = debugfs_create_u64
+               ("ssr_slow_start", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.ssr_slow_start);
+       vnic->dbfs.ssr_merges = debugfs_create_u64
+               ("ssr_merges", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.ssr_merges);
+       vnic->dbfs.ssr_too_many = debugfs_create_u64
+               ("ssr_too_many", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.ssr_too_many);
+       vnic->dbfs.ssr_new_stream = debugfs_create_u64
+               ("ssr_new_stream", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.ssr_new_stream);
+
+       vnic->dbfs.fastpath_tx_busy = debugfs_create_u64
+               ("fastpath_tx_busy", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.fastpath_tx_busy);
+       vnic->dbfs.fastpath_tx_completions = debugfs_create_u64
+               ("fastpath_tx_completions", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.fastpath_tx_completions);
+       vnic->dbfs.fastpath_tx_pending_max = debugfs_create_u32
+               ("fastpath_tx_pending_max", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.fastpath_tx_pending_max);
+       vnic->dbfs.event_count = debugfs_create_u64
+               ("event_count", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.event_count);
+       vnic->dbfs.bad_event_count = debugfs_create_u64
+               ("bad_event_count", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.bad_event_count);
+       vnic->dbfs.event_count_since_irq = debugfs_create_u32
+               ("event_count_since_irq", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.event_count_since_irq);
+       vnic->dbfs.events_per_irq_max = debugfs_create_u32
+               ("events_per_irq_max", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.events_per_irq_max);
+       vnic->dbfs.fastpath_frm_trunc = debugfs_create_u64
+               ("fastpath_frm_trunc", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.fastpath_frm_trunc);
+       vnic->dbfs.fastpath_crc_bad = debugfs_create_u64
+               ("fastpath_crc_bad", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.fastpath_crc_bad);
+       vnic->dbfs.fastpath_csum_bad = debugfs_create_u64
+               ("fastpath_csum_bad", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.fastpath_csum_bad);
+       vnic->dbfs.fastpath_rights_bad = debugfs_create_u64
+               ("fastpath_rights_bad", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.fastpath_rights_bad);
+       vnic->dbfs.fastpath_discard_other = debugfs_create_u64
+               ("fastpath_discard_other", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.fastpath_discard_other);
+       vnic->dbfs.rx_no_desc_trunc = debugfs_create_u64
+               ("rx_no_desc_trunc", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.rx_no_desc_trunc);
+       vnic->dbfs.events_per_poll_max = debugfs_create_u32
+               ("events_per_poll_max", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.events_per_poll_max);
+       vnic->dbfs.events_per_poll_rx_max = debugfs_create_u32
+               ("events_per_poll_rx_max", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.events_per_poll_rx_max);
+       vnic->dbfs.events_per_poll_tx_max = debugfs_create_u32
+               ("events_per_poll_tx_max", S_IRUSR | S_IRGRP | S_IROTH,
+                vnic->dbfs_dir, &vnic->stats.events_per_poll_tx_max);
+#endif
+#endif
+       return 0;
+}
+
+
+int netfront_accel_debugfs_remove(netfront_accel_vnic *vnic)
+{
+#if defined(CONFIG_DEBUG_FS)
+       if (vnic->dbfs_dir != NULL) {
+               debugfs_remove(vnic->netdev_dbfs.fastpath_rx_pkts);
+               debugfs_remove(vnic->netdev_dbfs.fastpath_rx_bytes);
+               debugfs_remove(vnic->netdev_dbfs.fastpath_rx_errors);
+               debugfs_remove(vnic->netdev_dbfs.fastpath_tx_pkts);
+               debugfs_remove(vnic->netdev_dbfs.fastpath_tx_bytes);
+               debugfs_remove(vnic->netdev_dbfs.fastpath_tx_errors);
+               
+#if NETFRONT_ACCEL_STATS
+               debugfs_remove(vnic->dbfs.irq_count);
+               debugfs_remove(vnic->dbfs.useless_irq_count);
+               debugfs_remove(vnic->dbfs.poll_schedule_count);
+               debugfs_remove(vnic->dbfs.poll_call_count);
+               debugfs_remove(vnic->dbfs.poll_reschedule_count);
+               debugfs_remove(vnic->dbfs.queue_stops);
+               debugfs_remove(vnic->dbfs.queue_wakes);
+               debugfs_remove(vnic->dbfs.ssr_bursts);
+               debugfs_remove(vnic->dbfs.ssr_drop_stream);
+               debugfs_remove(vnic->dbfs.ssr_misorder);
+               debugfs_remove(vnic->dbfs.ssr_slow_start);
+               debugfs_remove(vnic->dbfs.ssr_merges);
+               debugfs_remove(vnic->dbfs.ssr_too_many);
+               debugfs_remove(vnic->dbfs.ssr_new_stream);
+               
+               debugfs_remove(vnic->dbfs.fastpath_tx_busy);
+               debugfs_remove(vnic->dbfs.fastpath_tx_completions);
+               debugfs_remove(vnic->dbfs.fastpath_tx_pending_max);
+               debugfs_remove(vnic->dbfs.event_count);
+               debugfs_remove(vnic->dbfs.bad_event_count);
+               debugfs_remove(vnic->dbfs.event_count_since_irq);
+               debugfs_remove(vnic->dbfs.events_per_irq_max);
+               debugfs_remove(vnic->dbfs.fastpath_frm_trunc);
+               debugfs_remove(vnic->dbfs.fastpath_crc_bad);
+               debugfs_remove(vnic->dbfs.fastpath_csum_bad);
+               debugfs_remove(vnic->dbfs.fastpath_rights_bad);
+               debugfs_remove(vnic->dbfs.fastpath_discard_other);
+               debugfs_remove(vnic->dbfs.rx_no_desc_trunc);
+               debugfs_remove(vnic->dbfs.events_per_poll_max);
+               debugfs_remove(vnic->dbfs.events_per_poll_rx_max);
+               debugfs_remove(vnic->dbfs.events_per_poll_tx_max);
+#endif
+               debugfs_remove(vnic->dbfs_dir);
+       }
+#endif
+       return 0;
+}
diff --git a/drivers/xen/sfc_netfront/accel_msg.c b/drivers/xen/sfc_netfront/accel_msg.c

new file mode 100644 (file)

index 0000000..045af8b
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_msg.c
@@ -0,0 +1,567 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/stddef.h>
+#include <linux/errno.h>
+
+#include <xen/xenbus.h>
+
+#include "accel.h"
+#include "accel_msg_iface.h"
+#include "accel_util.h"
+#include "accel_bufs.h"
+
+#include "netfront.h" /* drivers/xen/netfront/netfront.h */
+
+static void vnic_start_interrupts(netfront_accel_vnic *vnic)
+{
+       unsigned long flags;
+       
+       /* Prime our interrupt */
+       spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+       if (!netfront_accel_vi_enable_interrupts(vnic)) {
+               struct netfront_info *np = netdev_priv(vnic->net_dev);
+
+               /* Cripes, that was quick, better pass it up */
+               netfront_accel_disable_net_interrupts(vnic);
+               vnic->irq_enabled = 0;
+               NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_schedule_count++);
+               napi_schedule(&np->napi);
+       } else {
+               /*
+                * Nothing yet, make sure we get interrupts through
+                * back end 
+                */
+               vnic->irq_enabled = 1;
+               netfront_accel_enable_net_interrupts(vnic);
+       }
+       spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+}
+
+
+static void vnic_stop_interrupts(netfront_accel_vnic *vnic)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+       netfront_accel_disable_net_interrupts(vnic);
+       vnic->irq_enabled = 0;
+       spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+}
+
+
+static void vnic_start_fastpath(netfront_accel_vnic *vnic)
+{
+       struct net_device *net_dev = vnic->net_dev;
+       struct netfront_info *np = netdev_priv(net_dev);
+       unsigned long flags;
+
+       DPRINTK("%s\n", __FUNCTION__);
+
+       spin_lock_irqsave(&vnic->tx_lock, flags);
+       vnic->tx_enabled = 1;
+       spin_unlock_irqrestore(&vnic->tx_lock, flags);
+       
+       napi_disable(&np->napi);
+       vnic->poll_enabled = 1;
+       napi_enable(&np->napi);
+       
+       vnic_start_interrupts(vnic);
+}
+
+
+void vnic_stop_fastpath(netfront_accel_vnic *vnic)
+{
+       struct net_device *net_dev = vnic->net_dev;
+       struct netfront_info *np = (struct netfront_info *)netdev_priv(net_dev);
+       unsigned long flags1, flags2;
+
+       DPRINTK("%s\n", __FUNCTION__);
+
+       vnic_stop_interrupts(vnic);
+       
+       spin_lock_irqsave(&vnic->tx_lock, flags1);
+       vnic->tx_enabled = 0;
+       spin_lock_irqsave(&np->tx_lock, flags2);
+       if (vnic->tx_skb != NULL) {
+               dev_kfree_skb_any(vnic->tx_skb);
+               vnic->tx_skb = NULL;
+               if (netfront_check_queue_ready(net_dev)) {
+                       netif_wake_queue(net_dev);
+                       NETFRONT_ACCEL_STATS_OP
+                               (vnic->stats.queue_wakes++);
+               }
+       }
+       spin_unlock_irqrestore(&np->tx_lock, flags2);
+       spin_unlock_irqrestore(&vnic->tx_lock, flags1);
+       
+       /* Must prevent polls and hold lock to modify poll_enabled */
+       napi_disable(&np->napi);
+       spin_lock_irqsave(&vnic->irq_enabled_lock, flags1);
+       vnic->poll_enabled = 0;
+       spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags1);
+       napi_enable(&np->napi);
+}
+
+
+static void netfront_accel_interface_up(netfront_accel_vnic *vnic)
+{
+       if (!vnic->backend_netdev_up) {
+               vnic->backend_netdev_up = 1;
+               
+               if (vnic->frontend_ready)
+                       vnic_start_fastpath(vnic);
+       }
+}
+
+
+static void netfront_accel_interface_down(netfront_accel_vnic *vnic)
+{
+       if (vnic->backend_netdev_up) {
+               vnic->backend_netdev_up = 0;
+               
+               if (vnic->frontend_ready)
+                       vnic_stop_fastpath(vnic);
+       }
+}
+
+
+static int vnic_add_bufs(netfront_accel_vnic *vnic, 
+                        struct net_accel_msg *msg)
+{
+       int rc, offset;
+       struct netfront_accel_bufinfo *bufinfo;
+  
+       BUG_ON(msg->u.mapbufs.pages > NET_ACCEL_MSG_MAX_PAGE_REQ);
+
+       offset = msg->u.mapbufs.reqid;
+
+       if (offset < vnic->bufpages.max_pages - 
+           (vnic->bufpages.max_pages / sfc_netfront_buffer_split)) {
+               bufinfo = vnic->rx_bufs;
+       } else
+               bufinfo = vnic->tx_bufs;
+
+       /* Queue up some Rx buffers to start things off. */
+       if ((rc = netfront_accel_add_bufs(&vnic->bufpages, bufinfo, msg)) == 0) {
+               netfront_accel_vi_add_bufs(vnic, bufinfo == vnic->rx_bufs);
+
+               if (offset + msg->u.mapbufs.pages == vnic->bufpages.max_pages) {
+                       VPRINTK("%s: got all buffers back\n", __FUNCTION__);
+                       vnic->frontend_ready = 1;
+                       if (vnic->backend_netdev_up)
+                               vnic_start_fastpath(vnic);
+               } else {
+                       VPRINTK("%s: got buffers back %d %d\n", __FUNCTION__, 
+                               offset, msg->u.mapbufs.pages);
+               }
+       }
+
+       return rc;
+}
+
+
+/* The largest [o] such that (1u << o) <= n.  Requires n > 0. */
+
+inline unsigned log2_le(unsigned long n) {
+       unsigned order = 1;
+       while ((1ul << order) <= n) ++order;
+       return (order - 1);
+}
+
+static int vnic_send_buffer_requests(netfront_accel_vnic *vnic,
+                                    struct netfront_accel_bufpages *bufpages)
+{
+       int pages, offset, rc = 0, sent = 0;
+       struct net_accel_msg msg;
+
+       while (bufpages->page_reqs < bufpages->max_pages) {
+               offset = bufpages->page_reqs;
+
+               pages = pow2(log2_le(bufpages->max_pages - 
+                                    bufpages->page_reqs));
+               pages = pages < NET_ACCEL_MSG_MAX_PAGE_REQ ? 
+                       pages : NET_ACCEL_MSG_MAX_PAGE_REQ;
+
+               BUG_ON(offset < 0);
+               BUG_ON(pages <= 0);
+
+               rc = netfront_accel_buf_map_request(vnic->dev, bufpages,
+                                                   &msg, pages, offset);
+               if (rc == 0) {
+                       rc = net_accel_msg_send(vnic->shared_page, 
+                                               &vnic->to_dom0, &msg);
+                       if (rc < 0) {
+                               VPRINTK("%s: queue full, stopping for now\n",
+                                       __FUNCTION__);
+                               break;
+                       }
+                       sent++;
+               } else {
+                       EPRINTK("%s: problem with grant, stopping for now\n",
+                               __FUNCTION__);
+                       break;
+               }
+
+               bufpages->page_reqs += pages;
+       }
+
+       if (sent)
+               net_accel_msg_notify(vnic->msg_channel_irq);
+
+       return rc;
+}
+
+
+/*
+ * In response to dom0 saying "my queue is full", we reply with this
+ * when it is no longer full
+ */
+inline void vnic_set_queue_not_full(netfront_accel_vnic *vnic)
+{
+
+       if (test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B,
+                           (unsigned long *)&vnic->shared_page->aflags))
+               notify_remote_via_irq(vnic->msg_channel_irq);
+       else
+               VPRINTK("queue not full bit already set, not signalling\n");
+}
+
+/* 
+ * Notify dom0 that the queue we want to use is full, it should
+ * respond by setting MSG_AFLAGS_QUEUEUNOTFULL in due course
+ */
+inline void vnic_set_queue_full(netfront_accel_vnic *vnic)
+{
+
+       if (!test_and_set_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B,
+                            (unsigned long *)&vnic->shared_page->aflags))
+               notify_remote_via_irq(vnic->msg_channel_irq);
+       else
+               VPRINTK("queue full bit already set, not signalling\n");
+}
+
+
+static int vnic_check_hello_version(unsigned version) 
+{
+       if (version > NET_ACCEL_MSG_VERSION) {
+               /* Newer protocol, we must refuse */
+               return -EPROTO;
+       }
+
+       if (version < NET_ACCEL_MSG_VERSION) {
+               /*
+                * We are newer, so have discretion to accept if we
+                * wish.  For now however, just reject
+                */
+               return -EPROTO;
+       }
+
+       BUG_ON(version != NET_ACCEL_MSG_VERSION);
+       return 0;
+}
+
+
+static int vnic_process_hello_msg(netfront_accel_vnic *vnic,
+                                 struct net_accel_msg *msg)
+{
+       int err = 0;
+       unsigned pages = sfc_netfront_max_pages;
+
+       if (vnic_check_hello_version(msg->u.hello.version) < 0) {
+               msg->id = NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_REPLY 
+                       | NET_ACCEL_MSG_ERROR;
+               msg->u.hello.version = NET_ACCEL_MSG_VERSION;
+       } else {
+               vnic->backend_netdev_up
+                       = vnic->shared_page->net_dev_up;
+               
+               msg->id = NET_ACCEL_MSG_HELLO | NET_ACCEL_MSG_REPLY;
+               msg->u.hello.version = NET_ACCEL_MSG_VERSION;
+               if (msg->u.hello.max_pages &&
+                   msg->u.hello.max_pages < pages)
+                       pages = msg->u.hello.max_pages;
+               msg->u.hello.max_pages = pages;
+               
+               /* Half of pages for rx, half for tx */ 
+               err = netfront_accel_alloc_buffer_mem(&vnic->bufpages,
+                                                     vnic->rx_bufs, 
+                                                     vnic->tx_bufs,
+                                                     pages);
+               if (err)
+                       msg->id |= NET_ACCEL_MSG_ERROR;         
+       }
+       
+       /* Send reply */
+       net_accel_msg_reply_notify(vnic->shared_page, vnic->msg_channel_irq,
+                                  &vnic->to_dom0, msg);
+       return err;
+}
+
+
+static int vnic_process_localmac_msg(netfront_accel_vnic *vnic,
+                                    struct net_accel_msg *msg)
+{
+       unsigned long flags;
+       cuckoo_hash_mac_key key;
+
+       if (msg->u.localmac.flags & NET_ACCEL_MSG_ADD) {
+               DPRINTK("MAC has moved, could be local: %pM\n",
+                       msg->u.localmac.mac);
+               key = cuckoo_mac_to_key(msg->u.localmac.mac);
+               spin_lock_irqsave(&vnic->table_lock, flags);
+               /* Try to remove it, not a big deal if not there */
+               cuckoo_hash_remove(&vnic->fastpath_table, 
+                                  (cuckoo_hash_key *)&key);
+               spin_unlock_irqrestore(&vnic->table_lock, flags);
+       }
+       
+       return 0;
+}
+
+
+static 
+int vnic_process_rx_msg(netfront_accel_vnic *vnic,
+                       struct net_accel_msg *msg)
+{
+       int err;
+
+       switch (msg->id) {
+       case NET_ACCEL_MSG_HELLO:
+               /* Hello, reply with Reply */
+               DPRINTK("got Hello, with version %.8x\n",
+                       msg->u.hello.version);
+               BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_NONE);
+               err = vnic_process_hello_msg(vnic, msg);
+               if (err == 0)
+                       vnic->msg_state = NETFRONT_ACCEL_MSG_HELLO;
+               break;
+       case NET_ACCEL_MSG_SETHW:
+               /* Hardware info message */
+               DPRINTK("got H/W info\n");
+               BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HELLO);
+               err = netfront_accel_vi_init(vnic, &msg->u.hw);
+               if (err == 0)
+                       vnic->msg_state = NETFRONT_ACCEL_MSG_HW;
+               break;
+       case NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY:
+               VPRINTK("Got mapped buffers back\n");
+               BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW);
+               err = vnic_add_bufs(vnic, msg);
+               break;
+       case NET_ACCEL_MSG_MAPBUF | NET_ACCEL_MSG_REPLY | NET_ACCEL_MSG_ERROR:
+               /* No buffers.  Can't use the fast path. */
+               EPRINTK("Got mapped buffers error.  Cannot accelerate.\n");
+               BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW);
+               err = -EIO;
+               break;
+       case NET_ACCEL_MSG_LOCALMAC:
+               /* Should be add, remove not currently used */
+               EPRINTK_ON(!(msg->u.localmac.flags & NET_ACCEL_MSG_ADD));
+               BUG_ON(vnic->msg_state != NETFRONT_ACCEL_MSG_HW);
+               err = vnic_process_localmac_msg(vnic, msg);
+               break;
+       default:
+               EPRINTK("Huh? Message code is 0x%x\n", msg->id);
+               err = -EPROTO;
+               break;
+       }
+
+       return err;
+}
+
+
+/* Process an IRQ received from back end driver */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+void netfront_accel_msg_from_bend(struct work_struct *context)
+#else
+void netfront_accel_msg_from_bend(void *context)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+       netfront_accel_vnic *vnic = 
+               container_of(context, netfront_accel_vnic, msg_from_bend);
+#else
+       netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
+#endif
+       struct net_accel_msg msg;
+       int err, queue_was_full = 0;
+       
+       mutex_lock(&vnic->vnic_mutex);
+
+       /*
+        * This happens when the shared pages have been unmapped but
+        * the workqueue has yet to be flushed 
+        */
+       if (!vnic->dom0_state_is_setup) 
+               goto unlock_out;
+
+       while ((vnic->shared_page->aflags & NET_ACCEL_MSG_AFLAGS_TO_DOMU_MASK)
+              != 0) {
+               if (vnic->shared_page->aflags &
+                   NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL) {
+                       /* We've been told there may now be space. */
+                       clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B,
+                                 (unsigned long *)&vnic->shared_page->aflags);
+               }
+
+               if (vnic->shared_page->aflags &
+                   NET_ACCEL_MSG_AFLAGS_QUEUE0FULL) {
+                       /*
+                        * There will be space at the end of this
+                        * function if we can make any.
+                        */
+                       clear_bit(NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B,
+                                 (unsigned long *)&vnic->shared_page->aflags);
+                       queue_was_full = 1;
+               }
+
+               if (vnic->shared_page->aflags &
+                   NET_ACCEL_MSG_AFLAGS_NETUPDOWN) {
+                       DPRINTK("%s: net interface change\n", __FUNCTION__);
+                       clear_bit(NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B,
+                                 (unsigned long *)&vnic->shared_page->aflags);
+                       if (vnic->shared_page->net_dev_up)
+                               netfront_accel_interface_up(vnic);
+                       else
+                               netfront_accel_interface_down(vnic);
+               }
+       }
+
+       /* Pull msg out of shared memory */
+       while ((err = net_accel_msg_recv(vnic->shared_page, &vnic->from_dom0,
+                                        &msg)) == 0) {
+               err = vnic_process_rx_msg(vnic, &msg);
+               
+               if (err != 0)
+                       goto done;
+       }
+
+       /*
+        * Send any pending buffer map request messages that we can,
+        * and mark domU->dom0 as full if necessary.  
+        */
+       if (vnic->msg_state == NETFRONT_ACCEL_MSG_HW &&
+           vnic->bufpages.page_reqs < vnic->bufpages.max_pages) {
+               if (vnic_send_buffer_requests(vnic, &vnic->bufpages) == -ENOSPC)
+                       vnic_set_queue_full(vnic);
+       }
+
+       /* 
+        * If there are no messages then this is not an error.  It
+        * just means that we've finished processing the queue.
+        */
+       if (err == -ENOENT)
+               err = 0;
+ done:
+       /* We will now have made space in the dom0->domU queue if we can */
+       if (queue_was_full)
+               vnic_set_queue_not_full(vnic);
+
+       if (err != 0) {
+               EPRINTK("%s returned %d\n", __FUNCTION__, err);
+               netfront_accel_set_closing(vnic);
+       }
+
+ unlock_out:
+       mutex_unlock(&vnic->vnic_mutex);
+
+       return;
+}
+
+
+irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context)
+{
+       netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
+       VPRINTK("irq %d from device %s\n", irq, vnic->dev->nodename);
+
+       queue_work(netfront_accel_workqueue, &vnic->msg_from_bend);
+
+       return IRQ_HANDLED;
+}
+
+/* Process an interrupt received from the NIC via backend */
+irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context)
+{
+       netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
+       struct net_device *net_dev = vnic->net_dev;
+       unsigned long flags;
+
+       VPRINTK("net irq %d from device %s\n", irq, vnic->dev->nodename);
+       
+       NETFRONT_ACCEL_STATS_OP(vnic->stats.irq_count++);
+
+       BUG_ON(net_dev==NULL);
+
+       spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+       if (vnic->irq_enabled) {
+               struct netfront_info *np = netdev_priv(net_dev);
+
+               netfront_accel_disable_net_interrupts(vnic);
+               vnic->irq_enabled = 0;
+               spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+
+#if NETFRONT_ACCEL_STATS
+               vnic->stats.poll_schedule_count++;
+               if (vnic->stats.event_count_since_irq >
+                   vnic->stats.events_per_irq_max)
+                       vnic->stats.events_per_irq_max = 
+                               vnic->stats.event_count_since_irq;
+               vnic->stats.event_count_since_irq = 0;
+#endif
+               napi_schedule(&np->napi);
+       }
+       else {
+               spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+               NETFRONT_ACCEL_STATS_OP(vnic->stats.useless_irq_count++);
+               DPRINTK("%s: irq when disabled\n", __FUNCTION__);
+       }
+       
+       return IRQ_HANDLED;
+}
+
+
+void netfront_accel_msg_tx_fastpath(netfront_accel_vnic *vnic, const void *mac,
+                                   u32 ip, u16 port, u8 protocol)
+{
+       unsigned long lock_state;
+       struct net_accel_msg *msg;
+
+       msg = net_accel_msg_start_send(vnic->shared_page, &vnic->to_dom0,
+                                      &lock_state);
+
+       if (msg == NULL)
+               return;
+
+       net_accel_msg_init(msg, NET_ACCEL_MSG_FASTPATH);
+       msg->u.fastpath.flags = NET_ACCEL_MSG_REMOVE;
+       memcpy(msg->u.fastpath.mac, mac, ETH_ALEN);
+
+       msg->u.fastpath.port = port;
+       msg->u.fastpath.ip = ip;
+       msg->u.fastpath.proto = protocol;
+
+       net_accel_msg_complete_send_notify(vnic->shared_page, &vnic->to_dom0, 
+                                          &lock_state, vnic->msg_channel_irq);
+}
diff --git a/drivers/xen/sfc_netfront/accel_netfront.c b/drivers/xen/sfc_netfront/accel_netfront.c

new file mode 100644 (file)

index 0000000..8ddec93
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_netfront.c
@@ -0,0 +1,330 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+
+/* drivers/xen/netfront/netfront.h */
+#include "netfront.h"
+
+#include "accel.h"
+#include "accel_bufs.h"
+#include "accel_util.h"
+#include "accel_msg_iface.h"
+#include "accel_ssr.h"
+ 
+#ifdef EFX_GCOV
+#include "gcov.h"
+#endif
+
+#define NETFRONT_ACCEL_VNIC_FROM_NETDEV(_nd)                           \
+       ((netfront_accel_vnic *)((struct netfront_info *)netdev_priv(net_dev))->accel_priv)
+
+static int netfront_accel_netdev_start_xmit(struct sk_buff *skb,
+                                           struct net_device *net_dev)
+{
+       netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+       struct netfront_info *np = 
+               (struct netfront_info *)netdev_priv(net_dev);
+       int handled, rc;
+       unsigned long flags1, flags2;
+
+       BUG_ON(vnic == NULL);
+
+       /* Take our tx lock and hold for the duration */
+       spin_lock_irqsave(&vnic->tx_lock, flags1);
+
+       if (!vnic->tx_enabled) {
+               rc = 0;
+               goto unlock_out;
+       }
+
+       handled = netfront_accel_vi_tx_post(vnic, skb);
+       if (handled == NETFRONT_ACCEL_STATUS_BUSY) {
+               BUG_ON(vnic->net_dev != net_dev);
+               DPRINTK("%s stopping queue\n", __FUNCTION__);
+
+               /* Need netfront's tx_lock and vnic tx_lock to write tx_skb */
+               spin_lock_irqsave(&np->tx_lock, flags2);
+               BUG_ON(vnic->tx_skb != NULL);
+               vnic->tx_skb = skb;
+               netif_stop_queue(net_dev);
+               spin_unlock_irqrestore(&np->tx_lock, flags2);
+
+               NETFRONT_ACCEL_STATS_OP(vnic->stats.queue_stops++);
+       }
+
+       if (handled == NETFRONT_ACCEL_STATUS_CANT)
+               rc = 0;
+       else
+               rc = 1;
+
+unlock_out:
+       spin_unlock_irqrestore(&vnic->tx_lock, flags1);
+
+       return rc;
+}
+
+
+static int netfront_accel_netdev_poll(struct net_device *net_dev, int *budget)
+{
+       netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+       int rx_allowed = *budget, rx_done;
+       
+       BUG_ON(vnic == NULL);
+
+       /* Can check this without lock as modifier excludes polls */ 
+       if (!vnic->poll_enabled)
+               return 0;
+
+       rx_done = netfront_accel_vi_poll(vnic, rx_allowed);
+       *budget -= rx_done;
+       
+       NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_call_count++);
+
+       VPRINTK("%s: done %d allowed %d\n",
+               __FUNCTION__, rx_done, rx_allowed);
+
+       netfront_accel_ssr_end_of_burst(vnic, &vnic->ssr_state);
+
+       if (rx_done < rx_allowed) {
+                return 0; /* Done */
+       }
+       
+       NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_reschedule_count++);
+
+       return 1; /* More to do. */
+}
+
+
+/*
+ * Process request from netfront to start napi interrupt
+ * mode. (i.e. enable interrupts as it's finished polling)
+ */
+static int netfront_accel_start_napi_interrupts(struct net_device *net_dev) 
+{
+       netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+       unsigned long flags;
+
+       BUG_ON(vnic == NULL);
+       
+       /*
+        * Can check this without lock as writer excludes poll before
+        * modifying
+        */
+       if (!vnic->poll_enabled)
+               return 0;
+
+       if (!netfront_accel_vi_enable_interrupts(vnic)) {
+               /* 
+                * There was something there, tell caller we had
+                * something to do.
+                */
+               return 1;
+       }
+
+       spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+       vnic->irq_enabled = 1;
+       netfront_accel_enable_net_interrupts(vnic);
+       spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+
+       return 0;
+}
+
+
+/*
+ * Process request from netfront to stop napi interrupt
+ * mode. (i.e. disable interrupts as it's starting to poll 
+ */
+static void netfront_accel_stop_napi_interrupts(struct net_device *net_dev) 
+{
+       netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+       unsigned long flags;
+
+       BUG_ON(vnic == NULL);
+
+       spin_lock_irqsave(&vnic->irq_enabled_lock, flags);
+
+       if (!vnic->poll_enabled) {
+               spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+               return;
+       }
+
+       netfront_accel_disable_net_interrupts(vnic);
+       vnic->irq_enabled = 0;
+       spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags);
+}
+
+
+static int netfront_accel_check_ready(struct net_device *net_dev)
+{
+       netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+
+       BUG_ON(vnic == NULL);
+
+       /* Read of tx_skb is protected by netfront's tx_lock */ 
+       return vnic->tx_skb == NULL;
+}
+
+
+static int netfront_accel_get_stats(struct net_device *net_dev,
+                                   struct net_device_stats *devst,
+                                   struct netfront_stats *lnkst)
+{
+       netfront_accel_vnic *vnic = NETFRONT_ACCEL_VNIC_FROM_NETDEV(net_dev);
+       struct netfront_accel_netdev_stats now;
+
+       BUG_ON(vnic == NULL);
+
+       now.fastpath_rx_pkts   = vnic->netdev_stats.fastpath_rx_pkts;
+       now.fastpath_rx_bytes  = vnic->netdev_stats.fastpath_rx_bytes;
+       now.fastpath_rx_errors = vnic->netdev_stats.fastpath_rx_errors;
+       now.fastpath_tx_pkts   = vnic->netdev_stats.fastpath_tx_pkts;
+       now.fastpath_tx_bytes  = vnic->netdev_stats.fastpath_tx_bytes;
+       now.fastpath_tx_errors = vnic->netdev_stats.fastpath_tx_errors;
+       
+       lnkst->rx_packets += (now.fastpath_rx_pkts -
+                             vnic->stats_last_read.fastpath_rx_pkts);
+       lnkst->rx_bytes   += (now.fastpath_rx_bytes -
+                             vnic->stats_last_read.fastpath_rx_bytes);
+       devst->rx_errors  += (now.fastpath_rx_errors -
+                             vnic->stats_last_read.fastpath_rx_errors);
+       lnkst->tx_packets += (now.fastpath_tx_pkts -
+                             vnic->stats_last_read.fastpath_tx_pkts);
+       lnkst->tx_bytes   += (now.fastpath_tx_bytes -
+                             vnic->stats_last_read.fastpath_tx_bytes);
+       devst->tx_errors  += (now.fastpath_tx_errors -
+                             vnic->stats_last_read.fastpath_tx_errors);
+       
+       vnic->stats_last_read = now;
+
+       return 0;
+}
+
+
+struct netfront_accel_hooks accel_hooks = {
+       .new_device         = &netfront_accel_probe,
+       .remove         = &netfront_accel_remove,
+       .netdev_poll       = &netfront_accel_netdev_poll,
+       .start_xmit         = &netfront_accel_netdev_start_xmit,
+       .start_napi_irq = &netfront_accel_start_napi_interrupts,
+       .stop_napi_irq   = &netfront_accel_stop_napi_interrupts,
+       .check_ready       = &netfront_accel_check_ready,
+       .get_stats           = &netfront_accel_get_stats
+};
+
+
+unsigned sfc_netfront_max_pages = NETFRONT_ACCEL_DEFAULT_BUF_PAGES;
+module_param_named (max_pages, sfc_netfront_max_pages, uint, 0644);
+MODULE_PARM_DESC(max_pages, "Number of buffer pages to request");
+
+unsigned sfc_netfront_buffer_split = 2;
+module_param_named (buffer_split, sfc_netfront_buffer_split, uint, 0644);
+MODULE_PARM_DESC(buffer_split, 
+                "Fraction of buffers to use for TX, rest for RX");
+
+
+const char *frontend_name = "sfc_netfront";
+
+struct workqueue_struct *netfront_accel_workqueue;
+
+static int __init netfront_accel_init(void)
+{
+       int rc;
+#ifdef EFX_GCOV        
+       gcov_provider_init(THIS_MODULE);
+#endif
+
+       /*
+        * If we're running on dom0, netfront hasn't initialised
+        * itself, so we need to keep away
+        */
+       if (is_initial_xendomain())
+               return 0;
+
+       if (!is_pow2(sizeof(struct net_accel_msg)))
+               EPRINTK("%s: bad structure size\n", __FUNCTION__);
+
+       netfront_accel_workqueue = create_workqueue(frontend_name);
+
+       netfront_accel_debugfs_init();
+
+       rc = netfront_accelerator_loaded(NETFRONT_ACCEL_VERSION,
+                                        frontend_name, &accel_hooks);
+
+       if (rc < 0) {
+               EPRINTK("Xen netfront accelerator version mismatch\n");
+               goto fail;
+       }
+
+       if (rc > 0) {
+               /* 
+                * In future may want to add backwards compatibility
+                * and accept certain subsets of previous versions
+                */
+               EPRINTK("Xen netfront accelerator version mismatch\n");
+               goto fail;
+       }
+
+       return 0;
+
+ fail:
+       netfront_accel_debugfs_fini();
+       flush_workqueue(netfront_accel_workqueue);
+       destroy_workqueue(netfront_accel_workqueue);
+#ifdef EFX_GCOV
+       gcov_provider_fini(THIS_MODULE);
+#endif
+       return -EINVAL;
+}
+module_init(netfront_accel_init);
+
+static void __exit netfront_accel_exit(void)
+{
+       if (is_initial_xendomain())
+               return;
+
+       DPRINTK("%s: unhooking\n", __FUNCTION__);
+
+       /* Unhook from normal netfront */
+       netfront_accelerator_stop(frontend_name);
+
+       DPRINTK("%s: done\n", __FUNCTION__);
+
+       netfront_accel_debugfs_fini();
+
+       flush_workqueue(netfront_accel_workqueue);
+
+       destroy_workqueue(netfront_accel_workqueue);
+
+#ifdef EFX_GCOV
+       gcov_provider_fini(THIS_MODULE);
+#endif
+       return;
+}
+module_exit(netfront_accel_exit);
+
+MODULE_LICENSE("GPL");
+
diff --git a/drivers/xen/sfc_netfront/accel_ssr.c b/drivers/xen/sfc_netfront/accel_ssr.c

new file mode 100644 (file)

index 0000000..9c44144
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_ssr.c
@@ -0,0 +1,308 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/list.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+
+#include "accel.h"
+#include "accel_util.h"
+#include "accel_bufs.h"
+
+#include "accel_ssr.h"
+
+static inline int list_valid(struct list_head *lh) {
+       return(lh->next != NULL);
+}
+
+static void netfront_accel_ssr_deliver (struct netfront_accel_vnic *vnic,
+                                       struct netfront_accel_ssr_state *st,
+                                       struct netfront_accel_ssr_conn *c);
+
+/** Construct an efx_ssr_state.
+ *
+ * @v st     The SSR state (per channel per port)
+ * @v port   The port.
+ */
+void netfront_accel_ssr_init(struct netfront_accel_ssr_state *st) {
+       unsigned i;
+
+       INIT_LIST_HEAD(&st->conns);
+       INIT_LIST_HEAD(&st->free_conns);
+       for (i = 0; i < 8; ++i) {
+               struct netfront_accel_ssr_conn *c = 
+                       kmalloc(sizeof(*c), GFP_KERNEL);
+               if (c == NULL)  break;
+               c->n_in_order_pkts = 0;
+               c->skb = NULL;
+               list_add(&c->link, &st->free_conns);
+       }
+
+}
+
+
+/** Destructor for an efx_ssr_state.
+ *
+ * @v st     The SSR state (per channel per port)
+ */
+void netfront_accel_ssr_fini(netfront_accel_vnic *vnic, 
+                            struct netfront_accel_ssr_state *st) {
+       struct netfront_accel_ssr_conn *c;
+
+       /* Return cleanly if efx_ssr_init() not previously called */
+       BUG_ON(list_valid(&st->conns) != list_valid(&st->free_conns));
+       if (! list_valid(&st->conns))
+               return;
+
+       while ( ! list_empty(&st->free_conns)) {
+               c = list_entry(st->free_conns.prev, 
+                              struct netfront_accel_ssr_conn, link);
+               list_del(&c->link);
+               BUG_ON(c->skb != NULL);
+               kfree(c);
+       }
+       while ( ! list_empty(&st->conns)) {
+               c = list_entry(st->conns.prev, 
+                              struct netfront_accel_ssr_conn, link);
+               list_del(&c->link);
+               if (c->skb)
+                       netfront_accel_ssr_deliver(vnic, st, c);
+               kfree(c);
+       }
+}
+
+
+/** Calc IP checksum and deliver to the OS
+ *
+ * @v st     The SSR state (per channel per port)
+ * @v c             The SSR connection state
+ */
+static void netfront_accel_ssr_deliver(netfront_accel_vnic *vnic,
+                                      struct netfront_accel_ssr_state *st,
+                                      struct netfront_accel_ssr_conn *c) {
+       BUG_ON(c->skb == NULL);
+
+       /*
+        * If we've chained packets together, recalculate the IP
+        * checksum.
+        */
+       if (skb_shinfo(c->skb)->frag_list) {
+               NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_bursts);
+               c->iph->check = 0;
+               c->iph->check = ip_fast_csum((unsigned char *) c->iph, 
+                                            c->iph->ihl);
+       }
+
+       VPRINTK("%s: %d\n", __FUNCTION__, c->skb->len);
+
+       netif_receive_skb(c->skb); 
+       c->skb = NULL;
+}
+
+
+/** Push held skbs down into network stack.
+ *
+ * @v st       SSR state
+ *
+ * Only called if we are tracking one or more connections.
+ */
+void __netfront_accel_ssr_end_of_burst(netfront_accel_vnic *vnic, 
+                                      struct netfront_accel_ssr_state *st) {
+       struct netfront_accel_ssr_conn *c;
+
+       BUG_ON(list_empty(&st->conns));
+
+       list_for_each_entry(c, &st->conns, link)
+               if (c->skb)
+                       netfront_accel_ssr_deliver(vnic, st, c);
+
+       /* Time-out connections that have received no traffic for 20ms. */
+       c = list_entry(st->conns.prev, struct netfront_accel_ssr_conn,
+                      link);
+       if (jiffies - c->last_pkt_jiffies > (HZ / 50 + 1)) {
+               NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_drop_stream);
+               list_del(&c->link);
+               list_add(&c->link, &st->free_conns);
+       }
+}
+
+
+/** Process SKB and decide whether to dispatch it to the stack now or
+ * later.
+ *
+ * @v st        SSR state
+ * @v skb      SKB to exmaine
+ * @ret rc       0 => deliver SKB to kernel now, otherwise the SKB belongs
+ *            us.
+ */
+int netfront_accel_ssr_skb(struct netfront_accel_vnic *vnic,
+                          struct netfront_accel_ssr_state *st,
+                          struct sk_buff *skb) {
+       int data_length, dont_merge;
+       struct netfront_accel_ssr_conn *c;
+       struct iphdr *iph;
+       struct tcphdr *th;
+       unsigned th_seq;
+
+       BUG_ON(skb_shinfo(skb)->frag_list != NULL);
+       BUG_ON(skb->next != NULL);
+
+       /* We're not interested if it isn't TCP over IPv4. */
+       iph = (struct iphdr *) skb->data;
+       if (skb->protocol != htons(ETH_P_IP) ||
+           iph->protocol != IPPROTO_TCP) {
+               return 0;
+       }
+
+       /* Ignore segments that fail csum or are fragmented. */
+       if (unlikely((skb->ip_summed - CHECKSUM_UNNECESSARY) |
+                    (iph->frag_off & htons(IP_MF | IP_OFFSET)))) {
+               return 0;
+       }
+
+       th = (struct tcphdr*)(skb->data + iph->ihl * 4);
+       data_length = ntohs(iph->tot_len) - iph->ihl * 4 - th->doff * 4;
+       th_seq = ntohl(th->seq);
+       dont_merge = (data_length == 0) | th->urg | th->syn | th->rst;
+
+       list_for_each_entry(c, &st->conns, link) {
+               if ((c->saddr  - iph->saddr) |
+                   (c->daddr  - iph->daddr) |
+                   (c->source - th->source) |
+                   (c->dest   - th->dest  ))
+                       continue;
+
+               /* Re-insert at head of list to reduce lookup time. */
+               list_del(&c->link);
+               list_add(&c->link, &st->conns);
+               c->last_pkt_jiffies = jiffies;
+
+               if (unlikely(th_seq - c->next_seq)) {
+                       /* Out-of-order, so start counting again. */
+                       if (c->skb)
+                               netfront_accel_ssr_deliver(vnic, st, c);
+                       c->n_in_order_pkts = 0;
+                       c->next_seq = th_seq + data_length;
+                       NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_misorder);
+                       return 0;
+               }
+               c->next_seq = th_seq + data_length;
+
+               if (++c->n_in_order_pkts < 300) {
+                       /* May be in slow-start, so don't merge. */
+                       NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_slow_start);
+                       return 0;
+               }
+
+               if (unlikely(dont_merge)) {
+                       if (c->skb)
+                               netfront_accel_ssr_deliver(vnic, st, c);
+                       return 0;
+               }
+
+               if (c->skb) {
+                       c->iph->tot_len = ntohs(c->iph->tot_len);
+                       c->iph->tot_len += data_length;
+                       c->iph->tot_len = htons(c->iph->tot_len);
+                       c->th->ack_seq = th->ack_seq;
+                       c->th->fin |= th->fin;
+                       c->th->psh |= th->psh;
+                       c->th->window = th->window;
+
+                       /* Remove the headers from this skb. */
+                       skb_pull(skb, skb->len - data_length);
+
+                       /*
+                        * Tack the new skb onto the head skb's frag_list.
+                        * This is exactly the format that fragmented IP
+                        * datagrams are reassembled into.
+                        */
+                       BUG_ON(skb->next != 0);
+                       if ( ! skb_shinfo(c->skb)->frag_list)
+                               skb_shinfo(c->skb)->frag_list = skb;
+                       else
+                               c->skb_tail->next = skb;
+                       c->skb_tail = skb;
+                       c->skb->len += skb->len;
+                       c->skb->data_len += skb->len;
+                       c->skb->truesize += skb->truesize;
+
+                       NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_merges);
+
+                       /*
+                        * If the next packet might push this super-packet
+                        * over the limit for an IP packet, deliver it now.
+                        * This is slightly conservative, but close enough.
+                        */
+                       if (c->skb->len + 
+                           (PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE)
+                           > 16384)
+                               netfront_accel_ssr_deliver(vnic, st, c);
+
+                       return 1;
+               }
+               else {
+                       c->iph = iph;
+                       c->th = th;
+                       c->skb = skb;
+                       return 1;
+               }
+       }
+
+       /* We're not yet tracking this connection. */
+
+       if (dont_merge) {
+               return 0;
+       }
+
+       if (list_empty(&st->free_conns)) {
+               c = list_entry(st->conns.prev, 
+                              struct netfront_accel_ssr_conn,
+                              link);
+               if (c->skb) {
+                       NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_too_many);
+                       return 0;
+               }
+       }
+       else {
+               c = list_entry(st->free_conns.next,
+                              struct netfront_accel_ssr_conn,
+                              link);
+       }
+       list_del(&c->link);
+       list_add(&c->link, &st->conns);
+       c->saddr = iph->saddr;
+       c->daddr = iph->daddr;
+       c->source = th->source;
+       c->dest = th->dest;
+       c->next_seq = th_seq + data_length;
+       c->n_in_order_pkts = 0;
+       BUG_ON(c->skb != NULL);
+       NETFRONT_ACCEL_STATS_OP(++vnic->stats.ssr_new_stream);
+       return 0;
+}
diff --git a/drivers/xen/sfc_netfront/accel_ssr.h b/drivers/xen/sfc_netfront/accel_ssr.h

new file mode 100644 (file)

index 0000000..1d10f46
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_ssr.h
@@ -0,0 +1,88 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETFRONT_ACCEL_SSR_H
+#define NETFRONT_ACCEL_SSR_H
+
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/list.h>
+
+#include "accel.h"
+
+/** State for Soft Segment Reassembly (SSR). */
+
+struct netfront_accel_ssr_conn {
+       struct list_head link;
+
+       unsigned saddr, daddr;
+       unsigned short source, dest;
+
+       /** Number of in-order packets we've seen with payload. */
+       unsigned n_in_order_pkts;
+
+       /** Next in-order sequence number. */
+       unsigned next_seq;
+
+       /** Time we last saw a packet on this connection. */
+       unsigned long last_pkt_jiffies;
+
+       /** The SKB we are currently holding.  If NULL, then all following
+        * fields are undefined.
+        */
+       struct sk_buff *skb;
+
+       /** The tail of the frag_list of SKBs we're holding.  Only valid
+        * after at least one merge.
+        */
+       struct sk_buff *skb_tail;
+
+       /** The IP header of the skb we are holding. */
+       struct iphdr *iph;
+       
+       /** The TCP header of the skb we are holding. */
+       struct tcphdr *th;
+};
+
+extern void netfront_accel_ssr_init(struct netfront_accel_ssr_state *st);
+extern void netfront_accel_ssr_fini(netfront_accel_vnic *vnic,
+                                   struct netfront_accel_ssr_state *st);
+
+extern void
+__netfront_accel_ssr_end_of_burst(netfront_accel_vnic *vnic,
+                                 struct netfront_accel_ssr_state *st);
+
+extern int  netfront_accel_ssr_skb(netfront_accel_vnic *vnic,
+                                  struct netfront_accel_ssr_state *st,
+                                  struct sk_buff *skb);
+
+static inline void
+netfront_accel_ssr_end_of_burst (netfront_accel_vnic *vnic,
+                                struct netfront_accel_ssr_state *st) {
+       if ( ! list_empty(&st->conns) )
+               __netfront_accel_ssr_end_of_burst(vnic, st);
+}
+
+#endif /* NETFRONT_ACCEL_SSR_H */
diff --git a/drivers/xen/sfc_netfront/accel_tso.c b/drivers/xen/sfc_netfront/accel_tso.c

new file mode 100644 (file)

index 0000000..1133ebb
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_tso.c
@@ -0,0 +1,509 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/pci.h>
+#include <linux/tcp.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+
+#include "accel.h"
+#include "accel_util.h"
+
+#include "accel_tso.h"
+
+#define ETH_HDR_LEN(skb)  skb_network_offset(skb)
+#define SKB_TCP_OFF(skb)  skb_transport_offset(skb)
+#define SKB_IP_OFF(skb)   skb_network_offset(skb)
+
+/*
+ * Set a maximum number of buffers in each output packet to make life
+ * a little simpler - if this is reached it will just move on to
+ * another packet 
+ */
+#define ACCEL_TSO_MAX_BUFFERS (6)
+
+/** TSO State.
+ *
+ * The state used during segmentation.  It is put into this data structure
+ * just to make it easy to pass into inline functions.
+ */
+struct netfront_accel_tso_state {
+       /** bytes of data we've yet to segment */
+       unsigned remaining_len;
+
+       /** current sequence number */
+       unsigned seqnum;
+
+       /** remaining space in current packet */
+       unsigned packet_space;
+
+       /** List of packets to be output, containing the buffers and
+        *  iovecs to describe each packet 
+        */
+       struct netfront_accel_tso_output_packet *output_packets;
+
+       /** Total number of buffers in output_packets */
+       unsigned buffers;
+
+       /** Total number of packets in output_packets */
+       unsigned packets;
+
+       /** Input Fragment Cursor.
+        *
+        * Where we are in the current fragment of the incoming SKB.  These
+        * values get updated in place when we split a fragment over
+        * multiple packets.
+        */
+       struct {
+               /** address of current position */
+               void *addr;
+               /** remaining length */   
+               unsigned int len;
+       } ifc; /*  == ifc Input Fragment Cursor */
+
+       /** Parameters.
+        *
+        * These values are set once at the start of the TSO send and do
+        * not get changed as the routine progresses.
+        */
+       struct {
+               /* the number of bytes of header */
+               unsigned int header_length;
+
+               /* The number of bytes to put in each outgoing segment. */
+               int full_packet_size;
+               
+               /* Current IP ID, host endian. */
+               unsigned ip_id;
+
+               /* Max size of each output packet payload */
+               int gso_size;
+       } p;
+};
+
+
+/**
+ * Verify that our various assumptions about sk_buffs and the conditions
+ * under which TSO will be attempted hold true.
+ *
+ * @v skb             The sk_buff to check.
+ */
+static inline void tso_check_safe(struct sk_buff *skb) {
+       EPRINTK_ON(skb->protocol != htons (ETH_P_IP));
+       EPRINTK_ON(((struct ethhdr*) skb->data)->h_proto != htons (ETH_P_IP));
+       EPRINTK_ON(ip_hdr(skb)->protocol != IPPROTO_TCP);
+       EPRINTK_ON((SKB_TCP_OFF(skb) + tcp_hdrlen(skb)) > skb_headlen(skb));
+}
+
+
+
+/** Parse the SKB header and initialise state. */
+static inline void tso_start(struct netfront_accel_tso_state *st, 
+                            struct sk_buff *skb) {
+
+       /*
+        * All ethernet/IP/TCP headers combined size is TCP header size
+        * plus offset of TCP header relative to start of packet.
+        */
+       st->p.header_length = tcp_hdrlen(skb) + SKB_TCP_OFF(skb);
+       st->p.full_packet_size = (st->p.header_length
+                                 + skb_shinfo(skb)->gso_size);
+       st->p.gso_size = skb_shinfo(skb)->gso_size;
+
+       st->p.ip_id = htons(ip_hdr(skb)->id);
+       st->seqnum = ntohl(tcp_hdr(skb)->seq);
+
+       EPRINTK_ON(tcp_hdr(skb)->urg);
+       EPRINTK_ON(tcp_hdr(skb)->syn);
+       EPRINTK_ON(tcp_hdr(skb)->rst);
+
+       st->remaining_len = skb->len - st->p.header_length;
+
+       st->output_packets = NULL;
+       st->buffers = 0;
+       st->packets = 0;
+
+       VPRINTK("Starting new TSO: hl %d ps %d gso %d seq %x len %d\n",
+               st->p.header_length, st->p.full_packet_size, st->p.gso_size,
+               st->seqnum, skb->len);
+}
+
+/**
+ * Add another NIC mapped buffer onto an output packet  
+ */ 
+static inline int tso_start_new_buffer(netfront_accel_vnic *vnic,
+                                      struct netfront_accel_tso_state *st,
+                                      int first)
+{
+       struct netfront_accel_tso_buffer *tso_buf;
+       struct netfront_accel_pkt_desc *buf;
+
+       /* Get a mapped packet buffer */
+       buf = netfront_accel_buf_get(vnic->tx_bufs);
+       if (buf == NULL) {
+               DPRINTK("%s: No buffer for TX\n", __FUNCTION__);
+               return -1;
+       }
+
+       /* Store a bit of meta-data at the end */
+       tso_buf =(struct netfront_accel_tso_buffer *)
+               (buf->pkt_kva + NETFRONT_ACCEL_TSO_BUF_LENGTH
+                + sizeof(struct netfront_accel_tso_output_packet));
+
+       tso_buf->buf = buf;
+
+       tso_buf->length = 0;
+       
+       if (first) {
+               struct netfront_accel_tso_output_packet *output_packet 
+                       = (struct netfront_accel_tso_output_packet *)
+                       (buf->pkt_kva + NETFRONT_ACCEL_TSO_BUF_LENGTH);
+               output_packet->next = st->output_packets;
+               st->output_packets = output_packet;
+               tso_buf->next = NULL;
+               st->output_packets->tso_bufs = tso_buf;
+               st->output_packets->tso_bufs_len = 1;
+       } else {
+               tso_buf->next = st->output_packets->tso_bufs;
+               st->output_packets->tso_bufs = tso_buf;
+               st->output_packets->tso_bufs_len ++;
+       }
+
+       BUG_ON(st->output_packets->tso_bufs_len > ACCEL_TSO_MAX_BUFFERS);
+       
+       st->buffers ++;
+
+       /*
+        * Store the context, set to NULL, last packet buffer will get
+        * non-NULL later
+        */
+       tso_buf->buf->skb = NULL;
+
+       return 0;
+}
+
+
+/* Generate a new header, and prepare for the new packet.
+ *
+ * @v vnic           VNIC
+ * @v skb             Socket buffer
+ * @v st               TSO state
+ * @ret rc           0 on success, or -1 if failed to alloc header
+ */
+
+static inline 
+int tso_start_new_packet(netfront_accel_vnic *vnic,
+                        struct sk_buff *skb,
+                        struct netfront_accel_tso_state *st) 
+{
+       struct netfront_accel_tso_buffer *tso_buf;
+       struct iphdr *tsoh_iph;
+       struct tcphdr *tsoh_th;
+       unsigned ip_length;
+
+       if (tso_start_new_buffer(vnic, st, 1) < 0) {
+               NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+               return -1;              
+       }
+
+       /* This has been set up by tso_start_new_buffer() */
+       tso_buf = st->output_packets->tso_bufs;
+
+       /* Copy in the header */
+       memcpy(tso_buf->buf->pkt_kva, skb->data, st->p.header_length);
+       tso_buf->length = st->p.header_length;
+
+       tsoh_th = (struct tcphdr*) 
+               (tso_buf->buf->pkt_kva + SKB_TCP_OFF(skb));
+       tsoh_iph = (struct iphdr*) 
+               (tso_buf->buf->pkt_kva + SKB_IP_OFF(skb));
+
+       /* Set to zero to encourage falcon to fill these in */
+       tsoh_th->check  = 0;
+       tsoh_iph->check = 0;
+
+       tsoh_th->seq = htonl(st->seqnum);
+       st->seqnum += st->p.gso_size;
+
+       if (st->remaining_len > st->p.gso_size) {
+               /* This packet will not finish the TSO burst. */
+               ip_length = st->p.full_packet_size - ETH_HDR_LEN(skb);
+               tsoh_th->fin = 0;
+               tsoh_th->psh = 0;
+       } else {
+               /* This packet will be the last in the TSO burst. */
+               ip_length = (st->p.header_length - ETH_HDR_LEN(skb)
+                            + st->remaining_len);
+               tsoh_th->fin = tcp_hdr(skb)->fin;
+               tsoh_th->psh = tcp_hdr(skb)->psh;
+       }
+
+       tsoh_iph->tot_len = htons(ip_length);
+
+       /* Linux leaves suitable gaps in the IP ID space for us to fill. */
+       tsoh_iph->id = st->p.ip_id++;
+       tsoh_iph->id = htons(tsoh_iph->id);
+
+       st->packet_space = st->p.gso_size; 
+
+       st->packets++;
+
+       return 0;
+}
+
+
+
+static inline void tso_get_fragment(struct netfront_accel_tso_state *st, 
+                                   int len, void *addr)
+{
+       st->ifc.len = len;
+       st->ifc.addr = addr;
+       return;
+}
+
+
+static inline void tso_unwind(netfront_accel_vnic *vnic, 
+                             struct netfront_accel_tso_state *st)
+{
+       struct netfront_accel_tso_buffer *tso_buf;
+       struct netfront_accel_tso_output_packet *output_packet;
+
+       DPRINTK("%s\n", __FUNCTION__);
+
+       while (st->output_packets != NULL) {
+               output_packet = st->output_packets;
+               st->output_packets = output_packet->next;
+               while (output_packet->tso_bufs != NULL) {
+                       tso_buf = output_packet->tso_bufs;
+                       output_packet->tso_bufs = tso_buf->next;
+
+                       st->buffers --;
+                       output_packet->tso_bufs_len --;
+
+                       netfront_accel_buf_put(vnic->tx_bufs, 
+                                              tso_buf->buf->buf_id);
+               }
+       }
+       BUG_ON(st->buffers != 0);
+}
+
+
+
+static inline
+void tso_fill_packet_with_fragment(netfront_accel_vnic *vnic,
+                                  struct netfront_accel_tso_state *st) 
+{
+       struct netfront_accel_tso_buffer *tso_buf;
+       int n, space;
+
+       BUG_ON(st->output_packets == NULL);
+       BUG_ON(st->output_packets->tso_bufs == NULL);
+
+       tso_buf = st->output_packets->tso_bufs;
+
+       if (st->ifc.len == 0)  return;
+       if (st->packet_space == 0)  return;
+       if (tso_buf->length == NETFRONT_ACCEL_TSO_BUF_LENGTH) return;
+
+       n = min(st->ifc.len, st->packet_space);
+
+       space = NETFRONT_ACCEL_TSO_BUF_LENGTH - tso_buf->length;
+       n = min(n, space);
+
+       st->packet_space -= n;
+       st->remaining_len -= n;
+       st->ifc.len -= n;
+
+       memcpy(tso_buf->buf->pkt_kva + tso_buf->length, st->ifc.addr, n);
+
+       tso_buf->length += n;
+
+       BUG_ON(tso_buf->length > NETFRONT_ACCEL_TSO_BUF_LENGTH);
+
+       st->ifc.addr += n;
+
+       return;
+}
+
+
+int netfront_accel_enqueue_skb_tso(netfront_accel_vnic *vnic,
+                                  struct sk_buff *skb)
+{
+       struct netfront_accel_tso_state state;
+       struct netfront_accel_tso_buffer *tso_buf = NULL;
+       struct netfront_accel_tso_output_packet *reversed_list = NULL;
+       struct netfront_accel_tso_output_packet *tmp_pkt;
+       ef_iovec iovecs[ACCEL_TSO_MAX_BUFFERS];
+       int frag_i, rc, dma_id;
+       skb_frag_t *f;
+
+       tso_check_safe(skb);
+
+       if (skb->ip_summed != CHECKSUM_PARTIAL)
+               EPRINTK("Trying to TSO send a packet without HW checksum\n");
+
+       tso_start(&state, skb);
+
+       /*
+        * Setup the first payload fragment.  If the skb header area
+        * contains exactly the headers and all payload is in the frag
+        * list things are little simpler
+        */
+       if (skb_headlen(skb) == state.p.header_length) {
+               /* Grab the first payload fragment. */
+               BUG_ON(skb_shinfo(skb)->nr_frags < 1);
+               frag_i = 0;
+               f = &skb_shinfo(skb)->frags[frag_i];
+               tso_get_fragment(&state, skb_frag_size(f),
+                                page_address(skb_frag_page(f)) + f->page_offset);
+       } else {
+               int hl = state.p.header_length;
+               tso_get_fragment(&state,  skb_headlen(skb) - hl, 
+                                skb->data + hl);
+               frag_i = -1;
+       }
+
+       if (tso_start_new_packet(vnic, skb, &state) < 0) {
+               DPRINTK("%s: out of first start-packet memory\n",
+                       __FUNCTION__);
+               goto unwind;
+       }
+
+       while (1) {
+               tso_fill_packet_with_fragment(vnic, &state);
+               
+               /* Move onto the next fragment? */
+               if (state.ifc.len == 0) {
+                       if (++frag_i >= skb_shinfo(skb)->nr_frags)
+                               /* End of payload reached. */
+                               break;
+                       f = &skb_shinfo(skb)->frags[frag_i];
+                       tso_get_fragment(&state, skb_frag_size(f),
+                                        page_address(skb_frag_page(f)) +
+                                        f->page_offset);
+               }
+
+               /* Start a new buffer? */
+               if ((state.output_packets->tso_bufs->length == 
+                    NETFRONT_ACCEL_TSO_BUF_LENGTH) &&
+                   tso_start_new_buffer(vnic, &state, 0)) {
+                       DPRINTK("%s: out of start-buffer memory\n",
+                               __FUNCTION__);
+                       goto unwind;
+               }
+
+               /* Start at new packet? */
+               if ((state.packet_space == 0 || 
+                    ((state.output_packets->tso_bufs_len >=
+                      ACCEL_TSO_MAX_BUFFERS) &&
+                     (state.output_packets->tso_bufs->length >= 
+                      NETFRONT_ACCEL_TSO_BUF_LENGTH))) &&
+                   tso_start_new_packet(vnic, skb, &state) < 0) {
+                       DPRINTK("%s: out of start-packet memory\n",
+                               __FUNCTION__);
+                       goto unwind;
+               }
+
+       }
+
+       /* Check for space */
+       if (ef_vi_transmit_space(&vnic->vi) < state.buffers) {
+               DPRINTK("%s: Not enough TX space (%d)\n",
+                       __FUNCTION__, state.buffers);
+               goto unwind;
+       }
+
+       /*
+        * Store the skb context in the most recent buffer (i.e. the
+        * last buffer that will be sent)
+        */
+       state.output_packets->tso_bufs->buf->skb = skb;
+
+       /* Reverse the list of packets as we construct it on a stack */
+       while (state.output_packets != NULL) {
+               tmp_pkt = state.output_packets;
+               state.output_packets = tmp_pkt->next;
+               tmp_pkt->next = reversed_list;
+               reversed_list = tmp_pkt;
+       }
+
+       /* Pass off to hardware */
+       while (reversed_list != NULL) {
+               tmp_pkt = reversed_list;
+               reversed_list = tmp_pkt->next;
+
+               BUG_ON(tmp_pkt->tso_bufs_len > ACCEL_TSO_MAX_BUFFERS);
+               BUG_ON(tmp_pkt->tso_bufs_len == 0);
+
+               dma_id = tmp_pkt->tso_bufs->buf->buf_id;
+
+               /*
+                * Make an iovec of the buffers in the list, reversing
+                * the buffers as we go as they are constructed on a
+                * stack
+                */
+               tso_buf = tmp_pkt->tso_bufs;
+               for (frag_i = tmp_pkt->tso_bufs_len - 1;
+                    frag_i >= 0;
+                    frag_i--) {
+                       iovecs[frag_i].iov_base = tso_buf->buf->pkt_buff_addr;
+                       iovecs[frag_i].iov_len = tso_buf->length;
+                       tso_buf = tso_buf->next;
+               }
+
+               rc = ef_vi_transmitv(&vnic->vi, iovecs, tmp_pkt->tso_bufs_len,
+                                    dma_id);
+               /*
+                * We checked for space already, so it really should
+                * succeed
+                */
+               BUG_ON(rc != 0);
+       }
+
+       /* Track number of tx fastpath stats */
+       vnic->netdev_stats.fastpath_tx_bytes += skb->len;
+       vnic->netdev_stats.fastpath_tx_pkts += state.packets;
+#if NETFRONT_ACCEL_STATS
+       {
+               unsigned n;
+               n = vnic->netdev_stats.fastpath_tx_pkts -
+                       vnic->stats.fastpath_tx_completions;
+               if (n > vnic->stats.fastpath_tx_pending_max)
+                       vnic->stats.fastpath_tx_pending_max = n;
+       }
+#endif
+
+       return NETFRONT_ACCEL_STATUS_GOOD;
+ 
+ unwind:
+       tso_unwind(vnic, &state);
+
+       NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+
+       return NETFRONT_ACCEL_STATUS_BUSY;
+}
+
+
+
diff --git a/drivers/xen/sfc_netfront/accel_tso.h b/drivers/xen/sfc_netfront/accel_tso.h

new file mode 100644 (file)

index 0000000..b9c3ca8
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_tso.h
@@ -0,0 +1,57 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETFRONT_ACCEL_TSO_H
+#define NETFRONT_ACCEL_TSO_H
+
+#include "accel_bufs.h"
+
+/* Track the buffers used in each output packet */
+struct netfront_accel_tso_buffer {
+       struct netfront_accel_tso_buffer *next;
+       struct netfront_accel_pkt_desc *buf;
+       unsigned length;
+};
+
+/* Track the output packets formed from each input packet */
+struct netfront_accel_tso_output_packet {
+       struct netfront_accel_tso_output_packet *next;
+       struct netfront_accel_tso_buffer *tso_bufs;
+       unsigned tso_bufs_len;
+};
+
+
+/*
+ * Max available space in a buffer for data once meta-data has taken
+ * its place 
+ */
+#define NETFRONT_ACCEL_TSO_BUF_LENGTH                                  \
+       ((PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE)                     \
+        - sizeof(struct netfront_accel_tso_buffer)                     \
+        - sizeof(struct netfront_accel_tso_output_packet))
+
+int netfront_accel_enqueue_skb_tso(netfront_accel_vnic *vnic,
+                                  struct sk_buff *skb);
+
+#endif /* NETFRONT_ACCEL_TSO_H */
diff --git a/drivers/xen/sfc_netfront/accel_vi.c b/drivers/xen/sfc_netfront/accel_vi.c

new file mode 100644 (file)

index 0000000..987a9f4
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_vi.c
@@ -0,0 +1,1203 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <asm/io.h>
+
+#include "accel.h"
+#include "accel_util.h"
+#include "accel_bufs.h"
+#include "accel_tso.h"
+#include "accel_ssr.h"
+#include "netfront.h"
+
+#include "etherfabric/ef_vi.h"
+
+/*
+ * Max available space in a buffer for data once meta-data has taken
+ * its place
+ */
+#define NETFRONT_ACCEL_TX_BUF_LENGTH                                   \
+       ((PAGE_SIZE / NETFRONT_ACCEL_BUFS_PER_PAGE)                     \
+        - sizeof(struct netfront_accel_tso_buffer))
+
+#define ACCEL_TX_MAX_BUFFERS (6)
+#define ACCEL_VI_POLL_EVENTS (8)
+
+static
+int netfront_accel_vi_init_fini(netfront_accel_vnic *vnic, 
+                               struct net_accel_msg_hw *hw_msg)
+{
+       struct ef_vi_nic_type nic_type;
+       struct net_accel_hw_falcon_b *hw_info;
+       void *io_kva, *evq_base, *rx_dma_kva, *tx_dma_kva, *doorbell_kva;
+       u32 *evq_gnts;
+       u32 evq_order;
+       int vi_state_size;
+       u8 vi_data[VI_MAPPINGS_SIZE];
+
+       if (hw_msg == NULL)
+               goto fini;
+
+       /* And create the local macs table lock */
+       spin_lock_init(&vnic->table_lock);
+       
+       /* Create fastpath table, initial size 8, key length 8 */
+       if (cuckoo_hash_init(&vnic->fastpath_table, 3, 8)) {
+               EPRINTK("failed to allocate fastpath table\n");
+               goto fail_cuckoo;
+       }
+
+       vnic->hw.falcon.type = hw_msg->type;
+
+       switch (hw_msg->type) {
+       case NET_ACCEL_MSG_HWTYPE_FALCON_A:
+               hw_info = &hw_msg->resources.falcon_a.common;
+               /* Need the extra rptr register page on A1 */
+               io_kva = net_accel_map_iomem_page
+                       (vnic->dev, hw_msg->resources.falcon_a.evq_rptr_gnt,
+                        &vnic->hw.falcon.evq_rptr_mapping);
+               if (io_kva == NULL) {
+                       EPRINTK("%s: evq_rptr permission failed\n", __FUNCTION__);
+                       goto evq_rptr_fail;
+               }
+
+               vnic->hw.falcon.evq_rptr = io_kva + 
+                       (hw_info->evq_rptr & (PAGE_SIZE - 1));
+               break;
+       case NET_ACCEL_MSG_HWTYPE_FALCON_B:
+       case NET_ACCEL_MSG_HWTYPE_SIENA_A:
+               hw_info = &hw_msg->resources.falcon_b;
+               break;
+       default:
+               goto bad_type;
+       }
+
+       /**** Event Queue ****/
+
+       /* Map the event queue pages */
+       evq_gnts = hw_info->evq_mem_gnts;
+       evq_order = hw_info->evq_order;
+
+       EPRINTK_ON(hw_info->evq_offs != 0);
+
+       DPRINTK("Will map evq %d pages\n", 1 << evq_order);
+
+       evq_base =
+               net_accel_map_grants_contig(vnic->dev, evq_gnts, 1 << evq_order,
+                                           &vnic->evq_mapping);
+       if (evq_base == NULL) {
+               EPRINTK("%s: evq_base failed\n", __FUNCTION__);
+               goto evq_fail;
+       }
+
+       /**** Doorbells ****/
+       /* Set up the doorbell mappings. */
+       doorbell_kva = 
+               net_accel_map_iomem_page(vnic->dev, hw_info->doorbell_gnt,
+                                        &vnic->hw.falcon.doorbell_mapping);
+       if (doorbell_kva == NULL) {
+               EPRINTK("%s: doorbell permission failed\n", __FUNCTION__);
+               goto doorbell_fail;
+       }
+       vnic->hw.falcon.doorbell = doorbell_kva;
+
+       /* On Falcon_B and Siena we get the rptr from the doorbell page */
+       if (hw_msg->type == NET_ACCEL_MSG_HWTYPE_FALCON_B ||
+           hw_msg->type == NET_ACCEL_MSG_HWTYPE_SIENA_A) {
+               vnic->hw.falcon.evq_rptr = 
+                       (u32 *)((char *)vnic->hw.falcon.doorbell 
+                               + hw_info->evq_rptr);
+       }
+
+       /**** DMA Queue ****/
+
+       /* Set up the DMA Queues from the message. */
+       tx_dma_kva = net_accel_map_grants_contig
+               (vnic->dev, &(hw_info->txdmaq_gnt), 1, 
+                &vnic->hw.falcon.txdmaq_mapping);
+       if (tx_dma_kva == NULL) {
+               EPRINTK("%s: TX dma failed\n", __FUNCTION__);
+               goto tx_dma_fail;
+       }
+
+       rx_dma_kva = net_accel_map_grants_contig
+               (vnic->dev, &(hw_info->rxdmaq_gnt), 1, 
+                &vnic->hw.falcon.rxdmaq_mapping);
+       if (rx_dma_kva == NULL) {
+               EPRINTK("%s: RX dma failed\n", __FUNCTION__);
+               goto rx_dma_fail;
+       }
+
+       /* Full confession */
+       DPRINTK("Mapped H/W"
+               "  Tx DMAQ grant %x -> %p\n"
+               "  Rx DMAQ grant %x -> %p\n"
+               "  EVQ grant %x -> %p\n",
+               hw_info->txdmaq_gnt, tx_dma_kva,
+               hw_info->rxdmaq_gnt, rx_dma_kva,
+               evq_gnts[0], evq_base
+               );
+
+       memset(vi_data, 0, sizeof(vi_data));
+       
+       /* TODO BUG11305: convert efhw_arch to ef_vi_arch
+        * e.g.
+        * arch = ef_vi_arch_from_efhw_arch(hw_info->nic_arch);
+        * assert(arch >= 0);
+        * nic_type.arch = arch;
+        */
+       nic_type.arch = (unsigned char)hw_info->nic_arch;
+       nic_type.variant = (char)hw_info->nic_variant;
+       nic_type.revision = (unsigned char)hw_info->nic_revision;
+       
+       ef_vi_init_mapping_evq(vi_data, nic_type, hw_info->instance, 
+                              1 << (evq_order + PAGE_SHIFT), evq_base, 
+                              (void *)0xdeadbeef);
+
+       ef_vi_init_mapping_vi(vi_data, nic_type, hw_info->rx_capacity, 
+                             hw_info->tx_capacity, hw_info->instance, 
+                             doorbell_kva, rx_dma_kva, tx_dma_kva, 0);
+
+       vi_state_size = ef_vi_calc_state_bytes(hw_info->rx_capacity,
+                                              hw_info->tx_capacity);
+       vnic->vi_state = (ef_vi_state *)kmalloc(vi_state_size, GFP_KERNEL);
+       if (vnic->vi_state == NULL) {
+               EPRINTK("%s: kmalloc for VI state failed\n", __FUNCTION__);
+               goto vi_state_fail;
+       }
+       ef_vi_init(&vnic->vi, vi_data, vnic->vi_state, &vnic->evq_state, 0);
+
+       ef_eventq_state_init(&vnic->vi);
+
+       ef_vi_state_init(&vnic->vi);
+
+       return 0;
+
+fini:
+       kfree(vnic->vi_state);
+       vnic->vi_state = NULL;
+vi_state_fail:
+       net_accel_unmap_grants_contig(vnic->dev, vnic->hw.falcon.rxdmaq_mapping);
+rx_dma_fail:
+       net_accel_unmap_grants_contig(vnic->dev, vnic->hw.falcon.txdmaq_mapping);
+tx_dma_fail:
+       net_accel_unmap_iomem_page(vnic->dev, vnic->hw.falcon.doorbell_mapping);
+       vnic->hw.falcon.doorbell = NULL;
+doorbell_fail:
+       net_accel_unmap_grants_contig(vnic->dev, vnic->evq_mapping);
+evq_fail:
+       if (vnic->hw.falcon.type == NET_ACCEL_MSG_HWTYPE_FALCON_A)
+               net_accel_unmap_iomem_page(vnic->dev, 
+                                          vnic->hw.falcon.evq_rptr_mapping);
+       vnic->hw.falcon.evq_rptr = NULL;
+evq_rptr_fail:
+bad_type:
+       cuckoo_hash_destroy(&vnic->fastpath_table);
+fail_cuckoo:
+       return -EIO;
+}
+
+
+void netfront_accel_vi_ctor(netfront_accel_vnic *vnic)
+{
+       /* Just mark the VI as uninitialised. */
+       vnic->vi_state = NULL;
+}
+
+
+int netfront_accel_vi_init(netfront_accel_vnic *vnic, struct net_accel_msg_hw *hw_msg)
+{
+       BUG_ON(hw_msg == NULL);
+       return netfront_accel_vi_init_fini(vnic, hw_msg);
+}
+
+
+void netfront_accel_vi_dtor(netfront_accel_vnic *vnic)
+{
+       if (vnic->vi_state != NULL)
+               netfront_accel_vi_init_fini(vnic, NULL);
+}
+
+
+static
+void netfront_accel_vi_post_rx(netfront_accel_vnic *vnic, u16 id,
+                              netfront_accel_pkt_desc *buf)
+{
+
+       int idx = vnic->rx_dma_batched;
+
+#if 0
+       VPRINTK("Posting buffer %d (0x%08x) for rx at index %d, space is %d\n",
+               id, buf->pkt_buff_addr, idx, ef_vi_receive_space(&vnic->vi));
+#endif
+       /* Set up a virtual buffer descriptor */
+       ef_vi_receive_init(&vnic->vi, buf->pkt_buff_addr, id,
+                          /*rx_bytes=max*/0);
+
+       idx++;
+
+       vnic->rx_dma_level++;
+       
+       /* 
+        * Only push the descriptor to the card if we've reached the
+        * batch size.  Otherwise, the descriptors can sit around for
+        * a while.  There will be plenty available.
+        */
+       if (idx >= NETFRONT_ACCEL_RX_DESC_BATCH ||
+           vnic->rx_dma_level < NETFRONT_ACCEL_RX_DESC_BATCH) {
+#if 0
+               VPRINTK("Flushing %d rx descriptors.\n", idx);
+#endif
+
+               /* Push buffer to hardware */
+               ef_vi_receive_push(&vnic->vi);
+               
+               idx = 0;
+       }
+       
+       vnic->rx_dma_batched = idx;
+}
+
+
+inline
+void netfront_accel_vi_post_rx_or_free(netfront_accel_vnic *vnic, u16 id,
+                                      netfront_accel_pkt_desc *buf)
+{
+
+       VPRINTK("%s: %d\n", __FUNCTION__, id);
+
+       if (ef_vi_receive_space(&vnic->vi) <= vnic->rx_dma_batched) {
+               VPRINTK("RX space is full\n");
+               netfront_accel_buf_put(vnic->rx_bufs, id);
+               return;
+       }
+
+       VPRINTK("Completed buffer %d is reposted\n", id);
+       netfront_accel_vi_post_rx(vnic, id, buf);
+       
+       /*
+        * Let's see if there's any more to be pushed out to the NIC
+        * while we're here
+        */
+       while (ef_vi_receive_space(&vnic->vi) > vnic->rx_dma_batched) {
+               /* Try to allocate a buffer. */
+               buf = netfront_accel_buf_get(vnic->rx_bufs);
+               if (buf == NULL)
+                       break;
+               
+               /* Add it to the rx dma queue. */
+               netfront_accel_vi_post_rx(vnic, buf->buf_id, buf);      
+       }
+}
+
+
+void netfront_accel_vi_add_bufs(netfront_accel_vnic *vnic, int is_rx)
+{
+
+       while (is_rx && 
+              ef_vi_receive_space(&vnic->vi) > vnic->rx_dma_batched) {
+               netfront_accel_pkt_desc *buf;
+               
+               VPRINTK("%s: %d\n", __FUNCTION__, vnic->rx_dma_level);
+               
+               /* Try to allocate a buffer. */
+               buf = netfront_accel_buf_get(vnic->rx_bufs);
+
+               if (buf == NULL)
+                       break;
+               
+               /* Add it to the rx dma queue. */
+               netfront_accel_vi_post_rx(vnic, buf->buf_id, buf);
+       }
+
+       VPRINTK("%s: done\n", __FUNCTION__);
+}
+
+
+struct netfront_accel_multi_state {
+       unsigned remaining_len;
+
+       unsigned buffers;
+
+       struct netfront_accel_tso_buffer *output_buffers;
+
+       /* Where we are in the current fragment of the SKB. */
+       struct {
+               /* address of current position */
+               void *addr;
+               /* remaining length */    
+               unsigned int len;
+       } ifc; /*  == Input Fragment Cursor */
+};
+
+
+static inline void multi_post_start(struct netfront_accel_multi_state *st, 
+                                   struct sk_buff *skb)
+{
+       st->remaining_len = skb->len;
+       st->output_buffers = NULL;
+       st->buffers = 0;
+       st->ifc.len = skb_headlen(skb);
+       st->ifc.addr = skb->data;
+}
+
+static int multi_post_start_new_buffer(netfront_accel_vnic *vnic, 
+                                      struct netfront_accel_multi_state *st)
+{
+       struct netfront_accel_tso_buffer *tso_buf;
+       struct netfront_accel_pkt_desc *buf;
+
+       /* Get a mapped packet buffer */
+       buf = netfront_accel_buf_get(vnic->tx_bufs);
+       if (buf == NULL) {
+               DPRINTK("%s: No buffer for TX\n", __FUNCTION__);
+               return -1;
+       }
+
+       /* Store a bit of meta-data at the end */
+       tso_buf = (struct netfront_accel_tso_buffer *)
+               (buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH);
+
+       tso_buf->buf = buf;
+
+       tso_buf->length = 0;
+       
+       tso_buf->next = st->output_buffers;
+       st->output_buffers = tso_buf;
+       st->buffers++;
+
+       BUG_ON(st->buffers >= ACCEL_TX_MAX_BUFFERS);
+
+       /*
+        * Store the context, set to NULL, last packet buffer will get
+        * non-NULL later
+        */
+       tso_buf->buf->skb = NULL;
+       
+       return 0;
+}
+
+
+static void
+multi_post_fill_buffer_with_fragment(netfront_accel_vnic *vnic,
+                                    struct netfront_accel_multi_state *st)
+{
+       struct netfront_accel_tso_buffer *tso_buf;
+       unsigned n, space;
+
+       BUG_ON(st->output_buffers == NULL);
+       tso_buf = st->output_buffers;
+
+       if (st->ifc.len == 0) return;
+       if (tso_buf->length == NETFRONT_ACCEL_TX_BUF_LENGTH) return;
+
+       BUG_ON(tso_buf->length > NETFRONT_ACCEL_TX_BUF_LENGTH);
+
+       space = NETFRONT_ACCEL_TX_BUF_LENGTH - tso_buf->length;
+       n = min(st->ifc.len, space);
+
+       memcpy(tso_buf->buf->pkt_kva + tso_buf->length, st->ifc.addr, n);
+
+       st->remaining_len -= n;
+       st->ifc.len -= n;
+       tso_buf->length += n;
+       st->ifc.addr += n;
+
+       BUG_ON(tso_buf->length > NETFRONT_ACCEL_TX_BUF_LENGTH);
+
+       return;
+}
+
+
+static inline void multi_post_unwind(netfront_accel_vnic *vnic,
+                                    struct netfront_accel_multi_state *st)
+{
+       struct netfront_accel_tso_buffer *tso_buf;
+
+       DPRINTK("%s\n", __FUNCTION__);
+
+       while (st->output_buffers != NULL) {
+               tso_buf = st->output_buffers;
+               st->output_buffers = tso_buf->next;
+               st->buffers--;
+               netfront_accel_buf_put(vnic->tx_bufs, tso_buf->buf->buf_id);
+       }
+       BUG_ON(st->buffers != 0);
+}
+
+
+static enum netfront_accel_post_status
+netfront_accel_enqueue_skb_multi(netfront_accel_vnic *vnic, struct sk_buff *skb)
+{
+       struct netfront_accel_tso_buffer *tso_buf;
+       struct netfront_accel_multi_state state;
+       ef_iovec iovecs[ACCEL_TX_MAX_BUFFERS];
+       skb_frag_t *f;
+       int frag_i, rc, dma_id;
+
+       multi_post_start(&state, skb);
+
+       frag_i = -1;
+
+       if (skb->ip_summed == CHECKSUM_PARTIAL) {
+               /* Set to zero to encourage falcon to work it out for us */
+               *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
+       }
+
+       if (multi_post_start_new_buffer(vnic, &state)) {
+               DPRINTK("%s: out of buffers\n", __FUNCTION__);
+               goto unwind;
+       }
+
+       while (1) {
+               multi_post_fill_buffer_with_fragment(vnic, &state);
+
+               /* Move onto the next fragment? */
+               if (state.ifc.len == 0) {
+                       if (++frag_i >= skb_shinfo(skb)->nr_frags)
+                               /* End of payload reached. */
+                               break;
+                       f = &skb_shinfo(skb)->frags[frag_i];
+                       state.ifc.len = skb_frag_size(f);
+                       state.ifc.addr = page_address(skb_frag_page(f))
+                                        + f->page_offset;
+               }
+
+               /* Start a new buffer? */
+               if ((state.output_buffers->length == 
+                    NETFRONT_ACCEL_TX_BUF_LENGTH) &&
+                   multi_post_start_new_buffer(vnic, &state)) {
+                       DPRINTK("%s: out of buffers\n", __FUNCTION__);
+                       goto unwind;
+               }
+       }
+
+       /* Check for space */
+       if (ef_vi_transmit_space(&vnic->vi) < state.buffers) {
+               DPRINTK("%s: Not enough TX space (%d)\n", __FUNCTION__, state.buffers);
+               goto unwind;
+       }
+
+       /* Store the skb in what will be the last buffer's context */
+       state.output_buffers->buf->skb = skb;
+       /* Remember dma_id of what will be the last buffer */ 
+       dma_id = state.output_buffers->buf->buf_id;
+
+       /*
+        * Make an iovec of the buffers in the list, reversing the
+        * buffers as we go as they are constructed on a stack
+        */
+       tso_buf = state.output_buffers;
+       for (frag_i = state.buffers-1; frag_i >= 0; frag_i--) {
+               iovecs[frag_i].iov_base = tso_buf->buf->pkt_buff_addr;
+               iovecs[frag_i].iov_len = tso_buf->length;
+               tso_buf = tso_buf->next;
+       }
+       
+       rc = ef_vi_transmitv(&vnic->vi, iovecs, state.buffers, dma_id);
+       /* We checked for space already, so it really should succeed */
+       BUG_ON(rc != 0);
+
+       /* Track number of tx fastpath stats */
+       vnic->netdev_stats.fastpath_tx_bytes += skb->len;
+       vnic->netdev_stats.fastpath_tx_pkts ++;
+#if NETFRONT_ACCEL_STATS
+       {
+               u32 n;
+               n = vnic->netdev_stats.fastpath_tx_pkts -
+                       (u32)vnic->stats.fastpath_tx_completions;
+               if (n > vnic->stats.fastpath_tx_pending_max)
+                       vnic->stats.fastpath_tx_pending_max = n;
+       }
+#endif
+       return NETFRONT_ACCEL_STATUS_GOOD;
+
+unwind:
+       multi_post_unwind(vnic, &state);
+
+       NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+
+       return NETFRONT_ACCEL_STATUS_BUSY;
+}
+
+
+static enum netfront_accel_post_status 
+netfront_accel_enqueue_skb_single(netfront_accel_vnic *vnic, struct sk_buff *skb)
+{
+       struct netfront_accel_tso_buffer *tso_buf;
+       struct netfront_accel_pkt_desc *buf;
+       u8 *kva;
+       int rc;
+
+       if (ef_vi_transmit_space(&vnic->vi) < 1) {
+               DPRINTK("%s: No TX space\n", __FUNCTION__);
+               NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+               return NETFRONT_ACCEL_STATUS_BUSY;
+       }
+
+       buf = netfront_accel_buf_get(vnic->tx_bufs);
+       if (buf == NULL) {
+               DPRINTK("%s: No buffer for TX\n", __FUNCTION__);
+               NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_busy++);
+               return NETFRONT_ACCEL_STATUS_BUSY;
+       }
+
+       /* Track number of tx fastpath stats */
+       vnic->netdev_stats.fastpath_tx_pkts++;
+       vnic->netdev_stats.fastpath_tx_bytes += skb->len;
+
+#if NETFRONT_ACCEL_STATS
+       {
+               u32 n;
+               n = vnic->netdev_stats.fastpath_tx_pkts - 
+                       (u32)vnic->stats.fastpath_tx_completions;
+               if (n > vnic->stats.fastpath_tx_pending_max)
+                       vnic->stats.fastpath_tx_pending_max = n;
+       }
+#endif
+       
+       /* Store the context */
+       buf->skb = skb;
+       
+       kva = buf->pkt_kva;
+
+       if (skb->ip_summed == CHECKSUM_PARTIAL) {
+               /* Set to zero to encourage falcon to work it out for us */
+               *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
+       }
+       NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT
+               (skb, idx, frag_data, frag_len, {
+                       /* Copy in payload */
+                       VPRINTK("*** Copying %d bytes to %p\n", frag_len, kva);
+                       memcpy(kva, frag_data, frag_len);
+                       kva += frag_len;
+               });
+
+       VPRINTK("%s: id %d pkt %p kva %p buff_addr 0x%08x\n", __FUNCTION__,
+               buf->buf_id, buf, buf->pkt_kva, buf->pkt_buff_addr);
+
+
+       /* Set up the TSO meta-data for a single buffer/packet */
+       tso_buf = (struct netfront_accel_tso_buffer *)
+               (buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH);
+       tso_buf->next = NULL;
+       tso_buf->buf = buf;
+       tso_buf->length = skb->len;
+
+       rc = ef_vi_transmit(&vnic->vi, buf->pkt_buff_addr, skb->len,
+                           buf->buf_id);
+       /* We checked for space already, so it really should succeed */
+       BUG_ON(rc != 0);
+
+       return NETFRONT_ACCEL_STATUS_GOOD;
+}
+
+
+enum netfront_accel_post_status 
+netfront_accel_vi_tx_post(netfront_accel_vnic *vnic, struct sk_buff *skb)
+{
+       struct ethhdr *pkt_eth_hdr;
+       struct iphdr *pkt_ipv4_hdr;
+       int value, try_fastpath;
+
+       /*
+        * This assumes that the data field points to the dest mac
+        * address.
+        */
+       cuckoo_hash_mac_key key = cuckoo_mac_to_key(skb->data);
+
+       /*
+        * NB very important that all things that could return "CANT"
+        * are tested before things that return "BUSY" as if it it
+        * returns "BUSY" it is assumed that it won't return "CANT"
+        * next time it is tried
+        */
+
+       /*
+        * Do a fastpath send if fast path table lookup returns true.
+        * We do this without the table lock and so may get the wrong
+        * answer, but current opinion is that's not a big problem 
+        */
+       try_fastpath = cuckoo_hash_lookup(&vnic->fastpath_table, 
+                                         (cuckoo_hash_key *)(&key), &value);
+
+       if (!try_fastpath) {
+               VPRINTK("try fast path false for mac: %pM\n", skb->data);
+               
+               return NETFRONT_ACCEL_STATUS_CANT;
+       }
+
+       /* Check to see if the packet can be sent. */
+       if (skb_headlen(skb) < sizeof(*pkt_eth_hdr) + sizeof(*pkt_ipv4_hdr)) {
+               EPRINTK("%s: Packet header is too small\n", __FUNCTION__);
+               return NETFRONT_ACCEL_STATUS_CANT;
+       }
+
+       pkt_eth_hdr  = (void*)skb->data;
+       pkt_ipv4_hdr = (void*)(pkt_eth_hdr+1);
+
+       if (be16_to_cpu(pkt_eth_hdr->h_proto) != ETH_P_IP) {
+               DPRINTK("%s: Packet is not IPV4 (ether_type=0x%04x)\n", __FUNCTION__,
+                       be16_to_cpu(pkt_eth_hdr->h_proto));
+               return NETFRONT_ACCEL_STATUS_CANT;
+       }
+       
+       if (pkt_ipv4_hdr->protocol != IPPROTO_TCP &&
+           pkt_ipv4_hdr->protocol != IPPROTO_UDP) {
+               DPRINTK("%s: Packet is not TCP/UDP (ip_protocol=0x%02x)\n",
+                       __FUNCTION__, pkt_ipv4_hdr->protocol);
+               return NETFRONT_ACCEL_STATUS_CANT;
+       }
+       
+       VPRINTK("%s: %d bytes, gso %d\n", __FUNCTION__, skb->len, 
+               skb_shinfo(skb)->gso_size);
+       
+       if (skb_shinfo(skb)->gso_size) {
+               return netfront_accel_enqueue_skb_tso(vnic, skb);
+       }
+
+       if (skb->len <= NETFRONT_ACCEL_TX_BUF_LENGTH) {
+               return netfront_accel_enqueue_skb_single(vnic, skb);
+       }
+
+       return netfront_accel_enqueue_skb_multi(vnic, skb);
+}
+
+
+/*
+ * Copy the data to required end destination. NB. len is the total new
+ * length of the socket buffer, not the amount of data to copy
+ */
+inline
+int ef_vnic_copy_to_skb(netfront_accel_vnic *vnic, struct sk_buff *skb, 
+                       struct netfront_accel_pkt_desc *buf, int len)
+{
+       int i, extra = len - skb->len;
+       char c;
+       int pkt_stride = vnic->rx_pkt_stride;
+       int skb_stride = vnic->rx_skb_stride;
+       char *skb_start;
+       
+       /*
+        * This pulls stuff into the cache - have seen performance
+        * benefit in this, but disabled by default
+        */
+       skb_start = skb->data;
+       if (pkt_stride) {
+               for (i = 0; i < len; i += pkt_stride) {
+                       c += ((volatile char*)(buf->pkt_kva))[i];
+               }
+       }
+       if (skb_stride) {
+               for (i = skb->len; i < len ; i += skb_stride) {
+                       c += ((volatile char*)(skb_start))[i];
+               }
+       }
+
+       if (skb_tailroom(skb) >= extra) {
+               memcpy(skb_put(skb, extra), buf->pkt_kva, extra);
+               return 0;
+       }
+
+       return -ENOSPC;
+}
+
+
+static void discard_jumbo_state(netfront_accel_vnic *vnic) 
+{
+
+       if (vnic->jumbo_state.skb != NULL) {
+               dev_kfree_skb_any(vnic->jumbo_state.skb);
+
+               vnic->jumbo_state.skb = NULL;
+       }
+       vnic->jumbo_state.in_progress = 0;
+}
+
+
+static void  netfront_accel_vi_rx_complete(netfront_accel_vnic *vnic,
+                                          struct sk_buff *skb)
+{
+       cuckoo_hash_mac_key key;
+       unsigned long flags;
+       int value;
+       struct net_device *net_dev;
+
+
+       key = cuckoo_mac_to_key(skb->data + ETH_ALEN);
+
+       /*
+        * If this is a MAC address that we want to do fast path TX
+        * to, and we don't already, add it to the fastpath table.
+        * The initial lookup is done without the table lock and so
+        * may get the wrong answer, but current opinion is that's not
+        * a big problem
+        */
+       if (is_valid_ether_addr(skb->data + ETH_ALEN) &&
+           !cuckoo_hash_lookup(&vnic->fastpath_table, (cuckoo_hash_key *)&key,
+                               &value)) {
+               spin_lock_irqsave(&vnic->table_lock, flags);
+                  
+               cuckoo_hash_add_check(&vnic->fastpath_table,
+                                     (cuckoo_hash_key *)&key,
+                                     1, 1);
+               
+               spin_unlock_irqrestore(&vnic->table_lock, flags);
+       }
+
+       if (compare_ether_addr(skb->data, vnic->mac)) {
+               struct iphdr *ip = (struct iphdr *)(skb->data + ETH_HLEN);
+               u16 port;
+
+               DPRINTK("%s: saw wrong MAC address %pM\n",
+                       __FUNCTION__, skb->data);
+
+               if (ip->protocol == IPPROTO_TCP) {
+                       struct tcphdr *tcp = (struct tcphdr *)
+                               ((char *)ip + 4 * ip->ihl);
+                       port = tcp->dest;
+               } else {
+                       struct udphdr *udp = (struct udphdr *)
+                               ((char *)ip + 4 * ip->ihl);
+                       EPRINTK_ON(ip->protocol != IPPROTO_UDP);
+                       port = udp->dest;
+               }
+
+               netfront_accel_msg_tx_fastpath(vnic, skb->data,
+                                              ip->daddr, port,
+                                              ip->protocol);
+       }
+
+       net_dev = vnic->net_dev;
+       skb->protocol = eth_type_trans(skb, net_dev);
+       /* CHECKSUM_UNNECESSARY as hardware has done it already */
+       skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+       if (!netfront_accel_ssr_skb(vnic, &vnic->ssr_state, skb))
+               netif_receive_skb(skb);
+}
+
+
+static int netfront_accel_vi_poll_process_rx(netfront_accel_vnic *vnic, 
+                                            ef_event *ev)
+{
+       struct netfront_accel_bufinfo *bufinfo = vnic->rx_bufs;
+       struct netfront_accel_pkt_desc *buf = NULL;
+       struct sk_buff *skb;
+       int id, len, sop = 0, cont = 0;
+
+       VPRINTK("Rx event.\n");
+       /*
+        * Complete the receive operation, and get the request id of
+        * the buffer
+        */
+       id = ef_vi_receive_done(&vnic->vi, ev);
+
+       if (id < 0 || id >= bufinfo->npages*NETFRONT_ACCEL_BUFS_PER_PAGE) {
+               EPRINTK("Rx packet %d is invalid\n", id);
+               /* Carry on round the loop if more events */
+               goto bad_packet;
+       }
+       /* Get our buffer descriptor */
+       buf = netfront_accel_buf_find(bufinfo, id);
+
+       len = EF_EVENT_RX_BYTES(*ev);
+
+       /* An RX buffer has been removed from the DMA ring. */
+       vnic->rx_dma_level--;
+
+       if (EF_EVENT_TYPE(*ev) == EF_EVENT_TYPE_RX) {
+               sop = EF_EVENT_RX_SOP(*ev);
+               cont = EF_EVENT_RX_CONT(*ev);
+
+               skb = vnic->jumbo_state.skb;
+
+               VPRINTK("Rx packet %d: %d bytes so far; sop %d; cont %d\n", 
+                       id, len, sop, cont);
+
+               if (sop) {
+                       if (!vnic->jumbo_state.in_progress) {
+                               vnic->jumbo_state.in_progress = 1;
+                               BUG_ON(vnic->jumbo_state.skb != NULL);
+                       } else {
+                               /*
+                                * This fragment shows a missing tail in 
+                                * previous one, but is itself possibly OK
+                                */
+                               DPRINTK("sop and in_progress => no tail\n");
+
+                               /* Release the socket buffer we already had */
+                               discard_jumbo_state(vnic);
+
+                               /* Now start processing this fragment */
+                               vnic->jumbo_state.in_progress = 1;
+                               skb = NULL;
+                       }
+               } else if (!vnic->jumbo_state.in_progress) {
+                       DPRINTK("!sop and !in_progress => missing head\n");
+                       goto missing_head;
+               }
+
+               if (!cont) {
+                       /* Update state for next time */
+                       vnic->jumbo_state.in_progress = 0;
+                       vnic->jumbo_state.skb = NULL;
+               } else if (!vnic->jumbo_state.in_progress) {
+                       DPRINTK("cont and !in_progress => missing head\n");
+                       goto missing_head;
+               }
+
+               if (skb == NULL) {
+                       BUG_ON(!sop);
+
+                       if (!cont)
+                               skb = alloc_skb(len+NET_IP_ALIGN, GFP_ATOMIC);
+                       else
+                               skb = alloc_skb(vnic->net_dev->mtu+NET_IP_ALIGN, 
+                                               GFP_ATOMIC);
+
+                       if (skb == NULL) {
+                               DPRINTK("%s: Couldn't get an rx skb.\n",
+                                       __FUNCTION__);
+                               netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
+                               /*
+                                * Dropping this fragment means we
+                                * should discard the rest too
+                                */
+                               discard_jumbo_state(vnic);
+
+                               /* Carry on round the loop if more events */
+                               return 0;
+                       }
+
+               }
+               
+               /* Copy the data to required end destination */
+               if (ef_vnic_copy_to_skb(vnic, skb, buf, len) != 0) {
+                       /*
+                        * No space in the skb - suggests > MTU packet
+                        * received
+                        */
+                       EPRINTK("%s: Rx packet too large (%d)\n",
+                               __FUNCTION__, len);
+                       netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
+                       discard_jumbo_state(vnic);
+                       return 0;
+               }
+               
+               /* Put the buffer back in the DMA queue. */
+               netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
+
+               if (cont) {
+                       vnic->jumbo_state.skb = skb;
+
+                       return 0;
+               } else {
+                       /* Track number of rx fastpath packets */
+                       vnic->netdev_stats.fastpath_rx_pkts++;
+                       vnic->netdev_stats.fastpath_rx_bytes += len;
+
+                       netfront_accel_vi_rx_complete(vnic, skb);
+
+                       return 1;
+               }
+       } else {
+               BUG_ON(EF_EVENT_TYPE(*ev) != EF_EVENT_TYPE_RX_DISCARD);
+
+               if (EF_EVENT_RX_DISCARD_TYPE(*ev) 
+                   == EF_EVENT_RX_DISCARD_TRUNC) {
+                       DPRINTK("%s: " EF_EVENT_FMT 
+                               " buffer %d FRM_TRUNC q_id %d\n",
+                               __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
+                               EF_EVENT_RX_DISCARD_Q_ID(*ev) );
+                       NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_frm_trunc);
+               } else if (EF_EVENT_RX_DISCARD_TYPE(*ev) 
+                         == EF_EVENT_RX_DISCARD_OTHER) {
+                       DPRINTK("%s: " EF_EVENT_FMT 
+                               " buffer %d RX_DISCARD_OTHER q_id %d\n",
+                               __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
+                               EF_EVENT_RX_DISCARD_Q_ID(*ev) );
+                       NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_discard_other);
+               } else if (EF_EVENT_RX_DISCARD_TYPE(*ev) ==
+                          EF_EVENT_RX_DISCARD_CSUM_BAD) {
+                       DPRINTK("%s: " EF_EVENT_FMT 
+                               " buffer %d DISCARD CSUM_BAD q_id %d\n",
+                               __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
+                               EF_EVENT_RX_DISCARD_Q_ID(*ev) );
+                       NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_csum_bad);
+               } else if (EF_EVENT_RX_DISCARD_TYPE(*ev) ==
+                          EF_EVENT_RX_DISCARD_CRC_BAD) {
+                       DPRINTK("%s: " EF_EVENT_FMT 
+                               " buffer %d DISCARD CRC_BAD q_id %d\n",
+                               __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
+                               EF_EVENT_RX_DISCARD_Q_ID(*ev) );
+                       NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_crc_bad);
+               } else {
+                       BUG_ON(EF_EVENT_RX_DISCARD_TYPE(*ev) !=
+                              EF_EVENT_RX_DISCARD_RIGHTS);
+                       DPRINTK("%s: " EF_EVENT_FMT 
+                               " buffer %d DISCARD RIGHTS q_id %d\n",
+                               __FUNCTION__, EF_EVENT_PRI_ARG(*ev), id,
+                               EF_EVENT_RX_DISCARD_Q_ID(*ev) );
+                       NETFRONT_ACCEL_STATS_OP(++vnic->stats.fastpath_rights_bad);
+               }
+       }
+
+       /* discard type drops through here */
+
+bad_packet:
+       /* Release the socket buffer we already had */
+       discard_jumbo_state(vnic);
+
+missing_head:
+       BUG_ON(vnic->jumbo_state.in_progress != 0);
+       BUG_ON(vnic->jumbo_state.skb != NULL);
+
+       if (id >= 0 && id < bufinfo->npages*NETFRONT_ACCEL_BUFS_PER_PAGE)
+               /* Put the buffer back in the DMA queue. */
+               netfront_accel_vi_post_rx_or_free(vnic, (u16)id, buf);
+
+       vnic->netdev_stats.fastpath_rx_errors++;
+
+       DPRINTK("%s experienced bad packet/missing fragment error: %d \n",
+               __FUNCTION__, ev->rx.flags);
+
+       return 0;
+}
+
+
+static void netfront_accel_vi_not_busy(netfront_accel_vnic *vnic)
+{
+       struct netfront_info *np = ((struct netfront_info *)
+                                   netdev_priv(vnic->net_dev));
+       int handled;
+       unsigned long flags;
+
+       /*
+        * We hold the vnic tx_lock which is sufficient to exclude
+        * writes to tx_skb
+        */
+
+       if (vnic->tx_skb != NULL) {
+               DPRINTK("%s trying to send spare buffer\n", __FUNCTION__);
+               
+               handled = netfront_accel_vi_tx_post(vnic, vnic->tx_skb);
+               
+               if (handled != NETFRONT_ACCEL_STATUS_BUSY) {
+                       DPRINTK("%s restarting tx\n", __FUNCTION__);
+
+                       /* Need netfront tx_lock and vnic tx_lock to
+                        * write tx_skb */
+                       spin_lock_irqsave(&np->tx_lock, flags);
+
+                       vnic->tx_skb = NULL;
+
+                       if (netfront_check_queue_ready(vnic->net_dev)) {
+                               netif_wake_queue(vnic->net_dev);
+                               NETFRONT_ACCEL_STATS_OP
+                                       (vnic->stats.queue_wakes++);
+                       }
+                       spin_unlock_irqrestore(&np->tx_lock, flags);
+
+               }
+               
+               /*
+                * Should never get a CANT, as it checks that before
+                * deciding it was BUSY first time round 
+                */
+               BUG_ON(handled == NETFRONT_ACCEL_STATUS_CANT);
+       }
+}
+
+
+static void netfront_accel_vi_tx_complete(netfront_accel_vnic *vnic, 
+                                         struct netfront_accel_tso_buffer *tso_buf,
+                                         int is_last)
+{
+       struct netfront_accel_tso_buffer *next;
+
+       /* 
+        * We get a single completion for every call to
+        * ef_vi_transmitv so handle any other buffers which are part
+        * of the same packet 
+        */
+       while (tso_buf != NULL) {
+               if (tso_buf->buf->skb != NULL) {
+                       dev_kfree_skb_any(tso_buf->buf->skb);
+                       tso_buf->buf->skb = NULL;
+               }
+
+               next = tso_buf->next;
+
+               netfront_accel_buf_put(vnic->tx_bufs, tso_buf->buf->buf_id);
+
+               tso_buf = next;
+       }
+
+       /*
+        * If this was the last one in the batch, we try and send any
+        * pending tx_skb. There should now be buffers and
+        * descriptors
+        */
+       if (is_last)
+               netfront_accel_vi_not_busy(vnic);
+}
+
+
+static void netfront_accel_vi_poll_process_tx(netfront_accel_vnic *vnic,
+                                             ef_event *ev)
+{
+       struct netfront_accel_pkt_desc *buf;
+       struct netfront_accel_tso_buffer *tso_buf;
+       ef_request_id ids[EF_VI_TRANSMIT_BATCH];
+       int i, n_ids;
+       unsigned long flags;
+
+       /* Get the request ids for this tx completion event. */
+       n_ids = ef_vi_transmit_unbundle(&vnic->vi, ev, ids);
+
+       /* Take the tx buffer spin lock and hold for the duration */
+       spin_lock_irqsave(&vnic->tx_lock, flags);
+
+       for (i = 0; i < n_ids; ++i) {
+               VPRINTK("Tx packet %d complete\n", ids[i]);
+               buf = netfront_accel_buf_find(vnic->tx_bufs, ids[i]);
+               NETFRONT_ACCEL_STATS_OP(vnic->stats.fastpath_tx_completions++);
+
+               tso_buf = (struct netfront_accel_tso_buffer *)
+                       (buf->pkt_kva + NETFRONT_ACCEL_TX_BUF_LENGTH);
+               BUG_ON(tso_buf->buf != buf);
+
+               netfront_accel_vi_tx_complete(vnic, tso_buf, i == (n_ids-1));
+       }
+
+       spin_unlock_irqrestore(&vnic->tx_lock, flags);
+}
+
+
+int netfront_accel_vi_poll(netfront_accel_vnic *vnic, int rx_packets)
+{
+       ef_event ev[ACCEL_VI_POLL_EVENTS];
+       int rx_remain = rx_packets, rc, events, i;
+#if NETFRONT_ACCEL_STATS
+       int n_evs_polled = 0, rx_evs_polled = 0, tx_evs_polled = 0;
+#endif
+       BUG_ON(rx_packets <= 0);
+
+       events = ef_eventq_poll(&vnic->vi, ev, 
+                               min(rx_remain, ACCEL_VI_POLL_EVENTS));
+       i = 0;
+       NETFRONT_ACCEL_STATS_OP(n_evs_polled += events);
+
+       VPRINTK("%s: %d events\n", __FUNCTION__, events);
+
+       /* Loop over each event */
+       while (events) {
+               VPRINTK("%s: Event "EF_EVENT_FMT", index %lu\n", __FUNCTION__, 
+                       EF_EVENT_PRI_ARG(ev[i]),        
+                       (unsigned long)(vnic->vi.evq_state->evq_ptr));
+
+               if ((EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_RX) ||
+                   (EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_RX_DISCARD)) {
+                       rc = netfront_accel_vi_poll_process_rx(vnic, &ev[i]);
+                       rx_remain -= rc;
+                       BUG_ON(rx_remain < 0);
+                       NETFRONT_ACCEL_STATS_OP(rx_evs_polled++);
+               } else if (EF_EVENT_TYPE(ev[i]) == EF_EVENT_TYPE_TX) {
+                       netfront_accel_vi_poll_process_tx(vnic, &ev[i]);
+                       NETFRONT_ACCEL_STATS_OP(tx_evs_polled++);
+               } else if (EF_EVENT_TYPE(ev[i]) == 
+                          EF_EVENT_TYPE_RX_NO_DESC_TRUNC) {
+                       DPRINTK("%s: RX_NO_DESC_TRUNC " EF_EVENT_FMT "\n",
+                               __FUNCTION__, EF_EVENT_PRI_ARG(ev[i]));
+                       discard_jumbo_state(vnic);
+                       NETFRONT_ACCEL_STATS_OP(vnic->stats.rx_no_desc_trunc++);
+               } else {
+                       EPRINTK("Unexpected event " EF_EVENT_FMT "\n", 
+                               EF_EVENT_PRI_ARG(ev[i]));
+                       NETFRONT_ACCEL_STATS_OP(vnic->stats.bad_event_count++);
+               }
+
+               i++;
+
+               /* Carry on round the loop if more events and more space */
+               if (i == events) {
+                       if (rx_remain == 0)
+                               break;
+
+                       events = ef_eventq_poll(&vnic->vi, ev, 
+                                               min(rx_remain, 
+                                                   ACCEL_VI_POLL_EVENTS));
+                       i = 0;
+                       NETFRONT_ACCEL_STATS_OP(n_evs_polled += events);
+               }
+       }
+       
+#if NETFRONT_ACCEL_STATS
+       vnic->stats.event_count += n_evs_polled;
+       vnic->stats.event_count_since_irq += n_evs_polled;
+       if (n_evs_polled > vnic->stats.events_per_poll_max)
+               vnic->stats.events_per_poll_max = n_evs_polled;
+       if (rx_evs_polled > vnic->stats.events_per_poll_rx_max)
+               vnic->stats.events_per_poll_rx_max = rx_evs_polled;
+       if (tx_evs_polled > vnic->stats.events_per_poll_tx_max)
+               vnic->stats.events_per_poll_tx_max = tx_evs_polled;
+#endif
+
+       return rx_packets - rx_remain;
+}
+
+
+int netfront_accel_vi_enable_interrupts(netfront_accel_vnic *vnic)
+{
+       u32 sw_evq_ptr;
+
+       VPRINTK("%s: checking for event on %p\n", __FUNCTION__, &vnic->vi.evq_state);
+
+       BUG_ON(vnic == NULL);
+       BUG_ON(vnic->vi.evq_state == NULL);
+
+       /* Do a quick check for an event. */
+       if (ef_eventq_has_event(&vnic->vi)) {
+               VPRINTK("%s: found event\n",  __FUNCTION__);
+               return 0;
+       }
+
+       VPRINTK("evq_ptr=0x%08x  evq_mask=0x%08x\n",
+               vnic->evq_state.evq_ptr, vnic->vi.evq_mask);
+  
+       /* Request a wakeup from the hardware. */
+       sw_evq_ptr = vnic->evq_state.evq_ptr & vnic->vi.evq_mask;
+
+       BUG_ON(vnic->hw.falcon.evq_rptr == NULL);
+
+       VPRINTK("Requesting wakeup at 0x%08x, rptr %p\n", sw_evq_ptr,
+               vnic->hw.falcon.evq_rptr);
+       *(volatile u32 *)(vnic->hw.falcon.evq_rptr) = (sw_evq_ptr >> 3);
+
+       return 1;
+}
diff --git a/drivers/xen/sfc_netfront/accel_xenbus.c b/drivers/xen/sfc_netfront/accel_xenbus.c

new file mode 100644 (file)

index 0000000..98d5334
--- /dev/null
+++ b/drivers/xen/sfc_netfront/accel_xenbus.c
@@ -0,0 +1,775 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/stddef.h>
+#include <linux/errno.h>
+
+#include <xen/xenbus.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+
+#include "accel.h"
+#include "accel_util.h"
+#include "accel_msg_iface.h"
+#include "accel_bufs.h"
+#include "accel_ssr.h"
+/* drivers/xen/netfront/netfront.h */
+#include "netfront.h"
+
+void netfront_accel_set_closing(netfront_accel_vnic *vnic) 
+{
+
+       vnic->frontend_state = XenbusStateClosing;
+       net_accel_update_state(vnic->dev, XenbusStateClosing);
+}
+       
+
+static void mac_address_change(struct xenbus_watch *watch,
+                              const char **vec, unsigned int len)
+{
+       netfront_accel_vnic *vnic;
+       struct xenbus_device *dev;
+       int rc;
+
+       DPRINTK("%s\n", __FUNCTION__);
+       
+       vnic = container_of(watch, netfront_accel_vnic, 
+                               mac_address_watch);
+       dev = vnic->dev;
+
+       rc = net_accel_xen_net_read_mac(dev, vnic->mac);
+
+       if (rc != 0)
+               EPRINTK("%s: failed to read mac (%d)\n", __FUNCTION__, rc);
+}
+
+
+static int setup_mac_address_watch(struct xenbus_device *dev,
+                                  netfront_accel_vnic *vnic)
+{
+       int err;
+
+       DPRINTK("Setting watch on %s/%s\n", dev->nodename, "mac");
+
+       err = xenbus_watch_path2(dev, dev->nodename, "mac", 
+                                &vnic->mac_address_watch, 
+                                mac_address_change);
+       if (err) {
+               EPRINTK("%s: Failed to register xenbus watch: %d\n",
+                       __FUNCTION__, err);
+               goto fail;
+       }
+
+       return 0;
+ fail:
+       vnic->mac_address_watch.node = NULL;
+       return err;
+}
+
+
+/* Grant access to some pages and publish through xenbus */
+static int make_named_grant(struct xenbus_device *dev, void *page, 
+                           const char *name, grant_ref_t *gnt_ref)
+{
+       struct xenbus_transaction tr;
+       int err;
+       grant_ref_t gnt;
+
+       gnt = net_accel_grant_page(dev, virt_to_mfn(page), 0);
+       if (gnt < 0)
+               return gnt;
+
+       do {
+               err = xenbus_transaction_start(&tr);
+               if (err != 0) {
+                       EPRINTK("%s: transaction start failed %d\n",
+                               __FUNCTION__, err);
+                       return err;
+               }
+               err = xenbus_printf(tr, dev->nodename, name, "%d", gnt);
+               if (err != 0) {
+                       EPRINTK("%s: xenbus_printf failed %d\n", __FUNCTION__,
+                               err);
+                       xenbus_transaction_end(tr, 1);
+                       return err;
+               }
+               err = xenbus_transaction_end(tr, 0);
+       } while (err == -EAGAIN);
+       
+       if (err != 0) {
+               EPRINTK("%s: transaction end failed %d\n", __FUNCTION__, err);
+               return err;
+       }
+       
+       *gnt_ref = gnt;
+
+       return 0;
+}
+
+
+static int remove_named_grant(struct xenbus_device *dev,
+                             const char *name, grant_ref_t gnt_ref)
+{
+       struct xenbus_transaction tr;
+       int err;
+
+       net_accel_ungrant_page(gnt_ref);
+
+       do {
+               err = xenbus_transaction_start(&tr);
+               if (err != 0) {
+                       EPRINTK("%s: transaction start failed %d\n",
+                               __FUNCTION__, err);
+                       return err;
+               }
+               err = xenbus_rm(tr, dev->nodename, name);
+               if (err != 0) {
+                       EPRINTK("%s: xenbus_rm failed %d\n", __FUNCTION__,
+                               err);
+                       xenbus_transaction_end(tr, 1);
+                       return err;
+               }
+               err = xenbus_transaction_end(tr, 0);
+       } while (err == -EAGAIN);
+       
+       if (err != 0) {
+               EPRINTK("%s: transaction end failed %d\n", __FUNCTION__, err);
+               return err;
+       }
+
+       return 0;
+}
+
+
+static 
+netfront_accel_vnic *netfront_accel_vnic_ctor(struct net_device *net_dev,
+                                             struct xenbus_device *dev)
+{
+       struct netfront_info *np =
+               (struct netfront_info *)netdev_priv(net_dev);
+       netfront_accel_vnic *vnic;
+       int err;
+
+       /*
+        * A bug in earlier versions of Xen accel plugin system meant
+        * you could be probed twice for the same device on suspend
+        * cancel.  Be tolerant of that.
+        */ 
+       if (np->accel_priv != NULL)
+               return ERR_PTR(-EALREADY);
+
+       /* Alloc mem for state */
+       vnic = kzalloc(sizeof(netfront_accel_vnic), GFP_KERNEL);
+       if (vnic == NULL) {
+               EPRINTK("%s: no memory for vnic state\n", __FUNCTION__);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       spin_lock_init(&vnic->tx_lock);
+
+       mutex_init(&vnic->vnic_mutex);
+       mutex_lock(&vnic->vnic_mutex);
+
+       /* Store so state can be retrieved from device */
+       BUG_ON(np->accel_priv != NULL);
+       np->accel_priv = vnic;
+       vnic->dev = dev;
+       vnic->net_dev = net_dev;
+       spin_lock_init(&vnic->irq_enabled_lock);
+       netfront_accel_ssr_init(&vnic->ssr_state);
+
+       init_waitqueue_head(&vnic->state_wait_queue);
+       vnic->backend_state = XenbusStateUnknown;
+       vnic->frontend_state = XenbusStateClosed;
+       vnic->removing = 0;
+       vnic->domU_state_is_setup = 0;
+       vnic->dom0_state_is_setup = 0;
+       vnic->poll_enabled = 0;
+       vnic->tx_enabled = 0;
+       vnic->tx_skb = NULL;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+       INIT_WORK(&vnic->msg_from_bend, netfront_accel_msg_from_bend);
+#else
+       INIT_WORK(&vnic->msg_from_bend, netfront_accel_msg_from_bend, vnic);
+#endif
+
+       netfront_accel_debugfs_create(vnic);
+
+       mutex_unlock(&vnic->vnic_mutex);
+
+       err = net_accel_xen_net_read_mac(dev, vnic->mac);
+       if (err) 
+               goto fail_mac;
+
+       /* Setup a watch on the frontend's MAC address */
+       err = setup_mac_address_watch(dev, vnic);
+       if (err)
+               goto fail_mac;
+
+       return vnic;
+
+fail_mac:
+
+       mutex_lock(&vnic->vnic_mutex);
+
+       netfront_accel_debugfs_remove(vnic);
+
+       netfront_accel_ssr_fini(vnic, &vnic->ssr_state);
+
+       EPRINTK_ON(vnic->tx_skb != NULL);
+
+       vnic->frontend_state = XenbusStateUnknown;
+       net_accel_update_state(dev, XenbusStateUnknown);
+
+       mutex_unlock(&vnic->vnic_mutex);
+
+       np->accel_priv = NULL;
+       kfree(vnic);
+
+       return ERR_PTR(err);
+}
+
+
+static void netfront_accel_vnic_dtor(netfront_accel_vnic *vnic)
+{
+       struct net_device *net_dev = vnic->net_dev;
+       struct netfront_info *np = 
+               (struct netfront_info *)netdev_priv(net_dev);
+
+       /*
+        * Now we don't hold the lock any more it is safe to remove
+        * this watch and synchonrise with the completion of
+        * watches
+        */
+       DPRINTK("%s: unregistering xenbus mac watch\n", __FUNCTION__);
+       unregister_xenbus_watch(&vnic->mac_address_watch);
+       kfree(vnic->mac_address_watch.node);
+
+       flush_workqueue(netfront_accel_workqueue);
+
+       mutex_lock(&vnic->vnic_mutex);
+
+       netfront_accel_debugfs_remove(vnic);
+
+       netfront_accel_ssr_fini(vnic, &vnic->ssr_state);
+
+       EPRINTK_ON(vnic->tx_skb != NULL);
+
+       vnic->frontend_state = XenbusStateUnknown;
+       net_accel_update_state(vnic->dev, XenbusStateUnknown);
+
+       mutex_unlock(&vnic->vnic_mutex);
+
+       np->accel_priv = NULL;
+       kfree(vnic);
+}
+
+
+static int vnic_setup_domU_shared_state(struct xenbus_device *dev,
+                                       netfront_accel_vnic *vnic)
+{
+       struct xenbus_transaction tr;
+       int err;
+       int msgs_per_queue;
+
+
+       DPRINTK("Setting up domU shared state.\n");
+
+       msgs_per_queue = (PAGE_SIZE/2) / sizeof(struct net_accel_msg);
+
+       /* Allocate buffer state */
+       vnic->tx_bufs = netfront_accel_init_bufs(&vnic->tx_lock);
+       if (vnic->tx_bufs == NULL) {
+               err = -ENOMEM;
+               EPRINTK("%s: Failed to allocate tx buffers\n", __FUNCTION__);
+               goto fail_tx_bufs;
+       }
+
+       vnic->rx_bufs = netfront_accel_init_bufs(NULL);
+       if (vnic->rx_bufs == NULL) {
+               err = -ENOMEM;
+               EPRINTK("%s: Failed to allocate rx buffers\n", __FUNCTION__);
+               goto fail_rx_bufs;
+       }
+
+       /* 
+        * This allocates two pages, one for the shared page and one
+        * for the message queue.
+        */
+       vnic->shared_page = (struct net_accel_shared_page *)
+               __get_free_pages(GFP_KERNEL, 1);
+       if (vnic->shared_page == NULL) {
+               EPRINTK("%s: no memory for shared pages\n", __FUNCTION__);
+               err = -ENOMEM;
+               goto fail_shared_page;
+       }
+
+       net_accel_msg_init_queue
+               (&vnic->from_dom0, &vnic->shared_page->queue0, 
+                (struct net_accel_msg *)((u8*)vnic->shared_page + PAGE_SIZE),
+                msgs_per_queue);
+
+       net_accel_msg_init_queue
+               (&vnic->to_dom0, &vnic->shared_page->queue1,
+                (struct net_accel_msg *)((u8*)vnic->shared_page +
+                                         (3 * PAGE_SIZE / 2)),
+                msgs_per_queue);
+       
+       vnic->msg_state = NETFRONT_ACCEL_MSG_NONE;
+
+       err = make_named_grant(dev, vnic->shared_page, "accel-ctrl-page",
+                              &vnic->ctrl_page_gnt);
+       if (err) {
+               EPRINTK("couldn't make ctrl-page named grant\n");
+               goto fail_ctrl_page_grant;
+       }
+
+       err = make_named_grant(dev, (u8*)vnic->shared_page + PAGE_SIZE,
+                              "accel-msg-page", &vnic->msg_page_gnt);
+       if (err) {
+               EPRINTK("couldn't make msg-page named grant\n");
+               goto fail_msg_page_grant;
+       }
+
+       /* Create xenbus msg event channel */
+       err = bind_listening_port_to_irqhandler
+               (dev->otherend_id, netfront_accel_msg_channel_irq_from_bend,
+                IRQF_SAMPLE_RANDOM, "vnicctrl", vnic);
+       if (err < 0) {
+               EPRINTK("Couldn't bind msg event channel\n");
+               goto fail_msg_irq;
+       }
+       vnic->msg_channel_irq = err;
+       vnic->msg_channel = irq_to_evtchn_port(vnic->msg_channel_irq);
+       
+       /* Create xenbus net event channel */
+       err = bind_listening_port_to_irqhandler
+               (dev->otherend_id, netfront_accel_net_channel_irq_from_bend,
+                IRQF_SAMPLE_RANDOM, "vnicfront", vnic);
+       if (err < 0) {
+               EPRINTK("Couldn't bind net event channel\n");
+               goto fail_net_irq;
+       }
+       vnic->net_channel_irq = err;
+       vnic->net_channel = irq_to_evtchn_port(vnic->net_channel_irq);
+       /* Want to ensure we don't get interrupts before we're ready */
+       netfront_accel_disable_net_interrupts(vnic);
+
+       DPRINTK("otherend %d has msg ch %u (%u) and net ch %u (%u)\n",
+               dev->otherend_id, vnic->msg_channel, vnic->msg_channel_irq, 
+               vnic->net_channel, vnic->net_channel_irq);
+
+       do {
+               err = xenbus_transaction_start(&tr);
+               if (err != 0) {
+                       EPRINTK("%s: Transaction start failed %d\n",
+                               __FUNCTION__, err);
+                       goto fail_transaction;
+               }
+
+               err = xenbus_printf(tr, dev->nodename, "accel-msg-channel",
+                                   "%u", vnic->msg_channel);
+               if (err != 0) {
+                       EPRINTK("%s: event channel xenbus write failed %d\n",
+                               __FUNCTION__, err);
+                       xenbus_transaction_end(tr, 1);
+                       goto fail_transaction;
+               }
+
+               err = xenbus_printf(tr, dev->nodename, "accel-net-channel",
+                                   "%u", vnic->net_channel);
+               if (err != 0) {
+                       EPRINTK("%s: net channel xenbus write failed %d\n",
+                               __FUNCTION__, err);
+                       xenbus_transaction_end(tr, 1);
+                       goto fail_transaction;
+               }
+
+               err = xenbus_transaction_end(tr, 0);
+       } while (err == -EAGAIN);
+
+       if (err != 0) {
+               EPRINTK("%s: Transaction end failed %d\n", __FUNCTION__, err);
+               goto fail_transaction;
+       }
+
+       DPRINTK("Completed setting up domU shared state\n");
+
+       return 0;
+
+fail_transaction:
+
+       unbind_from_irqhandler(vnic->net_channel_irq, vnic);
+fail_net_irq:
+
+       unbind_from_irqhandler(vnic->msg_channel_irq, vnic);
+fail_msg_irq:
+
+       remove_named_grant(dev, "accel-ctrl-page", vnic->ctrl_page_gnt);
+fail_msg_page_grant:
+
+       remove_named_grant(dev, "accel-msg-page", vnic->msg_page_gnt);
+fail_ctrl_page_grant:
+
+       free_pages((unsigned long)vnic->shared_page, 1);
+       vnic->shared_page = NULL;
+fail_shared_page:
+
+       netfront_accel_fini_bufs(vnic->rx_bufs);
+fail_rx_bufs:
+
+       netfront_accel_fini_bufs(vnic->tx_bufs);
+fail_tx_bufs:
+
+       /* Undo the memory allocation created when we got the HELLO */
+       netfront_accel_free_buffer_mem(&vnic->bufpages,
+                                      vnic->rx_bufs,
+                                      vnic->tx_bufs);
+
+       DPRINTK("Failed to setup domU shared state with code %d\n", err);
+
+       return err;
+}
+
+
+static void vnic_remove_domU_shared_state(struct xenbus_device *dev, 
+                                         netfront_accel_vnic *vnic)
+{
+       struct xenbus_transaction tr;
+       
+       /*
+        * Don't remove any watches because we currently hold the
+        * mutex and the watches take the mutex.
+        */
+
+       DPRINTK("%s: removing event channel irq handlers %d %d\n",
+               __FUNCTION__, vnic->net_channel_irq, vnic->msg_channel_irq);
+       do {
+               if (xenbus_transaction_start(&tr) != 0)
+                       break;
+               xenbus_rm(tr, dev->nodename, "accel-msg-channel");
+               xenbus_rm(tr, dev->nodename, "accel-net-channel");
+       } while (xenbus_transaction_end(tr, 0) == -EAGAIN);
+
+       unbind_from_irqhandler(vnic->net_channel_irq, vnic);
+       unbind_from_irqhandler(vnic->msg_channel_irq, vnic);
+
+       /* ungrant pages for msg channel */
+       remove_named_grant(dev, "accel-ctrl-page", vnic->ctrl_page_gnt);
+       remove_named_grant(dev, "accel-msg-page", vnic->msg_page_gnt);
+       free_pages((unsigned long)vnic->shared_page, 1);
+       vnic->shared_page = NULL;
+
+       /* ungrant pages for buffers, and free buffer memory */
+       netfront_accel_free_buffer_mem(&vnic->bufpages,
+                                      vnic->rx_bufs,
+                                      vnic->tx_bufs);
+       netfront_accel_fini_bufs(vnic->rx_bufs);
+       netfront_accel_fini_bufs(vnic->tx_bufs);
+}
+
+
+static void vnic_setup_dom0_shared_state(struct xenbus_device *dev,
+                                       netfront_accel_vnic *vnic)
+{
+       DPRINTK("Setting up dom0 shared state\n");
+
+       netfront_accel_vi_ctor(vnic);
+
+       /*
+        * Message processing will be enabled when this function
+        * returns, but we might have missed an interrupt.  Schedule a
+        * check just in case.
+        */
+       queue_work(netfront_accel_workqueue, &vnic->msg_from_bend);
+}
+
+
+static void vnic_remove_dom0_shared_state(struct xenbus_device *dev,
+                                         netfront_accel_vnic *vnic)
+{
+       DPRINTK("Removing dom0 shared state\n");
+
+       vnic_stop_fastpath(vnic);
+
+       netfront_accel_vi_dtor(vnic);
+}
+
+
+/*************************************************************************/
+
+/*
+ * The following code handles accelstate changes between the frontend
+ * and the backend.  In response to transitions, calls the following
+ * functions in matching pairs:
+ *
+ *   vnic_setup_domU_shared_state
+ *   vnic_remove_domU_shared_state
+ *
+ *   vnic_setup_dom0_shared_state
+ *   vnic_remove_dom0_shared_state
+ *
+ * Valid state transitions for DomU are as follows:
+ *
+ * Closed->Init       on probe or in response to Init from dom0
+ *
+ * Init->Connected    in response to Init from dom0
+ * Init->Closing      on error providing dom0 is in Init
+ * Init->Closed       on remove or in response to Closing from dom0
+ *
+ * Connected->Closing on error/remove
+ * Connected->Closed  in response to Closing from dom0
+ *
+ * Closing->Closed    in response to Closing from dom0
+ *
+ */
+
+
+/* Function to deal with Xenbus accel state change in backend */
+static void netfront_accel_backend_accel_changed(netfront_accel_vnic *vnic,
+                                                XenbusState backend_state)
+{
+       struct xenbus_device *dev = vnic->dev;
+       XenbusState frontend_state;
+       int state;
+
+       DPRINTK("%s: changing from %s to %s. nodename %s, otherend %s\n",
+               __FUNCTION__, xenbus_strstate(vnic->backend_state),
+               xenbus_strstate(backend_state), dev->nodename, dev->otherend);
+
+       /*
+        * Ignore duplicate state changes.  This can happen if the
+        * backend changes state twice in quick succession and the
+        * first watch fires in the frontend after the second
+        * transition has completed.
+        */
+       if (vnic->backend_state == backend_state)
+               return;
+
+       vnic->backend_state = backend_state;
+       frontend_state = vnic->frontend_state;
+
+       switch (backend_state) {
+       case XenbusStateInitialising:
+               /*
+                * It's possible for us to miss the closed state from
+                * dom0, so do the work here.
+                */
+               if (vnic->domU_state_is_setup) {
+                       vnic_remove_domU_shared_state(dev, vnic);
+                       vnic->domU_state_is_setup = 0;
+               }
+
+               if (frontend_state != XenbusStateInitialising) {
+                       /* Make sure the backend doesn't go away. */
+                       frontend_state = XenbusStateInitialising;
+                       net_accel_update_state(dev, frontend_state);
+                       xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", &state);
+                       backend_state = (XenbusState)state;
+                       if (backend_state != XenbusStateInitialising)
+                               break;
+               }
+
+               /* Start the new connection. */
+               if (!vnic->removing) {
+                       BUG_ON(vnic->domU_state_is_setup);
+                       if (vnic_setup_domU_shared_state(dev, vnic) == 0) {
+                               vnic->domU_state_is_setup = 1;
+                               frontend_state = XenbusStateConnected;
+                       } else
+                               frontend_state = XenbusStateClosing;
+               }
+               break;
+       case XenbusStateConnected:
+               if (vnic->domU_state_is_setup &&
+                   !vnic->dom0_state_is_setup) {
+                       vnic_setup_dom0_shared_state(dev, vnic);
+                       vnic->dom0_state_is_setup = 1;
+               }
+               break;
+       default:
+       case XenbusStateClosing:
+               if (vnic->dom0_state_is_setup) {
+                       vnic_remove_dom0_shared_state(dev, vnic);
+                       vnic->dom0_state_is_setup = 0;
+               }
+               frontend_state = XenbusStateClosed;
+               break;
+       case XenbusStateUnknown:
+       case XenbusStateClosed:
+               if (vnic->domU_state_is_setup) {
+                       vnic_remove_domU_shared_state(dev, vnic);
+                       vnic->domU_state_is_setup = 0;
+               }
+               break;
+       }
+
+       if (frontend_state != vnic->frontend_state) {
+               DPRINTK("Switching from state %s (%d) to %s (%d)\n",
+                       xenbus_strstate(vnic->frontend_state),
+                       vnic->frontend_state,
+                       xenbus_strstate(frontend_state), frontend_state);
+               vnic->frontend_state = frontend_state;
+               net_accel_update_state(dev, frontend_state);
+       }
+
+       wake_up(&vnic->state_wait_queue);
+}
+
+
+static void backend_accel_state_change(struct xenbus_watch *watch,
+                                      const char **vec, unsigned int len)
+{
+       int state;
+       netfront_accel_vnic *vnic;
+       struct xenbus_device *dev;
+
+       DPRINTK("%s\n", __FUNCTION__);
+
+       vnic = container_of(watch, struct netfront_accel_vnic,
+                               backend_accel_watch);
+
+       mutex_lock(&vnic->vnic_mutex);
+
+       dev = vnic->dev;
+
+       state = (int)XenbusStateUnknown;
+       xenbus_scanf(XBT_NIL, dev->otherend, "accelstate", "%d", &state);
+       netfront_accel_backend_accel_changed(vnic, state);
+
+       mutex_unlock(&vnic->vnic_mutex);
+}
+
+
+static int setup_dom0_accel_watch(struct xenbus_device *dev,
+                                 netfront_accel_vnic *vnic)
+{
+       int err;
+
+       DPRINTK("Setting watch on %s/%s\n", dev->otherend, "accelstate");
+
+       err = xenbus_watch_path2(dev, dev->otherend, "accelstate", 
+                                &vnic->backend_accel_watch, 
+                                backend_accel_state_change);
+       if (err) {
+               EPRINTK("%s: Failed to register xenbus watch: %d\n",
+                       __FUNCTION__, err);
+               goto fail;
+       }
+       return 0;
+ fail:
+       vnic->backend_accel_watch.node = NULL;
+       return err;
+}
+
+
+int netfront_accel_probe(struct net_device *net_dev, struct xenbus_device *dev)
+{
+       netfront_accel_vnic *vnic;
+       int err;
+
+       DPRINTK("Probe passed device %s\n", dev->nodename);
+
+       vnic = netfront_accel_vnic_ctor(net_dev, dev);
+       if (IS_ERR(vnic))
+               return PTR_ERR(vnic);
+
+       /*
+        * Setup a watch on the backend accel state.  This sets things
+        * going.
+        */
+       err = setup_dom0_accel_watch(dev, vnic);
+       if (err) {
+               netfront_accel_vnic_dtor(vnic);
+               EPRINTK("%s: probe failed with code %d\n", __FUNCTION__, err);
+               return err;
+       }
+
+       /*
+        * Indicate to the other end that we're ready to start unless
+        * the watch has already fired.
+        */
+       mutex_lock(&vnic->vnic_mutex);
+       VPRINTK("setup success, updating accelstate\n");
+       if (vnic->frontend_state == XenbusStateClosed) {
+               vnic->frontend_state = XenbusStateInitialising;
+               net_accel_update_state(dev, XenbusStateInitialising);
+       }
+       mutex_unlock(&vnic->vnic_mutex);
+
+       DPRINTK("Probe done device %s\n", dev->nodename);
+
+       return 0;
+}
+
+
+int netfront_accel_remove(struct xenbus_device *dev)
+{
+       struct netfront_info *np = dev_get_drvdata(&dev->dev);
+       netfront_accel_vnic *vnic = (netfront_accel_vnic *)np->accel_priv;
+
+       DPRINTK("%s %s\n", __FUNCTION__, dev->nodename);
+
+       BUG_ON(vnic == NULL);
+
+       mutex_lock(&vnic->vnic_mutex);
+
+       /* Reject any attempts to connect. */
+       vnic->removing = 1;
+
+       /* Close any existing connection. */
+       if (vnic->frontend_state == XenbusStateConnected) {
+               vnic->frontend_state = XenbusStateClosing;
+               net_accel_update_state(dev, XenbusStateClosing);
+       }
+
+       mutex_unlock(&vnic->vnic_mutex);
+
+       DPRINTK("%s waiting for release of %s\n", __FUNCTION__, dev->nodename);
+
+       /*
+        * Wait for the xenbus watch to release the shared resources.
+        * This indicates that dom0 has made the transition
+        * Closing->Closed or that dom0 was in Closed or Init and no
+        * resources were mapped.
+        */
+       wait_event(vnic->state_wait_queue,
+                  !vnic->domU_state_is_setup);
+
+       /*
+        * Now we don't need this watch anymore it is safe to remove
+        * it (and so synchronise with it completing if outstanding)
+        */
+       DPRINTK("%s: unregistering xenbus accel watch\n",
+               __FUNCTION__);
+       unregister_xenbus_watch(&vnic->backend_accel_watch);
+       kfree(vnic->backend_accel_watch.node);
+
+       netfront_accel_vnic_dtor(vnic);
+
+       DPRINTK("%s done %s\n", __FUNCTION__, dev->nodename);
+
+       return 0;
+}
diff --git a/drivers/xen/sfc_netfront/ef_vi_falcon.h b/drivers/xen/sfc_netfront/ef_vi_falcon.h

new file mode 100644 (file)

index 0000000..9aaf4ca
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_falcon.h
@@ -0,0 +1,172 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  slp
+ *  \brief  Falcon specific definitions
+ *   \date  2004/08
+ */
+
+#ifndef __EF_VI_FALCON_H__
+#define __EF_VI_FALCON_H__    
+
+#define EFHW_4K                0x00001000u
+#define EFHW_8K                0x00002000u
+
+/* include the autogenerated register definitions */
+
+#include "ef_vi_falcon_core.h"
+#include "ef_vi_falcon_desc.h"
+#include "ef_vi_falcon_event.h"
+
+
+/*----------------------------------------------------------------------------
+ *
+ * Helpers to turn bit shifts into dword shifts and check that the bit fields 
+ * haven't overflown the dword etc. Aim is to preserve consistency with the 
+ * autogenerated headers - once stable we could hard code.
+ *
+ *---------------------------------------------------------------------------*/
+
+/* mask constructors */
+#define __FALCON_MASK(WIDTH,T)  ((((T)1) << (WIDTH)) - 1)
+#define __EFVI_MASK32(WIDTH)  __FALCON_MASK((WIDTH),uint32_t)
+#define __EFVI_MASK64(WIDTH)  __FALCON_MASK((WIDTH),uint64_t)
+
+#define __EFVI_FALCON_MASKFIELD32(LBN, WIDTH)   ((uint32_t)  \
+                             (__EFVI_MASK32(WIDTH) << (LBN)))
+
+/* constructors for fields which span the first and second dwords */
+#define __LW(LBN) (32 - LBN)
+#define LOW(v, LBN, WIDTH)   ((uint32_t)  \
+                               (((v) & __EFVI_MASK64(__LW((LBN)))) << (LBN)))
+#define HIGH(v, LBN, WIDTH)  ((uint32_t)(((v) >> __LW((LBN))) & \
+                                       __EFVI_MASK64((WIDTH - __LW((LBN))))))
+/* constructors for fields within the second dword */
+#define __DW2(LBN)       ((LBN) - 32)
+
+/* constructors for fields which span the second and third dwords */
+#define __LW2(LBN) (64 - LBN)
+#define LOW2(v, LBN, WIDTH) ((uint32_t) \
+                       (((v) & __EFVI_MASK64(__LW2((LBN)))) << ((LBN) - 32)))
+#define HIGH2(v, LBN, WIDTH)  ((uint32_t) \
+             (((v) >> __LW2((LBN))) & __EFVI_MASK64((WIDTH - __LW2((LBN))))))
+
+/* constructors for fields within the third dword */
+#define __DW3(LBN)       ((LBN) - 64)
+
+                               
+/* constructors for fields which span the third and fourth dwords */
+#define __LW3(LBN) (96 - LBN)
+#define LOW3(v, LBN, WIDTH)   ((uint32_t)    \
+              (((v) & __EFVI_MASK64(__LW3((LBN)))) << ((LBN) - 64)))
+#define HIGH3(v, LBN, WIDTH)  ((unit32_t)    \
+             (((v) >> __LW3((LBN))) & __EFVI_MASK64((WIDTH - __LW3((LBN))))))
+
+/* constructors for fields within the fourth dword */
+#define __DW4(LBN)       ((LBN) - 96)
+
+/* checks that the autogenerated headers our consistent with our model */
+#define WIDTHCHCK(a, b) ef_assert((a) == (b))
+#define RANGECHCK(v, WIDTH) \
+                ef_assert(((uint64_t)(v) & ~(__EFVI_MASK64((WIDTH)))) == 0)
+
+/* fields within the first dword */
+#define DWCHCK(LBN, WIDTH) ef_assert(((LBN) >= 0) &&(((LBN)+(WIDTH)) <= 32))
+
+/* fields which span the first and second dwords */
+#define LWCHK(LBN, WIDTH)  ef_assert(WIDTH >= __LW(LBN))
+
+/*----------------------------------------------------------------------------
+ *
+ * Buffer virtual addresses (4K buffers) 
+ *
+ *---------------------------------------------------------------------------*/
+
+/* Form a buffer virtual address from buffer ID and offset.  If the offset
+** is larger than the buffer size, then the buffer indexed will be
+** calculated appropriately.  It is the responsibility of the caller to
+** ensure that they have valid buffers programmed at that address.
+*/
+#define EFVI_FALCON_VADDR_4K_S         (12)         
+#define EFVI_FALCON_VADDR_M       0xfffff              /* post shift mask  */
+
+
+#define EFVI_FALCON_BUFFER_4K_ADDR(id,off)      \
+  (((id) << EFVI_FALCON_VADDR_4K_S) + (off))
+
+#define EFVI_FALCON_BUFFER_4K_PAGE(vaddr)                       \
+  (((vaddr) >> EFVI_FALCON_VADDR_4K_S) & EFVI_FALCON_VADDR_M)
+
+#define EFVI_FALCON_BUFFER_4K_OFF(vaddr)                \
+  ((vaddr) & __EFVI_MASK32(EFVI_FALCON_VADDR_4K_S))
+
+
+/*----------------------------------------------------------------------------
+ *
+ * Masks
+ *
+ *---------------------------------------------------------------------------*/
+
+#define EFVI_FALCON_CLOCK_ASIC_HZ    (125000)
+#define EFVI_FALCON_CLOCK_FPGA_HZ    (62500)
+#define EFVI_FALCON_CLOCK_HZ         EFVI_FALCON_CLOCK_ASIC_HZ
+
+
+/*----------------------------------------------------------------------------
+ *
+ * Timers
+ *
+ *---------------------------------------------------------------------------*/
+
+/* Event-Queue Timer granularity - measured in us 
+   Given by: 4096 * 3 cycle * clock period */
+
+#define EFVI_FALCON_EVQTIMER_PERIOD_US   ((4096 * 3 * 1000) / EFVI_FALCON_CLOCK_HZ)
+
+/* mode bits */
+#define EFVI_FALCON_TIMER_MODE_DIS     0     /* disabled */
+#define EFVI_FALCON_TIMER_MODE_RUN     1     /* started counting right away */
+#define EFVI_FALCON_TIMER_MODE_HOLD    2     /* trigger mode (user queues) */
+
+#define EFVI_FALCON_EVQTIMER_HOLD     (EFVI_FALCON_TIMER_MODE_HOLD << TIMER_MODE_LBN)
+#define EFVI_FALCON_EVQTIMER_RUN      (EFVI_FALCON_TIMER_MODE_RUN  << TIMER_MODE_LBN)
+#define EFVI_FALCON_EVQTIMER_DISABLE  (EFVI_FALCON_TIMER_MODE_DIS  << TIMER_MODE_LBN) 
+
+
+/* ---- ef_vi_event helpers --- */
+
+#define EFVI_FALCON_EVENT_CODE(evp) \
+       ((evp)->u64 & EFVI_FALCON_EVENT_CODE_MASK)
+
+#define EFVI_FALCON_EVENT_SW_DATA_MASK    0x0000ffff
+
+#define __EFVI_FALCON_OPEN_MASK(WIDTH)  ((((uint64_t)1) << (WIDTH)) - 1)
+
+#define EFVI_FALCON_EVENT_CODE_MASK \
+           (__EFVI_FALCON_OPEN_MASK(EV_CODE_WIDTH) << EV_CODE_LBN)
+
+
+#endif  /* __EF_VI_FALCON_H__ */
diff --git a/drivers/xen/sfc_netfront/ef_vi_falcon_core.h b/drivers/xen/sfc_netfront/ef_vi_falcon_core.h

new file mode 100644 (file)

index 0000000..089f42a
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_falcon_core.h
@@ -0,0 +1,1075 @@
+
+#define  EFVI_FALCON_EXTENDED_P_BAR 1
+
+//////////////---- Bus Interface Unit Registers C Header ----//////////////
+#define IOM_IND_ADR_REG_OFST 0x0 // IO-mapped indirect access address register
+  #define IOM_AUTO_ADR_INC_EN_LBN 16
+  #define IOM_AUTO_ADR_INC_EN_WIDTH 1
+  #define IOM_IND_ADR_LBN 0
+  #define IOM_IND_ADR_WIDTH 16
+#define IOM_IND_DAT_REG_OFST 0x4 // IO-mapped indirect access data register
+  #define IOM_IND_DAT_LBN 0
+  #define IOM_IND_DAT_WIDTH 32
+#define ADR_REGION_REG_KER_OFST 0x0 // Address region register
+#define ADR_REGION_REG_OFST 0x0 // Address region register
+  #define ADR_REGION3_LBN 96
+  #define ADR_REGION3_WIDTH 18
+  #define ADR_REGION2_LBN 64
+  #define ADR_REGION2_WIDTH 18
+  #define ADR_REGION1_LBN 32
+  #define ADR_REGION1_WIDTH 18
+  #define ADR_REGION0_LBN 0
+  #define ADR_REGION0_WIDTH 18
+#define INT_EN_REG_KER_OFST 0x10 // Kernel driver Interrupt enable register
+  #define KER_INT_CHAR_LBN 4
+  #define KER_INT_CHAR_WIDTH 1
+  #define KER_INT_KER_LBN 3
+  #define KER_INT_KER_WIDTH 1
+  #define ILL_ADR_ERR_INT_EN_KER_LBN 2
+  #define ILL_ADR_ERR_INT_EN_KER_WIDTH 1
+  #define SRM_PERR_INT_EN_KER_LBN 1
+  #define SRM_PERR_INT_EN_KER_WIDTH 1
+  #define DRV_INT_EN_KER_LBN 0
+  #define DRV_INT_EN_KER_WIDTH 1
+#define INT_EN_REG_CHAR_OFST 0x20 // Char Driver interrupt enable register
+  #define CHAR_INT_CHAR_LBN 4
+  #define CHAR_INT_CHAR_WIDTH 1
+  #define CHAR_INT_KER_LBN 3
+  #define CHAR_INT_KER_WIDTH 1
+  #define ILL_ADR_ERR_INT_EN_CHAR_LBN 2
+  #define ILL_ADR_ERR_INT_EN_CHAR_WIDTH 1
+  #define SRM_PERR_INT_EN_CHAR_LBN 1
+  #define SRM_PERR_INT_EN_CHAR_WIDTH 1
+  #define DRV_INT_EN_CHAR_LBN 0
+  #define DRV_INT_EN_CHAR_WIDTH 1
+#define INT_ADR_REG_KER_OFST 0x30 // Interrupt host address for Kernel driver
+  #define INT_ADR_KER_LBN 0
+  #define INT_ADR_KER_WIDTH 64
+  #define DRV_INT_KER_LBN 32
+  #define DRV_INT_KER_WIDTH 1
+  #define EV_FF_HALF_INT_KER_LBN 3
+  #define EV_FF_HALF_INT_KER_WIDTH 1
+  #define EV_FF_FULL_INT_KER_LBN 2
+  #define EV_FF_FULL_INT_KER_WIDTH 1
+  #define ILL_ADR_ERR_INT_KER_LBN 1
+  #define ILL_ADR_ERR_INT_KER_WIDTH 1
+  #define SRAM_PERR_INT_KER_LBN 0
+  #define SRAM_PERR_INT_KER_WIDTH 1
+#define INT_ADR_REG_CHAR_OFST 0x40 // Interrupt host address for Char driver
+  #define INT_ADR_CHAR_LBN 0
+  #define INT_ADR_CHAR_WIDTH 64
+  #define DRV_INT_CHAR_LBN 32
+  #define DRV_INT_CHAR_WIDTH 1
+  #define EV_FF_HALF_INT_CHAR_LBN 3
+  #define EV_FF_HALF_INT_CHAR_WIDTH 1
+  #define EV_FF_FULL_INT_CHAR_LBN 2
+  #define EV_FF_FULL_INT_CHAR_WIDTH 1
+  #define ILL_ADR_ERR_INT_CHAR_LBN 1
+  #define ILL_ADR_ERR_INT_CHAR_WIDTH 1
+  #define SRAM_PERR_INT_CHAR_LBN 0
+  #define SRAM_PERR_INT_CHAR_WIDTH 1
+#define INT_ISR0_B0_OFST 0x90 // B0 only
+#define INT_ISR1_B0_OFST 0xA0
+#define INT_ACK_REG_KER_A1_OFST 0x50 // Kernel interrupt acknowledge register
+  #define RESERVED_LBN 0
+  #define RESERVED_WIDTH 32
+#define INT_ACK_REG_CHAR_A1_OFST 0x60 // CHAR interrupt acknowledge register
+  #define RESERVED_LBN 0
+  #define RESERVED_WIDTH 32
+//////////////---- Global CSR Registers C Header ----//////////////
+#define STRAP_REG_KER_OFST 0x200 // ASIC strap status register
+#define STRAP_REG_OFST 0x200 // ASIC strap status register
+  #define ONCHIP_SRAM_LBN 16
+  #define ONCHIP_SRAM_WIDTH 0
+  #define STRAP_ISCSI_EN_LBN 3
+  #define STRAP_ISCSI_EN_WIDTH 1
+  #define STRAP_PINS_LBN 0
+  #define STRAP_PINS_WIDTH 3
+#define GPIO_CTL_REG_KER_OFST 0x210 // GPIO control register
+#define GPIO_CTL_REG_OFST 0x210 // GPIO control register
+  #define GPIO_OEN_LBN 24
+  #define GPIO_OEN_WIDTH 4
+  #define GPIO_OUT_LBN 16
+  #define GPIO_OUT_WIDTH 4
+  #define GPIO_IN_LBN 8
+  #define GPIO_IN_WIDTH 4
+  #define GPIO_PWRUP_VALUE_LBN 0
+  #define GPIO_PWRUP_VALUE_WIDTH 4
+#define GLB_CTL_REG_KER_OFST 0x220 // Global control register
+#define GLB_CTL_REG_OFST 0x220 // Global control register
+  #define SWRST_LBN 0
+  #define SWRST_WIDTH 1
+#define FATAL_INTR_REG_KER_OFST 0x230 // Fatal interrupt register for Kernel
+  #define PCI_BUSERR_INT_KER_EN_LBN 43
+  #define PCI_BUSERR_INT_KER_EN_WIDTH 1
+  #define SRAM_OOB_INT_KER_EN_LBN 42
+  #define SRAM_OOB_INT_KER_EN_WIDTH 1
+  #define BUFID_OOB_INT_KER_EN_LBN 41
+  #define BUFID_OOB_INT_KER_EN_WIDTH 1
+  #define MEM_PERR_INT_KER_EN_LBN 40
+  #define MEM_PERR_INT_KER_EN_WIDTH 1
+  #define RBUF_OWN_INT_KER_EN_LBN 39
+  #define RBUF_OWN_INT_KER_EN_WIDTH 1
+  #define TBUF_OWN_INT_KER_EN_LBN 38
+  #define TBUF_OWN_INT_KER_EN_WIDTH 1
+  #define RDESCQ_OWN_INT_KER_EN_LBN 37
+  #define RDESCQ_OWN_INT_KER_EN_WIDTH 1
+  #define TDESCQ_OWN_INT_KER_EN_LBN 36
+  #define TDESCQ_OWN_INT_KER_EN_WIDTH 1
+  #define EVQ_OWN_INT_KER_EN_LBN 35
+  #define EVQ_OWN_INT_KER_EN_WIDTH 1
+  #define EVFF_OFLO_INT_KER_EN_LBN 34
+  #define EVFF_OFLO_INT_KER_EN_WIDTH 1
+  #define ILL_ADR_INT_KER_EN_LBN 33
+  #define ILL_ADR_INT_KER_EN_WIDTH 1
+  #define SRM_PERR_INT_KER_EN_LBN 32
+  #define SRM_PERR_INT_KER_EN_WIDTH 1
+  #define PCI_BUSERR_INT_KER_LBN 11
+  #define PCI_BUSERR_INT_KER_WIDTH 1
+  #define SRAM_OOB_INT_KER_LBN 10
+  #define SRAM_OOB_INT_KER_WIDTH 1
+  #define BUFID_OOB_INT_KER_LBN 9
+  #define BUFID_OOB_INT_KER_WIDTH 1
+  #define MEM_PERR_INT_KER_LBN 8
+  #define MEM_PERR_INT_KER_WIDTH 1
+  #define RBUF_OWN_INT_KER_LBN 7
+  #define RBUF_OWN_INT_KER_WIDTH 1
+  #define TBUF_OWN_INT_KER_LBN 6
+  #define TBUF_OWN_INT_KER_WIDTH 1
+  #define RDESCQ_OWN_INT_KER_LBN 5
+  #define RDESCQ_OWN_INT_KER_WIDTH 1
+  #define TDESCQ_OWN_INT_KER_LBN 4
+  #define TDESCQ_OWN_INT_KER_WIDTH 1
+  #define EVQ_OWN_INT_KER_LBN 3
+  #define EVQ_OWN_INT_KER_WIDTH 1
+  #define EVFF_OFLO_INT_KER_LBN 2
+  #define EVFF_OFLO_INT_KER_WIDTH 1
+  #define ILL_ADR_INT_KER_LBN 1
+  #define ILL_ADR_INT_KER_WIDTH 1
+  #define SRM_PERR_INT_KER_LBN 0
+  #define SRM_PERR_INT_KER_WIDTH 1
+#define FATAL_INTR_REG_OFST 0x240 // Fatal interrupt register for Char
+  #define PCI_BUSERR_INT_CHAR_EN_LBN 43
+  #define PCI_BUSERR_INT_CHAR_EN_WIDTH 1
+  #define SRAM_OOB_INT_CHAR_EN_LBN 42
+  #define SRAM_OOB_INT_CHAR_EN_WIDTH 1
+  #define BUFID_OOB_INT_CHAR_EN_LBN 41
+  #define BUFID_OOB_INT_CHAR_EN_WIDTH 1
+  #define MEM_PERR_INT_CHAR_EN_LBN 40
+  #define MEM_PERR_INT_CHAR_EN_WIDTH 1
+  #define RBUF_OWN_INT_CHAR_EN_LBN 39
+  #define RBUF_OWN_INT_CHAR_EN_WIDTH 1
+  #define TBUF_OWN_INT_CHAR_EN_LBN 38
+  #define TBUF_OWN_INT_CHAR_EN_WIDTH 1
+  #define RDESCQ_OWN_INT_CHAR_EN_LBN 37
+  #define RDESCQ_OWN_INT_CHAR_EN_WIDTH 1
+  #define TDESCQ_OWN_INT_CHAR_EN_LBN 36
+  #define TDESCQ_OWN_INT_CHAR_EN_WIDTH 1
+  #define EVQ_OWN_INT_CHAR_EN_LBN 35
+  #define EVQ_OWN_INT_CHAR_EN_WIDTH 1
+  #define EVFF_OFLO_INT_CHAR_EN_LBN 34
+  #define EVFF_OFLO_INT_CHAR_EN_WIDTH 1
+  #define ILL_ADR_INT_CHAR_EN_LBN 33
+  #define ILL_ADR_INT_CHAR_EN_WIDTH 1
+  #define SRM_PERR_INT_CHAR_EN_LBN 32
+  #define SRM_PERR_INT_CHAR_EN_WIDTH 1
+  #define FATAL_INTR_REG_EN_BITS    0xffffffffffffffffULL
+  #define PCI_BUSERR_INT_CHAR_LBN 11
+  #define PCI_BUSERR_INT_CHAR_WIDTH 1
+  #define SRAM_OOB_INT_CHAR_LBN 10
+  #define SRAM_OOB_INT_CHAR_WIDTH 1
+  #define BUFID_OOB_INT_CHAR_LBN 9
+  #define BUFID_OOB_INT_CHAR_WIDTH 1
+  #define MEM_PERR_INT_CHAR_LBN 8
+  #define MEM_PERR_INT_CHAR_WIDTH 1
+  #define RBUF_OWN_INT_CHAR_LBN 7
+  #define RBUF_OWN_INT_CHAR_WIDTH 1
+  #define TBUF_OWN_INT_CHAR_LBN 6
+  #define TBUF_OWN_INT_CHAR_WIDTH 1
+  #define RDESCQ_OWN_INT_CHAR_LBN 5
+  #define RDESCQ_OWN_INT_CHAR_WIDTH 1
+  #define TDESCQ_OWN_INT_CHAR_LBN 4
+  #define TDESCQ_OWN_INT_CHAR_WIDTH 1
+  #define EVQ_OWN_INT_CHAR_LBN 3
+  #define EVQ_OWN_INT_CHAR_WIDTH 1
+  #define EVFF_OFLO_INT_CHAR_LBN 2
+  #define EVFF_OFLO_INT_CHAR_WIDTH 1
+  #define ILL_ADR_INT_CHAR_LBN 1
+  #define ILL_ADR_INT_CHAR_WIDTH 1
+  #define SRM_PERR_INT_CHAR_LBN 0
+  #define SRM_PERR_INT_CHAR_WIDTH 1
+#define DP_CTRL_REG_OFST 0x250 // Datapath control register
+  #define FLS_EVQ_ID_LBN 0
+  #define FLS_EVQ_ID_WIDTH 12
+#define MEM_STAT_REG_KER_OFST 0x260 // Memory status register
+#define MEM_STAT_REG_OFST 0x260 // Memory status register
+  #define MEM_PERR_VEC_LBN 53
+  #define MEM_PERR_VEC_WIDTH 38
+  #define MBIST_CORR_LBN 38
+  #define MBIST_CORR_WIDTH 15
+  #define MBIST_ERR_LBN 0
+  #define MBIST_ERR_WIDTH 38
+#define DEBUG_REG_KER_OFST 0x270 // Debug register
+#define DEBUG_REG_OFST 0x270 // Debug register
+  #define DEBUG_BLK_SEL2_LBN 47
+  #define DEBUG_BLK_SEL2_WIDTH 3
+  #define DEBUG_BLK_SEL1_LBN 44
+  #define DEBUG_BLK_SEL1_WIDTH 3
+  #define DEBUG_BLK_SEL0_LBN 41
+  #define DEBUG_BLK_SEL0_WIDTH 3
+  #define MISC_DEBUG_ADDR_LBN 36
+  #define MISC_DEBUG_ADDR_WIDTH 5
+  #define SERDES_DEBUG_ADDR_LBN 31
+  #define SERDES_DEBUG_ADDR_WIDTH 5
+  #define EM_DEBUG_ADDR_LBN 26
+  #define EM_DEBUG_ADDR_WIDTH 5
+  #define SR_DEBUG_ADDR_LBN 21
+  #define SR_DEBUG_ADDR_WIDTH 5
+  #define EV_DEBUG_ADDR_LBN 16
+  #define EV_DEBUG_ADDR_WIDTH 5
+  #define RX_DEBUG_ADDR_LBN 11
+  #define RX_DEBUG_ADDR_WIDTH 5
+  #define TX_DEBUG_ADDR_LBN 6
+  #define TX_DEBUG_ADDR_WIDTH 5
+  #define BIU_DEBUG_ADDR_LBN 1
+  #define BIU_DEBUG_ADDR_WIDTH 5
+  #define DEBUG_EN_LBN 0
+  #define DEBUG_EN_WIDTH 1
+#define DRIVER_REG0_KER_OFST 0x280 // Driver scratch register 0
+#define DRIVER_REG0_OFST 0x280 // Driver scratch register 0
+  #define DRIVER_DW0_LBN 0
+  #define DRIVER_DW0_WIDTH 32
+#define DRIVER_REG1_KER_OFST 0x290 // Driver scratch register 1
+#define DRIVER_REG1_OFST 0x290 // Driver scratch register 1
+  #define DRIVER_DW1_LBN 0
+  #define DRIVER_DW1_WIDTH 32
+#define DRIVER_REG2_KER_OFST 0x2A0 // Driver scratch register 2
+#define DRIVER_REG2_OFST 0x2A0 // Driver scratch register 2
+  #define DRIVER_DW2_LBN 0
+  #define DRIVER_DW2_WIDTH 32
+#define DRIVER_REG3_KER_OFST 0x2B0 // Driver scratch register 3
+#define DRIVER_REG3_OFST 0x2B0 // Driver scratch register 3
+  #define DRIVER_DW3_LBN 0
+  #define DRIVER_DW3_WIDTH 32
+#define DRIVER_REG4_KER_OFST 0x2C0 // Driver scratch register 4
+#define DRIVER_REG4_OFST 0x2C0 // Driver scratch register 4
+  #define DRIVER_DW4_LBN 0
+  #define DRIVER_DW4_WIDTH 32
+#define DRIVER_REG5_KER_OFST 0x2D0 // Driver scratch register 5
+#define DRIVER_REG5_OFST 0x2D0 // Driver scratch register 5
+  #define DRIVER_DW5_LBN 0
+  #define DRIVER_DW5_WIDTH 32
+#define DRIVER_REG6_KER_OFST 0x2E0 // Driver scratch register 6
+#define DRIVER_REG6_OFST 0x2E0 // Driver scratch register 6
+  #define DRIVER_DW6_LBN 0
+  #define DRIVER_DW6_WIDTH 32
+#define DRIVER_REG7_KER_OFST 0x2F0 // Driver scratch register 7
+#define DRIVER_REG7_OFST 0x2F0 // Driver scratch register 7
+  #define DRIVER_DW7_LBN 0
+  #define DRIVER_DW7_WIDTH 32
+#define ALTERA_BUILD_REG_OFST 0x300 // Altera build register
+#define ALTERA_BUILD_REG_OFST 0x300 // Altera build register
+  #define ALTERA_BUILD_VER_LBN 0
+  #define ALTERA_BUILD_VER_WIDTH 32
+
+/* so called CSR spare register 
+    - contains separate parity enable bits for the various internal memory blocks */
+#define MEM_PARITY_ERR_EN_REG_KER 0x310 
+#define MEM_PARITY_ALL_BLOCKS_EN_LBN 64
+#define MEM_PARITY_ALL_BLOCKS_EN_WIDTH 38
+#define MEM_PARITY_TX_DATA_EN_LBN   72
+#define MEM_PARITY_TX_DATA_EN_WIDTH 2
+
+//////////////---- Event & Timer Module Registers C Header ----//////////////
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define EVQ_RPTR_REG_KER_OFST 0x11B00 // Event queue read pointer register
+#else
+#define EVQ_RPTR_REG_KER_OFST 0x1B00 // Event queue read pointer register
+#endif
+
+#define EVQ_RPTR_REG_OFST 0xFA0000 // Event queue read pointer register array.
+  #define EVQ_RPTR_LBN 0
+  #define EVQ_RPTR_WIDTH 15
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define EVQ_PTR_TBL_KER_OFST 0x11A00 // Event queue pointer table for kernel access
+#else
+#define EVQ_PTR_TBL_KER_OFST 0x1A00 // Event queue pointer table for kernel access
+#endif
+
+#define EVQ_PTR_TBL_CHAR_OFST 0xF60000 // Event queue pointer table for char direct access
+  #define EVQ_WKUP_OR_INT_EN_LBN 39
+  #define EVQ_WKUP_OR_INT_EN_WIDTH 1
+  #define EVQ_NXT_WPTR_LBN 24
+  #define EVQ_NXT_WPTR_WIDTH 15
+  #define EVQ_EN_LBN 23
+  #define EVQ_EN_WIDTH 1
+  #define EVQ_SIZE_LBN 20
+  #define EVQ_SIZE_WIDTH 3
+  #define EVQ_BUF_BASE_ID_LBN 0
+  #define EVQ_BUF_BASE_ID_WIDTH 20
+#define TIMER_CMD_REG_KER_OFST 0x420 // Timer table for kernel access. Page-mapped
+#define TIMER_CMD_REG_PAGE4_OFST 0x8420 // Timer table for user-level access. Page-mapped. For lowest 1K queues.
+#define TIMER_CMD_REG_PAGE123K_OFST 0x1000420 // Timer table for user-level access. Page-mapped. For upper 3K queues.
+#define TIMER_TBL_OFST 0xF70000 // Timer table for char driver direct access
+  #define TIMER_MODE_LBN 12
+  #define TIMER_MODE_WIDTH 2
+  #define TIMER_VAL_LBN 0
+  #define TIMER_VAL_WIDTH 12
+  #define TIMER_MODE_INT_HLDOFF 2
+  #define EVQ_BUF_SIZE_LBN 0
+  #define EVQ_BUF_SIZE_WIDTH 1
+#define DRV_EV_REG_KER_OFST 0x440 // Driver generated event register
+#define DRV_EV_REG_OFST 0x440 // Driver generated event register
+  #define DRV_EV_QID_LBN 64
+  #define DRV_EV_QID_WIDTH 12
+  #define DRV_EV_DATA_LBN 0
+  #define DRV_EV_DATA_WIDTH 64
+#define EVQ_CTL_REG_KER_OFST 0x450 // Event queue control register
+#define EVQ_CTL_REG_OFST 0x450 // Event queue control register
+  #define RX_EVQ_WAKEUP_MASK_B0_LBN 15
+  #define RX_EVQ_WAKEUP_MASK_B0_WIDTH 6
+  #define EVQ_OWNERR_CTL_LBN 14
+  #define EVQ_OWNERR_CTL_WIDTH 1
+  #define EVQ_FIFO_AF_TH_LBN 8
+  #define EVQ_FIFO_AF_TH_WIDTH 6
+  #define EVQ_FIFO_NOTAF_TH_LBN 0
+  #define EVQ_FIFO_NOTAF_TH_WIDTH 6
+//////////////---- SRAM Module Registers C Header ----//////////////
+#define BUF_TBL_CFG_REG_KER_OFST 0x600 // Buffer table configuration register
+#define BUF_TBL_CFG_REG_OFST 0x600 // Buffer table configuration register
+  #define BUF_TBL_MODE_LBN 3
+  #define BUF_TBL_MODE_WIDTH 1
+#define SRM_RX_DC_CFG_REG_KER_OFST 0x610 // SRAM receive descriptor cache configuration register
+#define SRM_RX_DC_CFG_REG_OFST 0x610 // SRAM receive descriptor cache configuration register
+  #define SRM_RX_DC_BASE_ADR_LBN 0
+  #define SRM_RX_DC_BASE_ADR_WIDTH 21
+#define SRM_TX_DC_CFG_REG_KER_OFST 0x620 // SRAM transmit descriptor cache configuration register
+#define SRM_TX_DC_CFG_REG_OFST 0x620 // SRAM transmit descriptor cache configuration register
+  #define SRM_TX_DC_BASE_ADR_LBN 0
+  #define SRM_TX_DC_BASE_ADR_WIDTH 21
+#define SRM_CFG_REG_KER_OFST 0x630 // SRAM configuration register
+#define SRM_CFG_REG_OFST 0x630 // SRAM configuration register
+  #define SRAM_OOB_ADR_INTEN_LBN 5
+  #define SRAM_OOB_ADR_INTEN_WIDTH 1
+  #define SRAM_OOB_BUF_INTEN_LBN 4
+  #define SRAM_OOB_BUF_INTEN_WIDTH 1
+  #define SRAM_BT_INIT_EN_LBN 3
+  #define SRAM_BT_INIT_EN_WIDTH 1
+  #define SRM_NUM_BANK_LBN 2
+  #define SRM_NUM_BANK_WIDTH 1
+  #define SRM_BANK_SIZE_LBN 0
+  #define SRM_BANK_SIZE_WIDTH 2
+#define BUF_TBL_UPD_REG_KER_OFST 0x650 // Buffer table update register
+#define BUF_TBL_UPD_REG_OFST 0x650 // Buffer table update register
+  #define BUF_UPD_CMD_LBN 63
+  #define BUF_UPD_CMD_WIDTH 1
+  #define BUF_CLR_CMD_LBN 62
+  #define BUF_CLR_CMD_WIDTH 1
+  #define BUF_CLR_END_ID_LBN 32
+  #define BUF_CLR_END_ID_WIDTH 20
+  #define BUF_CLR_START_ID_LBN 0
+  #define BUF_CLR_START_ID_WIDTH 20
+#define SRM_UPD_EVQ_REG_KER_OFST 0x660 // Buffer table update register
+#define SRM_UPD_EVQ_REG_OFST 0x660 // Buffer table update register
+  #define SRM_UPD_EVQ_ID_LBN 0
+  #define SRM_UPD_EVQ_ID_WIDTH 12
+#define SRAM_PARITY_REG_KER_OFST 0x670 // SRAM parity register.
+#define SRAM_PARITY_REG_OFST 0x670 // SRAM parity register.
+  #define FORCE_SRAM_PERR_LBN 0
+  #define FORCE_SRAM_PERR_WIDTH 1
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define BUF_HALF_TBL_KER_OFST 0x18000 // Buffer table in half buffer table mode direct access by kernel driver
+#else
+#define BUF_HALF_TBL_KER_OFST 0x8000 // Buffer table in half buffer table mode direct access by kernel driver
+#endif
+
+
+#define BUF_HALF_TBL_OFST 0x800000 // Buffer table in half buffer table mode direct access by char driver
+  #define BUF_ADR_HBUF_ODD_LBN 44
+  #define BUF_ADR_HBUF_ODD_WIDTH 20
+  #define BUF_OWNER_ID_HBUF_ODD_LBN 32
+  #define BUF_OWNER_ID_HBUF_ODD_WIDTH 12
+  #define BUF_ADR_HBUF_EVEN_LBN 12
+  #define BUF_ADR_HBUF_EVEN_WIDTH 20
+  #define BUF_OWNER_ID_HBUF_EVEN_LBN 0
+  #define BUF_OWNER_ID_HBUF_EVEN_WIDTH 12
+
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define BUF_FULL_TBL_KER_OFST 0x18000 // Buffer table in full buffer table mode direct access by kernel driver
+#else
+#define BUF_FULL_TBL_KER_OFST 0x8000 // Buffer table in full buffer table mode direct access by kernel driver
+#endif
+
+
+
+
+#define BUF_FULL_TBL_OFST 0x800000 // Buffer table in full buffer table mode direct access by char driver
+  #define IP_DAT_BUF_SIZE_LBN 50
+  #define IP_DAT_BUF_SIZE_WIDTH 1
+  #define BUF_ADR_REGION_LBN 48
+  #define BUF_ADR_REGION_WIDTH 2
+  #define BUF_ADR_FBUF_LBN 14
+  #define BUF_ADR_FBUF_WIDTH 34
+  #define BUF_OWNER_ID_FBUF_LBN 0
+  #define BUF_OWNER_ID_FBUF_WIDTH 14
+#define SRM_DBG_REG_OFST 0x3000000 // SRAM debug access
+  #define SRM_DBG_LBN 0
+  #define SRM_DBG_WIDTH 64
+//////////////---- RX Datapath Registers C Header ----//////////////
+
+#define RX_CFG_REG_KER_OFST 0x800 // Receive configuration register
+#define RX_CFG_REG_OFST 0x800 // Receive configuration register
+
+#if !defined(FALCON_64K_RXFIFO) && !defined(FALCON_PRE_02020029)
+# if !defined(FALCON_128K_RXFIFO)
+#  define FALCON_128K_RXFIFO
+# endif
+#endif
+
+#if defined(FALCON_128K_RXFIFO)
+
+/* new for B0 */
+  #define RX_TOEP_TCP_SUPPRESS_B0_LBN 48
+  #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1
+  #define RX_INGR_EN_B0_LBN 47
+  #define RX_INGR_EN_B0_WIDTH 1
+  #define RX_TOEP_IPV4_B0_LBN 46
+  #define RX_TOEP_IPV4_B0_WIDTH 1
+  #define RX_HASH_ALG_B0_LBN 45
+  #define RX_HASH_ALG_B0_WIDTH 1
+  #define RX_HASH_INSERT_HDR_B0_LBN 44
+  #define RX_HASH_INSERT_HDR_B0_WIDTH 1
+/* moved for B0 */
+  #define RX_DESC_PUSH_EN_B0_LBN 43
+  #define RX_DESC_PUSH_EN_B0_WIDTH 1
+  #define RX_RDW_PATCH_EN_LBN 42 /* Non head of line blocking */
+  #define RX_RDW_PATCH_EN_WIDTH 1
+  #define RX_PCI_BURST_SIZE_B0_LBN 39
+  #define RX_PCI_BURST_SIZE_B0_WIDTH 3
+  #define RX_OWNERR_CTL_B0_LBN 38
+  #define RX_OWNERR_CTL_B0_WIDTH 1
+  #define RX_XON_TX_TH_B0_LBN 33 
+  #define RX_XON_TX_TH_B0_WIDTH 5
+  #define RX_XOFF_TX_TH_B0_LBN 28 
+  #define RX_XOFF_TX_TH_B0_WIDTH 5
+  #define RX_USR_BUF_SIZE_B0_LBN 19
+  #define RX_USR_BUF_SIZE_B0_WIDTH 9
+  #define RX_XON_MAC_TH_B0_LBN 10
+  #define RX_XON_MAC_TH_B0_WIDTH 9
+  #define RX_XOFF_MAC_TH_B0_LBN 1
+  #define RX_XOFF_MAC_TH_B0_WIDTH 9
+  #define RX_XOFF_MAC_EN_B0_LBN 0
+  #define RX_XOFF_MAC_EN_B0_WIDTH 1
+
+#elif !defined(FALCON_PRE_02020029)
+/* new for B0 */
+  #define RX_TOEP_TCP_SUPPRESS_B0_LBN 46
+  #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1
+  #define RX_INGR_EN_B0_LBN 45
+  #define RX_INGR_EN_B0_WIDTH 1
+  #define RX_TOEP_IPV4_B0_LBN 44
+  #define RX_TOEP_IPV4_B0_WIDTH 1
+  #define RX_HASH_ALG_B0_LBN 43
+  #define RX_HASH_ALG_B0_WIDTH 41
+  #define RX_HASH_INSERT_HDR_B0_LBN 42
+  #define RX_HASH_INSERT_HDR_B0_WIDTH 1
+/* moved for B0 */
+  #define RX_DESC_PUSH_EN_B0_LBN 41
+  #define RX_DESC_PUSH_EN_B0_WIDTH 1
+  #define RX_PCI_BURST_SIZE_B0_LBN 37
+  #define RX_PCI_BURST_SIZE_B0_WIDTH 3
+  #define RX_OWNERR_CTL_B0_LBN 36
+  #define RX_OWNERR_CTL_B0_WIDTH 1
+  #define RX_XON_TX_TH_B0_LBN 31
+  #define RX_XON_TX_TH_B0_WIDTH 5
+  #define RX_XOFF_TX_TH_B0_LBN 26
+  #define RX_XOFF_TX_TH_B0_WIDTH 5
+  #define RX_USR_BUF_SIZE_B0_LBN 17
+  #define RX_USR_BUF_SIZE_B0_WIDTH 9
+  #define RX_XON_MAC_TH_B0_LBN 9
+  #define RX_XON_MAC_TH_B0_WIDTH 8
+  #define RX_XOFF_MAC_TH_B0_LBN 1
+  #define RX_XOFF_MAC_TH_B0_WIDTH 8
+  #define RX_XOFF_MAC_EN_B0_LBN 0
+  #define RX_XOFF_MAC_EN_B0_WIDTH 1
+
+#else
+/* new for B0 */
+  #define RX_TOEP_TCP_SUPPRESS_B0_LBN 44
+  #define RX_TOEP_TCP_SUPPRESS_B0_WIDTH 1
+  #define RX_INGR_EN_B0_LBN 43
+  #define RX_INGR_EN_B0_WIDTH 1
+  #define RX_TOEP_IPV4_B0_LBN 42
+  #define RX_TOEP_IPV4_B0_WIDTH 1
+  #define RX_HASH_ALG_B0_LBN 41
+  #define RX_HASH_ALG_B0_WIDTH 41
+  #define RX_HASH_INSERT_HDR_B0_LBN 40
+  #define RX_HASH_INSERT_HDR_B0_WIDTH 1
+/* moved for B0 */
+  #define RX_DESC_PUSH_EN_B0_LBN 35
+  #define RX_DESC_PUSH_EN_B0_WIDTH 1
+  #define RX_PCI_BURST_SIZE_B0_LBN 35
+  #define RX_PCI_BURST_SIZE_B0_WIDTH 2
+  #define RX_OWNERR_CTL_B0_LBN 34
+  #define RX_OWNERR_CTL_B0_WIDTH 1
+  #define RX_XON_TX_TH_B0_LBN 29
+  #define RX_XON_TX_TH_B0_WIDTH 5
+  #define RX_XOFF_TX_TH_B0_LBN 24
+  #define RX_XOFF_TX_TH_B0_WIDTH 5
+  #define RX_USR_BUF_SIZE_B0_LBN 15
+  #define RX_USR_BUF_SIZE_B0_WIDTH 9
+  #define RX_XON_MAC_TH_B0_LBN 8
+  #define RX_XON_MAC_TH_B0_WIDTH 7
+  #define RX_XOFF_MAC_TH_B0_LBN 1
+  #define RX_XOFF_MAC_TH_B0_WIDTH 7
+  #define RX_XOFF_MAC_EN_B0_LBN 0
+  #define RX_XOFF_MAC_EN_B0_WIDTH 1
+
+#endif
+
+/* A0/A1 */
+  #define RX_PUSH_EN_A1_LBN 35
+  #define RX_PUSH_EN_A1_WIDTH 1
+  #define RX_PCI_BURST_SIZE_A1_LBN 31
+  #define RX_PCI_BURST_SIZE_A1_WIDTH 3
+  #define RX_OWNERR_CTL_A1_LBN 30
+  #define RX_OWNERR_CTL_A1_WIDTH 1
+  #define RX_XON_TX_TH_A1_LBN 25
+  #define RX_XON_TX_TH_A1_WIDTH 5
+  #define RX_XOFF_TX_TH_A1_LBN 20
+  #define RX_XOFF_TX_TH_A1_WIDTH 5
+  #define RX_USR_BUF_SIZE_A1_LBN 11
+  #define RX_USR_BUF_SIZE_A1_WIDTH 9
+  #define RX_XON_MAC_TH_A1_LBN 6
+  #define RX_XON_MAC_TH_A1_WIDTH 5
+  #define RX_XOFF_MAC_TH_A1_LBN 1
+  #define RX_XOFF_MAC_TH_A1_WIDTH 5
+  #define RX_XOFF_MAC_EN_A1_LBN 0
+  #define RX_XOFF_MAC_EN_A1_WIDTH 1
+
+#define RX_FILTER_CTL_REG_OFST 0x810 // Receive filter control registers
+  #define SCATTER_ENBL_NO_MATCH_Q_B0_LBN 40
+  #define SCATTER_ENBL_NO_MATCH_Q_B0_WIDTH 1
+  #define UDP_FULL_SRCH_LIMIT_LBN 32
+  #define UDP_FULL_SRCH_LIMIT_WIDTH 8
+  #define NUM_KER_LBN 24
+  #define NUM_KER_WIDTH 2
+  #define UDP_WILD_SRCH_LIMIT_LBN 16
+  #define UDP_WILD_SRCH_LIMIT_WIDTH 8
+  #define TCP_WILD_SRCH_LIMIT_LBN 8
+  #define TCP_WILD_SRCH_LIMIT_WIDTH 8
+  #define TCP_FULL_SRCH_LIMIT_LBN 0
+  #define TCP_FULL_SRCH_LIMIT_WIDTH 8
+#define RX_FLUSH_DESCQ_REG_KER_OFST 0x820 // Receive flush descriptor queue register
+#define RX_FLUSH_DESCQ_REG_OFST 0x820 // Receive flush descriptor queue register
+  #define RX_FLUSH_DESCQ_CMD_LBN 24
+  #define RX_FLUSH_DESCQ_CMD_WIDTH 1
+  #define RX_FLUSH_EVQ_ID_LBN 12
+  #define RX_FLUSH_EVQ_ID_WIDTH 12
+  #define RX_FLUSH_DESCQ_LBN 0
+  #define RX_FLUSH_DESCQ_WIDTH 12
+#define RX_DESC_UPD_REG_KER_OFST 0x830 // Kernel  receive descriptor update register. Page-mapped
+#define RX_DESC_UPD_REG_PAGE4_OFST 0x8830 // Char & user receive descriptor update register. Page-mapped. For lowest 1K queues.
+#define RX_DESC_UPD_REG_PAGE123K_OFST 0x1000830 // Char & user receive descriptor update register. Page-mapped. For upper 3K queues.
+  #define RX_DESC_WPTR_LBN 96
+  #define RX_DESC_WPTR_WIDTH 12
+  #define RX_DESC_PUSH_CMD_LBN 95
+  #define RX_DESC_PUSH_CMD_WIDTH 1
+  #define RX_DESC_LBN 0
+  #define RX_DESC_WIDTH 64
+  #define RX_KER_DESC_LBN 0
+  #define RX_KER_DESC_WIDTH 64
+  #define RX_USR_DESC_LBN 0
+  #define RX_USR_DESC_WIDTH 32
+#define RX_DC_CFG_REG_KER_OFST 0x840 // Receive descriptor cache configuration register
+#define RX_DC_CFG_REG_OFST 0x840 // Receive descriptor cache configuration register
+  #define RX_DC_SIZE_LBN 0
+  #define RX_DC_SIZE_WIDTH 2
+#define RX_DC_PF_WM_REG_KER_OFST 0x850 // Receive descriptor cache pre-fetch watermark register
+#define RX_DC_PF_WM_REG_OFST 0x850 // Receive descriptor cache pre-fetch watermark register
+  #define RX_DC_PF_LWM_LO_LBN 0
+  #define RX_DC_PF_LWM_LO_WIDTH 6
+
+#define RX_RSS_TKEY_B0_OFST 0x860 // RSS Toeplitz hash key (B0 only)
+
+#define RX_NODESC_DROP_REG 0x880
+  #define RX_NODESC_DROP_CNT_LBN 0
+  #define RX_NODESC_DROP_CNT_WIDTH 16
+
+#define XM_TX_CFG_REG_OFST 0x1230
+  #define XM_AUTO_PAD_LBN 5
+  #define XM_AUTO_PAD_WIDTH 1
+
+#define RX_FILTER_TBL0_OFST 0xF00000 // Receive filter table - even entries
+  #define RSS_EN_0_B0_LBN 110
+  #define RSS_EN_0_B0_WIDTH 1
+  #define SCATTER_EN_0_B0_LBN 109
+  #define SCATTER_EN_0_B0_WIDTH 1
+  #define TCP_UDP_0_LBN 108
+  #define TCP_UDP_0_WIDTH 1
+  #define RXQ_ID_0_LBN 96
+  #define RXQ_ID_0_WIDTH 12
+  #define DEST_IP_0_LBN 64
+  #define DEST_IP_0_WIDTH 32
+  #define DEST_PORT_TCP_0_LBN 48
+  #define DEST_PORT_TCP_0_WIDTH 16
+  #define SRC_IP_0_LBN 16
+  #define SRC_IP_0_WIDTH 32
+  #define SRC_TCP_DEST_UDP_0_LBN 0
+  #define SRC_TCP_DEST_UDP_0_WIDTH 16
+#define RX_FILTER_TBL1_OFST 0xF00010 // Receive filter table - odd entries
+  #define RSS_EN_1_B0_LBN 110
+  #define RSS_EN_1_B0_WIDTH 1
+  #define SCATTER_EN_1_B0_LBN 109
+  #define SCATTER_EN_1_B0_WIDTH 1
+  #define TCP_UDP_1_LBN 108
+  #define TCP_UDP_1_WIDTH 1
+  #define RXQ_ID_1_LBN 96
+  #define RXQ_ID_1_WIDTH 12
+  #define DEST_IP_1_LBN 64
+  #define DEST_IP_1_WIDTH 32
+  #define DEST_PORT_TCP_1_LBN 48
+  #define DEST_PORT_TCP_1_WIDTH 16
+  #define SRC_IP_1_LBN 16
+  #define SRC_IP_1_WIDTH 32
+  #define SRC_TCP_DEST_UDP_1_LBN 0
+  #define SRC_TCP_DEST_UDP_1_WIDTH 16
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define RX_DESC_PTR_TBL_KER_OFST 0x11800 // Receive descriptor pointer kernel access
+#else
+#define RX_DESC_PTR_TBL_KER_OFST 0x1800 // Receive descriptor pointer kernel access
+#endif
+
+
+#define RX_DESC_PTR_TBL_OFST 0xF40000 // Receive descriptor pointer table
+  #define RX_ISCSI_DDIG_EN_LBN 88
+  #define RX_ISCSI_DDIG_EN_WIDTH 1
+  #define RX_ISCSI_HDIG_EN_LBN 87
+  #define RX_ISCSI_HDIG_EN_WIDTH 1
+  #define RX_DESC_PREF_ACT_LBN 86
+  #define RX_DESC_PREF_ACT_WIDTH 1
+  #define RX_DC_HW_RPTR_LBN 80
+  #define RX_DC_HW_RPTR_WIDTH 6
+  #define RX_DESCQ_HW_RPTR_LBN 68
+  #define RX_DESCQ_HW_RPTR_WIDTH 12
+  #define RX_DESCQ_SW_WPTR_LBN 56
+  #define RX_DESCQ_SW_WPTR_WIDTH 12
+  #define RX_DESCQ_BUF_BASE_ID_LBN 36
+  #define RX_DESCQ_BUF_BASE_ID_WIDTH 20
+  #define RX_DESCQ_EVQ_ID_LBN 24
+  #define RX_DESCQ_EVQ_ID_WIDTH 12
+  #define RX_DESCQ_OWNER_ID_LBN 10
+  #define RX_DESCQ_OWNER_ID_WIDTH 14
+  #define RX_DESCQ_LABEL_LBN 5
+  #define RX_DESCQ_LABEL_WIDTH 5
+  #define RX_DESCQ_SIZE_LBN 3
+  #define RX_DESCQ_SIZE_WIDTH 2
+  #define RX_DESCQ_TYPE_LBN 2
+  #define RX_DESCQ_TYPE_WIDTH 1
+  #define RX_DESCQ_JUMBO_LBN 1
+  #define RX_DESCQ_JUMBO_WIDTH 1
+  #define RX_DESCQ_EN_LBN 0
+  #define RX_DESCQ_EN_WIDTH 1
+
+
+#define RX_RSS_INDIR_TBL_B0_OFST 0xFB0000 // RSS indirection table (B0 only)
+  #define RX_RSS_INDIR_ENT_B0_LBN 0
+  #define RX_RSS_INDIR_ENT_B0_WIDTH 6
+
+//////////////---- TX Datapath Registers C Header ----//////////////
+#define TX_FLUSH_DESCQ_REG_KER_OFST 0xA00 // Transmit flush descriptor queue register
+#define TX_FLUSH_DESCQ_REG_OFST 0xA00 // Transmit flush descriptor queue register
+  #define TX_FLUSH_DESCQ_CMD_LBN 12
+  #define TX_FLUSH_DESCQ_CMD_WIDTH 1
+  #define TX_FLUSH_DESCQ_LBN 0
+  #define TX_FLUSH_DESCQ_WIDTH 12
+#define TX_DESC_UPD_REG_KER_OFST 0xA10 // Kernel transmit descriptor update register. Page-mapped
+#define TX_DESC_UPD_REG_PAGE4_OFST 0x8A10 // Char & user transmit descriptor update register. Page-mapped
+#define TX_DESC_UPD_REG_PAGE123K_OFST 0x1000A10 // Char & user transmit descriptor update register. Page-mapped
+  #define TX_DESC_WPTR_LBN 96
+  #define TX_DESC_WPTR_WIDTH 12
+  #define TX_DESC_PUSH_CMD_LBN 95
+  #define TX_DESC_PUSH_CMD_WIDTH 1
+  #define TX_DESC_LBN 0
+  #define TX_DESC_WIDTH 95
+  #define TX_KER_DESC_LBN 0
+  #define TX_KER_DESC_WIDTH 64
+  #define TX_USR_DESC_LBN 0
+  #define TX_USR_DESC_WIDTH 64
+#define TX_DC_CFG_REG_KER_OFST 0xA20 // Transmit descriptor cache configuration register
+#define TX_DC_CFG_REG_OFST 0xA20 // Transmit descriptor cache configuration register
+  #define TX_DC_SIZE_LBN 0
+  #define TX_DC_SIZE_WIDTH 2
+
+#if EFVI_FALCON_EXTENDED_P_BAR
+#define TX_DESC_PTR_TBL_KER_OFST 0x11900 // Transmit descriptor pointer.
+#else
+#define TX_DESC_PTR_TBL_KER_OFST 0x1900 // Transmit descriptor pointer.
+#endif
+
+
+#define TX_DESC_PTR_TBL_OFST 0xF50000 // Transmit descriptor pointer
+  #define TX_NON_IP_DROP_DIS_B0_LBN 91
+  #define TX_NON_IP_DROP_DIS_B0_WIDTH 1
+  #define TX_IP_CHKSM_DIS_B0_LBN 90
+  #define TX_IP_CHKSM_DIS_B0_WIDTH 1
+  #define TX_TCP_CHKSM_DIS_B0_LBN 89
+  #define TX_TCP_CHKSM_DIS_B0_WIDTH 1
+  #define TX_DESCQ_EN_LBN 88
+  #define TX_DESCQ_EN_WIDTH 1
+  #define TX_ISCSI_DDIG_EN_LBN 87
+  #define TX_ISCSI_DDIG_EN_WIDTH 1
+  #define TX_ISCSI_HDIG_EN_LBN 86
+  #define TX_ISCSI_HDIG_EN_WIDTH 1
+  #define TX_DC_HW_RPTR_LBN 80
+  #define TX_DC_HW_RPTR_WIDTH 6
+  #define TX_DESCQ_HW_RPTR_LBN 68
+  #define TX_DESCQ_HW_RPTR_WIDTH 12
+  #define TX_DESCQ_SW_WPTR_LBN 56
+  #define TX_DESCQ_SW_WPTR_WIDTH 12
+  #define TX_DESCQ_BUF_BASE_ID_LBN 36
+  #define TX_DESCQ_BUF_BASE_ID_WIDTH 20
+  #define TX_DESCQ_EVQ_ID_LBN 24
+  #define TX_DESCQ_EVQ_ID_WIDTH 12
+  #define TX_DESCQ_OWNER_ID_LBN 10
+  #define TX_DESCQ_OWNER_ID_WIDTH 14
+  #define TX_DESCQ_LABEL_LBN 5
+  #define TX_DESCQ_LABEL_WIDTH 5
+  #define TX_DESCQ_SIZE_LBN 3
+  #define TX_DESCQ_SIZE_WIDTH 2
+  #define TX_DESCQ_TYPE_LBN 1
+  #define TX_DESCQ_TYPE_WIDTH 2
+  #define TX_DESCQ_FLUSH_LBN 0
+  #define TX_DESCQ_FLUSH_WIDTH 1
+#define TX_CFG_REG_KER_OFST 0xA50 // Transmit configuration register
+#define TX_CFG_REG_OFST 0xA50 // Transmit configuration register
+  #define TX_IP_ID_P1_OFS_LBN 32
+  #define TX_IP_ID_P1_OFS_WIDTH 15
+  #define TX_IP_ID_P0_OFS_LBN 16
+  #define TX_IP_ID_P0_OFS_WIDTH 15
+  #define TX_TURBO_EN_LBN 3
+  #define TX_TURBO_EN_WIDTH 1 
+  #define TX_OWNERR_CTL_LBN 2
+  #define TX_OWNERR_CTL_WIDTH 2
+  #define TX_NON_IP_DROP_DIS_LBN 1
+  #define TX_NON_IP_DROP_DIS_WIDTH 1
+  #define TX_IP_ID_REP_EN_LBN 0
+  #define TX_IP_ID_REP_EN_WIDTH 1
+#define TX_RESERVED_REG_KER_OFST 0xA80 // Transmit configuration register
+#define TX_RESERVED_REG_OFST 0xA80 // Transmit configuration register
+  #define TX_CSR_PUSH_EN_LBN 89
+  #define TX_CSR_PUSH_EN_WIDTH 1
+  #define TX_RX_SPACER_LBN 64
+  #define TX_RX_SPACER_WIDTH 8
+  #define TX_SW_EV_EN_LBN 59
+  #define TX_SW_EV_EN_WIDTH 1
+  #define TX_RX_SPACER_EN_LBN 57
+  #define TX_RX_SPACER_EN_WIDTH 1
+  #define TX_CSR_PREF_WD_TMR_LBN 24
+  #define TX_CSR_PREF_WD_TMR_WIDTH 16
+  #define TX_CSR_ONLY1TAG_LBN 21
+  #define TX_CSR_ONLY1TAG_WIDTH 1
+  #define TX_PREF_THRESHOLD_LBN 19
+  #define TX_PREF_THRESHOLD_WIDTH 2
+  #define TX_ONE_PKT_PER_Q_LBN 18
+  #define TX_ONE_PKT_PER_Q_WIDTH 1
+  #define TX_DIS_NON_IP_EV_LBN 17
+  #define TX_DIS_NON_IP_EV_WIDTH 1
+  #define TX_DMA_SPACER_LBN 8
+  #define TX_DMA_SPACER_WIDTH 8
+  #define TX_FLUSH_MIN_LEN_EN_B0_LBN 7
+  #define TX_FLUSH_MIN_LEN_EN_B0_WIDTH 1
+  #define TX_TCP_DIS_A1_LBN 7
+  #define TX_TCP_DIS_A1_WIDTH 1
+  #define TX_IP_DIS_A1_LBN 6
+  #define TX_IP_DIS_A1_WIDTH 1
+  #define TX_MAX_CPL_LBN 2
+  #define TX_MAX_CPL_WIDTH 2
+  #define TX_MAX_PREF_LBN 0
+  #define TX_MAX_PREF_WIDTH 2
+#define TX_VLAN_REG_OFST 0xAE0 // Transmit VLAN tag register
+  #define TX_VLAN_EN_LBN 127
+  #define TX_VLAN_EN_WIDTH 1
+  #define TX_VLAN7_PORT1_EN_LBN 125
+  #define TX_VLAN7_PORT1_EN_WIDTH 1
+  #define TX_VLAN7_PORT0_EN_LBN 124
+  #define TX_VLAN7_PORT0_EN_WIDTH 1
+  #define TX_VLAN7_LBN 112
+  #define TX_VLAN7_WIDTH 12
+  #define TX_VLAN6_PORT1_EN_LBN 109
+  #define TX_VLAN6_PORT1_EN_WIDTH 1
+  #define TX_VLAN6_PORT0_EN_LBN 108
+  #define TX_VLAN6_PORT0_EN_WIDTH 1
+  #define TX_VLAN6_LBN 96
+  #define TX_VLAN6_WIDTH 12
+  #define TX_VLAN5_PORT1_EN_LBN 93
+  #define TX_VLAN5_PORT1_EN_WIDTH 1
+  #define TX_VLAN5_PORT0_EN_LBN 92
+  #define TX_VLAN5_PORT0_EN_WIDTH 1
+  #define TX_VLAN5_LBN 80
+  #define TX_VLAN5_WIDTH 12
+  #define TX_VLAN4_PORT1_EN_LBN 77
+  #define TX_VLAN4_PORT1_EN_WIDTH 1
+  #define TX_VLAN4_PORT0_EN_LBN 76
+  #define TX_VLAN4_PORT0_EN_WIDTH 1
+  #define TX_VLAN4_LBN 64
+  #define TX_VLAN4_WIDTH 12
+  #define TX_VLAN3_PORT1_EN_LBN 61
+  #define TX_VLAN3_PORT1_EN_WIDTH 1
+  #define TX_VLAN3_PORT0_EN_LBN 60
+  #define TX_VLAN3_PORT0_EN_WIDTH 1
+  #define TX_VLAN3_LBN 48
+  #define TX_VLAN3_WIDTH 12
+  #define TX_VLAN2_PORT1_EN_LBN 45
+  #define TX_VLAN2_PORT1_EN_WIDTH 1
+  #define TX_VLAN2_PORT0_EN_LBN 44
+  #define TX_VLAN2_PORT0_EN_WIDTH 1
+  #define TX_VLAN2_LBN 32
+  #define TX_VLAN2_WIDTH 12
+  #define TX_VLAN1_PORT1_EN_LBN 29
+  #define TX_VLAN1_PORT1_EN_WIDTH 1
+  #define TX_VLAN1_PORT0_EN_LBN 28
+  #define TX_VLAN1_PORT0_EN_WIDTH 1
+  #define TX_VLAN1_LBN 16
+  #define TX_VLAN1_WIDTH 12
+  #define TX_VLAN0_PORT1_EN_LBN 13
+  #define TX_VLAN0_PORT1_EN_WIDTH 1
+  #define TX_VLAN0_PORT0_EN_LBN 12
+  #define TX_VLAN0_PORT0_EN_WIDTH 1
+  #define TX_VLAN0_LBN 0
+  #define TX_VLAN0_WIDTH 12
+#define TX_FIL_CTL_REG_OFST 0xAF0 // Transmit filter control register
+  #define TX_MADR1_FIL_EN_LBN 65
+  #define TX_MADR1_FIL_EN_WIDTH 1
+  #define TX_MADR0_FIL_EN_LBN 64
+  #define TX_MADR0_FIL_EN_WIDTH 1
+  #define TX_IPFIL31_PORT1_EN_LBN 63
+  #define TX_IPFIL31_PORT1_EN_WIDTH 1
+  #define TX_IPFIL31_PORT0_EN_LBN 62
+  #define TX_IPFIL31_PORT0_EN_WIDTH 1
+  #define TX_IPFIL30_PORT1_EN_LBN 61
+  #define TX_IPFIL30_PORT1_EN_WIDTH 1
+  #define TX_IPFIL30_PORT0_EN_LBN 60
+  #define TX_IPFIL30_PORT0_EN_WIDTH 1
+  #define TX_IPFIL29_PORT1_EN_LBN 59
+  #define TX_IPFIL29_PORT1_EN_WIDTH 1
+  #define TX_IPFIL29_PORT0_EN_LBN 58
+  #define TX_IPFIL29_PORT0_EN_WIDTH 1
+  #define TX_IPFIL28_PORT1_EN_LBN 57
+  #define TX_IPFIL28_PORT1_EN_WIDTH 1
+  #define TX_IPFIL28_PORT0_EN_LBN 56
+  #define TX_IPFIL28_PORT0_EN_WIDTH 1
+  #define TX_IPFIL27_PORT1_EN_LBN 55
+  #define TX_IPFIL27_PORT1_EN_WIDTH 1
+  #define TX_IPFIL27_PORT0_EN_LBN 54
+  #define TX_IPFIL27_PORT0_EN_WIDTH 1
+  #define TX_IPFIL26_PORT1_EN_LBN 53
+  #define TX_IPFIL26_PORT1_EN_WIDTH 1
+  #define TX_IPFIL26_PORT0_EN_LBN 52
+  #define TX_IPFIL26_PORT0_EN_WIDTH 1
+  #define TX_IPFIL25_PORT1_EN_LBN 51
+  #define TX_IPFIL25_PORT1_EN_WIDTH 1
+  #define TX_IPFIL25_PORT0_EN_LBN 50
+  #define TX_IPFIL25_PORT0_EN_WIDTH 1
+  #define TX_IPFIL24_PORT1_EN_LBN 49
+  #define TX_IPFIL24_PORT1_EN_WIDTH 1
+  #define TX_IPFIL24_PORT0_EN_LBN 48
+  #define TX_IPFIL24_PORT0_EN_WIDTH 1
+  #define TX_IPFIL23_PORT1_EN_LBN 47
+  #define TX_IPFIL23_PORT1_EN_WIDTH 1
+  #define TX_IPFIL23_PORT0_EN_LBN 46
+  #define TX_IPFIL23_PORT0_EN_WIDTH 1
+  #define TX_IPFIL22_PORT1_EN_LBN 45
+  #define TX_IPFIL22_PORT1_EN_WIDTH 1
+  #define TX_IPFIL22_PORT0_EN_LBN 44
+  #define TX_IPFIL22_PORT0_EN_WIDTH 1
+  #define TX_IPFIL21_PORT1_EN_LBN 43
+  #define TX_IPFIL21_PORT1_EN_WIDTH 1
+  #define TX_IPFIL21_PORT0_EN_LBN 42
+  #define TX_IPFIL21_PORT0_EN_WIDTH 1
+  #define TX_IPFIL20_PORT1_EN_LBN 41
+  #define TX_IPFIL20_PORT1_EN_WIDTH 1
+  #define TX_IPFIL20_PORT0_EN_LBN 40
+  #define TX_IPFIL20_PORT0_EN_WIDTH 1
+  #define TX_IPFIL19_PORT1_EN_LBN 39
+  #define TX_IPFIL19_PORT1_EN_WIDTH 1
+  #define TX_IPFIL19_PORT0_EN_LBN 38
+  #define TX_IPFIL19_PORT0_EN_WIDTH 1
+  #define TX_IPFIL18_PORT1_EN_LBN 37
+  #define TX_IPFIL18_PORT1_EN_WIDTH 1
+  #define TX_IPFIL18_PORT0_EN_LBN 36
+  #define TX_IPFIL18_PORT0_EN_WIDTH 1
+  #define TX_IPFIL17_PORT1_EN_LBN 35
+  #define TX_IPFIL17_PORT1_EN_WIDTH 1
+  #define TX_IPFIL17_PORT0_EN_LBN 34
+  #define TX_IPFIL17_PORT0_EN_WIDTH 1
+  #define TX_IPFIL16_PORT1_EN_LBN 33
+  #define TX_IPFIL16_PORT1_EN_WIDTH 1
+  #define TX_IPFIL16_PORT0_EN_LBN 32
+  #define TX_IPFIL16_PORT0_EN_WIDTH 1
+  #define TX_IPFIL15_PORT1_EN_LBN 31
+  #define TX_IPFIL15_PORT1_EN_WIDTH 1
+  #define TX_IPFIL15_PORT0_EN_LBN 30
+  #define TX_IPFIL15_PORT0_EN_WIDTH 1
+  #define TX_IPFIL14_PORT1_EN_LBN 29
+  #define TX_IPFIL14_PORT1_EN_WIDTH 1
+  #define TX_IPFIL14_PORT0_EN_LBN 28
+  #define TX_IPFIL14_PORT0_EN_WIDTH 1
+  #define TX_IPFIL13_PORT1_EN_LBN 27
+  #define TX_IPFIL13_PORT1_EN_WIDTH 1
+  #define TX_IPFIL13_PORT0_EN_LBN 26
+  #define TX_IPFIL13_PORT0_EN_WIDTH 1
+  #define TX_IPFIL12_PORT1_EN_LBN 25
+  #define TX_IPFIL12_PORT1_EN_WIDTH 1
+  #define TX_IPFIL12_PORT0_EN_LBN 24
+  #define TX_IPFIL12_PORT0_EN_WIDTH 1
+  #define TX_IPFIL11_PORT1_EN_LBN 23
+  #define TX_IPFIL11_PORT1_EN_WIDTH 1
+  #define TX_IPFIL11_PORT0_EN_LBN 22
+  #define TX_IPFIL11_PORT0_EN_WIDTH 1
+  #define TX_IPFIL10_PORT1_EN_LBN 21
+  #define TX_IPFIL10_PORT1_EN_WIDTH 1
+  #define TX_IPFIL10_PORT0_EN_LBN 20
+  #define TX_IPFIL10_PORT0_EN_WIDTH 1
+  #define TX_IPFIL9_PORT1_EN_LBN 19
+  #define TX_IPFIL9_PORT1_EN_WIDTH 1
+  #define TX_IPFIL9_PORT0_EN_LBN 18
+  #define TX_IPFIL9_PORT0_EN_WIDTH 1
+  #define TX_IPFIL8_PORT1_EN_LBN 17
+  #define TX_IPFIL8_PORT1_EN_WIDTH 1
+  #define TX_IPFIL8_PORT0_EN_LBN 16
+  #define TX_IPFIL8_PORT0_EN_WIDTH 1
+  #define TX_IPFIL7_PORT1_EN_LBN 15
+  #define TX_IPFIL7_PORT1_EN_WIDTH 1
+  #define TX_IPFIL7_PORT0_EN_LBN 14
+  #define TX_IPFIL7_PORT0_EN_WIDTH 1
+  #define TX_IPFIL6_PORT1_EN_LBN 13
+  #define TX_IPFIL6_PORT1_EN_WIDTH 1
+  #define TX_IPFIL6_PORT0_EN_LBN 12
+  #define TX_IPFIL6_PORT0_EN_WIDTH 1
+  #define TX_IPFIL5_PORT1_EN_LBN 11
+  #define TX_IPFIL5_PORT1_EN_WIDTH 1
+  #define TX_IPFIL5_PORT0_EN_LBN 10
+  #define TX_IPFIL5_PORT0_EN_WIDTH 1
+  #define TX_IPFIL4_PORT1_EN_LBN 9
+  #define TX_IPFIL4_PORT1_EN_WIDTH 1
+  #define TX_IPFIL4_PORT0_EN_LBN 8
+  #define TX_IPFIL4_PORT0_EN_WIDTH 1
+  #define TX_IPFIL3_PORT1_EN_LBN 7
+  #define TX_IPFIL3_PORT1_EN_WIDTH 1
+  #define TX_IPFIL3_PORT0_EN_LBN 6
+  #define TX_IPFIL3_PORT0_EN_WIDTH 1
+  #define TX_IPFIL2_PORT1_EN_LBN 5
+  #define TX_IPFIL2_PORT1_EN_WIDTH 1
+  #define TX_IPFIL2_PORT0_EN_LBN 4
+  #define TX_IPFIL2_PORT0_EN_WIDTH 1
+  #define TX_IPFIL1_PORT1_EN_LBN 3
+  #define TX_IPFIL1_PORT1_EN_WIDTH 1
+  #define TX_IPFIL1_PORT0_EN_LBN 2
+  #define TX_IPFIL1_PORT0_EN_WIDTH 1
+  #define TX_IPFIL0_PORT1_EN_LBN 1
+  #define TX_IPFIL0_PORT1_EN_WIDTH 1
+  #define TX_IPFIL0_PORT0_EN_LBN 0
+  #define TX_IPFIL0_PORT0_EN_WIDTH 1
+#define TX_IPFIL_TBL_OFST 0xB00 // Transmit IP source address filter table
+  #define TX_IPFIL_MASK_LBN 32
+  #define TX_IPFIL_MASK_WIDTH 32
+  #define TX_IP_SRC_ADR_LBN 0
+  #define TX_IP_SRC_ADR_WIDTH 32
+#define TX_PACE_REG_A1_OFST 0xF80000 // Transmit pace control register
+#define TX_PACE_REG_B0_OFST 0xA90    // Transmit pace control register
+  #define TX_PACE_SB_AF_LBN 19
+  #define TX_PACE_SB_AF_WIDTH 10
+  #define TX_PACE_SB_NOTAF_LBN 9
+  #define TX_PACE_SB_NOTAF_WIDTH 10
+  #define TX_PACE_FB_BASE_LBN 5
+  #define TX_PACE_FB_BASE_WIDTH 4
+  #define TX_PACE_BIN_TH_LBN 0
+  #define TX_PACE_BIN_TH_WIDTH 5
+#define TX_PACE_TBL_A1_OFST 0xF80040 // Transmit pacing table
+#define TX_PACE_TBL_FIRST_QUEUE_A1 4
+#define TX_PACE_TBL_B0_OFST 0xF80000 // Transmit pacing table
+#define TX_PACE_TBL_FIRST_QUEUE_B0 0
+  #define TX_PACE_LBN 0
+  #define TX_PACE_WIDTH 5
+
+//////////////---- EE/Flash Registers C Header ----//////////////
+#define EE_SPI_HCMD_REG_KER_OFST 0x100 // SPI host command register
+#define EE_SPI_HCMD_REG_OFST 0x100 // SPI host command register
+  #define EE_SPI_HCMD_CMD_EN_LBN 31
+  #define EE_SPI_HCMD_CMD_EN_WIDTH 1
+  #define EE_WR_TIMER_ACTIVE_LBN 28
+  #define EE_WR_TIMER_ACTIVE_WIDTH 1
+  #define EE_SPI_HCMD_SF_SEL_LBN 24
+  #define EE_SPI_HCMD_SF_SEL_WIDTH 1
+  #define EE_SPI_HCMD_DABCNT_LBN 16
+  #define EE_SPI_HCMD_DABCNT_WIDTH 5
+  #define EE_SPI_HCMD_READ_LBN 15
+  #define EE_SPI_HCMD_READ_WIDTH 1
+  #define EE_SPI_HCMD_DUBCNT_LBN 12
+  #define EE_SPI_HCMD_DUBCNT_WIDTH 2
+  #define EE_SPI_HCMD_ADBCNT_LBN 8
+  #define EE_SPI_HCMD_ADBCNT_WIDTH 2
+  #define EE_SPI_HCMD_ENC_LBN 0
+  #define EE_SPI_HCMD_ENC_WIDTH 8
+#define EE_SPI_HADR_REG_KER_OFST 0X110 // SPI host address register
+#define EE_SPI_HADR_REG_OFST 0X110 // SPI host address register
+  #define EE_SPI_HADR_DUBYTE_LBN 24
+  #define EE_SPI_HADR_DUBYTE_WIDTH 8
+  #define EE_SPI_HADR_ADR_LBN 0
+  #define EE_SPI_HADR_ADR_WIDTH 24
+#define EE_SPI_HDATA_REG_KER_OFST 0x120 // SPI host data register
+#define EE_SPI_HDATA_REG_OFST 0x120 // SPI host data register
+  #define EE_SPI_HDATA3_LBN 96
+  #define EE_SPI_HDATA3_WIDTH 32
+  #define EE_SPI_HDATA2_LBN 64
+  #define EE_SPI_HDATA2_WIDTH 32
+  #define EE_SPI_HDATA1_LBN 32
+  #define EE_SPI_HDATA1_WIDTH 32
+  #define EE_SPI_HDATA0_LBN 0
+  #define EE_SPI_HDATA0_WIDTH 32
+#define EE_BASE_PAGE_REG_KER_OFST 0x130 // Expansion ROM base mirror register
+#define EE_BASE_PAGE_REG_OFST 0x130 // Expansion ROM base mirror register
+  #define EE_EXP_ROM_WINDOW_BASE_LBN 16
+  #define EE_EXP_ROM_WINDOW_BASE_WIDTH 13
+  #define EE_EXPROM_MASK_LBN 0
+  #define EE_EXPROM_MASK_WIDTH 13
+#define EE_VPD_CFG0_REG_KER_OFST 0X140 // SPI/VPD configuration register
+#define EE_VPD_CFG0_REG_OFST 0X140 // SPI/VPD configuration register
+  #define EE_SF_FASTRD_EN_LBN 127
+  #define EE_SF_FASTRD_EN_WIDTH 1
+  #define EE_SF_CLOCK_DIV_LBN 120
+  #define EE_SF_CLOCK_DIV_WIDTH 7
+  #define EE_VPD_WIP_POLL_LBN 119
+  #define EE_VPD_WIP_POLL_WIDTH 1
+  #define EE_VPDW_LENGTH_LBN 80
+  #define EE_VPDW_LENGTH_WIDTH 15
+  #define EE_VPDW_BASE_LBN 64
+  #define EE_VPDW_BASE_WIDTH 15
+  #define EE_VPD_WR_CMD_EN_LBN 56
+  #define EE_VPD_WR_CMD_EN_WIDTH 8
+  #define EE_VPD_BASE_LBN 32
+  #define EE_VPD_BASE_WIDTH 24
+  #define EE_VPD_LENGTH_LBN 16
+  #define EE_VPD_LENGTH_WIDTH 13
+  #define EE_VPD_AD_SIZE_LBN 8
+  #define EE_VPD_AD_SIZE_WIDTH 5
+  #define EE_VPD_ACCESS_ON_LBN 5
+  #define EE_VPD_ACCESS_ON_WIDTH 1
+#define EE_VPD_SW_CNTL_REG_KER_OFST 0X150 // VPD access SW control register
+#define EE_VPD_SW_CNTL_REG_OFST 0X150 // VPD access SW control register
+  #define EE_VPD_CYCLE_PENDING_LBN 31
+  #define EE_VPD_CYCLE_PENDING_WIDTH 1
+  #define EE_VPD_CYC_WRITE_LBN 28
+  #define EE_VPD_CYC_WRITE_WIDTH 1
+  #define EE_VPD_CYC_ADR_LBN 0
+  #define EE_VPD_CYC_ADR_WIDTH 15
+#define EE_VPD_SW_DATA_REG_KER_OFST 0x160 // VPD access SW data register
+#define EE_VPD_SW_DATA_REG_OFST 0x160 // VPD access SW data register
+  #define EE_VPD_CYC_DAT_LBN 0
+  #define EE_VPD_CYC_DAT_WIDTH 32
diff --git a/drivers/xen/sfc_netfront/ef_vi_falcon_desc.h b/drivers/xen/sfc_netfront/ef_vi_falcon_desc.h

new file mode 100644 (file)

index 0000000..8c8404c
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_falcon_desc.h
@@ -0,0 +1,43 @@
+//////////////---- Descriptors C Headers ----//////////////
+// Receive Kernel IP Descriptor
+  #define RX_KER_BUF_SIZE_LBN 48
+  #define RX_KER_BUF_SIZE_WIDTH 14
+  #define RX_KER_BUF_REGION_LBN 46
+  #define RX_KER_BUF_REGION_WIDTH 2
+      #define RX_KER_BUF_REGION0_DECODE 0
+      #define RX_KER_BUF_REGION1_DECODE 1
+      #define RX_KER_BUF_REGION2_DECODE 2
+      #define RX_KER_BUF_REGION3_DECODE 3
+  #define RX_KER_BUF_ADR_LBN 0
+  #define RX_KER_BUF_ADR_WIDTH 46
+// Receive User IP Descriptor
+  #define RX_USR_2BYTE_OFS_LBN 20
+  #define RX_USR_2BYTE_OFS_WIDTH 12
+  #define RX_USR_BUF_ID_LBN 0
+  #define RX_USR_BUF_ID_WIDTH 20
+// Transmit Kernel IP Descriptor
+  #define TX_KER_PORT_LBN 63
+  #define TX_KER_PORT_WIDTH 1
+  #define TX_KER_CONT_LBN 62
+  #define TX_KER_CONT_WIDTH 1
+  #define TX_KER_BYTE_CNT_LBN 48
+  #define TX_KER_BYTE_CNT_WIDTH 14
+  #define TX_KER_BUF_REGION_LBN 46
+  #define TX_KER_BUF_REGION_WIDTH 2
+      #define TX_KER_BUF_REGION0_DECODE 0
+      #define TX_KER_BUF_REGION1_DECODE 1
+      #define TX_KER_BUF_REGION2_DECODE 2
+      #define TX_KER_BUF_REGION3_DECODE 3
+  #define TX_KER_BUF_ADR_LBN 0
+  #define TX_KER_BUF_ADR_WIDTH 46
+// Transmit User IP Descriptor
+  #define TX_USR_PORT_LBN 47
+  #define TX_USR_PORT_WIDTH 1
+  #define TX_USR_CONT_LBN 46
+  #define TX_USR_CONT_WIDTH 1
+  #define TX_USR_BYTE_CNT_LBN 33
+  #define TX_USR_BYTE_CNT_WIDTH 13
+  #define TX_USR_BUF_ID_LBN 13
+  #define TX_USR_BUF_ID_WIDTH 20
+  #define TX_USR_BYTE_OFS_LBN 0
+  #define TX_USR_BYTE_OFS_WIDTH 13
diff --git a/drivers/xen/sfc_netfront/ef_vi_falcon_event.h b/drivers/xen/sfc_netfront/ef_vi_falcon_event.h

new file mode 100644 (file)

index 0000000..abb63f3
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_falcon_event.h
@@ -0,0 +1,123 @@
+//////////////---- Events Format C Header ----//////////////
+//////////////---- Event entry ----//////////////
+  #define EV_CODE_LBN 60
+  #define EV_CODE_WIDTH 4
+      #define RX_IP_EV_DECODE 0
+      #define TX_IP_EV_DECODE 2
+      #define DRIVER_EV_DECODE 5
+      #define GLOBAL_EV_DECODE 6
+      #define DRV_GEN_EV_DECODE 7
+  #define EV_DATA_LBN 0
+  #define EV_DATA_WIDTH 60
+//////////////---- Receive IP events for both Kernel & User event queues ----//////////////
+  #define RX_EV_PKT_OK_LBN 56
+  #define RX_EV_PKT_OK_WIDTH 1
+  #define RX_EV_BUF_OWNER_ID_ERR_LBN 54
+  #define RX_EV_BUF_OWNER_ID_ERR_WIDTH 1
+  #define RX_EV_IP_HDR_CHKSUM_ERR_LBN 52
+  #define RX_EV_IP_HDR_CHKSUM_ERR_WIDTH 1
+  #define RX_EV_TCP_UDP_CHKSUM_ERR_LBN 51
+  #define RX_EV_TCP_UDP_CHKSUM_ERR_WIDTH 1
+  #define RX_EV_ETH_CRC_ERR_LBN 50
+  #define RX_EV_ETH_CRC_ERR_WIDTH 1
+  #define RX_EV_FRM_TRUNC_LBN 49
+  #define RX_EV_FRM_TRUNC_WIDTH 1
+  #define RX_EV_DRIB_NIB_LBN 48
+  #define RX_EV_DRIB_NIB_WIDTH 1
+  #define RX_EV_TOBE_DISC_LBN 47
+  #define RX_EV_TOBE_DISC_WIDTH 1
+  #define RX_EV_PKT_TYPE_LBN 44
+  #define RX_EV_PKT_TYPE_WIDTH 3
+      #define RX_EV_PKT_TYPE_ETH_DECODE 0
+      #define RX_EV_PKT_TYPE_LLC_DECODE 1
+      #define RX_EV_PKT_TYPE_JUMBO_DECODE 2
+      #define RX_EV_PKT_TYPE_VLAN_DECODE 3
+      #define RX_EV_PKT_TYPE_VLAN_LLC_DECODE 4
+      #define RX_EV_PKT_TYPE_VLAN_JUMBO_DECODE 5
+  #define RX_EV_HDR_TYPE_LBN 42
+  #define RX_EV_HDR_TYPE_WIDTH 2
+      #define RX_EV_HDR_TYPE_TCP_IPV4_DECODE 0
+      #define RX_EV_HDR_TYPE_UDP_IPV4_DECODE 1
+      #define RX_EV_HDR_TYPE_OTHER_IP_DECODE 2
+      #define RX_EV_HDR_TYPE_NON_IP_DECODE 3
+  #define RX_EV_DESC_Q_EMPTY_LBN 41
+  #define RX_EV_DESC_Q_EMPTY_WIDTH 1
+  #define RX_EV_MCAST_HASH_MATCH_LBN 40
+  #define RX_EV_MCAST_HASH_MATCH_WIDTH 1
+  #define RX_EV_MCAST_PKT_LBN 39
+  #define RX_EV_MCAST_PKT_WIDTH 1
+  #define RX_EV_Q_LABEL_LBN 32
+  #define RX_EV_Q_LABEL_WIDTH 5
+  #define RX_JUMBO_CONT_LBN 31
+  #define RX_JUMBO_CONT_WIDTH 1
+  #define RX_SOP_LBN 15
+  #define RX_SOP_WIDTH 1
+  #define RX_PORT_LBN 30
+  #define RX_PORT_WIDTH 1
+  #define RX_EV_BYTE_CNT_LBN 16
+  #define RX_EV_BYTE_CNT_WIDTH 14
+  #define RX_iSCSI_PKT_OK_LBN 14
+  #define RX_iSCSI_PKT_OK_WIDTH 1
+  #define RX_ISCSI_DDIG_ERR_LBN 13
+  #define RX_ISCSI_DDIG_ERR_WIDTH 1
+  #define RX_ISCSI_HDIG_ERR_LBN 12
+  #define RX_ISCSI_HDIG_ERR_WIDTH 1
+  #define RX_EV_DESC_PTR_LBN 0
+  #define RX_EV_DESC_PTR_WIDTH 12
+//////////////---- Transmit IP events for both Kernel & User event queues ----//////////////
+  #define TX_EV_PKT_ERR_LBN 38
+  #define TX_EV_PKT_ERR_WIDTH 1
+  #define TX_EV_PKT_TOO_BIG_LBN 37
+  #define TX_EV_PKT_TOO_BIG_WIDTH 1
+  #define TX_EV_Q_LABEL_LBN 32
+  #define TX_EV_Q_LABEL_WIDTH 5
+  #define TX_EV_PORT_LBN 16
+  #define TX_EV_PORT_WIDTH 1
+  #define TX_EV_WQ_FF_FULL_LBN 15
+  #define TX_EV_WQ_FF_FULL_WIDTH 1
+  #define TX_EV_BUF_OWNER_ID_ERR_LBN 14
+  #define TX_EV_BUF_OWNER_ID_ERR_WIDTH 1
+  #define TX_EV_COMP_LBN 12
+  #define TX_EV_COMP_WIDTH 1
+  #define TX_EV_DESC_PTR_LBN 0
+  #define TX_EV_DESC_PTR_WIDTH 12
+//////////////---- Char or Kernel driver events ----//////////////
+  #define DRIVER_EV_SUB_CODE_LBN 56
+  #define DRIVER_EV_SUB_CODE_WIDTH 4
+      #define TX_DESCQ_FLS_DONE_EV_DECODE 0x0
+      #define RX_DESCQ_FLS_DONE_EV_DECODE 0x1
+      #define EVQ_INIT_DONE_EV_DECODE 0x2
+      #define EVQ_NOT_EN_EV_DECODE 0x3
+      #define RX_DESCQ_FLSFF_OVFL_EV_DECODE 0x4
+      #define SRM_UPD_DONE_EV_DECODE 0x5
+      #define WAKE_UP_EV_DECODE 0x6
+      #define TX_PKT_NON_TCP_UDP_DECODE 0x9
+      #define TIMER_EV_DECODE 0xA
+      #define RX_DSC_ERROR_EV_DECODE 0xE
+  #define DRIVER_EV_TX_DESCQ_ID_LBN 0
+  #define DRIVER_EV_TX_DESCQ_ID_WIDTH 12
+  #define DRIVER_EV_RX_DESCQ_ID_LBN 0
+  #define DRIVER_EV_RX_DESCQ_ID_WIDTH 12
+  #define DRIVER_EV_EVQ_ID_LBN 0
+  #define DRIVER_EV_EVQ_ID_WIDTH 12
+  #define DRIVER_TMR_ID_LBN 0
+  #define DRIVER_TMR_ID_WIDTH 12
+  #define DRIVER_EV_SRM_UPD_LBN 0
+  #define DRIVER_EV_SRM_UPD_WIDTH 2
+      #define SRM_CLR_EV_DECODE 0
+      #define SRM_UPD_EV_DECODE 1
+      #define SRM_ILLCLR_EV_DECODE 2
+//////////////---- Global events. Sent to both event queue 0 and 4. ----//////////////
+  #define XFP_PHY_INTR_LBN 10
+  #define XFP_PHY_INTR_WIDTH 1
+  #define XG_PHY_INTR_LBN 9
+  #define XG_PHY_INTR_WIDTH 1
+  #define G_PHY1_INTR_LBN 8
+  #define G_PHY1_INTR_WIDTH 1
+  #define G_PHY0_INTR_LBN 7
+  #define G_PHY0_INTR_WIDTH 1
+//////////////---- Driver generated events ----//////////////
+  #define DRV_GEN_EV_CODE_LBN 60
+  #define DRV_GEN_EV_CODE_WIDTH 4
+  #define DRV_GEN_EV_DATA_LBN 0
+  #define DRV_GEN_EV_DATA_WIDTH 60
diff --git a/drivers/xen/sfc_netfront/ef_vi_internal.h b/drivers/xen/sfc_netfront/ef_vi_internal.h

new file mode 100644 (file)

index 0000000..396ae46
--- /dev/null
+++ b/drivers/xen/sfc_netfront/ef_vi_internal.h
@@ -0,0 +1,256 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Really-and-truely-honestly internal stuff for libef.
+ *   \date  2004/06/13
+ */
+
+/*! \cidoxg_include_ci_ul */
+#ifndef __CI_EF_VI_INTERNAL_H__
+#define __CI_EF_VI_INTERNAL_H__
+
+
+/* These flags share space with enum ef_vi_flags. */
+#define EF_VI_BUG5692_WORKAROUND  0x10000
+
+
+/* ***********************************************************************
+ * COMPILATION CONTROL FLAGS (see ef_vi.h for "workaround" controls)
+ */
+
+#define EF_VI_DO_MAGIC_CHECKS 1
+
+
+/**********************************************************************
+ * Headers
+ */
+
+#include <etherfabric/ef_vi.h>
+#include "sysdep.h"
+#include "ef_vi_falcon.h"
+
+
+/**********************************************************************
+ * Debugging.
+ */
+
+#ifndef NDEBUG
+
+# define _ef_assert(exp, file, line) BUG_ON(!(exp));
+
+# define _ef_assert2(exp, x, y, file, line)  do {      \
+               if (unlikely(!(exp)))           \
+                       BUG();                          \
+       } while (0)
+
+#else
+
+# define _ef_assert(exp, file, line)
+# define _ef_assert2(e, x, y, file, line)
+
+#endif
+
+#define ef_assert(a)          do{ _ef_assert((a),__FILE__,__LINE__); } while(0)
+#define ef_assert_equal(a,b)  _ef_assert2((a)==(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_eq          ef_assert_equal
+#define ef_assert_lt(a,b)     _ef_assert2((a)<(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_le(a,b)     _ef_assert2((a)<=(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_nequal(a,b) _ef_assert2((a)!=(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_ne          ef_assert_nequal
+#define ef_assert_ge(a,b)     _ef_assert2((a)>=(b),(a),(b),__FILE__,__LINE__)
+#define ef_assert_gt(a,b)     _ef_assert2((a)>(b),(a),(b),__FILE__,__LINE__)
+
+/**********************************************************************
+ * Debug checks. ******************************************************
+ **********************************************************************/
+
+#ifdef NDEBUG
+# define EF_VI_MAGIC_SET(p, type)
+# define EF_VI_CHECK_VI(p)
+# define EF_VI_CHECK_EVENT_Q(p)
+# define EF_VI_CHECK_IOBUFSET(p)
+# define EF_VI_CHECK_FILTER(p)
+# define EF_VI_CHECK_SHMBUF(p)
+# define EF_VI_CHECK_PT_EP(p)
+#else
+# define EF_VI                    0x3
+# define EF_EPLOCK                0x6
+# define EF_IOBUFSET              0x9
+# define EF_FILTER                0xa
+# define EF_SHMBUF                0x11
+
+# define EF_VI_MAGIC(p, type)                          \
+       (((unsigned)(type) << 28) |                     \
+        (((unsigned)(intptr_t)(p)) & 0x0fffffffu))
+
+# if !EF_VI_DO_MAGIC_CHECKS
+#  define EF_VI_MAGIC_SET(p, type)
+#  define EF_VI_MAGIC_CHECK(p, type)
+# else
+#  define EF_VI_MAGIC_SET(p, type)                     \
+       do {                                            \
+               (p)->magic = EF_VI_MAGIC((p), (type));  \
+       } while (0)
+
+# define EF_VI_MAGIC_OKAY(p, type)                      \
+       ((p)->magic == EF_VI_MAGIC((p), (type)))
+
+# define EF_VI_MAGIC_CHECK(p, type)                     \
+       ef_assert(EF_VI_MAGIC_OKAY((p), (type)))
+
+#endif /* EF_VI_DO_MAGIC_CHECKS */
+
+# define EF_VI_CHECK_VI(p)                     \
+       ef_assert(p);                           \
+       EF_VI_MAGIC_CHECK((p), EF_VI);
+
+# define EF_VI_CHECK_EVENT_Q(p)                        \
+       ef_assert(p);                           \
+       EF_VI_MAGIC_CHECK((p), EF_VI);          \
+       ef_assert((p)->evq_base);               \
+       ef_assert((p)->evq_mask);
+
+# define EF_VI_CHECK_PT_EP(p)                  \
+       ef_assert(p);                           \
+       EF_VI_MAGIC_CHECK((p), EF_VI);          \
+       ef_assert((p)->ep_state);
+
+# define EF_VI_CHECK_IOBUFSET(p)               \
+       ef_assert(p);                           \
+       EF_VI_MAGIC_CHECK((p), EF_IOBUFSET)
+
+# define EF_VI_CHECK_FILTER(p)                 \
+       ef_assert(p);                           \
+       EF_VI_MAGIC_CHECK((p), EF_FILTER);
+
+# define EF_VI_CHECK_SHMBUF(p)                 \
+       ef_assert(p);                           \
+       EF_VI_MAGIC_CHECK((p), EF_SHMBUF);
+
+#endif
+
+#ifndef NDEBUG
+# define EF_DRIVER_MAGIC 0x00f00ba4
+# define EF_ASSERT_THIS_DRIVER_VALID(driver)                           \
+       do{ ef_assert(driver);                                          \
+               EF_VI_MAGIC_CHECK((driver), EF_DRIVER_MAGIC);           \
+               ef_assert((driver)->init);               }while(0)
+
+# define EF_ASSERT_DRIVER_VALID() EF_ASSERT_THIS_DRIVER_VALID(&ci_driver)
+#else
+# define EF_ASSERT_THIS_DRIVER_VALID(driver)
+# define EF_ASSERT_DRIVER_VALID()
+#endif
+
+
+/* *************************************
+ * Power of 2 FIFO
+ */
+
+#define EF_VI_FIFO2_M(f, x)  ((x) & ((f)->fifo_mask))
+#define ef_vi_fifo2_valid(f) ((f) && (f)->fifo && (f)->fifo_mask > 0 &&        \
+                             (f)->fifo_rd_i <= (f)->fifo_mask       && \
+                             (f)->fifo_wr_i <= (f)->fifo_mask       && \
+                             EF_VI_IS_POW2((f)->fifo_mask+1u))
+
+#define ef_vi_fifo2_init(f, cap)                       \
+       do{ ef_assert(EF_VI_IS_POW2((cap) + 1));        \
+               (f)->fifo_rd_i = (f)->fifo_wr_i = 0u;   \
+               (f)->fifo_mask = (cap);                 \
+       }while(0)
+
+#define ef_vi_fifo2_is_empty(f) ((f)->fifo_rd_i == (f)->fifo_wr_i)
+#define ef_vi_fifo2_capacity(f) ((f)->fifo_mask)
+#define ef_vi_fifo2_buf_size(f) ((f)->fifo_mask + 1u)
+#define ef_vi_fifo2_end(f)      ((f)->fifo + ef_vi_fifo2_buf_size(f))
+#define ef_vi_fifo2_peek(f)     ((f)->fifo[(f)->fifo_rd_i])
+#define ef_vi_fifo2_poke(f)     ((f)->fifo[(f)->fifo_wr_i])
+#define ef_vi_fifo2_num(f)   EF_VI_FIFO2_M((f),(f)->fifo_wr_i-(f)->fifo_rd_i)
+
+#define ef_vi_fifo2_wr_prev(f)                                         \
+       do{ (f)->fifo_wr_i = EF_VI_FIFO2_M((f), (f)->fifo_wr_i - 1u); }while(0)
+#define ef_vi_fifo2_wr_next(f)                                         \
+       do{ (f)->fifo_wr_i = EF_VI_FIFO2_M((f), (f)->fifo_wr_i + 1u); }while(0)
+#define ef_vi_fifo2_rd_adv(f, n)                                       \
+       do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i + (n)); }while(0)
+#define ef_vi_fifo2_rd_prev(f)                                         \
+       do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i - 1u); }while(0)
+#define ef_vi_fifo2_rd_next(f)                                         \
+       do{ (f)->fifo_rd_i = EF_VI_FIFO2_M((f), (f)->fifo_rd_i + 1u); }while(0)
+
+#define ef_vi_fifo2_put(f, v)                                          \
+       do{ ef_vi_fifo2_poke(f) = (v); ef_vi_fifo2_wr_next(f); }while(0)
+#define ef_vi_fifo2_get(f, pv)                                         \
+       do{ *(pv) = ef_vi_fifo2_peek(f); ef_vi_fifo2_rd_next(f); }while(0)
+
+
+/* *********************************************************************
+ * Eventq handling
+ */
+
+typedef union {
+       uint64_t    u64;
+       struct {
+               uint32_t  a;
+               uint32_t  b;
+       } opaque;
+} ef_vi_event;
+
+
+#define EF_VI_EVENT_OFFSET(q, i)                                       \
+       (((q)->evq_state->evq_ptr - (i) * sizeof(ef_vi_event)) & (q)->evq_mask)
+
+#define EF_VI_EVENT_PTR(q, i)                                           \
+       ((ef_vi_event*) ((q)->evq_base + EF_VI_EVENT_OFFSET((q), (i))))
+
+/* *********************************************************************
+ * Miscellaneous goodies
+ */
+#ifdef NDEBUG
+# define EF_VI_DEBUG(x)
+#else
+# define EF_VI_DEBUG(x)            x
+#endif
+
+#define EF_VI_ROUND_UP(i, align)   (((i)+(align)-1u) & ~((align)-1u))
+#define EF_VI_ALIGN_FWD(p, align)  (((p)+(align)-1u) & ~((align)-1u))
+#define EF_VI_ALIGN_BACK(p, align) ((p) & ~((align)-1u))
+#define EF_VI_PTR_ALIGN_BACK(p, align)                                 \
+       ((char*)EF_VI_ALIGN_BACK(((intptr_t)(p)), ((intptr_t)(align))))
+#define EF_VI_IS_POW2(x)           ((x) && ! ((x) & ((x) - 1)))
+
+
+/* ******************************************************************** 
+ */
+
+extern void falcon_vi_init(ef_vi*, void* vvis ) EF_VI_HF;
+extern void ef_eventq_state_init(ef_vi* evq) EF_VI_HF;
+extern void __ef_init(void) EF_VI_HF;
+
+
+#endif  /* __CI_EF_VI_INTERNAL_H__ */
+
diff --git a/drivers/xen/sfc_netfront/etherfabric/ef_vi.h b/drivers/xen/sfc_netfront/etherfabric/ef_vi.h

new file mode 100644 (file)

index 0000000..6b1bef0
--- /dev/null
+++ b/drivers/xen/sfc_netfront/etherfabric/ef_vi.h
@@ -0,0 +1,647 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ *  \brief  Virtual Interface
+ *   \date  2007/05/16
+ */
+
+#ifndef __EFAB_EF_VI_H__
+#define __EFAB_EF_VI_H__
+
+
+/**********************************************************************
+ * Primitive types ****************************************************
+ **********************************************************************/
+
+/* We standardise on the types from stdint.h and synthesise these types
+ * for compilers/platforms that don't provide them */
+
+#  include <linux/types.h>
+# define EF_VI_ALIGN(x) __attribute__ ((aligned (x)))
+# define ef_vi_inline static inline
+
+
+
+/**********************************************************************
+ * Types **************************************************************
+ **********************************************************************/
+
+typedef uint32_t                ef_eventq_ptr;
+
+typedef uint64_t                ef_addr;
+typedef char*                   ef_vi_ioaddr_t;
+
+/**********************************************************************
+ * ef_event ***********************************************************
+ **********************************************************************/
+
+/*! \i_ef_vi A DMA request identifier.
+**
+** This is an integer token specified by the transport and associated
+** with a DMA request.  It is returned to the VI user with DMA completion
+** events.  It is typically used to identify the buffer associated with
+** the transfer.
+*/
+typedef int                    ef_request_id;
+
+typedef union {
+       uint64_t  u64[1];
+       uint32_t  u32[2];
+} ef_vi_qword;
+
+typedef ef_vi_qword             ef_hw_event;
+
+#define EF_REQUEST_ID_BITS      16u
+#define EF_REQUEST_ID_MASK      ((1u << EF_REQUEST_ID_BITS) - 1u)
+
+/*! \i_ef_event An [ef_event] is a token that identifies something that
+** has happened.  Examples include packets received, packets transmitted
+** and errors.
+*/
+typedef union {
+       struct {
+               ef_hw_event    ev;
+               unsigned       type       :16;
+       } generic;
+       struct {
+               ef_hw_event    ev;
+               unsigned       type       :16;
+               /*ef_request_id  request_id :EF_REQUEST_ID_BITS;*/
+               unsigned       q_id       :16;
+               unsigned       len        :16;
+               unsigned       flags      :16;
+       } rx;
+       struct {  /* This *must* have same layout as [rx]. */
+               ef_hw_event    ev;
+               unsigned       type       :16;
+               /*ef_request_id  request_id :EF_REQUEST_ID_BITS;*/
+               unsigned       q_id       :16;
+               unsigned       len        :16;
+               unsigned       flags      :16;
+               unsigned       subtype    :16;
+       } rx_discard;
+       struct {
+               ef_hw_event    ev;
+               unsigned       type       :16;
+               /*ef_request_id  request_id :EF_REQUEST_ID_BITS;*/
+               unsigned       q_id       :16;
+       } tx;
+       struct {
+               ef_hw_event    ev;
+               unsigned       type       :16;
+               /*ef_request_id  request_id :EF_REQUEST_ID_BITS;*/
+               unsigned       q_id       :16;
+               unsigned       subtype    :16;
+       } tx_error;
+       struct {
+               ef_hw_event    ev;
+               unsigned       type       :16;
+               unsigned       q_id       :16;
+       } rx_no_desc_trunc;
+       struct {
+               ef_hw_event    ev;
+               unsigned       type       :16;
+               unsigned       data;
+       } sw;
+} ef_event;
+
+
+#define EF_EVENT_TYPE(e)        ((e).generic.type)
+enum {
+       /** Good data was received. */
+       EF_EVENT_TYPE_RX,
+       /** Packets have been sent. */
+       EF_EVENT_TYPE_TX,
+       /** Data received and buffer consumed, but something is wrong. */
+       EF_EVENT_TYPE_RX_DISCARD,
+       /** Transmit of packet failed. */
+       EF_EVENT_TYPE_TX_ERROR,
+       /** Received packet was truncated due to lack of descriptors. */
+       EF_EVENT_TYPE_RX_NO_DESC_TRUNC,
+       /** Software generated event. */
+       EF_EVENT_TYPE_SW,
+       /** Event queue overflow. */
+       EF_EVENT_TYPE_OFLOW,
+};
+
+#define EF_EVENT_RX_BYTES(e)    ((e).rx.len)
+#define EF_EVENT_RX_Q_ID(e)     ((e).rx.q_id)
+#define EF_EVENT_RX_CONT(e)     ((e).rx.flags & EF_EVENT_FLAG_CONT)
+#define EF_EVENT_RX_SOP(e)      ((e).rx.flags & EF_EVENT_FLAG_SOP)
+#define EF_EVENT_RX_ISCSI_OKAY(e) ((e).rx.flags & EF_EVENT_FLAG_ISCSI_OK)
+#define EF_EVENT_FLAG_SOP       0x1
+#define EF_EVENT_FLAG_CONT      0x2
+#define EF_EVENT_FLAG_ISCSI_OK  0x4
+
+#define EF_EVENT_TX_Q_ID(e)     ((e).tx.q_id)
+
+#define EF_EVENT_RX_DISCARD_Q_ID(e)  ((e).rx_discard.q_id)
+#define EF_EVENT_RX_DISCARD_LEN(e)   ((e).rx_discard.len)
+#define EF_EVENT_RX_DISCARD_TYPE(e)  ((e).rx_discard.subtype)
+enum {
+       EF_EVENT_RX_DISCARD_CSUM_BAD,
+       EF_EVENT_RX_DISCARD_CRC_BAD,
+       EF_EVENT_RX_DISCARD_TRUNC,
+       EF_EVENT_RX_DISCARD_RIGHTS,
+       EF_EVENT_RX_DISCARD_OTHER,
+};
+
+#define EF_EVENT_TX_ERROR_Q_ID(e)    ((e).tx_error.q_id)
+#define EF_EVENT_TX_ERROR_TYPE(e)    ((e).tx_error.subtype)
+enum {
+       EF_EVENT_TX_ERROR_RIGHTS,
+       EF_EVENT_TX_ERROR_OFLOW,
+       EF_EVENT_TX_ERROR_2BIG,
+       EF_EVENT_TX_ERROR_BUS,
+};
+
+#define EF_EVENT_RX_NO_DESC_TRUNC_Q_ID(e)  ((e).rx_no_desc_trunc.q_id)
+
+#define EF_EVENT_SW_DATA_MASK   0xffff
+#define EF_EVENT_SW_DATA(e)     ((e).sw.data)
+
+#define EF_EVENT_FMT            "[ev:%x:%08x:%08x]"
+#define EF_EVENT_PRI_ARG(e)     (unsigned) (e).generic.type,    \
+               (unsigned) (e).generic.ev.u32[1],               \
+               (unsigned) (e).generic.ev.u32[0]
+
+#define EF_GET_HW_EV(e)         ((e).generic.ev)
+#define EF_GET_HW_EV_PTR(e)     (&(e).generic.ev)
+#define EF_GET_HW_EV_U64(e)     ((e).generic.ev.u64[0])
+
+
+/* ***************** */
+
+/*! Used by netif shared state. Must use types of explicit size. */
+typedef struct {
+       uint16_t              rx_last_desc_ptr;   /* for RX duplicates       */
+       uint8_t               bad_sop;            /* bad SOP detected        */
+       uint8_t               frag_num;           /* next fragment #, 0=>SOP */
+} ef_rx_dup_state_t;
+
+
+/* Max number of ports on any SF NIC. */
+#define EFAB_DMAQS_PER_EVQ_MAX 32
+
+typedef struct {
+       ef_eventq_ptr           evq_ptr;
+       int32_t               trashed;
+       ef_rx_dup_state_t     rx_dup_state[EFAB_DMAQS_PER_EVQ_MAX];
+} ef_eventq_state;
+
+
+/*! \i_ef_base [ef_iovec] is similar the standard [struct iovec].  An
+** array of these is used to designate a scatter/gather list of I/O
+** buffers.
+*/
+typedef struct {
+       ef_addr                       iov_base EF_VI_ALIGN(8);
+       unsigned                      iov_len;
+} ef_iovec;
+
+/* Falcon constants */
+#define TX_EV_DESC_PTR_LBN 0
+
+
+/**********************************************************************
+ * ef_vi **************************************************************
+ **********************************************************************/
+
+enum ef_vi_flags {
+       EF_VI_RX_SCATTER        = 0x1,
+       EF_VI_ISCSI_RX_HDIG     = 0x2,
+       EF_VI_ISCSI_TX_HDIG     = 0x4,
+       EF_VI_ISCSI_RX_DDIG     = 0x8,
+       EF_VI_ISCSI_TX_DDIG     = 0x10,
+       EF_VI_TX_PHYS_ADDR      = 0x20,
+       EF_VI_RX_PHYS_ADDR      = 0x40,
+       EF_VI_TX_IP_CSUM_DIS    = 0x80,
+       EF_VI_TX_TCPUDP_CSUM_DIS= 0x100,
+       EF_VI_TX_TCPUDP_ONLY    = 0x200,
+       /* Flags in range 0xXXXX0000 are for internal use. */
+};
+
+typedef struct {
+       uint32_t  added;
+       uint32_t  removed;
+} ef_vi_txq_state;
+
+typedef struct {
+       uint32_t  added;
+       uint32_t  removed;
+} ef_vi_rxq_state;
+
+typedef struct {
+       uint32_t         mask;
+       void*            doorbell;
+       void*            descriptors;
+       uint16_t*        ids;
+       unsigned         misalign_mask;
+} ef_vi_txq;
+
+typedef struct {
+       uint32_t         mask;
+       void*            doorbell;
+       void*            descriptors;
+       uint16_t*        ids;
+} ef_vi_rxq;
+
+typedef struct {
+       ef_eventq_state  evq;
+       ef_vi_txq_state  txq;
+       ef_vi_rxq_state  rxq;
+       /* Followed by request id fifos. */
+} ef_vi_state;
+
+/*! \i_ef_vi  A virtual interface.
+**
+** An [ef_vi] represents a virtual interface on a specific NIC.  A
+** virtual interface is a collection of an event queue and two DMA queues
+** used to pass Ethernet frames between the transport implementation and
+** the network.
+*/
+typedef struct ef_vi {
+       unsigned                        magic;
+
+       unsigned                      vi_resource_id;
+       unsigned                      vi_resource_handle_hack;
+       unsigned                      vi_i;
+
+       char*                           vi_mem_mmap_ptr;
+       int                           vi_mem_mmap_bytes;
+       char*                           vi_io_mmap_ptr;
+       int                           vi_io_mmap_bytes;
+
+       ef_eventq_state*              evq_state;
+       char*                         evq_base;
+       unsigned                      evq_mask;
+       ef_vi_ioaddr_t                evq_timer_reg;
+
+       ef_vi_txq                     vi_txq;
+       ef_vi_rxq                     vi_rxq;
+       ef_vi_state*                  ep_state;
+       enum ef_vi_flags              vi_flags;
+} ef_vi;
+
+
+enum ef_vi_arch {
+       EF_VI_ARCH_FALCON,
+};
+
+
+struct ef_vi_nic_type {
+       unsigned char  arch;
+       char           variant;
+       unsigned char  revision;
+};
+
+
+/* This structure is opaque to the client & used to pass mapping data
+ * from the resource manager to the ef_vi lib. for ef_vi_init().
+ */
+struct vi_mappings {
+       uint32_t         signature;
+# define VI_MAPPING_VERSION   0x02  /*Byte: Increment me if struct altered*/
+# define VI_MAPPING_SIGNATURE (0xBA1150 + VI_MAPPING_VERSION)
+
+       struct ef_vi_nic_type nic_type;
+
+       int              vi_instance;
+
+       unsigned         evq_bytes;
+       char*            evq_base;
+       ef_vi_ioaddr_t   evq_timer_reg;
+
+       unsigned         rx_queue_capacity;
+       ef_vi_ioaddr_t   rx_dma_ef1;
+       char*            rx_dma_falcon;
+       ef_vi_ioaddr_t   rx_bell;
+
+       unsigned         tx_queue_capacity;
+       ef_vi_ioaddr_t   tx_dma_ef1;
+       char*            tx_dma_falcon;
+       ef_vi_ioaddr_t   tx_bell;
+};
+/* This is used by clients to allocate a suitably sized buffer for the 
+ * resource manager to fill & ef_vi_init() to use. */
+#define VI_MAPPINGS_SIZE (sizeof(struct vi_mappings))
+
+
+/**********************************************************************
+ * ef_config **********************************************************
+ **********************************************************************/
+
+struct ef_config_t {
+       int   log;                    /* debug logging level          */
+};
+
+extern struct ef_config_t  ef_config;
+
+
+/**********************************************************************
+ * ef_vi **************************************************************
+ **********************************************************************/
+
+/* Initialise [data_area] with information required to initialise an ef_vi.
+ * In the following, an unused param should be set to NULL. Note the case
+ * marked (*) of [iobuf_mmap] for falcon/driver; for normal driver this
+ * must be NULL.
+ *
+ * \param  data_area     [in,out] required, must ref at least VI_MAPPINGS_SIZE 
+ *                                bytes
+ * \param  evq_capacity  [in] number of events in event queue.  Specify 0 for
+ *                            no event queue.
+ * \param  rxq_capacity  [in] number of descriptors in RX DMA queue.  Specify
+ *                            0 for no RX queue.
+ * \param  txq_capacity  [in] number of descriptors in TX DMA queue.  Specify
+ *                            0 for no TX queue.
+ * \param  mmap_info     [in] mem-map info for resource
+ * \param  io_mmap       [in] ef1,    required
+ *                            falcon, required
+ * \param  iobuf_mmap    [in] ef1,    UL: unused
+ *                            falcon, UL: required
+ */
+extern void ef_vi_init_mapping_vi(void* data_area, struct ef_vi_nic_type,
+                                  unsigned rxq_capacity,
+                                  unsigned txq_capacity, int instance,
+                                  void* io_mmap, void* iobuf_mmap_rx,
+                                  void* iobuf_mmap_tx, enum ef_vi_flags);
+
+
+extern void ef_vi_init_mapping_evq(void* data_area, struct ef_vi_nic_type,
+                                   int instance, unsigned evq_bytes,
+                                   void* base, void* timer_reg);
+
+ef_vi_inline unsigned ef_vi_resource_id(ef_vi* vi)
+{ 
+       return vi->vi_resource_id; 
+}
+
+ef_vi_inline enum ef_vi_flags ef_vi_flags(ef_vi* vi)
+{ 
+       return vi->vi_flags; 
+}
+
+
+/**********************************************************************
+ * Receive interface **************************************************
+ **********************************************************************/
+
+/*! \i_ef_vi Returns the amount of space in the RX descriptor ring.
+**
+** \return the amount of space in the queue.
+*/
+ef_vi_inline int ef_vi_receive_space(ef_vi* vi) 
+{
+       ef_vi_rxq_state* qs = &vi->ep_state->rxq;
+       return vi->vi_rxq.mask - (qs->added - qs->removed);
+}
+
+
+/*! \i_ef_vi Returns the fill level of the RX descriptor ring.
+**
+** \return the fill level of the queue.
+*/
+ef_vi_inline int ef_vi_receive_fill_level(ef_vi* vi) 
+{
+       ef_vi_rxq_state* qs = &vi->ep_state->rxq;
+       return qs->added - qs->removed;
+}
+
+
+ef_vi_inline int ef_vi_receive_capacity(ef_vi* vi)
+{ 
+       return vi->vi_rxq.mask;
+}
+
+/*! \i_ef_vi  Complete a receive operation.
+**
+** When a receive completion event is received, it should be passed to
+** this function.  The request-id for the buffer that the packet was
+** delivered to is returned.
+**
+** After this function returns, more space may be available in the
+** receive queue.
+*/
+extern ef_request_id ef_vi_receive_done(const ef_vi*, const ef_event*);
+
+/*! \i_ef_vi  Return request ID indicated by a receive event
+ */
+ef_vi_inline ef_request_id ef_vi_receive_request_id(const ef_vi* vi,
+                                                    const ef_event* ef_ev)
+{
+       const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*ef_ev);
+       return ev->u32[0] & vi->vi_rxq.mask;
+}
+  
+
+/*! \i_ef_vi  Form a receive descriptor.
+**
+** If \c initial_rx_bytes is zero use a reception size at least as large
+** as an MTU.
+*/
+extern int ef_vi_receive_init(ef_vi* vi, ef_addr addr, ef_request_id dma_id,
+                              int intial_rx_bytes);
+
+/*! \i_ef_vi  Submit initialised receive descriptors to the NIC. */
+extern void ef_vi_receive_push(ef_vi* vi);
+
+/*! \i_ef_vi  Post a buffer on the receive queue.
+**
+**   \return 0 on success, or -EAGAIN if the receive queue is full
+*/
+extern int ef_vi_receive_post(ef_vi*, ef_addr addr,
+                             ef_request_id dma_id);
+
+/**********************************************************************
+ * Transmit interface *************************************************
+ **********************************************************************/
+
+/*! \i_ef_vi Return the amount of space (in descriptors) in the transmit
+**           queue.
+**
+** \return the amount of space in the queue (in descriptors)
+*/
+ef_vi_inline int ef_vi_transmit_space(ef_vi* vi) 
+{
+       ef_vi_txq_state* qs = &vi->ep_state->txq;
+       return vi->vi_txq.mask - (qs->added - qs->removed);
+}
+
+
+/*! \i_ef_vi Returns the fill level of the TX descriptor ring.
+**
+** \return the fill level of the queue.
+*/
+ef_vi_inline int ef_vi_transmit_fill_level(ef_vi* vi)
+{
+       ef_vi_txq_state* qs = &vi->ep_state->txq;
+       return qs->added - qs->removed;
+}
+
+
+/*! \i_ef_vi Returns the total capacity of the TX descriptor ring.
+**
+** \return the capacity of the queue.
+*/
+ef_vi_inline int ef_vi_transmit_capacity(ef_vi* vi)
+{ 
+       return vi->vi_txq.mask;
+}
+
+
+/*! \i_ef_vi  Transmit a packet.
+**
+**   \param bytes must be greater than ETH_ZLEN.
+**   \return -EAGAIN if the transmit queue is full, or 0 on success
+*/
+extern int ef_vi_transmit(ef_vi*, ef_addr, int bytes, ef_request_id dma_id);
+
+/*! \i_ef_vi  Transmit a packet using a gather list.
+**
+**   \param iov_len must be greater than zero
+**   \param iov the first must be non-zero in length (but others need not)
+**
+**   \return -EAGAIN if the queue is full, or 0 on success
+*/
+extern int ef_vi_transmitv(ef_vi*, const ef_iovec* iov, int iov_len,
+                           ef_request_id dma_id);
+
+/*! \i_ef_vi  Initialise a DMA request.
+**
+** \return -EAGAIN if the queue is full, or 0 on success
+*/
+extern int ef_vi_transmit_init(ef_vi*, ef_addr, int bytes,
+                               ef_request_id dma_id);
+
+/*! \i_ef_vi  Initialise a DMA request.
+**
+** \return -EAGAIN if the queue is full, or 0 on success
+*/
+extern int ef_vi_transmitv_init(ef_vi*, const ef_iovec*, int iov_len,
+                                ef_request_id dma_id);
+
+/*! \i_ef_vi  Submit DMA requests to the NIC.
+**
+** The DMA requests must have been initialised using
+** ef_vi_transmit_init() or ef_vi_transmitv_init().
+*/
+extern void ef_vi_transmit_push(ef_vi*);
+
+
+/*! \i_ef_vi Maximum number of transmit completions per transmit event. */
+#define EF_VI_TRANSMIT_BATCH  64
+
+/*! \i_ef_vi Determine the set of [ef_request_id]s for each DMA request
+**           which has been completed by a given transmit completion
+**           event.
+**
+** \param ids must point to an array of length EF_VI_TRANSMIT_BATCH
+** \return the number of valid [ef_request_id]s (can be zero)
+*/
+extern int ef_vi_transmit_unbundle(ef_vi* ep, const ef_event*,
+                                   ef_request_id* ids);
+
+
+/*! \i_ef_event Returns true if ef_eventq_poll() will return event(s). */
+extern int ef_eventq_has_event(ef_vi* vi);
+
+/*! \i_ef_event Returns true if there are quite a few events in the event
+** queue.
+**
+** This looks ahead in the event queue, so has the property that it will
+** not ping-pong a cache-line when it is called concurrently with events
+** being delivered.
+*/
+extern int ef_eventq_has_many_events(ef_vi* evq, int look_ahead);
+
+/*! Type of function to handle unknown events arriving on event queue
+**  Return CI_TRUE iff the event has been handled.
+*/
+typedef int/*bool*/ ef_event_handler_fn(void* priv, ef_vi* evq, ef_event* ev);
+
+/*! Standard poll exception routine */
+extern int/*bool*/ ef_eventq_poll_exception(void* priv, ef_vi* evq,
+                                            ef_event* ev);
+
+/*! \i_ef_event  Retrieve events from the event queue, handle RX/TX events
+**  and pass any others to an exception handler function
+**
+**   \return The number of events retrieved.
+*/
+extern int ef_eventq_poll_evs(ef_vi* evq, ef_event* evs, int evs_len,
+                              ef_event_handler_fn *exception, void *expt_priv);
+
+/*! \i_ef_event  Retrieve events from the event queue.
+**
+**   \return The number of events retrieved.
+*/
+ef_vi_inline int ef_eventq_poll(ef_vi* evq, ef_event* evs, int evs_len)
+{
+       return ef_eventq_poll_evs(evq, evs, evs_len,
+                            &ef_eventq_poll_exception, (void*)0);
+}
+
+/*! \i_ef_event Returns the capacity of an event queue. */
+ef_vi_inline int ef_eventq_capacity(ef_vi* vi) 
+{
+       return (vi->evq_mask + 1u) / sizeof(ef_hw_event);
+}
+
+/* Returns the instance ID of [vi] */
+ef_vi_inline unsigned ef_vi_instance(ef_vi* vi)
+{ return vi->vi_i; }
+
+
+/**********************************************************************
+ * Initialisation *****************************************************
+ **********************************************************************/
+
+/*! Return size of state buffer of an initialised VI. */
+extern int ef_vi_state_bytes(ef_vi*);
+
+/*! Return size of buffer needed for VI state given sizes of RX and TX
+** DMA queues.  Queue sizes must be legal sizes (power of 2), or 0 (no
+** queue).
+*/
+extern int ef_vi_calc_state_bytes(int rxq_size, int txq_size);
+
+/*! Initialise [ef_vi] from the provided resources. [vvis] must have been
+** created by ef_make_vi_data() & remains owned by the caller.
+*/
+extern void ef_vi_init(ef_vi*, void* vi_info, ef_vi_state* state,
+                       ef_eventq_state* evq_state, enum ef_vi_flags);
+
+extern void ef_vi_state_init(ef_vi*);
+extern void ef_eventq_state_init(ef_vi*);
+
+/*! Convert an efhw device arch to ef_vi_arch, or returns -1 if not
+** recognised.
+*/
+extern int  ef_vi_arch_from_efhw_arch(int efhw_arch);
+
+
+#endif /* __EFAB_EF_VI_H__ */
diff --git a/drivers/xen/sfc_netfront/falcon_event.c b/drivers/xen/sfc_netfront/falcon_event.c

new file mode 100644 (file)

index 0000000..dd9cc15
--- /dev/null
+++ b/drivers/xen/sfc_netfront/falcon_event.c
@@ -0,0 +1,346 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Routine to poll event queues.
+ *   \date  2003/03/04
+ */
+
+/*! \cidoxg_lib_ef */
+#include "ef_vi_internal.h"
+
+/* Be worried about this on byteswapped machines */
+/* Due to crazy chipsets, we see the event words being written in
+** arbitrary order (bug4539).  So test for presence of event must ensure
+** that both halves have changed from the null.
+*/
+# define EF_VI_IS_EVENT(evp)                                           \
+       ( (((evp)->opaque.a != (uint32_t)-1) &&                         \
+          ((evp)->opaque.b != (uint32_t)-1)) )
+
+
+#ifdef NDEBUG
+# define IS_DEBUG 0
+#else
+# define IS_DEBUG 1
+#endif
+
+
+/*! Check for RX events with inconsistent SOP/CONT
+**
+** Returns true if this event should be discarded
+*/
+ef_vi_inline int ef_eventq_is_rx_sop_cont_bad_efab(ef_vi* vi,
+                                                  const ef_vi_qword* ev)
+{
+       ef_rx_dup_state_t* rx_dup_state;
+       uint8_t* bad_sop;
+
+       unsigned label = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
+       unsigned sop   = QWORD_TEST_BIT(RX_SOP, *ev);
+  
+       ef_assert(vi);
+       ef_assert_lt(label, EFAB_DMAQS_PER_EVQ_MAX);
+
+       rx_dup_state = &vi->evq_state->rx_dup_state[label];
+       bad_sop = &rx_dup_state->bad_sop;
+
+       if( ! ((vi->vi_flags & EF_VI_BUG5692_WORKAROUND) || IS_DEBUG) ) {
+               *bad_sop = (*bad_sop && !sop);
+       }
+       else {
+               unsigned cont  = QWORD_TEST_BIT(RX_JUMBO_CONT, *ev);
+               uint8_t *frag_num = &rx_dup_state->frag_num;
+
+               /* bad_sop should latch till the next sop */
+               *bad_sop = (*bad_sop && !sop) || ( !!sop != (*frag_num==0) );
+
+               /* we do not check the number of bytes relative to the
+                * fragment number and size of the user rx buffer here
+                * because we don't know the size of the user rx
+                * buffer - we probably should perform this check in
+                * the nearest code calling this though.
+                */
+               *frag_num = cont ? (*frag_num + 1) : 0;
+       }
+
+       return *bad_sop;
+}
+
+
+ef_vi_inline int falcon_rx_check_dup(ef_vi* evq, ef_event* ev_out,
+                                    const ef_vi_qword* ev)
+{
+       unsigned q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
+       uint16_t desc_ptr = QWORD_GET_U(RX_EV_DESC_PTR, *ev);
+       ef_rx_dup_state_t* rx_dup_state = &evq->evq_state->rx_dup_state[q_id];
+
+       if(likely( desc_ptr != rx_dup_state->rx_last_desc_ptr )) {
+               rx_dup_state->rx_last_desc_ptr = desc_ptr;
+               return 0;
+       }
+
+       rx_dup_state->rx_last_desc_ptr = desc_ptr;
+       rx_dup_state->bad_sop = 1;
+#ifndef NDEBUG
+       rx_dup_state->frag_num = 0;
+#endif
+       BUG_ON(!QWORD_TEST_BIT(RX_EV_FRM_TRUNC, *ev));
+       BUG_ON( QWORD_TEST_BIT(RX_EV_PKT_OK, *ev));
+       BUG_ON(!QWORD_GET_U(RX_EV_BYTE_CNT, *ev) == 0);
+       ev_out->rx_no_desc_trunc.type = EF_EVENT_TYPE_RX_NO_DESC_TRUNC;
+       ev_out->rx_no_desc_trunc.q_id = q_id;
+       return 1;
+}
+
+
+ef_vi_inline void falcon_rx_event(ef_event* ev_out, const ef_vi_qword* ev)
+{
+       if(likely( QWORD_TEST_BIT(RX_EV_PKT_OK, *ev) )) {
+               ev_out->rx.type = EF_EVENT_TYPE_RX;
+               ev_out->rx.q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
+               ev_out->rx.len  = QWORD_GET_U(RX_EV_BYTE_CNT, *ev);
+               if( QWORD_TEST_BIT(RX_SOP, *ev) )
+                       ev_out->rx.flags = EF_EVENT_FLAG_SOP;
+               else
+                       ev_out->rx.flags = 0;
+               if( QWORD_TEST_BIT(RX_JUMBO_CONT, *ev) )
+                       ev_out->rx.flags |= EF_EVENT_FLAG_CONT;
+               if( QWORD_TEST_BIT(RX_iSCSI_PKT_OK, *ev) )
+                       ev_out->rx.flags |= EF_EVENT_FLAG_ISCSI_OK;
+       }
+       else {
+               ev_out->rx_discard.type = EF_EVENT_TYPE_RX_DISCARD;
+               ev_out->rx_discard.q_id = QWORD_GET_U(RX_EV_Q_LABEL, *ev);
+               ev_out->rx_discard.len  = QWORD_GET_U(RX_EV_BYTE_CNT, *ev);
+#if 1  /* hack for ptloop compatability: ?? TODO purge */
+               if( QWORD_TEST_BIT(RX_SOP, *ev) )
+                       ev_out->rx_discard.flags = EF_EVENT_FLAG_SOP;
+               else
+                       ev_out->rx_discard.flags = 0;
+               if( QWORD_TEST_BIT(RX_JUMBO_CONT, *ev) )
+                       ev_out->rx_discard.flags |= EF_EVENT_FLAG_CONT;
+               if( QWORD_TEST_BIT(RX_iSCSI_PKT_OK, *ev) )
+                       ev_out->rx_discard.flags |= EF_EVENT_FLAG_ISCSI_OK;
+#endif
+               /* Order matters here: more fundamental errors first. */
+               if( QWORD_TEST_BIT(RX_EV_BUF_OWNER_ID_ERR, *ev) )
+                       ev_out->rx_discard.subtype = 
+                               EF_EVENT_RX_DISCARD_RIGHTS;
+               else if( QWORD_TEST_BIT(RX_EV_FRM_TRUNC, *ev) )
+                       ev_out->rx_discard.subtype = 
+                               EF_EVENT_RX_DISCARD_TRUNC;
+               else if( QWORD_TEST_BIT(RX_EV_ETH_CRC_ERR, *ev) )
+                       ev_out->rx_discard.subtype = 
+                               EF_EVENT_RX_DISCARD_CRC_BAD;
+               else if( QWORD_TEST_BIT(RX_EV_IP_HDR_CHKSUM_ERR, *ev) )
+                       ev_out->rx_discard.subtype = 
+                               EF_EVENT_RX_DISCARD_CSUM_BAD;
+               else if( QWORD_TEST_BIT(RX_EV_TCP_UDP_CHKSUM_ERR, *ev) )
+                       ev_out->rx_discard.subtype = 
+                               EF_EVENT_RX_DISCARD_CSUM_BAD;
+               else
+                       ev_out->rx_discard.subtype = 
+                               EF_EVENT_RX_DISCARD_OTHER;
+       }
+}
+
+
+ef_vi_inline void falcon_tx_event(ef_event* ev_out, const ef_vi_qword* ev)
+{
+       /* Danger danger!  No matter what we ask for wrt batching, we
+       ** will get a batched event every 16 descriptors, and we also
+       ** get dma-queue-empty events.  i.e. Duplicates are expected.
+       **
+       ** In addition, if it's been requested in the descriptor, we
+       ** get an event per descriptor.  (We don't currently request
+       ** this).
+       */
+       if(likely( QWORD_TEST_BIT(TX_EV_COMP, *ev) )) {
+               ev_out->tx.type = EF_EVENT_TYPE_TX;
+               ev_out->tx.q_id = QWORD_GET_U(TX_EV_Q_LABEL, *ev);
+       }
+       else {
+               ev_out->tx_error.type = EF_EVENT_TYPE_TX_ERROR;
+               ev_out->tx_error.q_id = QWORD_GET_U(TX_EV_Q_LABEL, *ev);
+               if(likely( QWORD_TEST_BIT(TX_EV_BUF_OWNER_ID_ERR, *ev) ))
+                       ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_RIGHTS;
+               else if(likely( QWORD_TEST_BIT(TX_EV_WQ_FF_FULL, *ev) ))
+                       ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_OFLOW;
+               else if(likely( QWORD_TEST_BIT(TX_EV_PKT_TOO_BIG, *ev) ))
+                       ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_2BIG;
+               else if(likely( QWORD_TEST_BIT(TX_EV_PKT_ERR, *ev) ))
+                       ev_out->tx_error.subtype = EF_EVENT_TX_ERROR_BUS;
+       }
+}
+
+
+static void mark_bad(ef_event* ev)
+{
+       ev->generic.ev.u64[0] &=~ ((uint64_t) 1u << RX_EV_PKT_OK_LBN);
+}
+
+
+int ef_eventq_poll_evs(ef_vi* evq, ef_event* evs, int evs_len,
+                      ef_event_handler_fn *exception, void *expt_priv)
+{
+       int evs_len_orig = evs_len;
+
+       EF_VI_CHECK_EVENT_Q(evq);
+       ef_assert(evs);
+       ef_assert_gt(evs_len, 0);
+
+       if(unlikely( EF_VI_IS_EVENT(EF_VI_EVENT_PTR(evq, 1)) ))
+               goto overflow;
+
+       do {
+               { /* Read the event out of the ring, then fiddle with
+                  * copied version.  Reason is that the ring is
+                  * likely to get pushed out of cache by another
+                  * event being delivered by hardware. */
+                       ef_vi_event* ev = EF_VI_EVENT_PTR(evq, 0);
+                       if( ! EF_VI_IS_EVENT(ev) )
+                               break;
+                       evs->generic.ev.u64[0] = cpu_to_le64 (ev->u64);
+                       evq->evq_state->evq_ptr += sizeof(ef_vi_event);
+                       ev->u64 = (uint64_t)(int64_t) -1;
+               }
+
+               /* Ugly: Exploit the fact that event code lies in top
+                * bits of event. */
+               ef_assert_ge(EV_CODE_LBN, 32u);
+               switch( evs->generic.ev.u32[1] >> (EV_CODE_LBN - 32u) ) {
+               case RX_IP_EV_DECODE:
+                       /* Look for duplicate desc_ptr: it signals
+                        * that a jumbo frame was truncated because we
+                        * ran out of descriptors. */
+                       if(unlikely( falcon_rx_check_dup
+                                          (evq, evs, &evs->generic.ev) )) {
+                               --evs_len;
+                               ++evs;
+                               break;
+                       }
+                       else {
+                               /* Cope with FalconA1 bugs where RX
+                                * gives inconsistent RX events Mark
+                                * events as bad until SOP becomes
+                                * consistent again
+                                * ef_eventq_is_rx_sop_cont_bad() has
+                                * side effects - order is important
+                                */
+                               if(unlikely
+                                  (ef_eventq_is_rx_sop_cont_bad_efab
+                                   (evq, &evs->generic.ev) )) {
+                                       mark_bad(evs);
+                               }
+                       }
+                       falcon_rx_event(evs, &evs->generic.ev);
+                       --evs_len;      
+                       ++evs;
+                       break;
+
+               case TX_IP_EV_DECODE:
+                       falcon_tx_event(evs, &evs->generic.ev);
+                       --evs_len;
+                       ++evs;
+                       break;
+
+               default:
+                       break;
+               }
+       } while( evs_len );
+
+       return evs_len_orig - evs_len;
+
+
+ overflow:
+       evs->generic.type = EF_EVENT_TYPE_OFLOW;
+       evs->generic.ev.u64[0] = (uint64_t)((int64_t)-1);
+       return 1;
+}
+
+
+int/*bool*/ ef_eventq_poll_exception(void* priv, ef_vi* evq, ef_event* ev)
+{
+       int /*bool*/ handled = 0;
+  
+       switch( ev->generic.ev.u32[1] >> (EV_CODE_LBN - 32u) ) {
+       case DRIVER_EV_DECODE:
+               if( QWORD_GET_U(DRIVER_EV_SUB_CODE, ev->generic.ev) ==
+                   EVQ_INIT_DONE_EV_DECODE )
+                       /* EVQ initialised event: ignore. */
+                       handled = 1;
+               break;
+       }
+       return handled;
+}
+
+
+void ef_eventq_iterate(ef_vi* vi,
+                      void (*fn)(void* arg, ef_vi*, int rel_pos,
+                                 int abs_pos, void* event),
+                      void* arg, int stop_at_end)
+{
+       int i, size_evs = (vi->evq_mask + 1) / sizeof(ef_vi_event);
+
+       for( i = 0; i < size_evs; ++i ) {
+               ef_vi_event* e = EF_VI_EVENT_PTR(vi, -i);
+               if( EF_VI_IS_EVENT(e) )
+                       fn(arg, vi, i, 
+                          EF_VI_EVENT_OFFSET(vi, -i) / sizeof(ef_vi_event),
+                          e);
+               else if( stop_at_end )
+                       break;
+       }
+}
+
+
+int ef_eventq_has_event(ef_vi* vi)
+{
+       return EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, 0));
+}
+
+
+int ef_eventq_has_many_events(ef_vi* vi, int look_ahead)
+{
+       ef_assert_ge(look_ahead, 0);
+       return EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, -look_ahead));
+}
+
+
+int ef_eventq_has_rx_event(ef_vi* vi)
+{
+       ef_vi_event* ev;
+       int i, n_evs = 0;
+
+       for( i = 0;  EF_VI_IS_EVENT(EF_VI_EVENT_PTR(vi, i)); --i ) {
+               ev = EF_VI_EVENT_PTR(vi, i);
+               if( EFVI_FALCON_EVENT_CODE(ev) == EF_EVENT_TYPE_RX )  n_evs++;
+       }
+       return n_evs;
+}
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netfront/falcon_vi.c b/drivers/xen/sfc_netfront/falcon_vi.c

new file mode 100644 (file)

index 0000000..b6880e9
--- /dev/null
+++ b/drivers/xen/sfc_netfront/falcon_vi.c
@@ -0,0 +1,473 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr, stg
+ *  \brief  Falcon-specific VI
+ *   \date  2006/11/30
+ */
+
+#include "ef_vi_internal.h"
+
+
+#define EFVI_FALCON_DMA_TX_FRAG                1
+
+
+/* TX descriptor for both physical and virtual packet transfers */
+typedef union {
+       uint32_t        dword[2];
+} ef_vi_falcon_dma_tx_buf_desc;
+typedef ef_vi_falcon_dma_tx_buf_desc ef_vi_falcon_dma_tx_phys_desc;
+
+
+/* RX descriptor for physical addressed transfers */
+typedef union {
+       uint32_t        dword[2];
+} ef_vi_falcon_dma_rx_phys_desc;
+
+
+/* RX descriptor for virtual packet transfers */
+typedef struct {
+       uint32_t        dword[1];
+} ef_vi_falcon_dma_rx_buf_desc;
+
+/* Buffer table index */
+typedef uint32_t               ef_vi_buffer_addr_t;
+
+ef_vi_inline int64_t dma_addr_to_u46(int64_t src_dma_addr)
+{
+       return (src_dma_addr & __FALCON_MASK(46, int64_t));
+}
+
+/*! Setup a physical address based descriptor with a specified length */
+ef_vi_inline void
+__falcon_dma_rx_calc_ip_phys(ef_vi_dma_addr_t dest_pa, 
+                            ef_vi_falcon_dma_rx_phys_desc *desc,
+                            int bytes)
+{
+       int region = 0;                 /* TODO fixme */
+       int64_t dest    = dma_addr_to_u46(dest_pa); /* lower 46 bits */
+
+       DWCHCK(__DW2(RX_KER_BUF_SIZE_LBN),  RX_KER_BUF_SIZE_WIDTH);
+       DWCHCK(__DW2(RX_KER_BUF_REGION_LBN),RX_KER_BUF_REGION_WIDTH);
+
+       LWCHK(RX_KER_BUF_ADR_LBN, RX_KER_BUF_ADR_WIDTH);
+
+       RANGECHCK(bytes,  RX_KER_BUF_SIZE_WIDTH);
+       RANGECHCK(region, RX_KER_BUF_REGION_WIDTH);
+
+       ef_assert(desc);
+
+       desc->dword[1] = ((bytes << __DW2(RX_KER_BUF_SIZE_LBN)) |
+                         (region << __DW2(RX_KER_BUF_REGION_LBN)) |
+                         (HIGH(dest,
+                               RX_KER_BUF_ADR_LBN, 
+                               RX_KER_BUF_ADR_WIDTH)));
+
+       desc->dword[0] = LOW(dest, 
+                            RX_KER_BUF_ADR_LBN, 
+                            RX_KER_BUF_ADR_WIDTH);
+}
+
+/*! Setup a virtual buffer descriptor for an IPMODE transfer */
+ef_vi_inline void
+__falcon_dma_tx_calc_ip_buf(unsigned buf_id, unsigned buf_ofs, unsigned bytes,
+                           int port, int frag, 
+                           ef_vi_falcon_dma_tx_buf_desc *desc)
+{
+       DWCHCK(__DW2(TX_USR_PORT_LBN), TX_USR_PORT_WIDTH);
+       DWCHCK(__DW2(TX_USR_CONT_LBN), TX_USR_CONT_WIDTH);
+       DWCHCK(__DW2(TX_USR_BYTE_CNT_LBN), TX_USR_BYTE_CNT_WIDTH);
+       LWCHK(RX_KER_BUF_ADR_LBN, RX_KER_BUF_ADR_WIDTH);
+       DWCHCK(TX_USR_BYTE_OFS_LBN, TX_USR_BYTE_OFS_WIDTH);
+
+       RANGECHCK(bytes,   TX_USR_BYTE_CNT_WIDTH);
+       RANGECHCK(port,    TX_USR_PORT_WIDTH);
+       RANGECHCK(frag,    TX_USR_CONT_WIDTH);
+       RANGECHCK(buf_id,  TX_USR_BUF_ID_WIDTH);
+       RANGECHCK(buf_ofs, TX_USR_BYTE_OFS_WIDTH);
+
+       ef_assert(desc);
+
+       desc->dword[1] = ((port   <<  __DW2(TX_USR_PORT_LBN))      | 
+                         (frag   <<  __DW2(TX_USR_CONT_LBN))      | 
+                         (bytes  <<  __DW2(TX_USR_BYTE_CNT_LBN))  |
+                         (HIGH(buf_id, 
+                               TX_USR_BUF_ID_LBN,
+                               TX_USR_BUF_ID_WIDTH)));
+
+       desc->dword[0] =  ((LOW(buf_id,
+                               TX_USR_BUF_ID_LBN,
+                               (TX_USR_BUF_ID_WIDTH))) |
+                          (buf_ofs << TX_USR_BYTE_OFS_LBN));
+}
+
+ef_vi_inline void
+falcon_dma_tx_calc_ip_buf_4k(unsigned buf_vaddr, unsigned bytes,
+                            int port, int frag, 
+                            ef_vi_falcon_dma_tx_buf_desc *desc)
+{
+       /* TODO FIXME [buf_vaddr] consists of the buffer index in the
+       ** high bits, and an offset in the low bits. Assumptions
+       ** permate the code that these can be rolled into one 32bit
+       ** value, so this is currently preserved for Falcon. But we
+       ** should change to support 8K pages
+       */
+       unsigned buf_id =  EFVI_FALCON_BUFFER_4K_PAGE(buf_vaddr);
+       unsigned buf_ofs = EFVI_FALCON_BUFFER_4K_OFF(buf_vaddr);
+
+       __falcon_dma_tx_calc_ip_buf( buf_id, buf_ofs, bytes, port, frag, desc);
+}
+
+ef_vi_inline void
+falcon_dma_tx_calc_ip_buf(unsigned buf_vaddr, unsigned bytes, int port, 
+                         int frag, ef_vi_falcon_dma_tx_buf_desc *desc)
+{
+       falcon_dma_tx_calc_ip_buf_4k(buf_vaddr, bytes, port, frag, desc);
+}
+
+/*! Setup a virtual buffer based descriptor */
+ef_vi_inline void
+__falcon_dma_rx_calc_ip_buf(unsigned buf_id, unsigned buf_ofs, 
+                           ef_vi_falcon_dma_rx_buf_desc *desc)
+{ 
+       /* check alignment of buffer offset and pack */
+       ef_assert((buf_ofs & 0x1) == 0);
+
+       buf_ofs >>= 1;
+
+       DWCHCK(RX_USR_2BYTE_OFS_LBN, RX_USR_2BYTE_OFS_WIDTH);
+       DWCHCK(RX_USR_BUF_ID_LBN, RX_USR_BUF_ID_WIDTH);
+
+       RANGECHCK(buf_ofs, RX_USR_2BYTE_OFS_WIDTH);
+       RANGECHCK(buf_id,  RX_USR_BUF_ID_WIDTH);
+
+       ef_assert(desc);
+
+       desc->dword[0] = ((buf_ofs << RX_USR_2BYTE_OFS_LBN) | 
+                         (buf_id  << RX_USR_BUF_ID_LBN));
+}
+
+ef_vi_inline void
+falcon_dma_rx_calc_ip_buf_4k(unsigned buf_vaddr, 
+                            ef_vi_falcon_dma_rx_buf_desc *desc)
+{ 
+       /* TODO FIXME [buf_vaddr] consists of the buffer index in the
+       ** high bits, and an offset in the low bits. Assumptions
+       ** permeate the code that these can be rolled into one 32bit
+       ** value, so this is currently preserved for Falcon. But we
+       ** should change to support 8K pages
+       */
+       unsigned buf_id =  EFVI_FALCON_BUFFER_4K_PAGE(buf_vaddr);
+       unsigned buf_ofs = EFVI_FALCON_BUFFER_4K_OFF(buf_vaddr);
+
+       __falcon_dma_rx_calc_ip_buf(buf_id, buf_ofs, desc);
+}
+
+ef_vi_inline void
+falcon_dma_rx_calc_ip_buf(unsigned buf_vaddr, 
+                         ef_vi_falcon_dma_rx_buf_desc *desc)
+{ 
+       falcon_dma_rx_calc_ip_buf_4k(buf_vaddr, desc);
+}
+
+
+ef_vi_inline ef_vi_dma_addr_t ef_physaddr(ef_addr efaddr)
+{
+       return (ef_vi_dma_addr_t) efaddr;
+}
+
+
+/*! Convert between an ef_addr and a buffer table index
+**  Assert that this was not a physical address
+*/
+ef_vi_inline ef_vi_buffer_addr_t ef_bufaddr(ef_addr efaddr)
+{
+       ef_assert(efaddr < ((uint64_t)1 << 32) );
+
+       return (ef_vi_buffer_addr_t) efaddr;
+}
+
+
+/*! Setup an physical address based descriptor for an IPMODE transfer */
+ef_vi_inline void
+falcon_dma_tx_calc_ip_phys(ef_vi_dma_addr_t src_dma_addr, unsigned bytes, 
+                          int port, int frag,
+                          ef_vi_falcon_dma_tx_phys_desc *desc)
+{
+
+       int region = 0; /* FIXME */
+       int64_t src    = dma_addr_to_u46(src_dma_addr); /* lower 46 bits */
+
+       DWCHCK(__DW2(TX_KER_PORT_LBN),      TX_KER_PORT_WIDTH);
+       DWCHCK(__DW2(TX_KER_CONT_LBN),      TX_KER_CONT_WIDTH);
+       DWCHCK(__DW2(TX_KER_BYTE_CNT_LBN),  TX_KER_BYTE_CNT_WIDTH);
+       DWCHCK(__DW2(TX_KER_BUF_REGION_LBN),TX_KER_BUF_REGION_WIDTH);
+
+       LWCHK(TX_KER_BUF_ADR_LBN, TX_KER_BUF_ADR_WIDTH);
+
+       RANGECHCK(port,   TX_KER_PORT_WIDTH);
+       RANGECHCK(frag,   TX_KER_CONT_WIDTH);
+       RANGECHCK(bytes,  TX_KER_BYTE_CNT_WIDTH);
+       RANGECHCK(region, TX_KER_BUF_REGION_WIDTH);
+
+       desc->dword[1] = ((port   <<  __DW2(TX_KER_PORT_LBN))      | 
+                         (frag   <<  __DW2(TX_KER_CONT_LBN))      | 
+                         (bytes  <<  __DW2(TX_KER_BYTE_CNT_LBN))  | 
+                         (region << __DW2(TX_KER_BUF_REGION_LBN)) |
+                         (HIGH(src,
+                               TX_KER_BUF_ADR_LBN, 
+                               TX_KER_BUF_ADR_WIDTH)));
+
+       ef_assert_equal(TX_KER_BUF_ADR_LBN, 0);
+       desc->dword[0] = (uint32_t) src_dma_addr;
+}
+
+
+void falcon_vi_init(ef_vi* vi, void* vvis)
+{
+       struct vi_mappings *vm = (struct vi_mappings*)vvis;
+       uint16_t* ids;
+
+       ef_assert(vi);
+       ef_assert(vvis);
+       ef_assert_equal(vm->signature, VI_MAPPING_SIGNATURE);
+       ef_assert_equal(vm->nic_type.arch, EF_VI_ARCH_FALCON);
+
+       /* Initialise masks to zero, so that ef_vi_state_init() will
+       ** not do any harm when we don't have DMA queues. */
+       vi->vi_rxq.mask = vi->vi_txq.mask = 0;
+
+       /* Used for BUG5391_WORKAROUND. */
+       vi->vi_txq.misalign_mask = 0;
+
+       /* Initialise doorbell addresses to a distinctive small value
+       ** which will cause a segfault, to trap doorbell pushes to VIs
+       ** without DMA queues. */
+       vi->vi_rxq.doorbell = vi->vi_txq.doorbell = (ef_vi_ioaddr_t)0xdb;
+
+       ids = (uint16_t*) (vi->ep_state + 1);
+
+       if( vm->tx_queue_capacity ) {
+               vi->vi_txq.mask = vm->tx_queue_capacity - 1;
+               vi->vi_txq.doorbell = vm->tx_bell + 12;
+               vi->vi_txq.descriptors = vm->tx_dma_falcon;
+               vi->vi_txq.ids = ids;
+               ids += vi->vi_txq.mask + 1;
+               /* Check that the id fifo fits in the space allocated. */
+               ef_assert_le((char*) (vi->vi_txq.ids + vm->tx_queue_capacity),
+                            (char*) vi->ep_state
+                            + ef_vi_calc_state_bytes(vm->rx_queue_capacity,
+                                                     vm->tx_queue_capacity));
+       }
+       if( vm->rx_queue_capacity ) {
+               vi->vi_rxq.mask = vm->rx_queue_capacity - 1;
+               vi->vi_rxq.doorbell = vm->rx_bell + 12;
+               vi->vi_rxq.descriptors = vm->rx_dma_falcon;
+               vi->vi_rxq.ids = ids;
+               /* Check that the id fifo fits in the space allocated. */
+               ef_assert_le((char*) (vi->vi_rxq.ids + vm->rx_queue_capacity),
+                            (char*) vi->ep_state
+                            + ef_vi_calc_state_bytes(vm->rx_queue_capacity,
+                                                     vm->tx_queue_capacity));
+       }
+
+       if( vm->nic_type.variant == 'A' ) {
+               vi->vi_txq.misalign_mask = 15;    /* BUG5391_WORKAROUND */
+               vi->vi_flags |= EF_VI_BUG5692_WORKAROUND;
+       }
+}
+
+
+int ef_vi_transmitv_init(ef_vi* vi, const ef_iovec* iov, int iov_len,
+                        ef_request_id dma_id)
+{
+       ef_vi_txq* q = &vi->vi_txq;
+       ef_vi_txq_state* qs = &vi->ep_state->txq;
+       ef_vi_falcon_dma_tx_buf_desc* dp;
+       unsigned len, dma_len, di;
+       unsigned added_save = qs->added;
+       ef_addr dma_addr;
+       unsigned last_len = 0;
+
+       ef_assert(iov_len > 0);
+       ef_assert(iov);
+       ef_assert_equal((dma_id & EF_REQUEST_ID_MASK), dma_id);
+       ef_assert_nequal(dma_id, 0xffff);
+
+       dma_addr = iov->iov_base;
+       len = iov->iov_len;
+
+       if( vi->vi_flags & EF_VI_ISCSI_TX_DDIG ) {
+               /* Last 4 bytes of placeholder for digest must be
+                * removed for h/w */
+               ef_assert(len > 4);
+               last_len = iov[iov_len - 1].iov_len;
+               if( last_len <= 4 ) {
+                       ef_assert(iov_len > 1);
+                       --iov_len;
+                       last_len = iov[iov_len - 1].iov_len - (4 - last_len);
+               }
+               else {
+                       last_len = iov[iov_len - 1].iov_len - 4;
+               }
+               if( iov_len == 1 )
+                       len = last_len;
+       }
+
+       while( 1 ) {
+               if( qs->added - qs->removed >= q->mask ) {
+                       qs->added = added_save;
+                       return -EAGAIN;
+               }
+
+               dma_len = (~((unsigned) dma_addr) & 0xfff) + 1;
+               if( dma_len > len )  dma_len = len;
+               { /* BUG5391_WORKAROUND */
+                       unsigned misalign = 
+                               (unsigned) dma_addr & q->misalign_mask;
+                       if( misalign && dma_len + misalign > 512 )
+                               dma_len = 512 - misalign;
+               }
+
+               di = qs->added++ & q->mask;
+               dp = (ef_vi_falcon_dma_tx_buf_desc*) q->descriptors + di;
+               if( vi->vi_flags & EF_VI_TX_PHYS_ADDR )
+                       falcon_dma_tx_calc_ip_phys
+                               (ef_physaddr(dma_addr), dma_len, /*port*/ 0,
+                                (iov_len == 1 && dma_len == len) ? 0 :
+                                EFVI_FALCON_DMA_TX_FRAG, dp);
+               else
+                       falcon_dma_tx_calc_ip_buf
+                               (ef_bufaddr(dma_addr), dma_len, /*port*/ 0,
+                                (iov_len == 1 && dma_len == len) ? 0 :
+                                EFVI_FALCON_DMA_TX_FRAG, dp);
+
+               dma_addr += dma_len;
+               len -= dma_len;
+
+               if( len == 0 ) {
+                       if( --iov_len == 0 )  break;
+                       ++iov;
+                       dma_addr = iov->iov_base;
+                       len = iov->iov_len;
+                       if( (vi->vi_flags & EF_VI_ISCSI_TX_DDIG) &&
+                           (iov_len == 1) )
+                               len = last_len;
+               }
+       }
+
+       q->ids[di] = (uint16_t) dma_id;
+       return 0;
+}
+
+
+void ef_vi_transmit_push(ef_vi* vi)
+{
+       ef_vi_wiob();
+       writel((vi->ep_state->txq.added & vi->vi_txq.mask) <<
+               __DW4(TX_DESC_WPTR_LBN),
+               vi->vi_txq.doorbell);
+}
+
+
+/*! The value of initial_rx_bytes is used to set RX_KER_BUF_SIZE in an initial
+**  receive descriptor here if physical addressing is being used. A value of
+**  zero represents 16384 bytes.  This is okay, because caller must provide a
+**  buffer than is > MTU, and mac should filter anything bigger than that.
+*/
+int ef_vi_receive_init(ef_vi* vi, ef_addr addr, ef_request_id dma_id,
+                      int initial_rx_bytes)
+{
+       ef_vi_rxq* q = &vi->vi_rxq;
+       ef_vi_rxq_state* qs = &vi->ep_state->rxq;
+       unsigned di;
+
+       if( ef_vi_receive_space(vi) ) {
+               di = qs->added++ & q->mask;
+               ef_assert_equal(q->ids[di], 0xffff);
+               q->ids[di] = (uint16_t) dma_id;
+
+               if( ! (vi->vi_flags & EF_VI_RX_PHYS_ADDR) ) {
+                       ef_vi_falcon_dma_rx_buf_desc* dp;
+                       dp = (ef_vi_falcon_dma_rx_buf_desc*) 
+                               q->descriptors + di;
+                       falcon_dma_rx_calc_ip_buf(ef_bufaddr(addr), dp);
+               }
+               else {
+                       ef_vi_falcon_dma_rx_phys_desc* dp;
+                       dp = (ef_vi_falcon_dma_rx_phys_desc*) 
+                               q->descriptors + di;
+                       __falcon_dma_rx_calc_ip_phys(addr, dp,
+                                                    initial_rx_bytes);
+               }
+
+               return 0;
+       }
+
+       return -EAGAIN;
+}
+
+
+int ef_vi_receive_post(ef_vi* vi, ef_addr addr, ef_request_id dma_id)
+{
+  int rc = ef_vi_receive_init(vi, addr, dma_id, 0);
+  if( rc == 0 )  ef_vi_receive_push(vi);
+  return rc;
+}
+
+
+void ef_vi_receive_push(ef_vi* vi)
+{
+       ef_vi_wiob();
+       writel ((vi->ep_state->rxq.added & vi->vi_rxq.mask) <<
+               __DW4(RX_DESC_WPTR_LBN),
+               vi->vi_rxq.doorbell);
+}
+
+
+ef_request_id ef_vi_receive_done(const ef_vi* vi, const ef_event* ef_ev)
+{
+       const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*ef_ev);
+       unsigned di = ev->u32[0] & vi->vi_rxq.mask;
+       ef_request_id rq_id;
+
+       ef_assert(EF_EVENT_TYPE(*ef_ev) == EF_EVENT_TYPE_RX ||
+                 EF_EVENT_TYPE(*ef_ev) == EF_EVENT_TYPE_RX_DISCARD);
+
+       /* Detect spurious / duplicate RX events.  We may need to modify this
+       ** code so that we are robust if they happen. */
+       ef_assert_equal(di, vi->ep_state->rxq.removed & vi->vi_rxq.mask);
+
+       /* We only support 1 port: so events should be in order. */
+       ef_assert(vi->vi_rxq.ids[di] != 0xffff);
+
+       rq_id = vi->vi_rxq.ids[di];
+       vi->vi_rxq.ids[di] = 0xffff;
+       ++vi->ep_state->rxq.removed;
+       return rq_id;
+}
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netfront/pt_tx.c b/drivers/xen/sfc_netfront/pt_tx.c

new file mode 100644 (file)

index 0000000..bdc1f88
--- /dev/null
+++ b/drivers/xen/sfc_netfront/pt_tx.c
@@ -0,0 +1,91 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Packet-mode transmit interface.
+ *   \date  2003/04/02
+ */
+
+/*! \cidoxg_lib_ef */
+#include "ef_vi_internal.h"
+
+
+int ef_vi_transmit_init(ef_vi* vi, ef_addr base, int len, ef_request_id dma_id)
+{
+       ef_iovec iov = { base, len };
+       return ef_vi_transmitv_init(vi, &iov, 1, dma_id);
+}
+
+
+int ef_vi_transmit(ef_vi* vi, ef_addr base, int len, ef_request_id dma_id)
+{
+       ef_iovec iov = { base, len };
+       int rc = ef_vi_transmitv_init(vi, &iov, 1, dma_id);
+       if( rc == 0 )  ef_vi_transmit_push(vi);
+       return rc;
+}
+
+
+int ef_vi_transmitv(ef_vi* vi, const ef_iovec* iov, int iov_len,
+                    ef_request_id dma_id)
+{
+       int rc = ef_vi_transmitv_init(vi, iov, iov_len, dma_id);
+       if( rc == 0 )  ef_vi_transmit_push(vi);
+       return rc;
+}
+
+
+int ef_vi_transmit_unbundle(ef_vi* vi, const ef_event* __ev,
+                           ef_request_id* ids)
+{
+       ef_request_id* ids_in = ids;
+       ef_vi_txq* q = &vi->vi_txq;
+       ef_vi_txq_state* qs = &vi->ep_state->txq;
+       const ef_vi_qword* ev = EF_GET_HW_EV_PTR(*__ev);
+       unsigned i, stop = (ev->u32[0] + 1) & q->mask;
+
+       ef_assert(EF_EVENT_TYPE(*__ev) == EF_EVENT_TYPE_TX ||
+                 EF_EVENT_TYPE(*__ev) == EF_EVENT_TYPE_TX_ERROR);
+
+       /* Shouldn't be batching more than 64 descriptors, and should not go
+       ** backwards. */
+       ef_assert_le((((ev->u32[0] + 1) - qs->removed) & q->mask), 64);
+       /* Should not complete more than we've posted. */
+       ef_assert_le((((ev->u32[0] + 1) - qs->removed) & q->mask),
+                    qs->added - qs->removed);
+
+       for( i = qs->removed & q->mask; i != stop; i = ++qs->removed & q->mask )
+               if( q->ids[i] != 0xffff ) {
+                       *ids++ = q->ids[i];
+                       q->ids[i] = 0xffff;
+               }
+
+       ef_assert_le(ids - ids_in, EF_VI_TRANSMIT_BATCH);
+
+       return (int) (ids - ids_in);
+}
+
+/*! \cidoxg_end */
diff --git a/drivers/xen/sfc_netfront/sysdep.h b/drivers/xen/sfc_netfront/sysdep.h

new file mode 100644 (file)

index 0000000..dc2234e
--- /dev/null
+++ b/drivers/xen/sfc_netfront/sysdep.h
@@ -0,0 +1,185 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  stg
+ *  \brief  System dependent support for ef vi lib
+ *   \date  2007/05/10
+ */
+
+/*! \cidoxg_include_ci_ul */
+#ifndef __CI_CIUL_SYSDEP_LINUX_H__
+#define __CI_CIUL_SYSDEP_LINUX_H__
+
+
+#define ef_vi_wiob()  mmiowb()
+
+
+/**********************************************************************
+ * Kernel version compatability
+ */
+
+#if defined(__GNUC__)
+
+/* Linux kernel doesn't have stdint.h or [u]intptr_t. */
+# if !defined(LINUX_VERSION_CODE)
+#  include <linux/version.h>
+# endif
+# include <asm/io.h>
+
+/* In Linux 2.6.24, linux/types.h has uintptr_t */
+# if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+#  if BITS_PER_LONG == 32
+   typedef __u32         uintptr_t;
+#  else
+   typedef __u64         uintptr_t;
+#  endif
+# endif
+
+/* But even 2.6.24 doesn't define intptr_t */
+# if BITS_PER_LONG == 32
+   typedef __s32         intptr_t;
+# else
+   typedef __s64         intptr_t;
+# endif
+
+# if defined(__ia64__)
+#  define EF_VI_PRIx64  "lx"
+# else
+#  define EF_VI_PRIx64  "llx"
+# endif
+
+# define EF_VI_HF __attribute__((visibility("hidden")))
+# define EF_VI_HV __attribute__((visibility("hidden")))
+
+# if defined(__i386__) || defined(__x86_64__)  /* GCC x86/x64 */
+   typedef unsigned long long ef_vi_dma_addr_t; 
+# endif
+#endif
+
+#ifndef mmiowb
+# if defined(__i386__) || defined(__x86_64__)
+#  define mmiowb()
+# elif defined(__ia64__)
+#  ifndef ia64_mfa
+#   define ia64_mfa() asm volatile ("mf.a" ::: "memory")
+#  endif
+#  define mmiowb ia64_mfa
+# else
+#  error "Need definition for mmiowb"
+# endif
+#endif
+
+#ifdef EFX_NOT_UPSTREAM
+
+/* Stuff for architectures/compilers not officially supported */
+
+#if !defined(__GNUC__)
+# if defined(__PPC__)  /* GCC, PPC */
+   typedef unsigned long     ef_vi_dma_addr_t;
+
+#  ifdef __powerpc64__
+#   ifdef CONFIG_SMP
+#    define CI_SMP_SYNC        "\n   eieio     \n"         /* memory cache sync */
+#    define CI_SMP_ISYNC       "\n   isync     \n"         /* instr cache sync */
+#   else
+#    define CI_SMP_SYNC
+#    define CI_SMP_ISYNC
+#   endif
+#  else         /* for ppc32 systems */
+#   ifdef CONFIG_SMP
+#    define CI_SMP_SYNC        "\n   eieio     \n"
+#    define CI_SMP_ISYNC       "\n   sync      \n"
+#   else
+#    define CI_SMP_SYNC
+#    define CI_SMP_ISYNC
+#   endif
+#  endif
+
+# elif defined(__ia64__)  /* GCC, IA64 */
+   typedef unsigned long     ef_vi_dma_addr_t;
+# else
+#  error Unknown processor - GNU C
+# endif
+
+#elif defined(__PGI)
+# error PGI not supported 
+
+#elif defined(__INTEL_COMPILER)
+
+/* Intel compilers v7 claim to be very gcc compatible. */
+# if __INTEL_COMPILER >= 700
+#  if __GNUC__ >= 3 || (__GNUC__ == 2 && __GNUC_MINOR__ > 91)
+#   define EF_VI_LIKELY(t)    __builtin_expect((t), 1)
+#   define EF_VI_UNLIKELY(t)  __builtin_expect((t), 0)
+#  endif
+# else
+#  error Old Intel compiler not supported.
+# endif
+
+#else
+# error Unknown compiler.
+#endif
+
+#endif
+
+
+# include <linux/errno.h>
+
+
+/**********************************************************************
+ * Extracting bit fields.
+ */
+
+#define _QWORD_GET_LOW(f, v)                                    \
+  (((v).u32[0] >> (f##_LBN)) & ((1u << f##_WIDTH) - 1u))
+#define _QWORD_GET_HIGH(f, v)                                           \
+  (((v).u32[1] >> (f##_LBN - 32u)) & ((1u << f##_WIDTH) - 1u))
+#define _QWORD_GET_ANY(f, v)                                            \
+  (((v).u64[0] >> f##_LBN) & (((uint64_t) 1u << f##_WIDTH) - 1u))
+
+#define QWORD_GET(f, v)                                                     \
+  ((f##_LBN + f##_WIDTH) <= 32u                                             \
+   ? _QWORD_GET_LOW(f, (v))                                                 \
+   : ((f##_LBN >= 32u) ? _QWORD_GET_HIGH(f, (v)) : _QWORD_GET_ANY(f, (v))))
+
+#define QWORD_GET_U(f, v)  ((unsigned) QWORD_GET(f, (v)))
+
+#define _QWORD_TEST_BIT_LOW(f, v)   ((v).u32[0] & (1u << (f##_LBN)))
+#define _QWORD_TEST_BIT_HIGH(f, v)  ((v).u32[1] & (1u << (f##_LBN - 32u)))
+
+#define QWORD_TEST_BIT(f, v)                                                  \
+  (f##_LBN < 32 ? _QWORD_TEST_BIT_LOW(f, (v)) : _QWORD_TEST_BIT_HIGH(f, (v)))
+
+
+
+
+#ifndef DECLSPEC_NORETURN
+/* normally defined on Windows to expand to a declaration that the
+   function will not return */
+# define DECLSPEC_NORETURN
+#endif
+
+#endif  /* __CI_CIUL_SYSDEP_LINUX_H__ */
diff --git a/drivers/xen/sfc_netfront/vi_init.c b/drivers/xen/sfc_netfront/vi_init.c

new file mode 100644 (file)

index 0000000..4e7e19b
--- /dev/null
+++ b/drivers/xen/sfc_netfront/vi_init.c
@@ -0,0 +1,183 @@
+/****************************************************************************
+ * Copyright 2002-2005: Level 5 Networks Inc.
+ * Copyright 2005-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications
+ *  <linux-xen-drivers@solarflare.com>
+ *  <onload-dev@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * \author  djr
+ *  \brief  Initialisation of VIs.
+ *   \date  2007/06/08
+ */
+
+#include "ef_vi_internal.h"
+
+#define EF_VI_STATE_BYTES(rxq_sz, txq_sz)                      \
+       (sizeof(ef_vi_state) + (rxq_sz) * sizeof(uint16_t)      \
+        + (txq_sz) * sizeof(uint16_t))
+
+int ef_vi_calc_state_bytes(int rxq_sz, int txq_sz)
+{
+       ef_assert(rxq_sz == 0 || EF_VI_IS_POW2(rxq_sz));
+       ef_assert(txq_sz == 0 || EF_VI_IS_POW2(txq_sz));
+
+       return EF_VI_STATE_BYTES(rxq_sz, txq_sz);
+}
+
+
+int ef_vi_state_bytes(ef_vi* vi)
+{
+       int rxq_sz = 0, txq_sz = 0;
+       if( ef_vi_receive_capacity(vi) )
+               rxq_sz = ef_vi_receive_capacity(vi) + 1;
+       if( ef_vi_transmit_capacity(vi) )
+               txq_sz = ef_vi_transmit_capacity(vi) + 1;
+
+       ef_assert(rxq_sz == 0 || EF_VI_IS_POW2(rxq_sz));
+       ef_assert(txq_sz == 0 || EF_VI_IS_POW2(txq_sz));
+
+       return EF_VI_STATE_BYTES(rxq_sz, txq_sz);
+}
+
+
+void ef_eventq_state_init(ef_vi* evq)
+{
+       int j;
+
+       for (j = 0; j<EFAB_DMAQS_PER_EVQ_MAX; j++) {
+               ef_rx_dup_state_t *rx_dup_state =
+                       &evq->evq_state->rx_dup_state[j];
+               rx_dup_state->bad_sop = 0;
+               rx_dup_state->rx_last_desc_ptr = -1;
+               rx_dup_state->frag_num = 0;
+       }
+
+       evq->evq_state->evq_ptr = 0;
+}
+
+
+void ef_vi_state_init(ef_vi* vi)
+{
+       ef_vi_state* state = vi->ep_state;
+       unsigned i;
+
+       state->txq.added = state->txq.removed = 0;
+       state->rxq.added = state->rxq.removed = 0;
+
+       if( vi->vi_rxq.mask )
+               for( i = 0; i <= vi->vi_rxq.mask; ++i )
+                       vi->vi_rxq.ids[i] = (uint16_t) -1;
+       if( vi->vi_txq.mask )
+               for( i = 0; i <= vi->vi_txq.mask; ++i )
+                       vi->vi_txq.ids[i] = (uint16_t) -1;
+}
+
+
+void ef_vi_init_mapping_evq(void* data_area, struct ef_vi_nic_type nic_type,
+                            int instance, unsigned evq_bytes, void* base,
+                            void* timer_reg)
+{
+       struct vi_mappings* vm = (struct vi_mappings*) data_area;
+
+       vm->signature = VI_MAPPING_SIGNATURE;
+       vm->vi_instance = instance;
+       vm->nic_type = nic_type;
+       vm->evq_bytes = evq_bytes;
+       vm->evq_base = base;
+       vm->evq_timer_reg = timer_reg;
+}
+
+
+void ef_vi_init(ef_vi* vi, void* vvis, ef_vi_state* state,
+                ef_eventq_state* evq_state, enum ef_vi_flags vi_flags)
+{
+       struct vi_mappings* vm = (struct vi_mappings*) vvis;
+
+       vi->vi_i = vm->vi_instance;
+       vi->ep_state = state;
+       vi->vi_flags = vi_flags;
+
+       switch( vm->nic_type.arch ) {
+       case EF_VI_ARCH_FALCON:
+               falcon_vi_init(vi, vvis);
+               break;
+       default:
+               /* ?? TODO: We should return an error code. */
+               ef_assert(0);
+               break;
+       }
+
+       if( vm->evq_bytes ) {
+               vi->evq_state = evq_state;
+               vi->evq_mask = vm->evq_bytes - 1u;
+               vi->evq_base = vm->evq_base;
+               vi->evq_timer_reg = vm->evq_timer_reg;
+       }
+
+       EF_VI_MAGIC_SET(vi, EF_VI);
+}
+
+
+/* Initialise [data_area] with information required to initialise an ef_vi.
+ * In the following, an unused param should be set to NULL. Note the case
+ * marked (*) of [iobuf_mmap] for falcon/driver; for the normal driver this
+ * must be NULL.
+ *
+ * \param  data_area     [in,out] required, must ref at least VI_MAPPING_SIZE 
+ *                                bytes
+ * \param  io_mmap       [in] ef1,    required
+ *                            falcon, required
+ * \param  iobuf_mmap    [in] ef1,    unused
+ *                            falcon, required
+ */
+void ef_vi_init_mapping_vi(void* data_area, struct ef_vi_nic_type nic_type,
+                           unsigned rxq_capacity, unsigned txq_capacity,
+                           int instance, void* io_mmap,
+                           void* iobuf_mmap_rx, void* iobuf_mmap_tx,
+                           enum ef_vi_flags vi_flags)
+{
+       struct vi_mappings* vm = (struct vi_mappings*) data_area;
+       int rx_desc_bytes, rxq_bytes;
+
+       ef_assert(rxq_capacity > 0 || txq_capacity > 0);
+       ef_assert(vm);
+       ef_assert(io_mmap);
+       ef_assert(iobuf_mmap_rx || iobuf_mmap_tx);
+
+       vm->signature = VI_MAPPING_SIGNATURE;
+       vm->vi_instance = instance;
+       vm->nic_type = nic_type;
+
+       rx_desc_bytes = (vi_flags & EF_VI_RX_PHYS_ADDR) ? 8 : 4;
+       rxq_bytes = rxq_capacity * rx_desc_bytes;
+       rxq_bytes = (rxq_bytes + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+
+       if( iobuf_mmap_rx == iobuf_mmap_tx )
+               iobuf_mmap_tx = (char*) iobuf_mmap_rx + rxq_bytes;
+
+       vm->rx_queue_capacity = rxq_capacity;
+       vm->rx_dma_falcon = iobuf_mmap_rx;
+       vm->rx_bell       = (char*) io_mmap + (RX_DESC_UPD_REG_KER_OFST & 4095);
+       vm->tx_queue_capacity = txq_capacity;
+       vm->tx_dma_falcon = iobuf_mmap_tx;
+       vm->tx_bell       = (char*) io_mmap + (TX_DESC_UPD_REG_KER_OFST & 4095);
+}
diff --git a/drivers/xen/sfc_netutil/Makefile b/drivers/xen/sfc_netutil/Makefile

new file mode 100644 (file)

index 0000000..3fce370
--- /dev/null
+++ b/drivers/xen/sfc_netutil/Makefile
@@ -0,0 +1,11 @@
+EXTRA_CFLAGS += -Idrivers/xen/sfc_netutil
+EXTRA_CFLAGS += -Werror
+
+ifdef GGOV
+EXTRA_CFLAGS += -fprofile-arcs -ftest-coverage -DEFX_GCOV
+endif
+
+obj-$(CONFIG_XEN_NETDEV_ACCEL_SFC_UTIL) := sfc_netutil.o
+
+sfc_netutil-objs := accel_cuckoo_hash.o accel_msg_iface.o accel_util.o 
+
diff --git a/drivers/xen/sfc_netutil/accel_cuckoo_hash.c b/drivers/xen/sfc_netutil/accel_cuckoo_hash.c

new file mode 100644 (file)

index 0000000..00454cb
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_cuckoo_hash.c
@@ -0,0 +1,649 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/types.h> /* needed for linux/random.h */
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include "accel_cuckoo_hash.h"
+#include "accel_util.h"
+
+static inline int cuckoo_hash_key_compare(cuckoo_hash_table *hashtab,
+                                         cuckoo_hash_key *key1, 
+                                         cuckoo_hash_key *key2)
+{
+       return !memcmp(key1, key2, hashtab->key_length);
+}
+
+
+static inline void cuckoo_hash_key_set(cuckoo_hash_key *key1, 
+                                      cuckoo_hash_key *key2)
+{
+       *key1 = *key2;
+}
+
+
+/*
+ * Sets hash function parameters.  Chooses "a" to be odd, 0 < a < 2^w
+ * where w is the length of the key
+ */
+static void set_hash_parameters(cuckoo_hash_table *hashtab)
+{
+ again:
+       hashtab->a0 = hashtab->a1 = 0;
+
+       /* Make sure random */
+       get_random_bytes(&hashtab->a0, hashtab->key_length);
+       get_random_bytes(&hashtab->a1, hashtab->key_length);
+
+       /* Make sure odd */
+       hashtab->a0 |= 1;
+       hashtab->a1 |= 1;
+
+       /* Being different is good */
+       if (hashtab->a0 != hashtab->a1)
+               return;
+                      
+       goto again;
+}
+
+int cuckoo_hash_init(cuckoo_hash_table *hashtab, unsigned length_bits,
+                    unsigned key_length)
+{
+       char *table_mem;
+       unsigned length = 1 << length_bits;
+
+       BUG_ON(length_bits >= sizeof(unsigned) * 8);
+       BUG_ON(key_length > sizeof(cuckoo_hash_key));
+
+       table_mem = kzalloc(sizeof(cuckoo_hash_entry) * 2 * length, GFP_KERNEL);
+
+       if (table_mem == NULL)
+               return -ENOMEM;
+
+       hashtab->length = length;
+       hashtab->length_bits = length_bits;
+       hashtab->key_length = key_length;
+       hashtab->entries = 0;
+
+       hashtab->table0 = (cuckoo_hash_entry *)table_mem;
+       hashtab->table1 = (cuckoo_hash_entry *)
+               (table_mem + length * sizeof(cuckoo_hash_entry));
+
+       set_hash_parameters(hashtab);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_init);
+
+void cuckoo_hash_destroy(cuckoo_hash_table *hashtab)
+{
+       if (hashtab->table0 != NULL)
+               kfree(hashtab->table0);
+}
+
+EXPORT_SYMBOL_GPL(cuckoo_hash_destroy);
+
+/* 
+ * This computes sizeof(cuckoo_hash) bits of hash, not all will be
+ * necessarily used, but the hash function throws away any that
+ * aren't
+ */ 
+static inline void cuckoo_compute_hash_helper(cuckoo_hash_table *hashtab,
+                                             cuckoo_hash_key *a,
+                                             cuckoo_hash_key *x,
+                                             cuckoo_hash *result) 
+{
+       u64 multiply_result = 0, a_temp, x_temp;
+       u32 carry = 0;
+       u32 *a_words;
+       u32 *x_words;
+       int i;
+
+       /*
+        * As the mod and div operations in the function effectively
+        * reduce and shift the bits of the product down to just the
+        * third word, we need only compute that and return it as a
+        * result.
+        *
+        * Do enough long multiplication to get the word we need
+        */
+
+       /* This assumes things about the sizes of the key and hash */
+       BUG_ON(hashtab->key_length % sizeof(u32) != 0);
+       BUG_ON(sizeof(cuckoo_hash) != sizeof(u32));
+
+       a_words = (u32 *)a;
+       x_words = (u32 *)x;
+
+       for (i = 0; i < hashtab->key_length / sizeof(u32); i++) {
+               a_temp = a_words[i];
+               x_temp = x_words[i];
+               
+               multiply_result = (a_temp * x_temp) + carry;
+               carry = (multiply_result >> 32) & 0xffffffff;
+       }
+       
+       *result = multiply_result & 0xffffffff;
+}
+
+
+/*
+ * Want to implement (ax mod 2^w) div 2^(w-q) for odd a, 0 < a < 2^w;
+ * w is the length of the key, q is the length of the hash, I think.
+ * See http://www.it-c.dk/people/pagh/papers/cuckoo-jour.pdf 
+ */
+static cuckoo_hash cuckoo_compute_hash(cuckoo_hash_table *hashtab, 
+                                      cuckoo_hash_key *key, 
+                                      cuckoo_hash_key *a)
+{
+       unsigned q = hashtab->length_bits;
+       unsigned shift = 32 - q;
+       unsigned mask = ((1 << q) - 1) << shift;
+       cuckoo_hash hash;
+
+       cuckoo_compute_hash_helper(hashtab, a, key, &hash);
+
+       /* 
+        * Take the top few bits to get the right length for this
+        * hash table 
+        */
+       hash = (hash & mask) >> shift;
+
+       BUG_ON(hash >= hashtab->length);
+
+       return hash;
+}
+
+
+static int cuckoo_hash_lookup0(cuckoo_hash_table *hashtab,
+                              cuckoo_hash_key *key,
+                              cuckoo_hash_value *value)
+{
+       cuckoo_hash hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0);
+
+       if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED)
+           && cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key),
+                                      key)) {
+               *value = hashtab->table0[hash].value;
+               return 1;
+       }
+
+       return 0;
+}
+
+static int cuckoo_hash_lookup1(cuckoo_hash_table *hashtab,
+                              cuckoo_hash_key *key,
+                              cuckoo_hash_value *value)
+{
+       cuckoo_hash hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1);
+
+       if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED)
+           && cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key),
+                                      key)) {
+               *value = hashtab->table1[hash].value;
+               return 1;
+       }
+
+       return 0;
+}
+
+
+int cuckoo_hash_lookup(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
+                      cuckoo_hash_value *value)
+{
+       return cuckoo_hash_lookup0(hashtab, key, value)
+               || cuckoo_hash_lookup1(hashtab, key, value);
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_lookup);
+
+
+/* Transfer any active entries from "old_table" into hashtab */
+static int cuckoo_hash_transfer_entries(cuckoo_hash_table *hashtab,
+                                       cuckoo_hash_entry *old_table,
+                                       unsigned capacity)
+{
+       int i, rc;
+       cuckoo_hash_entry *entry;
+
+       hashtab->entries = 0;
+
+       for (i = 0; i < capacity; i++) {
+               entry = &old_table[i];
+               if (entry->state == CUCKOO_HASH_STATE_OCCUPIED) {
+                       rc = cuckoo_hash_add(hashtab, &(entry->key), 
+                                            entry->value, 0);
+                       if (rc != 0) {
+                               return rc;
+                       }
+               }
+       }
+  
+       return 0;
+}
+
+
+int cuckoo_hash_rehash(cuckoo_hash_table *hashtab)
+{
+       cuckoo_hash_entry *new_table;
+       cuckoo_hash_table old_hashtab;
+       int resize = 0, rc, rehash_count;
+
+       /*
+        * Store old tables so we can access the existing values and
+        * copy across
+        */
+       memcpy(&old_hashtab, hashtab, sizeof(cuckoo_hash_table));
+
+       /* resize if hashtable is more than half full */
+       if (old_hashtab.entries > old_hashtab.length &&
+           old_hashtab.length_bits < 32)
+               resize = 1;
+
+ resize:
+       if (resize) {
+               new_table = kmalloc(sizeof(cuckoo_hash_entry) * 4 * hashtab->length,
+                                   GFP_ATOMIC);
+               if (new_table == NULL) {
+                       rc = -ENOMEM;
+                       goto err;
+               }
+
+               hashtab->length = 2 * hashtab->length;
+               hashtab->length_bits++;
+       } else {
+               new_table = kmalloc(sizeof(cuckoo_hash_entry) * 2 * hashtab->length,
+                                   GFP_ATOMIC);
+               if (new_table == NULL) {
+                       rc = -ENOMEM;
+                       goto err;
+               }
+       }
+    
+       /*
+        * Point hashtab to new memory region so we can try to
+        * construct new table
+        */
+       hashtab->table0 = new_table;
+       hashtab->table1 = (cuckoo_hash_entry *)
+               ((char *)new_table + hashtab->length * sizeof(cuckoo_hash_entry));
+  
+       rehash_count = 0;
+
+ again:
+       /* Zero the new tables */
+       memset(new_table, 0, hashtab->length * 2 * sizeof(cuckoo_hash_entry));
+
+       /* Choose new parameters for the hash functions */
+       set_hash_parameters(hashtab);
+
+       /*
+        * Multiply old_table_length by 2 as the length refers to each
+        * table, and there are two of them.  This assumes that they
+        * are arranged sequentially in memory, so assert it 
+        */
+       BUG_ON(((char *)old_hashtab.table1) != 
+              ((char *)old_hashtab.table0 + old_hashtab.length
+               * sizeof(cuckoo_hash_entry)));
+       rc = cuckoo_hash_transfer_entries(hashtab, old_hashtab.table0, 
+                                         old_hashtab.length * 2);
+       if (rc < 0) {
+               /* Problem */
+               if (rc == -ENOSPC) {
+                       ++rehash_count;
+                       if (rehash_count < CUCKOO_HASH_MAX_LOOP) {
+                               /*
+                                * Wanted to rehash, but rather than
+                                * recurse we can just do it here
+                                */
+                               goto again;
+                       } else {
+                               /*
+                                * Didn't manage to rehash, so let's
+                                * go up a size (if we haven't already
+                                * and there's space)
+                                */
+                               if (!resize && hashtab->length_bits < 32) {
+                                       resize = 1;
+                                       kfree(new_table);
+                                       goto resize;
+                               }
+                               else
+                                       goto err;
+                       }
+               }
+               else
+                       goto err;
+       }
+
+       /* Success, I think.  Free up the old table */
+       kfree(old_hashtab.table0);
+  
+       /* We should have put all the entries from old table in the new one */
+       BUG_ON(hashtab->entries != old_hashtab.entries);
+
+       return 0;
+ err:
+       EPRINTK("%s: Rehash failed, giving up\n", __FUNCTION__);
+       /* Some other error, give up, at least restore table to how it was */
+       memcpy(hashtab, &old_hashtab, sizeof(cuckoo_hash_table));
+       if (new_table)
+               kfree(new_table);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_rehash);
+
+
+static int 
+cuckoo_hash_insert_or_displace(cuckoo_hash_entry *table, unsigned hash,
+                              cuckoo_hash_key *key, 
+                              cuckoo_hash_value value,
+                              cuckoo_hash_key *displaced_key, 
+                              cuckoo_hash_value *displaced_value)
+{
+       if (table[hash].state == CUCKOO_HASH_STATE_VACANT) {
+               cuckoo_hash_key_set(&(table[hash].key), key);
+               table[hash].value = value;
+               table[hash].state = CUCKOO_HASH_STATE_OCCUPIED;
+
+               return 1;
+       } else {
+               cuckoo_hash_key_set(displaced_key, &(table[hash].key));
+               *displaced_value = table[hash].value;
+               cuckoo_hash_key_set(&(table[hash].key), key);
+               table[hash].value = value;
+
+               return 0;
+       }
+}
+
+
+int cuckoo_hash_add(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
+                    cuckoo_hash_value value, int can_rehash)
+{
+       cuckoo_hash hash0, hash1;
+       int i, rc;
+       cuckoo_hash_key key1, key2;
+
+       cuckoo_hash_key_set(&key1, key);
+
+ again:
+       i = 0;
+       do {
+               hash0 = cuckoo_compute_hash(hashtab, &key1, &hashtab->a0);
+               if (cuckoo_hash_insert_or_displace(hashtab->table0, hash0, 
+                                                  &key1, value, &key2,
+                                                  &value)) {
+                       /* Success */
+                       hashtab->entries++;
+                       return 0;
+               }
+       
+               hash1 = cuckoo_compute_hash(hashtab, &key2, &hashtab->a1);
+               if (cuckoo_hash_insert_or_displace(hashtab->table1, hash1,
+                                                  &key2, value, &key1,
+                                                  &value)) {
+                       /* Success */
+                       hashtab->entries++;
+                       return 0;
+               }
+       } while (++i < CUCKOO_HASH_MAX_LOOP);
+
+       if (can_rehash) {
+               if ((rc = cuckoo_hash_rehash(hashtab)) < 0) {
+                       /*
+                        * Give up - this will drop whichever
+                        * key/value pair we have currently displaced
+                        * on the floor
+                        */
+                       return rc;
+               }
+               goto again;
+       }
+  
+       EPRINTK("%s: failed hash add\n", __FUNCTION__);
+       /*
+        * Couldn't do it - bad as we've now removed some random thing
+        * from the table, and will just drop it on the floor.  Better
+        * would be to somehow revert the table to the state it was in
+        * at the start
+        */
+       return -ENOSPC;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_add);
+
+
+int cuckoo_hash_add_check(cuckoo_hash_table *hashtab,
+                         cuckoo_hash_key *key, cuckoo_hash_value value,
+                         int can_rehash)
+{
+       int stored_value;
+
+       if (cuckoo_hash_lookup(hashtab, key, &stored_value))
+               return -EBUSY;
+
+       return cuckoo_hash_add(hashtab, key, value, can_rehash);
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_add_check);
+
+
+int cuckoo_hash_remove(cuckoo_hash_table *hashtab, cuckoo_hash_key *key)
+{
+       cuckoo_hash hash;
+
+       hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0);
+       if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
+           cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key),
+                                   key)) {
+               hashtab->table0[hash].state = CUCKOO_HASH_STATE_VACANT;
+               hashtab->entries--;
+               return 0;
+       }
+  
+       hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1);
+       if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
+           cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key),
+                                   key)) {
+               hashtab->table1[hash].state = CUCKOO_HASH_STATE_VACANT;
+               hashtab->entries--;
+               return 0;
+       }
+ 
+       return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_remove);
+
+
+int cuckoo_hash_update(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
+                      cuckoo_hash_value value)
+{
+       cuckoo_hash hash;
+
+       hash = cuckoo_compute_hash(hashtab, key, &hashtab->a0);
+       if ((hashtab->table0[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
+           cuckoo_hash_key_compare(hashtab, &(hashtab->table0[hash].key),
+                                   key)) {
+               hashtab->table0[hash].value = value;
+               return 0;
+       }
+
+       hash = cuckoo_compute_hash(hashtab, key, &hashtab->a1);
+       if ((hashtab->table1[hash].state == CUCKOO_HASH_STATE_OCCUPIED) &&
+           cuckoo_hash_key_compare(hashtab, &(hashtab->table1[hash].key),
+                                   key)) {
+               hashtab->table1[hash].value = value;
+               return 0;
+       }
+
+       return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_update);
+
+
+void cuckoo_hash_iterate_reset(cuckoo_hash_table *hashtab)
+{
+       hashtab->iterate_index = 0;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_iterate_reset);
+
+
+int cuckoo_hash_iterate(cuckoo_hash_table *hashtab,
+                       cuckoo_hash_key *key, cuckoo_hash_value *value)
+{
+       unsigned index;
+
+       while (hashtab->iterate_index < hashtab->length) {
+               index = hashtab->iterate_index;
+               ++hashtab->iterate_index;
+               if (hashtab->table0[index].state == CUCKOO_HASH_STATE_OCCUPIED) {
+                       *key = hashtab->table0[index].key;
+                       *value = hashtab->table0[index].value;
+                       return 0;
+               }
+       }
+
+       while (hashtab->iterate_index >= hashtab->length &&
+              hashtab->iterate_index < hashtab->length * 2) {
+               index = hashtab->iterate_index - hashtab->length;
+               ++hashtab->iterate_index;               
+               if (hashtab->table1[index].state == CUCKOO_HASH_STATE_OCCUPIED) {
+                       *key = hashtab->table1[index].key;
+                       *value = hashtab->table1[index].value;
+                       return 0;
+               }
+       }
+
+       return -ENOSPC;
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_iterate);
+
+
+#if 0
+void cuckoo_hash_valid(cuckoo_hash_table *hashtab)
+{
+       int i, entry_count = 0;
+
+       for (i=0; i < hashtab->length; i++) {
+               EPRINTK_ON(hashtab->table0[i].state != CUCKOO_HASH_STATE_VACANT &&
+                          hashtab->table0[i].state != CUCKOO_HASH_STATE_OCCUPIED);
+               if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+                       entry_count++;
+               EPRINTK_ON(hashtab->table1[i].state != CUCKOO_HASH_STATE_VACANT &&
+                          hashtab->table1[i].state != CUCKOO_HASH_STATE_OCCUPIED);
+               if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+                       entry_count++;  
+       }
+       
+       if (entry_count != hashtab->entries) {
+               EPRINTK("%s: bad count\n", __FUNCTION__);
+               cuckoo_hash_dump(hashtab);
+               return;
+       }
+
+       for (i=0; i< hashtab->length; i++) {
+               if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+                       if (i != cuckoo_compute_hash(hashtab, 
+                                                    &hashtab->table0[i].key, 
+                                                    &hashtab->a0)) {
+                               EPRINTK("%s: Bad key table 0 index %d\n",
+                                       __FUNCTION__, i);
+                               cuckoo_hash_dump(hashtab);
+                               return;
+                       }
+               if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+                       if (i != cuckoo_compute_hash(hashtab, 
+                                                    &hashtab->table1[i].key, 
+                                                    &hashtab->a1)) {
+                               EPRINTK("%s: Bad key table 1 index %d\n",
+                                       __FUNCTION__, i);
+                               cuckoo_hash_dump(hashtab);
+                               return;
+                       }
+       }
+
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_valid);
+
+
+void cuckoo_hash_dump(cuckoo_hash_table *hashtab)
+{
+       int i, entry_count;
+
+       entry_count = 0;
+       for (i=0; i < hashtab->length; i++) {
+               EPRINTK_ON(hashtab->table0[i].state != CUCKOO_HASH_STATE_VACANT &&
+                          hashtab->table0[i].state != CUCKOO_HASH_STATE_OCCUPIED);
+               if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+                       entry_count++;
+               EPRINTK_ON(hashtab->table1[i].state != CUCKOO_HASH_STATE_VACANT &&
+                          hashtab->table1[i].state != CUCKOO_HASH_STATE_OCCUPIED);
+               if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+                       entry_count++;  
+       }
+
+       EPRINTK("======================\n");
+       EPRINTK("Cuckoo hash table dump\n");
+       EPRINTK("======================\n");
+       EPRINTK("length: %d; length_bits: %d; key_length: %d\n", hashtab->length,
+               hashtab->length_bits, hashtab->key_length);
+       EPRINTK("Recorded entries: %d\n", hashtab->entries);
+       EPRINTK("Counted entries: %d\n", entry_count);
+       EPRINTK("a0: %llx; a1: %llx\n", hashtab->a0, hashtab->a1);
+       EPRINTK("-----------------------------------------\n");
+       EPRINTK("Index  Occupied  Key  Value Index0 Index1\n");
+       EPRINTK("-----------------------------------------\n");         
+       for (i=0; i< hashtab->length; i++) {
+               if (hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+               EPRINTK("%d %d %llx %d %d %d\n", i,
+                       hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED,
+                       hashtab->table0[i].key, hashtab->table0[i].value,
+                       cuckoo_compute_hash(hashtab, &hashtab->table0[i].key, 
+                                           &hashtab->a0),
+                       cuckoo_compute_hash(hashtab, &hashtab->table0[i].key, 
+                                           &hashtab->a1));
+               else
+               EPRINTK("%d %d - - - -\n", i,
+                       hashtab->table0[i].state == CUCKOO_HASH_STATE_OCCUPIED);
+                       
+       }
+       EPRINTK("-----------------------------------------\n");
+       EPRINTK("Index  Occupied  Key  Value Index0 Index1\n");
+       EPRINTK("-----------------------------------------\n");
+       for (i=0; i< hashtab->length; i++) {
+               if (hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED)
+               EPRINTK("%d %d %llx %d %d %d\n", i,
+                       hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED,
+                       hashtab->table1[i].key, hashtab->table1[i].value,
+                       cuckoo_compute_hash(hashtab, &hashtab->table1[i].key, 
+                                           &hashtab->a0),
+                       cuckoo_compute_hash(hashtab, &hashtab->table1[i].key, 
+                                           &hashtab->a1));
+               else
+               EPRINTK("%d %d - - - -\n", i,
+                       hashtab->table1[i].state == CUCKOO_HASH_STATE_OCCUPIED);
+       } 
+       EPRINTK("======================\n");
+}
+EXPORT_SYMBOL_GPL(cuckoo_hash_dump);
+#endif
diff --git a/drivers/xen/sfc_netutil/accel_cuckoo_hash.h b/drivers/xen/sfc_netutil/accel_cuckoo_hash.h

new file mode 100644 (file)

index 0000000..83518f9
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_cuckoo_hash.h
@@ -0,0 +1,227 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+/*
+ * A cuckoo hash table consists of two sub tables.  Each entry can
+ * hash to a position in each table.  If, on entry, its position is
+ * found to be occupied, the existing element is moved to it's other
+ * location.  This recurses until success or a loop is found.  If a
+ * loop is found the table is rehashed.
+ *
+ *  See http://www.it-c.dk/people/pagh/papers/cuckoo-jour.pdf
+ */
+
+#ifndef NET_ACCEL_CUCKOO_HASH_H
+#define NET_ACCEL_CUCKOO_HASH_H
+
+/*! Type used for hash table keys of ip pairs */
+typedef struct {
+       u32 local_ip;
+       //u32 remote_ip;
+       u16 local_port;
+       //u16 remote_port;
+       /* Technically only 1 bit, but use 16 to make key a round
+          number size */
+       u16 proto;
+} cuckoo_hash_ip_key;
+
+/*! Type used for hash table keys of mac addresses */
+typedef u64 cuckoo_hash_mac_key;
+
+/*! This type is designed to be large enough to hold all supported key
+ *  sizes to avoid having to malloc storage for them.
+ */
+typedef u64 cuckoo_hash_key;
+
+/*! Type used for the values stored in the hash table */
+typedef int cuckoo_hash_value;
+
+/*! Type used for the hash used to index the table */
+typedef u32 cuckoo_hash;
+
+/*! How long to spend displacing values when adding before giving up
+ *  and rehashing */
+#define CUCKOO_HASH_MAX_LOOP (hashtab->length)
+
+/*! State of hash table entry */
+typedef enum {
+       CUCKOO_HASH_STATE_VACANT = 0,
+       CUCKOO_HASH_STATE_OCCUPIED 
+} cuckoo_hash_state;
+
+/*! An entry in the hash table */
+typedef struct {
+       cuckoo_hash_state state;
+       cuckoo_hash_key key;
+       cuckoo_hash_value value;
+} cuckoo_hash_entry;
+
+/*! A cuckoo hash table */
+typedef struct {
+       /*! The length of each table (NB. there are two tables of this
+        *  length) */
+       unsigned length; 
+       /*! The length of each table in bits */
+       unsigned length_bits;
+       /*! The length of the key in bytes */ 
+       unsigned key_length; 
+       /*! The number of entries currently stored in the table */
+       unsigned entries;
+       /*! Index into table used by cuckoo_hash_iterate */
+       unsigned iterate_index; 
+
+       /* parameter of hash functions */
+       /*! The "a" parameter of the first hash function */
+       cuckoo_hash_key a0; 
+       /*! The "a" parameter of the second hash function */
+       cuckoo_hash_key a1; 
+
+       /*! The first table */
+       cuckoo_hash_entry *table0; 
+       /*! The second table */
+       cuckoo_hash_entry *table1; 
+} cuckoo_hash_table;
+
+/*! Initialise the cuckoo has table 
+ *
+ * \param hashtab A pointer to an unitialised hash table structure
+ * \param length_bits The number of elements in each table equals
+ * 2**length_bits
+ * \param key_length The length of the key in bytes
+ *
+ * \return 0 on success, -ENOMEM if it couldn't allocate the tables
+ */
+extern
+int cuckoo_hash_init(cuckoo_hash_table *hashtab, unsigned length_bits,
+                    unsigned key_length);
+
+
+/*! Destroy a hash table
+ *
+ * \param hashtab A hash table that has previously been passed to a
+ * successful call of cuckoo_hash_init()
+ */
+extern
+void cuckoo_hash_destroy(cuckoo_hash_table *hashtab);
+
+
+/*! Lookup an entry in the hash table 
+ *
+ * \param hashtab The hash table in which to look.
+ * \param key Pointer to a mac address to use as the key
+ * \param value On exit set to the value stored if key was present
+ *
+ * \return 0 if not present in the table, non-zero if it is (and value
+ * is set accordingly)
+ */
+extern
+int cuckoo_hash_lookup(cuckoo_hash_table *hashtab,
+                      cuckoo_hash_key *key,
+                      cuckoo_hash_value *value);
+
+/*! Add an entry to the hash table.  Key must not be a duplicate of
+ * anything already in the table.  If this is a risk, see
+ * cuckoo_hash_add_check
+ *
+ * \param hashtab The hash table to add the entry to
+ * \param key Pointer to a mac address to use as a key
+ * \param value The value to store 
+ * \param can_rehash Flag to allow the add function to rehash the
+ * table if necessary
+ *
+ * \return 0 on success, non-zero on failure.  -ENOSPC means it just
+ * couldn't find anywhere to put it - this is bad and probably means
+ * an entry has been dropped on the floor (but the entry you just
+ * tried to add may now be included)
+ */
+extern
+int cuckoo_hash_add(cuckoo_hash_table *hashtab,
+                   cuckoo_hash_key *key, 
+                   cuckoo_hash_value value,
+                   int can_rehash);
+
+/*! Same as cuckoo_hash_add but first checks to ensure entry is not
+ * already there
+ * \return -EBUSY if already there
+ */
+
+extern
+int cuckoo_hash_add_check(cuckoo_hash_table *hashtab,
+                         cuckoo_hash_key *key, 
+                         cuckoo_hash_value value,
+                         int can_rehash);
+/*! Remove an entry from the table 
+ *
+ * \param hashtab The hash table to remove the entry from
+ * \param key The key that was used to previously add the entry
+ *
+ * \return 0 on success, -EINVAL if the entry couldn't be found 
+ */
+extern
+int cuckoo_hash_remove(cuckoo_hash_table *hashtab, cuckoo_hash_key *key);
+
+
+/*! Helper for those using mac addresses to convert to a key for the
+ *  hash table
+ */
+static inline cuckoo_hash_mac_key cuckoo_mac_to_key(const u8 *mac)
+{
+       return (cuckoo_hash_mac_key)(mac[0])
+               | (cuckoo_hash_mac_key)(mac[1]) << 8
+               | (cuckoo_hash_mac_key)(mac[2]) << 16
+               | (cuckoo_hash_mac_key)(mac[3]) << 24
+               | (cuckoo_hash_mac_key)(mac[4]) << 32
+               | (cuckoo_hash_mac_key)(mac[5]) << 40;
+}
+
+
+/*! Update an entry already in the hash table to take a new value 
+ *
+ * \param hashtab The hash table to add the entry to
+ * \param key Pointer to a mac address to use as a key
+ * \param value The value to store 
+ *
+ * \return 0 on success, non-zero on failure. 
+ */
+int cuckoo_hash_update(cuckoo_hash_table *hashtab, cuckoo_hash_key *key,
+                      cuckoo_hash_value value);
+
+
+/*! Go through the hash table and return all used entries (one per call)
+ *
+ * \param hashtab The hash table to iterate over 
+ * \param key Pointer to a key to take the returned key
+ * \param value Pointer to a value to take the returned value
+ *
+ * \return 0 on success (key, value set), non-zero on failure.
+ */
+int cuckoo_hash_iterate(cuckoo_hash_table *hashtab,
+                       cuckoo_hash_key *key, cuckoo_hash_value *value);
+void cuckoo_hash_iterate_reset(cuckoo_hash_table *hashtab);
+
+/* debug, not compiled by default */
+void cuckoo_hash_valid(cuckoo_hash_table *hashtab);
+void cuckoo_hash_dump(cuckoo_hash_table *hashtab);
+
+#endif /* NET_ACCEL_CUCKOO_HASH_H */
diff --git a/drivers/xen/sfc_netutil/accel_msg_iface.c b/drivers/xen/sfc_netutil/accel_msg_iface.c

new file mode 100644 (file)

index 0000000..e52de14
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_msg_iface.c
@@ -0,0 +1,301 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <xen/evtchn.h>
+
+#include "accel_util.h"
+#include "accel_msg_iface.h"
+
+#define NET_ACCEL_MSG_Q_SIZE (1024)
+#define NET_ACCEL_MSG_Q_MASK (NET_ACCEL_MSG_Q_SIZE - 1)
+
+#ifdef NDEBUG
+#define NET_ACCEL_CHECK_MAGIC(_p, _errval)
+#define NET_ACCEL_SHOW_QUEUE(_t, _q, _id)
+#else
+#define NET_ACCEL_CHECK_MAGIC(_p, _errval)                             \
+       if (_p->magic != NET_ACCEL_MSG_MAGIC) {                         \
+               pr_err("%s: passed invalid shared page %p!\n",          \
+                      __FUNCTION__, _p);                               \
+               return _errval;                                         \
+       }
+#define NET_ACCEL_SHOW_QUEUE(_t, _q, _id)                              \
+       printk(_t ": queue %d write %x read %x base %x limit %x\n",     \
+              _id, _q->write, _q->read, _q->base, _q->limit);
+#endif
+
+/*
+ * We've been passed at least 2 pages. 1 control page and 1 or more
+ * data pages.
+ */
+int net_accel_msg_init_page(void *mem, int len, int up)
+{
+       struct net_accel_shared_page *shared_page = 
+               (struct net_accel_shared_page*)mem;
+
+       if ((unsigned long)shared_page & NET_ACCEL_MSG_Q_MASK)
+               return -EINVAL;
+
+       shared_page->magic = NET_ACCEL_MSG_MAGIC;
+
+       shared_page->aflags = 0;
+
+       shared_page->net_dev_up = up;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_init_page);
+
+
+void net_accel_msg_init_queue(sh_msg_fifo2 *queue,
+                             struct net_accel_msg_queue *indices,
+                             struct net_accel_msg *base, int size)
+{
+       queue->fifo = base;
+       spin_lock_init(&queue->lock);
+       sh_fifo2_init(queue, size-1, &indices->read, &indices->write);
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_init_queue);
+
+
+static inline int _net_accel_msg_send(struct net_accel_shared_page *sp,
+                                     sh_msg_fifo2 *queue,
+                                     struct net_accel_msg *msg,
+                                     int is_reply)
+{
+       int rc = 0;
+       NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
+       rmb();
+       if (is_reply) {
+               EPRINTK_ON(sh_fifo2_is_full(queue));
+               sh_fifo2_put(queue, *msg);
+       } else {
+               if (sh_fifo2_not_half_full(queue)) {
+                       sh_fifo2_put(queue, *msg);
+               } else {
+                       rc = -ENOSPC;
+               }
+       }
+       wmb();
+       return rc;
+}
+
+/* Notify after a batch of messages have been sent */
+void net_accel_msg_notify(int irq)
+{
+       notify_remote_via_irq(irq);
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_notify);
+
+/* 
+ * Send a message on the specified FIFO. Returns 0 on success, -errno
+ * on failure. The message in msg is copied to the current slot of the
+ * FIFO.
+ */
+int net_accel_msg_send(struct net_accel_shared_page *sp, sh_msg_fifo2 *q, 
+                      struct net_accel_msg *msg)
+{
+       unsigned long flags;
+       int rc;
+       net_accel_msg_lock_queue(q, &flags);
+       rc = _net_accel_msg_send(sp, q, msg, 0);
+       net_accel_msg_unlock_queue(q, &flags);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_send);
+
+
+/* As net_accel_msg_send but also posts a notification to the far end. */
+int net_accel_msg_send_notify(struct net_accel_shared_page *sp, int irq, 
+                             sh_msg_fifo2 *q, struct net_accel_msg *msg)
+{
+       unsigned long flags;
+       int rc;
+       net_accel_msg_lock_queue(q, &flags);
+       rc = _net_accel_msg_send(sp, q, msg, 0);
+       net_accel_msg_unlock_queue(q, &flags);
+       if (rc >= 0)
+               notify_remote_via_irq(irq);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_send_notify);
+
+
+int net_accel_msg_reply(struct net_accel_shared_page *sp, sh_msg_fifo2 *q, 
+                      struct net_accel_msg *msg)
+{
+       unsigned long flags;
+       int rc;
+       net_accel_msg_lock_queue(q, &flags);
+       rc = _net_accel_msg_send(sp, q, msg, 1);
+       net_accel_msg_unlock_queue(q, &flags);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_reply);
+
+
+/* As net_accel_msg_send but also posts a notification to the far end. */
+int net_accel_msg_reply_notify(struct net_accel_shared_page *sp, int irq, 
+                             sh_msg_fifo2 *q, struct net_accel_msg *msg)
+{
+       unsigned long flags;
+       int rc;
+       net_accel_msg_lock_queue(q, &flags);
+       rc = _net_accel_msg_send(sp, q, msg, 1);
+       net_accel_msg_unlock_queue(q, &flags);
+       if (rc >= 0)
+               notify_remote_via_irq(irq);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_reply_notify);
+
+
+/*
+ * Look at a received message, if any, so a decision can be made about
+ * whether to read it now or not.  Cookie is a bit of debug which is
+ * set here and checked when passed to net_accel_msg_recv_next()
+ */
+int net_accel_msg_peek(struct net_accel_shared_page *sp, 
+                      sh_msg_fifo2 *queue, 
+                      struct net_accel_msg *msg, int *cookie)
+{
+       unsigned long flags;
+       int rc = 0;
+       NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
+       net_accel_msg_lock_queue(queue, &flags);
+       rmb();
+       if (sh_fifo2_is_empty(queue)) {
+               rc = -ENOENT;
+       } else {
+               *msg = sh_fifo2_peek(queue);
+               *cookie = *(queue->fifo_rd_i);
+       }
+       net_accel_msg_unlock_queue(queue, &flags);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_peek);
+
+
+/*
+ * Move the queue onto the next element, used after finished with a
+ * peeked msg 
+ */
+int net_accel_msg_recv_next(struct net_accel_shared_page *sp, 
+                           sh_msg_fifo2 *queue, int cookie)
+{
+       unsigned long flags;
+       NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
+       net_accel_msg_lock_queue(queue, &flags);
+       rmb();
+       /* Mustn't be empty */
+       BUG_ON(sh_fifo2_is_empty(queue));
+       /* 
+        * Check cookie matches, i.e. we're advancing over the same message
+        * as was got using peek 
+        */
+       BUG_ON(cookie != *(queue->fifo_rd_i));
+       sh_fifo2_rd_next(queue);
+       wmb();
+       net_accel_msg_unlock_queue(queue, &flags);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_recv_next);
+
+
+/* 
+ * Receive a message on the specified FIFO. Returns 0 on success,
+ * -errno on failure.
+ */
+int net_accel_msg_recv(struct net_accel_shared_page *sp, sh_msg_fifo2 *queue, 
+                      struct net_accel_msg *msg)
+{
+       unsigned long flags;
+       int rc = 0;
+       NET_ACCEL_CHECK_MAGIC(sp, -EINVAL);
+       net_accel_msg_lock_queue(queue, &flags);
+       rmb();
+       if (sh_fifo2_is_empty(queue)) {
+               rc = -ENOENT;
+       } else {
+               sh_fifo2_get(queue, msg);
+       }
+       wmb();
+       net_accel_msg_unlock_queue(queue, &flags);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_recv);
+
+
+/* 
+ * Start sending a message without copying. returns a pointer to a message
+ * that will be filled out in place. The queue is locked until the message 
+ * is sent.
+ */
+struct net_accel_msg *net_accel_msg_start_send(struct net_accel_shared_page *sp,
+                                              sh_msg_fifo2 *queue, unsigned long *flags)
+{
+       struct net_accel_msg *msg;
+       NET_ACCEL_CHECK_MAGIC(sp, NULL);
+       net_accel_msg_lock_queue(queue, flags);
+       rmb();
+       if (sh_fifo2_not_half_full(queue)) {
+               msg = sh_fifo2_pokep(queue);
+       } else {
+               net_accel_msg_unlock_queue(queue, flags);
+               msg = NULL;
+       }
+       return msg;
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_start_send);
+
+
+static inline void _msg_complete(struct net_accel_shared_page *sp,
+                                sh_msg_fifo2 *queue,
+                                unsigned long *flags)
+{
+       sh_fifo2_wr_next(queue);
+       net_accel_msg_unlock_queue(queue, flags);
+}
+
+/*
+ * Complete the sending of a message started with net_accel_msg_start_send. The 
+ * message is implicit since the queue was locked by _start
+ */
+void net_accel_msg_complete_send(struct net_accel_shared_page *sp,
+                                sh_msg_fifo2 *queue,
+                                unsigned long *flags)
+{
+       _msg_complete(sp, queue, flags);
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_complete_send);
+
+/* As net_accel_msg_complete_send but does the notify. */
+void net_accel_msg_complete_send_notify(struct net_accel_shared_page *sp, 
+                                       sh_msg_fifo2 *queue, 
+                                       unsigned long *flags, int irq)
+{
+       _msg_complete(sp, queue, flags);
+       notify_remote_via_irq(irq);
+}
+EXPORT_SYMBOL_GPL(net_accel_msg_complete_send_notify);
diff --git a/drivers/xen/sfc_netutil/accel_msg_iface.h b/drivers/xen/sfc_netutil/accel_msg_iface.h

new file mode 100644 (file)

index 0000000..0483a56
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_msg_iface.h
@@ -0,0 +1,415 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NET_ACCEL_MSG_IFACE_H
+#define NET_ACCEL_MSG_IFACE_H
+
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+
+#include "accel_shared_fifo.h"
+
+#define NET_ACCEL_MSG_MAGIC (0x85465479)
+
+/*! We talk version 0.010 of the interdomain protocol */
+#define NET_ACCEL_MSG_VERSION (0x00001000)
+
+/*! Shared memory portion of inter-domain FIFO */
+struct net_accel_msg_queue {
+       u32 read;
+       u32 write;
+};
+
+
+/*
+ * The aflags in the following structure is used as follows:
+ *
+ *  - each bit is set when one of the corresponding variables is
+ *  changed by either end.
+ *
+ *  - the end that has made the change then forwards an IRQ to the
+ *  other
+ *
+ *  - the IRQ handler deals with these bits either on the fast path, or
+ *  for less common changes, by jumping onto the slow path.
+ *
+ *  - once it has seen a change, it clears the relevant bit.
+ *
+ * aflags is accessed atomically using clear_bit, test_bit,
+ * test_and_set_bit etc
+ */
+
+/*
+ * The following used to signify to the other domain when the queue
+ * they want to use is full, and when it is no longer full.  Could be
+ * compressed to use fewer bits but done this way for simplicity and
+ * clarity
+ */
+
+/* "dom0->domU queue" is full */
+#define NET_ACCEL_MSG_AFLAGS_QUEUE0FULL      0x1 
+#define NET_ACCEL_MSG_AFLAGS_QUEUE0FULL_B    0
+/* "dom0->domU queue" is not full */
+#define NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL   0x2 
+#define NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL_B 1
+/* "domU->dom0 queue" is full */
+#define NET_ACCEL_MSG_AFLAGS_QUEUEUFULL      0x4 
+#define NET_ACCEL_MSG_AFLAGS_QUEUEUFULL_B    2
+/* "domU->dom0 queue" is not full */
+#define NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL   0x8
+#define NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL_B 3
+/* dom0 -> domU net_dev up/down events */
+#define NET_ACCEL_MSG_AFLAGS_NETUPDOWN  0x10
+#define NET_ACCEL_MSG_AFLAGS_NETUPDOWN_B       4
+
+/*
+ * Masks used to test if there are any messages for domU and dom0
+ * respectively
+ */
+#define NET_ACCEL_MSG_AFLAGS_TO_DOMU_MASK      \
+       (NET_ACCEL_MSG_AFLAGS_QUEUE0FULL    |   \
+        NET_ACCEL_MSG_AFLAGS_QUEUEUNOTFULL |   \
+        NET_ACCEL_MSG_AFLAGS_NETUPDOWN)
+#define NET_ACCEL_MSG_AFLAGS_TO_DOM0_MASK      \
+       (NET_ACCEL_MSG_AFLAGS_QUEUE0NOTFULL |   \
+        NET_ACCEL_MSG_AFLAGS_QUEUEUFULL)
+
+/*! The shared data structure used for inter-VM communication. */
+struct net_accel_shared_page {
+       /*! Sanity check */
+       u32 magic;          
+       /*! Used by host/Dom0 */
+       struct net_accel_msg_queue queue0;
+       /*! Used by guest/DomU */
+       struct net_accel_msg_queue queue1;
+       /*! Atomic flags, used to communicate simple state changes */
+       u32 aflags;     
+       /*! State of net_dev used for acceleration */     
+       u32 net_dev_up; 
+};
+
+
+enum net_accel_hw_type {
+       /*! Not a virtualisable NIC: use slow path. */
+       NET_ACCEL_MSG_HWTYPE_NONE = 0,
+       /*! NIC is Falcon-based */
+       NET_ACCEL_MSG_HWTYPE_FALCON_A = 1,
+       NET_ACCEL_MSG_HWTYPE_FALCON_B = 2,
+       NET_ACCEL_MSG_HWTYPE_SIENA_A = 3,
+};
+
+/*! The maximum number of pages used by an event queue. */
+#define EF_HW_FALCON_EVQ_PAGES 8
+
+struct net_accel_hw_falcon_b {
+       /* VI */
+       /*! Grant for Tx DMA Q */
+       u32 txdmaq_gnt;   
+       /*! Grant for Rx DMA Q */
+       u32 rxdmaq_gnt;   
+       /*! Machine frame number for Tx/Rx doorbell page */
+       u32 doorbell_mfn; 
+       /*! Grant for Tx/Rx doorbell page */
+       u32 doorbell_gnt;
+
+       /* Event Q */
+       /*! Grants for the pages of the EVQ */
+       u32 evq_mem_gnts[EF_HW_FALCON_EVQ_PAGES]; 
+       u32 evq_offs;
+       /*! log2(pages in event Q) */
+       u32 evq_order;    
+       /*! Capacity in events */
+       u32 evq_capacity; 
+       /*! Eventq pointer register physical address */
+       u32 evq_rptr; 
+       /*! Interface instance */
+       u32 instance; 
+       /*! Capacity of RX queue */
+       u32 rx_capacity;
+       /*! Capacity of TX queue */
+       u32 tx_capacity;
+
+       /* NIC */
+       s32 nic_arch;
+       s32 nic_revision;
+       u8 nic_variant;
+};
+
+struct net_accel_hw_falcon_a {
+       struct net_accel_hw_falcon_b common;
+       u32 evq_rptr_gnt;
+};
+
+
+/*! Description of the hardware that the DomU is being given. */
+struct net_accel_msg_hw {
+       u32 type;               /*!< Hardware type */
+       union {
+               struct net_accel_hw_falcon_a falcon_a;
+               struct net_accel_hw_falcon_b falcon_b;
+       } resources;
+};
+
+/*! Start-of-day handshake message. Dom0 fills in its version and
+ * sends, DomU checks, inserts its version and replies
+ */
+struct net_accel_msg_hello {
+       /*! Sender's version (set by each side in turn) */
+       u32 version;    
+       /*! max pages allocated/allowed for buffers */
+       u32 max_pages;      
+};
+
+/*! Maximum number of page requests that can fit in a message. */
+#define NET_ACCEL_MSG_MAX_PAGE_REQ (8)
+
+/*! Request for NIC buffers. DomU fils out pages and grants (and
+ *  optionally) reqid, dom0 fills out buf and sends reply 
+ */
+struct net_accel_msg_map_buffers {
+       u32 reqid;      /*!< Optional request ID */
+       u32 pages;      /*!< Number of pages to map */
+       u32 grants[NET_ACCEL_MSG_MAX_PAGE_REQ];  /*!< Grant ids to map */ 
+       u32 buf;          /*!< NIC buffer address of pages obtained */
+};
+
+/*! Notification of a change to local mac address, used to filter
+  locally destined packets off the fast path */
+struct net_accel_msg_localmac {
+       u32 flags;      /*!< Should this be added or removed? */
+       u8 mac[ETH_ALEN]; /*!< The mac address to filter onto slow path */
+};
+
+struct net_accel_msg_fastpath {
+       u32 flags;      /*!< Should this be added or removed? */
+       u8  mac[ETH_ALEN];/*!< The mac address to filter onto fast path */
+       u16 port;        /*!< The port of the connection */
+       u32 ip;    /*!< The IP address of the connection */
+       u8  proto;      /*!< The protocol of connection (TCP/UDP) */
+};
+
+/*! Values for struct ef_msg_localmac/fastpath.flags */
+#define NET_ACCEL_MSG_ADD    0x1
+#define NET_ACCEL_MSG_REMOVE 0x2
+
+/*! Overall message structure */
+struct net_accel_msg {
+       /*! ID specifying type of messge */
+       u32 id;              
+       union {
+               /*! handshake */
+               struct net_accel_msg_hello hello;  
+               /*! hardware description */
+               struct net_accel_msg_hw hw;     
+               /*! buffer map request */
+               struct net_accel_msg_map_buffers mapbufs; 
+               /*! mac address of a local interface */
+               struct net_accel_msg_localmac localmac; 
+               /*! address of a new fastpath connection */
+               struct net_accel_msg_fastpath fastpath; 
+               /*! make the message a fixed size */
+               u8 pad[128 - sizeof(u32)]; 
+       }  u;
+};
+
+
+#define NET_ACCEL_MSG_HW_TO_MSG(_u) container_of(_u, struct net_accel_msg, u.hw)
+
+/*! Inter-domain message FIFO */
+typedef struct {
+       struct net_accel_msg *fifo;
+       u32 fifo_mask;
+       u32 *fifo_rd_i;
+       u32 *fifo_wr_i;
+       spinlock_t lock;
+       u32 is_locked; /* Debug flag */
+} sh_msg_fifo2;
+
+
+#define NET_ACCEL_MSG_OFFSET_MASK PAGE_MASK
+
+/* Modifiers */
+#define NET_ACCEL_MSG_REPLY    (0x80000000)
+#define NET_ACCEL_MSG_ERROR    (0x40000000)
+
+/* Dom0 -> DomU and reply. Handshake/version check. */
+#define NET_ACCEL_MSG_HELLO    (0x00000001)
+/* Dom0 -> DomU : hardware setup (VI info.) */
+#define NET_ACCEL_MSG_SETHW    (0x00000002)
+/*
+ * Dom0 -> DomU. Notification of a local mac to add/remove from slow
+ * path filter
+ */
+#define NET_ACCEL_MSG_LOCALMAC (0x00000003)
+/* 
+ * DomU -> Dom0 and reply. Request for buffer table entries for
+ * preallocated pages.
+ */
+#define NET_ACCEL_MSG_MAPBUF   (0x00000004)
+/* 
+ * Dom0 -> DomU. Notification of a local mac to add/remove from fast
+ * path filter
+ */
+#define NET_ACCEL_MSG_FASTPATH (0x00000005)
+
+/*! Initialise a message and set the type
+ * \param message : the message
+ * \param code : the message type 
+ */
+static inline void net_accel_msg_init(struct net_accel_msg *msg, int code) {
+       msg->id = (u32)code;
+}
+
+/*! initialise a shared page structure
+ * \param shared_page : mapped memory in which the structure resides
+ * \param len : size of the message FIFO area that follows
+ * \param up : initial up/down state of netdev 
+ * \return 0 or an error code
+ */
+extern int net_accel_msg_init_page(void *shared_page, int len, int up);
+
+/*! initialise a message queue 
+ * \param queue : the message FIFO to initialise 
+ * \param indices : the read and write indices in shared memory
+ * \param base : the start of the memory area for the FIFO
+ * \param size : the size of the FIFO in bytes
+ */
+extern void net_accel_msg_init_queue(sh_msg_fifo2 *queue,
+                                    struct net_accel_msg_queue *indices,
+                                    struct net_accel_msg *base, int size);
+
+/* Notify after a batch of messages have been sent */
+extern void net_accel_msg_notify(int irq);
+
+/*! Send a message on the specified FIFO. The message is copied to the 
+ *  current slot of the FIFO.
+ * \param sp : pointer to shared page
+ * \param q : pointer to message FIFO to use
+ * \param msg : pointer to message 
+ * \return 0 on success, -errno on
+ */ 
+extern int net_accel_msg_send(struct net_accel_shared_page *sp,
+                             sh_msg_fifo2 *q, 
+                             struct net_accel_msg *msg);
+extern int net_accel_msg_reply(struct net_accel_shared_page *sp,
+                             sh_msg_fifo2 *q, 
+                             struct net_accel_msg *msg);
+
+/*! As net_accel_msg_send but also posts a notification to the far end. */
+extern int net_accel_msg_send_notify(struct net_accel_shared_page *sp, 
+                                    int irq, sh_msg_fifo2 *q, 
+                                    struct net_accel_msg *msg);
+/*! As net_accel_msg_send but also posts a notification to the far end. */
+extern int net_accel_msg_reply_notify(struct net_accel_shared_page *sp, 
+                                     int irq, sh_msg_fifo2 *q, 
+                                     struct net_accel_msg *msg);
+
+/*! Receive a message on the specified FIFO. Returns 0 on success,
+ *  -errno on failure.
+ */
+extern int net_accel_msg_recv(struct net_accel_shared_page *sp,
+                             sh_msg_fifo2 *q,
+                             struct net_accel_msg *msg);
+
+/*! Look at a received message, if any, so a decision can be made
+ *  about whether to read it now or not.  Cookie is a bit of debug
+ *  which is set here and checked when passed to
+ *  net_accel_msg_recv_next()
+ */
+extern int net_accel_msg_peek(struct net_accel_shared_page *sp,
+                             sh_msg_fifo2 *queue, 
+                             struct net_accel_msg *msg, int *cookie);
+/*! Move the queue onto the next element, used after finished with a
+ *  peeked msg 
+ */
+extern int net_accel_msg_recv_next(struct net_accel_shared_page *sp,
+                                  sh_msg_fifo2 *queue, int cookie);
+
+/*! Start sending a message without copying. returns a pointer to a
+ *  message that will be filled out in place. The queue is locked
+ *  until the message is sent.
+ */
+extern 
+struct net_accel_msg *net_accel_msg_start_send(struct net_accel_shared_page *sp,
+                                              sh_msg_fifo2 *queue,
+                                              unsigned long *flags);
+
+
+/*! Complete the sending of a message started with
+ *  net_accel_msg_start_send. The message is implicit since the queue
+ *  was locked by _start 
+ */
+extern void net_accel_msg_complete_send(struct net_accel_shared_page *sp,
+                                       sh_msg_fifo2 *queue,
+                                       unsigned long *flags);
+
+/*! As net_accel_msg_complete_send but does the notify. */
+extern void net_accel_msg_complete_send_notify(struct net_accel_shared_page *sp, 
+                                              sh_msg_fifo2 *queue,
+                                              unsigned long *flags, int irq);
+
+/*! Lock the queue so that multiple "_locked" functions can be called
+ *  without the queue being modified by others 
+ */
+static inline
+void net_accel_msg_lock_queue(sh_msg_fifo2 *queue, unsigned long *flags)
+{
+       spin_lock_irqsave(&queue->lock, (*flags));
+       rmb();
+       BUG_ON(queue->is_locked);
+       queue->is_locked = 1;
+}
+
+/*! Unlock the queue */
+static inline
+void net_accel_msg_unlock_queue(sh_msg_fifo2 *queue, unsigned long *flags)
+{
+       BUG_ON(!queue->is_locked);
+       queue->is_locked = 0;
+       wmb();
+       spin_unlock_irqrestore(&queue->lock, (*flags));
+}
+
+/*! Give up without sending a message that was started with
+ *  net_accel_msg_start_send() 
+ */
+static inline 
+void net_accel_msg_abort_send(struct net_accel_shared_page *sp,
+                             sh_msg_fifo2 *queue, unsigned long *flags)
+{
+       net_accel_msg_unlock_queue(queue, flags);
+}
+
+/*! Test the queue to ensure there is sufficient space */
+static inline
+int net_accel_msg_check_space(sh_msg_fifo2 *queue, unsigned space)
+{
+       return sh_fifo2_space(queue) >= space;
+}
+
+#endif /* NET_ACCEL_MSG_IFACE_H */
diff --git a/drivers/xen/sfc_netutil/accel_shared_fifo.h b/drivers/xen/sfc_netutil/accel_shared_fifo.h

new file mode 100644 (file)

index 0000000..a55608a
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_shared_fifo.h
@@ -0,0 +1,127 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NET_ACCEL_SHARED_FIFO_H
+#define NET_ACCEL_SHARED_FIFO_H
+
+/*
+ * This is based on fifo.h, but handles sharing between address spaces
+ * that don't trust each other, by splitting out the read and write
+ * indices. This costs at least one pointer indirection more than the
+ * vanilla version per access.
+ */
+
+typedef struct {
+       char*    fifo;
+       unsigned      fifo_mask;
+       unsigned      *fifo_rd_i;
+       unsigned      *fifo_wr_i;
+} sh_byte_fifo2;
+
+#define SH_FIFO2_M(f, x)     ((x) & ((f)->fifo_mask))
+
+static inline unsigned log2_ge(unsigned long n, unsigned min_order) {
+       unsigned order = min_order;
+       while((1ul << order) < n) ++order;
+       return order;
+}
+
+static inline unsigned long pow2(unsigned order) {
+       return (1ul << order);
+}
+
+#define is_pow2(x)  (pow2(log2_ge((x), 0)) == (x))
+
+#define sh_fifo2_valid(f)  ((f) && (f)->fifo && (f)->fifo_mask > 0 &&   \
+                           is_pow2((f)->fifo_mask+1u))
+
+#define sh_fifo2_init(f, cap, _rptr, _wptr)            \
+       do {                                            \
+               BUG_ON(!is_pow2((cap) + 1));            \
+               (f)->fifo_rd_i = _rptr;                 \
+               (f)->fifo_wr_i = _wptr;                 \
+               *(f)->fifo_rd_i = *(f)->fifo_wr_i = 0u; \
+               (f)->fifo_mask = (cap);                 \
+       } while(0)
+
+#define sh_fifo2_num(f)      SH_FIFO2_M((f),*(f)->fifo_wr_i - *(f)->fifo_rd_i)
+#define sh_fifo2_space(f)    SH_FIFO2_M((f),*(f)->fifo_rd_i - *(f)->fifo_wr_i-1u)
+#define sh_fifo2_is_empty(f)  (sh_fifo2_num(f)==0)
+#define sh_fifo2_not_empty(f) (sh_fifo2_num(f)!=0)
+#define sh_fifo2_is_full(f)   (sh_fifo2_space(f)==0u)
+#define sh_fifo2_not_full(f)  (sh_fifo2_space(f)!=0u)
+#define sh_fifo2_buf_size(f) ((f)->fifo_mask + 1u)
+#define sh_fifo2_capacity(f) ((f)->fifo_mask)
+#define sh_fifo2_end(f)      ((f)->fifo + sh_fifo2_buf_size(f))
+#define sh_fifo2_not_half_full(f) (sh_fifo2_space(f) > (sh_fifo2_capacity(f) >> 1))
+
+#define sh_fifo2_peek(f)     ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_rd_i)])
+#define sh_fifo2_peekp(f)    ((f)->fifo + SH_FIFO2_M((f), *(f)->fifo_rd_i))
+#define sh_fifo2_poke(f)     ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_wr_i)])
+#define sh_fifo2_pokep(f)    ((f)->fifo + SH_FIFO2_M((f), *(f)->fifo_wr_i))
+#define sh_fifo2_peek_i(f,i) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_rd_i+(i))])
+#define sh_fifo2_poke_i(f,i) ((f)->fifo[SH_FIFO2_M((f), *(f)->fifo_wr_i+(i))])
+
+#define sh_fifo2_rd_next(f)                                    \
+       do {*(f)->fifo_rd_i = *(f)->fifo_rd_i + 1u;} while(0)
+#define sh_fifo2_wr_next(f)                                    \
+       do {*(f)->fifo_wr_i = *(f)->fifo_wr_i + 1u;} while(0)
+#define sh_fifo2_rd_adv(f, n)                                  \
+       do {*(f)->fifo_rd_i = *(f)->fifo_rd_i + (n);} while(0)
+#define sh_fifo2_wr_adv(f, n)                                  \
+       do {*(f)->fifo_wr_i = *(f)->fifo_wr_i + (n);} while(0)
+
+#define sh_fifo2_put(f, v)                                             \
+       do {sh_fifo2_poke(f) = (v); wmb(); sh_fifo2_wr_next(f);} while(0)
+
+#define sh_fifo2_get(f, pv)                                            \
+       do {*(pv) = sh_fifo2_peek(f); mb(); sh_fifo2_rd_next(f);} while(0)
+
+static inline unsigned sh_fifo2_contig_num(sh_byte_fifo2 *f)
+{
+       unsigned fifo_wr_i = SH_FIFO2_M(f, *f->fifo_wr_i);
+       unsigned fifo_rd_i = SH_FIFO2_M(f, *f->fifo_rd_i);
+
+       return (fifo_wr_i >= fifo_rd_i)
+               ? fifo_wr_i - fifo_rd_i
+               : f->fifo_mask + 1u - *(f)->fifo_rd_i;
+}
+
+static inline unsigned sh_fifo2_contig_space(sh_byte_fifo2 *f)
+{
+       unsigned fifo_wr_i = SH_FIFO2_M(f, *f->fifo_wr_i);
+       unsigned fifo_rd_i = SH_FIFO2_M(f, *f->fifo_rd_i);
+
+       return (fifo_rd_i > fifo_wr_i)
+               ? fifo_rd_i - fifo_wr_i - 1
+               : (f->fifo_mask + 1u - fifo_wr_i
+                  /*
+                   * The last byte can't be used if the read pointer
+                   * is at zero.
+                   */
+                  - (fifo_rd_i==0));
+}
+
+
+#endif /* NET_ACCEL_SHARED_FIFO_H */
diff --git a/drivers/xen/sfc_netutil/accel_util.c b/drivers/xen/sfc_netutil/accel_util.c

new file mode 100644 (file)

index 0000000..1db241d
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_util.c
@@ -0,0 +1,336 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#include <linux/slab.h>
+#include <linux/if_ether.h>
+#include <linux/module.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/hypercall.h>
+#include <xen/xenbus.h>
+#include <xen/gnttab.h>
+
+#include "accel_util.h"
+
+#ifdef EFX_GCOV
+#include "gcov.h"
+
+static int __init net_accel_init(void)
+{
+       gcov_provider_init(THIS_MODULE);
+       return 0;
+}
+module_init(net_accel_init);
+
+static void __exit net_accel_exit(void)
+{
+       gcov_provider_fini(THIS_MODULE);
+}
+module_exit(net_accel_exit);
+#endif
+
+/* Shutdown remote domain that is misbehaving */
+int net_accel_shutdown_remote(int domain)
+{
+       struct sched_remote_shutdown sched_shutdown = {
+               .domain_id = domain,
+               .reason = SHUTDOWN_crash
+       };
+
+       EPRINTK("Crashing domain %d\n", domain);
+
+       return HYPERVISOR_sched_op(SCHEDOP_remote_shutdown, &sched_shutdown);
+}
+EXPORT_SYMBOL(net_accel_shutdown_remote);
+
+
+/* Based on xenbus_backend_client.c:xenbus_map_ring() */
+static int net_accel_map_grant(struct xenbus_device *dev, int gnt_ref,
+                              grant_handle_t *handle, void *vaddr, 
+                              u64 *dev_bus_addr, unsigned flags)
+{
+       struct gnttab_map_grant_ref op;
+       int ret;
+       
+       gnttab_set_map_op(&op, (unsigned long)vaddr, flags,
+                         gnt_ref, dev->otherend_id);
+
+       gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &op);
+
+       if (op.status != GNTST_okay) {
+               xenbus_dev_error
+                       (dev, op.status,
+                        "failed mapping in shared page %d from domain %d\n",
+                        gnt_ref, dev->otherend_id);
+               ret = -EINVAL;
+       } else {
+               *handle = op.handle;
+               if (dev_bus_addr)
+                       *dev_bus_addr = op.dev_bus_addr;
+               ret = 0;
+       }
+
+       return ret;
+}
+
+
+/* Based on xenbus_backend_client.c:xenbus_unmap_ring() */
+static int net_accel_unmap_grant(struct xenbus_device *dev, 
+                                grant_handle_t handle,
+                                void *vaddr, u64 dev_bus_addr,
+                                unsigned flags)
+{
+       struct gnttab_unmap_grant_ref op;
+
+       gnttab_set_unmap_op(&op, (unsigned long)vaddr, flags, handle);
+       
+       if (dev_bus_addr)
+               op.dev_bus_addr = dev_bus_addr;
+
+       BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1));
+
+       if (op.status != GNTST_okay)
+               xenbus_dev_error(dev, op.status,
+                                "failed unmapping page at handle %d error %d\n",
+                                handle, op.status);
+
+       return op.status == GNTST_okay ? 0 : -EINVAL;
+}
+
+
+int net_accel_map_device_page(struct xenbus_device *dev,  
+                             int gnt_ref, grant_handle_t *handle,
+                             u64 *dev_bus_addr)
+{
+       return net_accel_map_grant(dev, gnt_ref, handle, 0, dev_bus_addr,
+                                  GNTMAP_device_map);
+}
+EXPORT_SYMBOL_GPL(net_accel_map_device_page);
+
+ 
+int net_accel_unmap_device_page(struct xenbus_device *dev,
+                               grant_handle_t handle, u64 dev_bus_addr)
+{
+       return net_accel_unmap_grant(dev, handle, 0, dev_bus_addr, 
+                                    GNTMAP_device_map);
+}
+EXPORT_SYMBOL_GPL(net_accel_unmap_device_page);
+
+
+struct net_accel_valloc_grant_mapping {
+       struct vm_struct *vm;
+       int pages;
+       grant_handle_t grant_handles[0];
+};
+
+/* Map a series of grants into a contiguous virtual area */
+static void *net_accel_map_grants_valloc(struct xenbus_device *dev, 
+                                        unsigned *grants, int npages, 
+                                        unsigned flags, void **priv)
+{
+       struct net_accel_valloc_grant_mapping *map;
+       struct vm_struct *vm;
+       void *addr;
+       int i, j, rc;
+
+       vm  = alloc_vm_area(PAGE_SIZE * npages, NULL);
+       if (vm == NULL) {
+               EPRINTK("No memory from alloc_vm_area.\n");
+               return NULL;
+       }
+       /* 
+        * Get a structure in which we will record all the info needed
+        * to undo the mapping.
+        */
+       map = kzalloc(sizeof(struct net_accel_valloc_grant_mapping)  + 
+                     npages * sizeof(grant_handle_t), GFP_KERNEL);
+       if (map == NULL) {
+               EPRINTK("No memory for net_accel_valloc_grant_mapping\n");
+               free_vm_area(vm);
+               return NULL;
+       }
+       map->vm = vm;
+       map->pages = npages;
+
+       /* Do the actual mapping */
+       addr = vm->addr;
+
+       for (i = 0; i < npages; i++) {
+               rc = net_accel_map_grant(dev, grants[i], map->grant_handles + i, 
+                                        addr, NULL, flags);
+               if (rc < 0)
+                       goto undo;
+               addr = (void*)((unsigned long)addr + PAGE_SIZE);
+       }
+
+       if (priv)
+               *priv = (void *)map;
+       else
+               kfree(map);
+
+       return vm->addr;
+
+ undo:
+       EPRINTK("Aborting contig map due to single map failure %d (%d of %d)\n",
+               rc, i+1, npages);
+       for (j = 0; j < i; j++) {
+               addr = (void*)((unsigned long)vm->addr + (j * PAGE_SIZE));
+               net_accel_unmap_grant(dev, map->grant_handles[j], addr, 0,
+                                     flags);
+       }
+       free_vm_area(vm);
+       kfree(map);
+       return NULL;
+}
+
+/* Undo the result of the mapping */
+static void net_accel_unmap_grants_vfree(struct xenbus_device *dev, 
+                                        unsigned flags, void *priv)
+{
+       struct net_accel_valloc_grant_mapping *map = 
+               (struct net_accel_valloc_grant_mapping *)priv;
+
+       void *addr = map->vm->addr;
+       int npages = map->pages;
+       int i;
+
+       for (i = 0; i < npages; i++) {
+               net_accel_unmap_grant(dev, map->grant_handles[i], addr, 0,
+                                     flags);
+               addr = (void*)((unsigned long)addr + PAGE_SIZE);
+       }
+       free_vm_area(map->vm);
+       kfree(map);
+}
+
+
+void *net_accel_map_grants_contig(struct xenbus_device *dev,
+                               unsigned *grants, int npages, 
+                               void **priv)
+{
+    return net_accel_map_grants_valloc(dev, grants, npages, GNTMAP_host_map, priv);
+}
+EXPORT_SYMBOL(net_accel_map_grants_contig);
+
+
+void net_accel_unmap_grants_contig(struct xenbus_device *dev,
+                                  void *priv)
+{
+       net_accel_unmap_grants_vfree(dev, GNTMAP_host_map, priv);
+}
+EXPORT_SYMBOL(net_accel_unmap_grants_contig);
+
+
+void *net_accel_map_iomem_page(struct xenbus_device *dev, int gnt_ref,
+                            void **priv)
+{
+       return net_accel_map_grants_valloc(dev, &gnt_ref, 1, GNTMAP_host_map, priv);
+}
+EXPORT_SYMBOL(net_accel_map_iomem_page);
+
+
+void net_accel_unmap_iomem_page(struct xenbus_device *dev, void *priv)
+{
+       net_accel_unmap_grants_vfree(dev, GNTMAP_host_map, priv);
+}
+EXPORT_SYMBOL(net_accel_unmap_iomem_page);
+
+
+int net_accel_grant_page(struct xenbus_device *dev, unsigned long mfn, 
+                        int is_iomem)
+{
+       int err = gnttab_grant_foreign_access(dev->otherend_id, mfn,
+                                             is_iomem ? GTF_PCD : 0);
+       if (err < 0)
+               xenbus_dev_error(dev, err, "failed granting access to page\n");
+       return err;
+}
+EXPORT_SYMBOL_GPL(net_accel_grant_page);
+
+
+int net_accel_ungrant_page(grant_ref_t gntref)
+{
+       if (unlikely(gnttab_query_foreign_access(gntref) != 0)) {
+               EPRINTK("%s: remote domain still using grant %d\n", __FUNCTION__, 
+                       gntref);
+               return -EBUSY;
+       }
+
+       gnttab_end_foreign_access(gntref, 0);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(net_accel_ungrant_page);
+
+
+int net_accel_xen_net_read_mac(struct xenbus_device *dev, u8 mac[])
+{
+       char *s, *e, *macstr;
+       int i;
+
+       macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL);
+       if (IS_ERR(macstr))
+               return PTR_ERR(macstr);
+
+       for (i = 0; i < ETH_ALEN; i++) {
+               mac[i] = simple_strtoul(s, &e, 16);
+               if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) {
+                       kfree(macstr);
+                       return -ENOENT;
+               }
+               s = e+1;
+       }
+
+       kfree(macstr);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(net_accel_xen_net_read_mac);
+
+
+void net_accel_update_state(struct xenbus_device *dev, int state)
+{
+       struct xenbus_transaction tr;
+       int err;
+
+       DPRINTK("%s: setting accelstate to %s\n", __FUNCTION__,
+               xenbus_strstate(state));
+
+       if (xenbus_exists(XBT_NIL, dev->nodename, "")) {
+               VPRINTK("%s: nodename %s\n", __FUNCTION__, dev->nodename);
+       again:
+               err = xenbus_transaction_start(&tr);
+               if (err == 0)
+                       err = xenbus_printf(tr, dev->nodename, "accelstate",
+                                           "%d", state);
+               if (err != 0) {
+                       xenbus_transaction_end(tr, 1);
+               } else {
+                       err = xenbus_transaction_end(tr, 0);
+                       if (err == -EAGAIN)
+                               goto again;
+               }
+       }
+}
+EXPORT_SYMBOL_GPL(net_accel_update_state);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/xen/sfc_netutil/accel_util.h b/drivers/xen/sfc_netutil/accel_util.h

new file mode 100644 (file)

index 0000000..66f96d8
--- /dev/null
+++ b/drivers/xen/sfc_netutil/accel_util.h
@@ -0,0 +1,124 @@
+/****************************************************************************
+ * Solarflare driver for Xen network acceleration
+ *
+ * Copyright 2006-2008: Solarflare Communications Inc,
+ *                      9501 Jeronimo Road, Suite 250,
+ *                      Irvine, CA 92618, USA
+ *
+ * Maintained by Solarflare Communications <linux-xen-drivers@solarflare.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ ****************************************************************************
+ */
+
+#ifndef NETBACK_ACCEL_UTIL_H
+#define NETBACK_ACCEL_UTIL_H
+
+#ifdef DPRINTK
+#undef DPRINTK
+#endif
+
+#define FILE_LEAF strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__
+
+#if 1
+#define VPRINTK(_f, _a...) 
+#else
+#define VPRINTK(_f, _a...)                     \
+       printk("(file=%s, line=%d) " _f,        \
+              FILE_LEAF , __LINE__ , ## _a )
+#endif
+
+#if 1
+#define DPRINTK(_f, _a...) 
+#else
+#define DPRINTK(_f, _a...)                     \
+       printk("(file=%s, line=%d) " _f,        \
+              FILE_LEAF , __LINE__ , ## _a )
+#endif
+
+#define EPRINTK(_f, _a...)                     \
+       printk("(file=%s, line=%d) " _f,        \
+              FILE_LEAF , __LINE__ , ## _a )
+
+#define EPRINTK_ON(exp)                                                        \
+       do {                                                            \
+               if (exp)                                                \
+                       EPRINTK("%s at %s:%d\n", #exp, __FILE__, __LINE__); \
+       } while(0)
+
+#define DPRINTK_ON(exp)                                                        \
+       do {                                                            \
+               if (exp)                                                \
+                       DPRINTK("%s at %s:%d\n", #exp, __FILE__, __LINE__); \
+       } while(0)
+
+#include <xen/xenbus.h>
+
+/*! Map a set of pages from another domain
+ * \param dev The xenbus device context
+ * \param priv The private data returned by the mapping function 
+ */
+extern 
+void *net_accel_map_grants_contig(struct xenbus_device *dev, 
+                                 unsigned *grants, int npages, 
+                                 void **priv);
+
+/*! Unmap a set of pages mapped using net_accel_map_grants_contig.
+ * \param dev The xenbus device context
+ * \param priv The private data returned by the mapping function 
+ */
+extern 
+void net_accel_unmap_grants_contig(struct xenbus_device *dev, void *priv);
+
+/*! Read the MAC address of a device from xenstore */
+extern
+int net_accel_xen_net_read_mac(struct xenbus_device *dev, u8 mac[]);
+
+/*! Update the accelstate field for a device in xenstore */
+extern
+void net_accel_update_state(struct xenbus_device *dev, int state);
+
+/* These four map/unmap functions are based on
+ * xenbus_backend_client.c:xenbus_map_ring().  However, they are not
+ * used for ring buffers, instead just to map pages between domains,
+ * or to map a page so that it is accessible by a device
+ */
+extern
+int net_accel_map_device_page(struct xenbus_device *dev,  
+                             int gnt_ref, grant_handle_t *handle,
+                             u64 *dev_bus_addr);
+extern
+int net_accel_unmap_device_page(struct xenbus_device *dev,
+                               grant_handle_t handle, u64 dev_bus_addr);
+extern
+void *net_accel_map_iomem_page(struct xenbus_device *dev, int gnt_ref,
+                            void **priv);
+extern
+void net_accel_unmap_iomem_page(struct xenbus_device *dev, void *priv);
+
+/*! Grrant a page to remote domain */
+extern
+int net_accel_grant_page(struct xenbus_device *dev, unsigned long mfn, 
+                        int is_iomem);
+/*! Undo a net_accel_grant_page */
+extern
+int net_accel_ungrant_page(grant_ref_t gntref);
+
+
+/*! Shutdown remote domain that is misbehaving */
+extern
+int net_accel_shutdown_remote(int domain);
+
+
+#endif
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c

index fdb6d22..3f0283e 100644 (file)
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -12,14 +12,20 @@
  #include <linux/module.h>
  #include <linux/kobject.h>
  
+#if defined(CONFIG_XEN) || defined(MODULE)
+#include <asm/hypervisor.h>
+#else
  #include <asm/xen/hypervisor.h>
  #include <asm/xen/hypercall.h>
+#endif
  
  #include <xen/xen.h>
  #include <xen/xenbus.h>
  #include <xen/interface/xen.h>
  #include <xen/interface/version.h>
  
+#include "xenbus/xenbus_comms.h"
+
  #define HYPERVISOR_ATTR_RO(_name) \
  static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
  
@@ -118,9 +124,8 @@ static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
  {
         char *vm, *val;
         int ret;
-       extern int xenstored_ready;
  
-       if (!xenstored_ready)
+       if (!is_xenstored_ready())
                 return -EBUSY;
  
         vm = xenbus_read(XBT_NIL, "vm", "", NULL);
@@ -355,6 +360,35 @@ static void xen_properties_destroy(void)
         sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
  }
  
+#if defined(CONFIG_XEN) && defined(CONFIG_KEXEC)
+extern size_t vmcoreinfo_size_xen;
+extern unsigned long paddr_vmcoreinfo_xen;
+
+static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page)
+{
+       return sprintf(page, "%lx %zx\n",
+                      paddr_vmcoreinfo_xen, vmcoreinfo_size_xen);
+}
+
+HYPERVISOR_ATTR_RO(vmcoreinfo);
+
+static int __init xen_sysfs_vmcoreinfo_init(void)
+{
+       if (!vmcoreinfo_size_xen)
+               return 0;
+       return sysfs_create_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+}
+
+static void xen_sysfs_vmcoreinfo_destroy(void)
+{
+       if (vmcoreinfo_size_xen)
+               sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+}
+#else
+static inline int __init xen_sysfs_vmcoreinfo_init(void) { return 0; }
+static inline void xen_sysfs_vmcoreinfo_destroy(void) { }
+#endif
+
  static int __init hyper_sysfs_init(void)
  {
         int ret;
@@ -377,9 +411,11 @@ static int __init hyper_sysfs_init(void)
         ret = xen_properties_init();
         if (ret)
                 goto prop_out;
+       ret = xen_sysfs_vmcoreinfo_init();
+       if (!ret)
+               goto out;
  
-       goto out;
-
+       xen_properties_destroy();
  prop_out:
         xen_sysfs_uuid_destroy();
  uuid_out:
@@ -394,6 +430,7 @@ out:
  
  static void __exit hyper_sysfs_exit(void)
  {
+       xen_sysfs_vmcoreinfo_destroy();
         xen_properties_destroy();
         xen_compilation_destroy();
         xen_sysfs_uuid_destroy();
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c

index dcb7952..1d3c155 100644 (file)
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -18,27 +18,14 @@
  
  #include <xen/xen.h>
  #include <xen/interface/xen.h>
+#include <xen/interface/tmem.h>
+#ifdef CONFIG_PARAVIRT_XEN
  #include <asm/xen/hypercall.h>
  #include <asm/xen/page.h>
  #include <asm/xen/hypervisor.h>
-
-#define TMEM_CONTROL               0
-#define TMEM_NEW_POOL              1
-#define TMEM_DESTROY_POOL          2
-#define TMEM_NEW_PAGE              3
-#define TMEM_PUT_PAGE              4
-#define TMEM_GET_PAGE              5
-#define TMEM_FLUSH_PAGE            6
-#define TMEM_FLUSH_OBJECT          7
-#define TMEM_READ                  8
-#define TMEM_WRITE                 9
-#define TMEM_XCHG                 10
-
-/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
-#define TMEM_POOL_PERSIST          1
-#define TMEM_POOL_SHARED           2
-#define TMEM_POOL_PAGESIZE_SHIFT   4
-#define TMEM_VERSION_SHIFT        24
+#else
+#include <asm/hypervisor.h>
+#endif
  
  
  struct tmem_pool_uuid {
@@ -73,7 +60,7 @@ static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid,
         op.u.gen.tmem_offset = tmem_offset;
         op.u.gen.pfn_offset = pfn_offset;
         op.u.gen.len = len;
-       set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn);
+       op.u.gen.cmfn = gmfn;
         rc = HYPERVISOR_tmem_op(&op);
         return rc;
  }
@@ -87,11 +74,11 @@ static int xen_tmem_new_pool(struct tmem_pool_uuid uuid,
         for (pageshift = 0; pagesize != 1; pageshift++)
                 pagesize >>= 1;
         flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT;
-       flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT;
+       flags |= TMEM_SPEC_VERSION << TMEM_POOL_VERSION_SHIFT;
         op.cmd = TMEM_NEW_POOL;
-       op.u.new.uuid[0] = uuid.uuid_lo;
-       op.u.new.uuid[1] = uuid.uuid_hi;
-       op.u.new.flags = flags;
+       op.u.creat.uuid[0] = uuid.uuid_lo;
+       op.u.creat.uuid[1] = uuid.uuid_hi;
+       op.u.creat.flags = flags;
         rc = HYPERVISOR_tmem_op(&op);
         return rc;
  }
@@ -388,7 +375,7 @@ static int __init xen_tmem_init(void)
         }
  #endif
  #ifdef CONFIG_CLEANCACHE
-       BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
+       BUILD_BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
         if (tmem_enabled && use_cleancache) {
                 char *s = "";
                 struct cleancache_ops old_ops =
diff --git a/drivers/xen/tpmback/Makefile b/drivers/xen/tpmback/Makefile

new file mode 100644 (file)

index 0000000..d5865c4
--- /dev/null
+++ b/drivers/xen/tpmback/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_XEN_TPMDEV_BACKEND)       += tpmbk.o
+
+tpmbk-y += tpmback.o interface.o xenbus.o
diff --git a/drivers/xen/tpmback/common.h b/drivers/xen/tpmback/common.h

new file mode 100644 (file)

index 0000000..c5ec097
--- /dev/null
+++ b/drivers/xen/tpmback/common.h
@@ -0,0 +1,93 @@
+/******************************************************************************
+ * drivers/xen/tpmback/common.h
+ */
+
+#ifndef __TPM__BACKEND__COMMON_H__
+#define __TPM__BACKEND__COMMON_H__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <xen/xenbus.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/io/tpmif.h>
+
+#define DPRINTK(_f, _a...)                     \
+       pr_debug("(file=%s, line=%d) " _f,      \
+                __FILE__ , __LINE__ , ## _a )
+
+struct backend_info
+{
+       struct xenbus_device *dev;
+
+       /* our communications channel */
+       struct tpmif_st *tpmif;
+
+       long int frontend_id;
+       long int instance; // instance of TPM
+       u8 is_instance_set;// whether instance number has been set
+
+       /* watch front end for changes */
+       struct xenbus_watch backend_watch;
+};
+
+typedef struct tpmif_st {
+       struct list_head tpmif_list;
+       /* Unique identifier for this interface. */
+       domid_t domid;
+       unsigned int handle;
+
+       /* Physical parameters of the comms window. */
+       unsigned int irq;
+
+       /* The shared rings and indexes. */
+       tpmif_tx_interface_t *tx;
+       struct vm_struct *tx_area;
+
+       /* Miscellaneous private stuff. */
+       enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+       int active;
+
+       struct tpmif_st *hash_next;
+       struct list_head list;  /* scheduling list */
+       atomic_t refcnt;
+
+       struct backend_info *bi;
+
+       struct page **mmap_pages;
+
+       char devname[20];
+} tpmif_t;
+
+void tpmif_disconnect_complete(tpmif_t * tpmif);
+tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi);
+int tpmif_interface_init(void);
+void tpmif_interface_exit(void);
+void tpmif_schedule_work(tpmif_t * tpmif);
+void tpmif_deschedule_work(tpmif_t * tpmif);
+int tpmif_xenbus_init(void);
+void tpmif_xenbus_exit(void);
+int tpmif_map(tpmif_t *, grant_ref_t, evtchn_port_t);
+irqreturn_t tpmif_be_int(int irq, void *dev_id);
+
+long int tpmback_get_instance(struct backend_info *bi);
+
+int vtpm_release_packets(tpmif_t * tpmif, int send_msgs);
+
+
+#define tpmif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define tpmif_put(_b)                                  \
+       do {                                            \
+               if (atomic_dec_and_test(&(_b)->refcnt)) \
+                       tpmif_disconnect_complete(_b);  \
+       } while (0)
+
+extern int num_frontends;
+
+static inline unsigned long idx_to_kaddr(tpmif_t *t, unsigned int idx)
+{
+       return (unsigned long)pfn_to_kaddr(page_to_pfn(t->mmap_pages[idx]));
+}
+
+#endif /* __TPMIF__BACKEND__COMMON_H__ */
diff --git a/drivers/xen/tpmback/interface.c b/drivers/xen/tpmback/interface.c

new file mode 100644 (file)

index 0000000..37850c8
--- /dev/null
+++ b/drivers/xen/tpmback/interface.c
@@ -0,0 +1,133 @@
+ /*****************************************************************************
+ * drivers/xen/tpmback/interface.c
+ *
+ * Vritual TPM interface management.
+ *
+ * Copyright (c) 2005, IBM Corporation
+ *
+ * Author: Stefan Berger, stefanb@us.ibm.com
+ *
+ * This code has been derived from drivers/xen/netback/interface.c
+ * Copyright (c) 2004, Keir Fraser
+ */
+
+#include "common.h"
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+
+static struct kmem_cache *tpmif_cachep;
+int num_frontends = 0;
+
+LIST_HEAD(tpmif_list);
+
+static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi)
+{
+       tpmif_t *tpmif;
+
+       tpmif = kmem_cache_zalloc(tpmif_cachep, GFP_KERNEL);
+       if (tpmif == NULL)
+               goto out_of_memory;
+
+       tpmif->domid = domid;
+       tpmif->status = DISCONNECTED;
+       tpmif->bi = bi;
+       snprintf(tpmif->devname, sizeof(tpmif->devname), "tpmif%d", domid);
+       atomic_set(&tpmif->refcnt, 1);
+
+       tpmif->mmap_pages = alloc_empty_pages_and_pagevec(TPMIF_TX_RING_SIZE);
+       if (tpmif->mmap_pages == NULL)
+               goto out_of_memory;
+
+       list_add(&tpmif->tpmif_list, &tpmif_list);
+       num_frontends++;
+
+       return tpmif;
+
+ out_of_memory:
+       if (tpmif != NULL)
+               kmem_cache_free(tpmif_cachep, tpmif);
+       pr_err("%s: out of memory\n", __FUNCTION__);
+       return ERR_PTR(-ENOMEM);
+}
+
+static void free_tpmif(tpmif_t * tpmif)
+{
+       num_frontends--;
+       list_del(&tpmif->tpmif_list);
+       free_empty_pages_and_pagevec(tpmif->mmap_pages, TPMIF_TX_RING_SIZE);
+       kmem_cache_free(tpmif_cachep, tpmif);
+}
+
+tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi)
+{
+       tpmif_t *tpmif;
+
+       list_for_each_entry(tpmif, &tpmif_list, tpmif_list) {
+               if (tpmif->bi == bi) {
+                       if (tpmif->domid == domid) {
+                               tpmif_get(tpmif);
+                               return tpmif;
+                       } else {
+                               return ERR_PTR(-EEXIST);
+                       }
+               }
+       }
+
+       return alloc_tpmif(domid, bi);
+}
+
+int tpmif_map(tpmif_t *tpmif, grant_ref_t ring_ref, evtchn_port_t evtchn)
+{
+       struct vm_struct *area;
+       int err;
+
+       if (tpmif->irq)
+               return 0;
+
+       area = xenbus_map_ring_valloc(tpmif->bi->dev, ring_ref);
+       if (IS_ERR(area))
+               return PTR_ERR(area);
+       tpmif->tx_area = area;
+
+       tpmif->tx = (tpmif_tx_interface_t *)area->addr;
+       clear_page(tpmif->tx);
+
+       err = bind_interdomain_evtchn_to_irqhandler(
+               tpmif->domid, evtchn, tpmif_be_int, 0, tpmif->devname, tpmif);
+       if (err < 0) {
+               xenbus_unmap_ring_vfree(tpmif->bi->dev, area);
+               return err;
+       }
+       tpmif->irq = err;
+
+       tpmif->active = 1;
+
+       return 0;
+}
+
+void tpmif_disconnect_complete(tpmif_t *tpmif)
+{
+       if (tpmif->irq)
+               unbind_from_irqhandler(tpmif->irq, tpmif);
+
+       if (tpmif->tx)
+               xenbus_unmap_ring_vfree(tpmif->bi->dev, tpmif->tx_area);
+
+       free_tpmif(tpmif);
+}
+
+int __init tpmif_interface_init(void)
+{
+       tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof (tpmif_t),
+                                        0, 0, NULL);
+       return tpmif_cachep ? 0 : -ENOMEM;
+}
+
+void tpmif_interface_exit(void)
+{
+       kmem_cache_destroy(tpmif_cachep);
+}
diff --git a/drivers/xen/tpmback/tpmback.c b/drivers/xen/tpmback/tpmback.c

new file mode 100644 (file)

index 0000000..0a80f83
--- /dev/null
+++ b/drivers/xen/tpmback/tpmback.c
@@ -0,0 +1,947 @@
+/******************************************************************************
+ * drivers/xen/tpmback/tpmback.c
+ *
+ * Copyright (c) 2005, IBM Corporation
+ *
+ * Author: Stefan Berger, stefanb@us.ibm.com
+ * Grant table support: Mahadevan Gomathisankaran
+ *
+ * This code has been derived from drivers/xen/netback/netback.c
+ * Copyright (c) 2002-2004, K A Fraser
+ *
+ */
+
+#include "common.h"
+#include <xen/evtchn.h>
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+#include <xen/xenbus.h>
+#include <xen/interface/grant_table.h>
+#include <xen/gnttab.h>
+
+/* local data structures */
+struct data_exchange {
+       struct list_head pending_pak;
+       struct list_head current_pak;
+       unsigned int copied_so_far;
+       u8 has_opener:1;
+       u8 aborted:1;
+       rwlock_t pak_lock;      // protects all of the previous fields
+       wait_queue_head_t wait_queue;
+};
+
+struct vtpm_resp_hdr {
+       uint32_t instance_no;
+       uint16_t tag_no;
+       uint32_t len_no;
+       uint32_t ordinal_no;
+} __attribute__ ((packed));
+
+struct packet {
+       struct list_head next;
+       unsigned int data_len;
+       u8 *data_buffer;
+       tpmif_t *tpmif;
+       u32 tpm_instance;
+       u8 req_tag;
+       u32 last_read;
+       u8 flags;
+       struct timer_list processing_timer;
+};
+
+enum {
+       PACKET_FLAG_DISCARD_RESPONSE = 1,
+};
+
+/* local variables */
+static struct data_exchange dataex;
+
+/* local function prototypes */
+static int _packet_write(struct packet *pak,
+                        const char *data, size_t size, int userbuffer);
+static void processing_timeout(unsigned long ptr);
+static int packet_read_shmem(struct packet *pak,
+                            tpmif_t * tpmif,
+                            u32 offset,
+                            char *buffer, int isuserbuffer, u32 left);
+static int vtpm_queue_packet(struct packet *pak);
+
+/***************************************************************
+ Buffer copying fo user and kernel space buffes.
+***************************************************************/
+static inline int copy_from_buffer(void *to,
+                                  const void *from, unsigned long size,
+                                  int isuserbuffer)
+{
+       if (isuserbuffer) {
+               if (copy_from_user(to, (void __user *)from, size))
+                       return -EFAULT;
+       } else {
+               memcpy(to, from, size);
+       }
+       return 0;
+}
+
+static inline int copy_to_buffer(void *to,
+                                const void *from, unsigned long size,
+                                int isuserbuffer)
+{
+       if (isuserbuffer) {
+               if (copy_to_user((void __user *)to, from, size))
+                       return -EFAULT;
+       } else {
+               memcpy(to, from, size);
+       }
+       return 0;
+}
+
+
+static void dataex_init(struct data_exchange *dataex)
+{
+       INIT_LIST_HEAD(&dataex->pending_pak);
+       INIT_LIST_HEAD(&dataex->current_pak);
+       dataex->has_opener = 0;
+       rwlock_init(&dataex->pak_lock);
+       init_waitqueue_head(&dataex->wait_queue);
+}
+
+/***************************************************************
+ Packet-related functions
+***************************************************************/
+
+static struct packet *packet_find_instance(struct list_head *head,
+                                          u32 tpm_instance)
+{
+       struct packet *pak;
+       struct list_head *p;
+
+       /*
+        * traverse the list of packets and return the first
+        * one with the given instance number
+        */
+       list_for_each(p, head) {
+               pak = list_entry(p, struct packet, next);
+
+               if (pak->tpm_instance == tpm_instance) {
+                       return pak;
+               }
+       }
+       return NULL;
+}
+
+static struct packet *packet_find_packet(struct list_head *head, void *packet)
+{
+       struct packet *pak;
+       struct list_head *p;
+
+       /*
+        * traverse the list of packets and return the first
+        * one with the given instance number
+        */
+       list_for_each(p, head) {
+               pak = list_entry(p, struct packet, next);
+
+               if (pak == packet) {
+                       return pak;
+               }
+       }
+       return NULL;
+}
+
+static struct packet *packet_alloc(tpmif_t * tpmif,
+                                  u32 size, u8 req_tag, u8 flags)
+{
+       struct packet *pak = NULL;
+       pak = kzalloc(sizeof (struct packet), GFP_ATOMIC);
+       if (NULL != pak) {
+               if (tpmif) {
+                       pak->tpmif = tpmif;
+                       pak->tpm_instance = tpmback_get_instance(tpmif->bi);
+                       tpmif_get(tpmif);
+               }
+               pak->data_len = size;
+               pak->req_tag = req_tag;
+               pak->last_read = 0;
+               pak->flags = flags;
+
+               /*
+                * cannot do tpmif_get(tpmif); bad things happen
+                * on the last tpmif_put()
+                */
+               init_timer(&pak->processing_timer);
+               pak->processing_timer.function = processing_timeout;
+               pak->processing_timer.data = (unsigned long)pak;
+       }
+       return pak;
+}
+
+static void inline packet_reset(struct packet *pak)
+{
+       pak->last_read = 0;
+}
+
+static void packet_free(struct packet *pak)
+{
+       if (timer_pending(&pak->processing_timer)) {
+               BUG();
+       }
+
+       if (pak->tpmif)
+               tpmif_put(pak->tpmif);
+       kfree(pak->data_buffer);
+       /*
+        * cannot do tpmif_put(pak->tpmif); bad things happen
+        * on the last tpmif_put()
+        */
+       kfree(pak);
+}
+
+
+/*
+ * Write data to the shared memory and send it to the FE.
+ */
+static int packet_write(struct packet *pak,
+                       const char *data, size_t size, int isuserbuffer)
+{
+       int rc = 0;
+
+       if (0 != (pak->flags & PACKET_FLAG_DISCARD_RESPONSE)) {
+               /* Don't send a respone to this packet. Just acknowledge it. */
+               rc = size;
+       } else {
+               rc = _packet_write(pak, data, size, isuserbuffer);
+       }
+
+       return rc;
+}
+
+int _packet_write(struct packet *pak,
+                 const char *data, size_t size, int isuserbuffer)
+{
+       /*
+        * Write into the shared memory pages directly
+        * and send it to the front end.
+        */
+       tpmif_t *tpmif = pak->tpmif;
+       grant_handle_t handle;
+       int rc = 0;
+       unsigned int i = 0;
+       unsigned int offset = 0;
+
+       if (tpmif == NULL) {
+               return -EFAULT;
+       }
+
+       if (tpmif->status == DISCONNECTED) {
+               return size;
+       }
+
+       while (offset < size && i < TPMIF_TX_RING_SIZE) {
+               unsigned int tocopy;
+               struct gnttab_map_grant_ref map_op;
+               struct gnttab_unmap_grant_ref unmap_op;
+               tpmif_tx_request_t *tx;
+
+               tx = &tpmif->tx->ring[i].req;
+
+               if (0 == tx->addr) {
+                       DPRINTK("ERROR: Buffer for outgoing packet NULL?! i=%d\n", i);
+                       return 0;
+               }
+
+               gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
+                                 GNTMAP_host_map, tx->ref, tpmif->domid);
+
+               gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map_op);
+
+               if (map_op.status != GNTST_okay) {
+                       DPRINTK(" Grant table operation failure !\n");
+                       return 0;
+               }
+
+               handle = map_op.handle;
+
+               tocopy = min_t(size_t, size - offset, PAGE_SIZE);
+
+               if (copy_from_buffer((void *)(idx_to_kaddr(tpmif, i) |
+                                             (tx->addr & ~PAGE_MASK)),
+                                    &data[offset], tocopy, isuserbuffer)) {
+                       tpmif_put(tpmif);
+                       return -EFAULT;
+               }
+               tx->size = tocopy;
+
+               gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
+                                   GNTMAP_host_map, handle);
+
+               if (unlikely
+                   (HYPERVISOR_grant_table_op
+                    (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
+                       BUG();
+               }
+
+               offset += tocopy;
+               i++;
+       }
+
+       rc = offset;
+       DPRINTK("Notifying frontend via irq %d\n", tpmif->irq);
+       notify_remote_via_irq(tpmif->irq);
+
+       return rc;
+}
+
+/*
+ * Read data from the shared memory and copy it directly into the
+ * provided buffer. Advance the read_last indicator which tells
+ * how many bytes have already been read.
+ */
+static int packet_read(struct packet *pak, size_t numbytes,
+                      char *buffer, size_t buffersize, int isuserbuffer)
+{
+       tpmif_t *tpmif = pak->tpmif;
+
+       /*
+        * Read 'numbytes' of data from the buffer. The first 4
+        * bytes are the instance number in network byte order,
+        * after that come the data from the shared memory buffer.
+        */
+       u32 to_copy;
+       u32 offset = 0;
+       u32 room_left = buffersize;
+
+       if (pak->last_read < 4) {
+               /*
+                * copy the instance number into the buffer
+                */
+               u32 instance_no = htonl(pak->tpm_instance);
+               u32 last_read = pak->last_read;
+
+               to_copy = min_t(size_t, 4 - last_read, numbytes);
+
+               if (copy_to_buffer(&buffer[0],
+                                  &(((u8 *) & instance_no)[last_read]),
+                                  to_copy, isuserbuffer)) {
+                       return -EFAULT;
+               }
+
+               pak->last_read += to_copy;
+               offset += to_copy;
+               room_left -= to_copy;
+       }
+
+       /*
+        * If the packet has a data buffer appended, read from it...
+        */
+
+       if (room_left > 0) {
+               if (pak->data_buffer) {
+                       u32 to_copy = min_t(u32, pak->data_len - offset, room_left);
+                       u32 last_read = pak->last_read - 4;
+
+                       if (copy_to_buffer(&buffer[offset],
+                                          &pak->data_buffer[last_read],
+                                          to_copy, isuserbuffer)) {
+                               return -EFAULT;
+                       }
+                       pak->last_read += to_copy;
+                       offset += to_copy;
+               } else {
+                       offset = packet_read_shmem(pak,
+                                                  tpmif,
+                                                  offset,
+                                                  buffer,
+                                                  isuserbuffer, room_left);
+               }
+       }
+       return offset;
+}
+
+static int packet_read_shmem(struct packet *pak,
+                            tpmif_t * tpmif,
+                            u32 offset, char *buffer, int isuserbuffer,
+                            u32 room_left)
+{
+       u32 last_read = pak->last_read - 4;
+       u32 i = (last_read / PAGE_SIZE);
+       u32 pg_offset = last_read & (PAGE_SIZE - 1);
+       u32 to_copy;
+       grant_handle_t handle;
+
+       tpmif_tx_request_t *tx;
+
+       tx = &tpmif->tx->ring[0].req;
+       /*
+        * Start copying data at the page with index 'index'
+        * and within that page at offset 'offset'.
+        * Copy a maximum of 'room_left' bytes.
+        */
+       to_copy = min_t(u32, PAGE_SIZE - pg_offset, room_left);
+       while (to_copy > 0) {
+               void *src;
+               struct gnttab_map_grant_ref map_op;
+               struct gnttab_unmap_grant_ref unmap_op;
+
+               tx = &tpmif->tx->ring[i].req;
+
+               gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i),
+                                 GNTMAP_host_map, tx->ref, tpmif->domid);
+
+               gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &map_op);
+
+               if (map_op.status != GNTST_okay) {
+                       DPRINTK(" Grant table operation failure !\n");
+                       return -EFAULT;
+               }
+
+               handle = map_op.handle;
+
+               if (to_copy > tx->size) {
+                       /*
+                        * User requests more than what's available
+                        */
+                       to_copy = min_t(u32, tx->size, to_copy);
+               }
+
+               DPRINTK("Copying from mapped memory at %08lx\n",
+                       (unsigned long)(idx_to_kaddr(tpmif, i) |
+                                       (tx->addr & ~PAGE_MASK)));
+
+               src = (void *)(idx_to_kaddr(tpmif, i) |
+                              ((tx->addr & ~PAGE_MASK) + pg_offset));
+               if (copy_to_buffer(&buffer[offset],
+                                  src, to_copy, isuserbuffer)) {
+                       return -EFAULT;
+               }
+
+               DPRINTK("Data from TPM-FE of domain %d are %d %d %d %d\n",
+                       tpmif->domid, buffer[offset], buffer[offset + 1],
+                       buffer[offset + 2], buffer[offset + 3]);
+
+               gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i),
+                                   GNTMAP_host_map, handle);
+
+               if (unlikely
+                   (HYPERVISOR_grant_table_op
+                    (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) {
+                       BUG();
+               }
+
+               offset += to_copy;
+               pg_offset = 0;
+               last_read += to_copy;
+               room_left -= to_copy;
+
+               to_copy = min_t(u32, PAGE_SIZE, room_left);
+               i++;
+       }                       /* while (to_copy > 0) */
+       /*
+        * Adjust the last_read pointer
+        */
+       pak->last_read = last_read + 4;
+       return offset;
+}
+
+/* ============================================================
+ * The file layer for reading data from this device
+ * ============================================================
+ */
+static int vtpm_op_open(struct inode *inode, struct file *f)
+{
+       int rc = 0;
+       unsigned long flags;
+
+       write_lock_irqsave(&dataex.pak_lock, flags);
+       if (dataex.has_opener == 0) {
+               dataex.has_opener = 1;
+       } else {
+               rc = -EPERM;
+       }
+       write_unlock_irqrestore(&dataex.pak_lock, flags);
+       return rc;
+}
+
+static ssize_t vtpm_op_read(struct file *file,
+                           char __user * data, size_t size, loff_t * offset)
+{
+       int ret_size = -ENODATA;
+       struct packet *pak = NULL;
+       unsigned long flags;
+
+       write_lock_irqsave(&dataex.pak_lock, flags);
+       if (dataex.aborted) {
+               dataex.aborted = 0;
+               dataex.copied_so_far = 0;
+               write_unlock_irqrestore(&dataex.pak_lock, flags);
+               return -EIO;
+       }
+
+       if (list_empty(&dataex.pending_pak)) {
+               write_unlock_irqrestore(&dataex.pak_lock, flags);
+               wait_event_interruptible(dataex.wait_queue,
+                                        !list_empty(&dataex.pending_pak));
+               write_lock_irqsave(&dataex.pak_lock, flags);
+               dataex.copied_so_far = 0;
+       }
+
+       if (!list_empty(&dataex.pending_pak)) {
+               unsigned int left;
+
+               pak = list_entry(dataex.pending_pak.next, struct packet, next);
+               left = pak->data_len - dataex.copied_so_far;
+               list_del(&pak->next);
+               write_unlock_irqrestore(&dataex.pak_lock, flags);
+
+               DPRINTK("size given by app: %zu, available: %u\n", size, left);
+
+               ret_size = min_t(size_t, size, left);
+
+               ret_size = packet_read(pak, ret_size, data, size, 1);
+
+               write_lock_irqsave(&dataex.pak_lock, flags);
+
+               if (ret_size < 0) {
+                       del_singleshot_timer_sync(&pak->processing_timer);
+                       packet_free(pak);
+                       dataex.copied_so_far = 0;
+               } else {
+                       DPRINTK("Copied %d bytes to user buffer\n", ret_size);
+
+                       dataex.copied_so_far += ret_size;
+                       if (dataex.copied_so_far >= pak->data_len + 4) {
+                               DPRINTK("All data from this packet given to app.\n");
+                               /* All data given to app */
+
+                               del_singleshot_timer_sync(&pak->
+                                                         processing_timer);
+                               list_add_tail(&pak->next, &dataex.current_pak);
+                               /*
+                                * The more fontends that are handled at the same time,
+                                * the more time we give the TPM to process the request.
+                                */
+                               mod_timer(&pak->processing_timer,
+                                         jiffies + (num_frontends * 60 * HZ));
+                               dataex.copied_so_far = 0;
+                       } else {
+                               list_add(&pak->next, &dataex.pending_pak);
+                       }
+               }
+       }
+       write_unlock_irqrestore(&dataex.pak_lock, flags);
+
+       DPRINTK("Returning result from read to app: %d\n", ret_size);
+
+       return ret_size;
+}
+
+/*
+ * Write operation - only works after a previous read operation!
+ */
+static ssize_t vtpm_op_write(struct file *file,
+                            const char __user * data, size_t size,
+                            loff_t * offset)
+{
+       struct packet *pak;
+       int rc = 0;
+       unsigned int off = 4;
+       unsigned long flags;
+       struct vtpm_resp_hdr vrh;
+
+       /*
+        * Minimum required packet size is:
+        * 4 bytes for instance number
+        * 2 bytes for tag
+        * 4 bytes for paramSize
+        * 4 bytes for the ordinal
+        * sum: 14 bytes
+        */
+       if (size < sizeof (vrh))
+               return -EFAULT;
+
+       if (copy_from_user(&vrh, data, sizeof (vrh)))
+               return -EFAULT;
+
+       /* malformed packet? */
+       if ((off + ntohl(vrh.len_no)) != size)
+               return -EFAULT;
+
+       write_lock_irqsave(&dataex.pak_lock, flags);
+       pak = packet_find_instance(&dataex.current_pak,
+                                  ntohl(vrh.instance_no));
+
+       if (pak == NULL) {
+               write_unlock_irqrestore(&dataex.pak_lock, flags);
+               DPRINTK(KERN_ALERT "No associated packet! (inst=%d)\n",
+                       ntohl(vrh.instance_no));
+               return -EFAULT;
+       }
+
+       del_singleshot_timer_sync(&pak->processing_timer);
+       list_del(&pak->next);
+
+       write_unlock_irqrestore(&dataex.pak_lock, flags);
+
+       /*
+        * The first 'offset' bytes must be the instance number - skip them.
+        */
+       size -= off;
+
+       rc = packet_write(pak, &data[off], size, 1);
+
+       if (rc > 0) {
+               /* I neglected the first 4 bytes */
+               rc += off;
+       }
+       packet_free(pak);
+       return rc;
+}
+
+static int vtpm_op_release(struct inode *inode, struct file *file)
+{
+       unsigned long flags;
+
+       vtpm_release_packets(NULL, 1);
+       write_lock_irqsave(&dataex.pak_lock, flags);
+       dataex.has_opener = 0;
+       write_unlock_irqrestore(&dataex.pak_lock, flags);
+       return 0;
+}
+
+static unsigned int vtpm_op_poll(struct file *file,
+                                struct poll_table_struct *pts)
+{
+       unsigned int flags = POLLOUT | POLLWRNORM;
+
+       poll_wait(file, &dataex.wait_queue, pts);
+       if (!list_empty(&dataex.pending_pak)) {
+               flags |= POLLIN | POLLRDNORM;
+       }
+       return flags;
+}
+
+static const struct file_operations vtpm_ops = {
+       .owner = THIS_MODULE,
+       .llseek = no_llseek,
+       .open = vtpm_op_open,
+       .read = vtpm_op_read,
+       .write = vtpm_op_write,
+       .release = vtpm_op_release,
+       .poll = vtpm_op_poll,
+};
+
+static struct miscdevice vtpms_miscdevice = {
+       .minor = 225,
+       .name = "vtpm",
+       .fops = &vtpm_ops,
+};
+
+/***************************************************************
+ Utility functions
+***************************************************************/
+
+static int tpm_send_fail_message(struct packet *pak, u8 req_tag)
+{
+       int rc;
+       static const unsigned char tpm_error_message_fail[] = {
+               0x00, 0x00,
+               0x00, 0x00, 0x00, 0x0a,
+               0x00, 0x00, 0x00, 0x09  /* TPM_FAIL */
+       };
+       unsigned char buffer[sizeof (tpm_error_message_fail)];
+
+       memcpy(buffer, tpm_error_message_fail,
+              sizeof (tpm_error_message_fail));
+       /*
+        * Insert the right response tag depending on the given tag
+        * All response tags are '+3' to the request tag.
+        */
+       buffer[1] = req_tag + 3;
+
+       /*
+        * Write the data to shared memory and notify the front-end
+        */
+       rc = packet_write(pak, buffer, sizeof (buffer), 0);
+
+       return rc;
+}
+
+static int _vtpm_release_packets(struct list_head *head,
+                                tpmif_t * tpmif, int send_msgs)
+{
+       int aborted = 0;
+       int c = 0;
+       struct packet *pak;
+       struct list_head *pos, *tmp;
+
+       list_for_each_safe(pos, tmp, head) {
+               pak = list_entry(pos, struct packet, next);
+               c += 1;
+
+               if (tpmif == NULL || pak->tpmif == tpmif) {
+                       int can_send = 0;
+
+                       del_singleshot_timer_sync(&pak->processing_timer);
+                       list_del(&pak->next);
+
+                       if (pak->tpmif && pak->tpmif->status == CONNECTED) {
+                               can_send = 1;
+                       }
+
+                       if (send_msgs && can_send) {
+                               tpm_send_fail_message(pak, pak->req_tag);
+                       }
+                       packet_free(pak);
+                       if (c == 1)
+                               aborted = 1;
+               }
+       }
+       return aborted;
+}
+
+int vtpm_release_packets(tpmif_t * tpmif, int send_msgs)
+{
+       unsigned long flags;
+
+       write_lock_irqsave(&dataex.pak_lock, flags);
+
+       dataex.aborted = _vtpm_release_packets(&dataex.pending_pak,
+                                              tpmif,
+                                              send_msgs);
+       _vtpm_release_packets(&dataex.current_pak, tpmif, send_msgs);
+
+       write_unlock_irqrestore(&dataex.pak_lock, flags);
+       return 0;
+}
+
+static int vtpm_queue_packet(struct packet *pak)
+{
+       int rc = 0;
+
+       if (dataex.has_opener) {
+               unsigned long flags;
+
+               write_lock_irqsave(&dataex.pak_lock, flags);
+               list_add_tail(&pak->next, &dataex.pending_pak);
+               /* give the TPM some time to pick up the request */
+               mod_timer(&pak->processing_timer, jiffies + (30 * HZ));
+               write_unlock_irqrestore(&dataex.pak_lock, flags);
+
+               wake_up_interruptible(&dataex.wait_queue);
+       } else {
+               rc = -EFAULT;
+       }
+       return rc;
+}
+
+static int vtpm_receive(tpmif_t * tpmif, u32 size)
+{
+       int rc = 0;
+       unsigned char buffer[10];
+       __be32 *native_size;
+       struct packet *pak = packet_alloc(tpmif, size, 0, 0);
+
+       if (!pak)
+               return -ENOMEM;
+       /*
+        * Read 10 bytes from the received buffer to test its
+        * content for validity.
+        */
+       if (sizeof (buffer) != packet_read(pak,
+                                          sizeof (buffer), buffer,
+                                          sizeof (buffer), 0)) {
+               goto failexit;
+       }
+       /*
+        * Reset the packet read pointer so we can read all its
+        * contents again.
+        */
+       packet_reset(pak);
+
+       native_size = (__force __be32 *) (&buffer[4 + 2]);
+       /*
+        * Verify that the size of the packet is correct
+        * as indicated and that there's actually someone reading packets.
+        * The minimum size of the packet is '10' for tag, size indicator
+        * and ordinal.
+        */
+       if (size < 10 ||
+           be32_to_cpu(*native_size) != size ||
+           0 == dataex.has_opener || tpmif->status != CONNECTED) {
+               rc = -EINVAL;
+               goto failexit;
+       } else {
+               rc = vtpm_queue_packet(pak);
+               if (rc < 0)
+                       goto failexit;
+       }
+       return 0;
+
+      failexit:
+       if (pak) {
+               tpm_send_fail_message(pak, buffer[4 + 1]);
+               packet_free(pak);
+       }
+       return rc;
+}
+
+/*
+ * Timeout function that gets invoked when a packet has not been processed
+ * during the timeout period.
+ * The packet must be on a list when this function is invoked. This
+ * also means that once its taken off a list, the timer must be
+ * destroyed as well.
+ */
+static void processing_timeout(unsigned long ptr)
+{
+       struct packet *pak = (struct packet *)ptr;
+       unsigned long flags;
+
+       write_lock_irqsave(&dataex.pak_lock, flags);
+       /*
+        * The packet needs to be searched whether it
+        * is still on the list.
+        */
+       if (pak == packet_find_packet(&dataex.pending_pak, pak) ||
+           pak == packet_find_packet(&dataex.current_pak, pak)) {
+               if ((pak->flags & PACKET_FLAG_DISCARD_RESPONSE) == 0) {
+                       tpm_send_fail_message(pak, pak->req_tag);
+               }
+               /* discard future responses */
+               pak->flags |= PACKET_FLAG_DISCARD_RESPONSE;
+       }
+
+       write_unlock_irqrestore(&dataex.pak_lock, flags);
+}
+
+static void tpm_tx_action(unsigned long unused);
+static DECLARE_TASKLET(tpm_tx_tasklet, tpm_tx_action, 0);
+
+static struct list_head tpm_schedule_list;
+static spinlock_t tpm_schedule_list_lock;
+
+static inline void maybe_schedule_tx_action(void)
+{
+       smp_mb();
+       tasklet_schedule(&tpm_tx_tasklet);
+}
+
+static inline int __on_tpm_schedule_list(tpmif_t * tpmif)
+{
+       return tpmif->list.next != NULL;
+}
+
+static void remove_from_tpm_schedule_list(tpmif_t * tpmif)
+{
+       spin_lock_irq(&tpm_schedule_list_lock);
+       if (likely(__on_tpm_schedule_list(tpmif))) {
+               list_del(&tpmif->list);
+               tpmif->list.next = NULL;
+               tpmif_put(tpmif);
+       }
+       spin_unlock_irq(&tpm_schedule_list_lock);
+}
+
+static void add_to_tpm_schedule_list_tail(tpmif_t * tpmif)
+{
+       if (__on_tpm_schedule_list(tpmif))
+               return;
+
+       spin_lock_irq(&tpm_schedule_list_lock);
+       if (!__on_tpm_schedule_list(tpmif) && tpmif->active) {
+               list_add_tail(&tpmif->list, &tpm_schedule_list);
+               tpmif_get(tpmif);
+       }
+       spin_unlock_irq(&tpm_schedule_list_lock);
+}
+
+void tpmif_schedule_work(tpmif_t * tpmif)
+{
+       add_to_tpm_schedule_list_tail(tpmif);
+       maybe_schedule_tx_action();
+}
+
+void tpmif_deschedule_work(tpmif_t * tpmif)
+{
+       remove_from_tpm_schedule_list(tpmif);
+}
+
+static void tpm_tx_action(unsigned long unused)
+{
+       struct list_head *ent;
+       tpmif_t *tpmif;
+       tpmif_tx_request_t *tx;
+
+       DPRINTK("%s: Getting data from front-end(s)!\n", __FUNCTION__);
+
+       while (!list_empty(&tpm_schedule_list)) {
+               /* Get a tpmif from the list with work to do. */
+               ent = tpm_schedule_list.next;
+               tpmif = list_entry(ent, tpmif_t, list);
+               tpmif_get(tpmif);
+               remove_from_tpm_schedule_list(tpmif);
+
+               tx = &tpmif->tx->ring[0].req;
+
+               /* pass it up */
+               vtpm_receive(tpmif, tx->size);
+
+               tpmif_put(tpmif);
+       }
+}
+
+irqreturn_t tpmif_be_int(int irq, void *dev_id)
+{
+       tpmif_t *tpmif = (tpmif_t *) dev_id;
+
+       add_to_tpm_schedule_list_tail(tpmif);
+       maybe_schedule_tx_action();
+       return IRQ_HANDLED;
+}
+
+static int __init tpmback_init(void)
+{
+       int rc;
+
+       if ((rc = misc_register(&vtpms_miscdevice)) != 0) {
+               pr_alert("Could not register misc device for TPM BE\n");
+               return rc;
+       }
+
+       dataex_init(&dataex);
+
+       spin_lock_init(&tpm_schedule_list_lock);
+       INIT_LIST_HEAD(&tpm_schedule_list);
+
+       rc = tpmif_interface_init();
+       if (!rc) {
+               rc = tpmif_xenbus_init();
+               if (rc)
+                       tpmif_interface_exit();
+       }
+       if (rc) {
+               misc_deregister(&vtpms_miscdevice);
+               return rc;
+       }
+
+       pr_alert("Successfully initialized TPM backend driver\n");
+
+       return 0;
+}
+module_init(tpmback_init);
+
+static void __exit tpmback_exit(void)
+{
+       vtpm_release_packets(NULL, 0);
+       tpmif_xenbus_exit();
+       tpmif_interface_exit();
+       misc_deregister(&vtpms_miscdevice);
+}
+module_exit(tpmback_exit)
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vtpm");
diff --git a/drivers/xen/tpmback/xenbus.c b/drivers/xen/tpmback/xenbus.c

new file mode 100644 (file)

index 0000000..5132c4b
--- /dev/null
+++ b/drivers/xen/tpmback/xenbus.c
@@ -0,0 +1,268 @@
+/*  Xenbus code for tpmif backend
+    Copyright (C) 2005 IBM Corporation
+    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+#include <stdarg.h>
+#include <xen/xenbus.h>
+#include "common.h"
+
+static void maybe_connect(struct backend_info *be);
+static void connect(struct backend_info *be);
+static int connect_ring(struct backend_info *be);
+static void backend_changed(struct xenbus_watch *watch,
+                           const char **vec, unsigned int len);
+static void frontend_changed(struct xenbus_device *dev,
+                            enum xenbus_state frontend_state);
+
+long int tpmback_get_instance(struct backend_info *bi)
+{
+       long int res = -1;
+       if (bi && bi->is_instance_set)
+               res = bi->instance;
+       return res;
+}
+
+static int tpmback_remove(struct xenbus_device *dev)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+       if (!be) return 0;
+
+       if (be->backend_watch.node) {
+               unregister_xenbus_watch(&be->backend_watch);
+               kfree(be->backend_watch.node);
+               be->backend_watch.node = NULL;
+       }
+       if (be->tpmif) {
+               be->tpmif->bi = NULL;
+               vtpm_release_packets(be->tpmif, 0);
+               tpmif_put(be->tpmif);
+               be->tpmif = NULL;
+       }
+       kfree(be);
+       dev_set_drvdata(&dev->dev, NULL);
+       return 0;
+}
+
+static int tpmback_probe(struct xenbus_device *dev,
+                        const struct xenbus_device_id *id)
+{
+       int err;
+       struct backend_info *be = kzalloc(sizeof(struct backend_info),
+                                         GFP_KERNEL);
+
+       if (!be) {
+               xenbus_dev_fatal(dev, -ENOMEM,
+                                "allocating backend structure");
+               return -ENOMEM;
+       }
+
+       be->is_instance_set = 0;
+       be->dev = dev;
+       dev_set_drvdata(&dev->dev, be);
+
+       err = xenbus_watch_path2(dev, dev->nodename,
+                                "instance", &be->backend_watch,
+                                backend_changed);
+       if (err) {
+               goto fail;
+       }
+
+       err = xenbus_switch_state(dev, XenbusStateInitWait);
+       if (err) {
+               goto fail;
+       }
+       return 0;
+fail:
+       tpmback_remove(dev);
+       return err;
+}
+
+
+static void backend_changed(struct xenbus_watch *watch,
+                           const char **vec, unsigned int len)
+{
+       int err;
+       long instance;
+       struct backend_info *be
+               = container_of(watch, struct backend_info, backend_watch);
+       struct xenbus_device *dev = be->dev;
+
+       err = xenbus_scanf(XBT_NIL, dev->nodename,
+                          "instance","%li", &instance);
+       if (XENBUS_EXIST_ERR(err)) {
+               return;
+       }
+
+       if (err != 1) {
+               xenbus_dev_fatal(dev, err, "reading instance");
+               return;
+       }
+
+       if (be->is_instance_set == 0) {
+               be->instance = instance;
+               be->is_instance_set = 1;
+       }
+}
+
+
+static void frontend_changed(struct xenbus_device *dev,
+                            enum xenbus_state frontend_state)
+{
+       struct backend_info *be = dev_get_drvdata(&dev->dev);
+       int err;
+
+       switch (frontend_state) {
+       case XenbusStateInitialising:
+       case XenbusStateInitialised:
+               break;
+
+       case XenbusStateConnected:
+               err = connect_ring(be);
+               if (err) {
+                       return;
+               }
+               maybe_connect(be);
+               break;
+
+       case XenbusStateClosing:
+               be->instance = -1;
+               xenbus_switch_state(dev, XenbusStateClosing);
+               break;
+
+       case XenbusStateUnknown: /* keep it here */
+       case XenbusStateClosed:
+               xenbus_switch_state(dev, XenbusStateClosed);
+               device_unregister(&be->dev->dev);
+               tpmback_remove(dev);
+               break;
+
+       default:
+               xenbus_dev_fatal(dev, -EINVAL,
+                                "saw state %d at frontend",
+                                frontend_state);
+               break;
+       }
+}
+
+
+
+static void maybe_connect(struct backend_info *be)
+{
+       if (be->tpmif == NULL || be->tpmif->status == CONNECTED)
+               return;
+
+       connect(be);
+}
+
+
+static void connect(struct backend_info *be)
+{
+       struct xenbus_transaction xbt;
+       int err;
+       struct xenbus_device *dev = be->dev;
+       unsigned long ready = 1;
+
+again:
+       err = xenbus_transaction_start(&xbt);
+       if (err) {
+               xenbus_dev_fatal(be->dev, err, "starting transaction");
+               return;
+       }
+
+       err = xenbus_printf(xbt, be->dev->nodename,
+                           "ready", "%lu", ready);
+       if (err) {
+               xenbus_dev_fatal(be->dev, err, "writing 'ready'");
+               goto abort;
+       }
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err == -EAGAIN)
+               goto again;
+       if (err)
+               xenbus_dev_fatal(be->dev, err, "end of transaction");
+
+       err = xenbus_switch_state(dev, XenbusStateConnected);
+       if (!err)
+               be->tpmif->status = CONNECTED;
+       return;
+abort:
+       xenbus_transaction_end(xbt, 1);
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+       struct xenbus_device *dev = be->dev;
+       unsigned int ring_ref, evtchn;
+       int err;
+
+       err = xenbus_gather(XBT_NIL, dev->otherend,
+                           "ring-ref", "%u", &ring_ref,
+                           "event-channel", "%u", &evtchn, NULL);
+       if (err) {
+               xenbus_dev_error(dev, err,
+                                "reading %s/ring-ref and event-channel",
+                                dev->otherend);
+               return err;
+       }
+
+       if (!be->tpmif) {
+               be->tpmif = tpmif_find(dev->otherend_id, be);
+               if (IS_ERR(be->tpmif)) {
+                       err = PTR_ERR(be->tpmif);
+                       be->tpmif = NULL;
+                       xenbus_dev_fatal(dev,err,"creating vtpm interface");
+                       return err;
+               }
+       }
+
+       if (be->tpmif != NULL) {
+               err = tpmif_map(be->tpmif, ring_ref, evtchn);
+               if (err) {
+                       xenbus_dev_error(dev, err,
+                                        "mapping shared-frame %u port %u",
+                                        ring_ref, evtchn);
+                       return err;
+               }
+       }
+       return 0;
+}
+
+
+static const struct xenbus_device_id tpmback_ids[] = {
+       { "vtpm" },
+       { "" }
+};
+
+static DEFINE_XENBUS_DRIVER(tpmback, ,
+       .probe = tpmback_probe,
+       .remove = tpmback_remove,
+       .otherend_changed = frontend_changed,
+);
+
+
+int tpmif_xenbus_init(void)
+{
+       return xenbus_register_backend(&tpmback_driver);
+}
+
+void tpmif_xenbus_exit(void)
+{
+       xenbus_unregister_driver(&tpmback_driver);
+}
diff --git a/drivers/xen/usbback/Makefile b/drivers/xen/usbback/Makefile

new file mode 100644 (file)

index 0000000..a7548cb
--- /dev/null
+++ b/drivers/xen/usbback/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_XEN_USB_BACKEND) := usbbk.o
+
+usbbk-y   := usbstub.o xenbus.o interface.o usbback.o
+
diff --git a/drivers/xen/usbback/interface.c b/drivers/xen/usbback/interface.c

new file mode 100644 (file)

index 0000000..bd22277
--- /dev/null
+++ b/drivers/xen/usbback/interface.c
@@ -0,0 +1,190 @@
+/*
+ * interface.c
+ *
+ * Xen USB backend interface management.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/vmalloc.h>
+#include "usbback.h"
+#include <xen/evtchn.h>
+
+static LIST_HEAD(usbif_list);
+static DEFINE_SPINLOCK(usbif_list_lock);
+
+usbif_t *find_usbif(domid_t domid, unsigned int handle)
+{
+       usbif_t *usbif;
+       int found = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&usbif_list_lock, flags);
+       list_for_each_entry(usbif, &usbif_list, usbif_list) {
+               if (usbif->domid == domid
+                       && usbif->handle == handle) {
+                       found = 1;
+                       break;
+               }
+       }
+       spin_unlock_irqrestore(&usbif_list_lock, flags);
+
+       if (found)
+               return usbif;
+
+       return NULL;
+}
+
+usbif_t *usbif_alloc(domid_t domid, unsigned int handle)
+{
+       usbif_t *usbif;
+       unsigned long flags;
+       int i;
+
+       usbif = kzalloc(sizeof(usbif_t), GFP_KERNEL);
+       if (!usbif)
+               return NULL;
+
+       usbif->domid = domid;
+       usbif->handle = handle;
+       spin_lock_init(&usbif->urb_ring_lock);
+       spin_lock_init(&usbif->conn_ring_lock);
+       atomic_set(&usbif->refcnt, 0);
+       init_waitqueue_head(&usbif->wq);
+       init_waitqueue_head(&usbif->waiting_to_free);
+       spin_lock_init(&usbif->stub_lock);
+       INIT_LIST_HEAD(&usbif->stub_list);
+       spin_lock_init(&usbif->addr_lock);
+       for (i = 0; i < USB_DEV_ADDR_SIZE; i++)
+               usbif->addr_table[i] = NULL;
+
+       spin_lock_irqsave(&usbif_list_lock, flags);
+       list_add(&usbif->usbif_list, &usbif_list);
+       spin_unlock_irqrestore(&usbif_list_lock, flags);
+
+       return usbif;
+}
+
+int usbif_map(usbif_t *usbif, grant_ref_t urb_ring_ref,
+             grant_ref_t conn_ring_ref, evtchn_port_t evtchn)
+{
+       int err = -ENOMEM;
+       struct vm_struct *area;
+       usbif_urb_sring_t *urb_sring;
+       usbif_conn_sring_t *conn_sring;
+
+       if (usbif->irq)
+               return 0;
+
+       area = xenbus_map_ring_valloc(usbif->xbdev, urb_ring_ref);
+       if (IS_ERR(area))
+               return PTR_ERR(area);
+       usbif->urb_ring_area = area;
+       area = xenbus_map_ring_valloc(usbif->xbdev, conn_ring_ref);
+       if (IS_ERR(area)) {
+               err = PTR_ERR(area);
+               goto fail_alloc;
+       }
+       usbif->conn_ring_area = area;
+
+       err = bind_interdomain_evtchn_to_irqhandler(
+                       usbif->domid, evtchn, usbbk_be_int, 0,
+                       "usbif-backend", usbif);
+       if (err < 0)
+               goto fail_evtchn;
+       usbif->irq = err;
+
+       urb_sring = (usbif_urb_sring_t *) usbif->urb_ring_area->addr;
+       BACK_RING_INIT(&usbif->urb_ring, urb_sring, PAGE_SIZE);
+
+       conn_sring = (usbif_conn_sring_t *) usbif->conn_ring_area->addr;
+       BACK_RING_INIT(&usbif->conn_ring, conn_sring, PAGE_SIZE);
+
+       return 0;
+
+fail_evtchn:
+       xenbus_unmap_ring_vfree(usbif->xbdev, usbif->conn_ring_area);
+fail_alloc:
+       xenbus_unmap_ring_vfree(usbif->xbdev, usbif->urb_ring_area);
+
+       return err;
+}
+
+void usbif_disconnect(usbif_t *usbif)
+{
+       struct usbstub *stub, *tmp;
+       unsigned long flags;
+
+       if (usbif->xenusbd) {
+               kthread_stop(usbif->xenusbd);
+               usbif->xenusbd = NULL;
+       }
+
+       spin_lock_irqsave(&usbif->stub_lock, flags);
+       list_for_each_entry_safe(stub, tmp, &usbif->stub_list, dev_list) {
+               usbbk_unlink_urbs(stub);
+               detach_device_without_lock(usbif, stub);
+       }
+       spin_unlock_irqrestore(&usbif->stub_lock, flags);
+
+       wait_event(usbif->waiting_to_free, atomic_read(&usbif->refcnt) == 0);
+
+       if (usbif->irq) {
+               unbind_from_irqhandler(usbif->irq, usbif);
+               usbif->irq = 0;
+       }
+
+       if (usbif->urb_ring.sring) {
+               xenbus_unmap_ring_vfree(usbif->xbdev, usbif->urb_ring_area);
+               xenbus_unmap_ring_vfree(usbif->xbdev, usbif->conn_ring_area);
+               usbif->urb_ring.sring = NULL;
+               usbif->conn_ring.sring = NULL;
+       }
+}
+
+void usbif_free(usbif_t *usbif)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&usbif_list_lock, flags);
+       list_del(&usbif->usbif_list);
+       spin_unlock_irqrestore(&usbif_list_lock, flags);
+       kfree(usbif);
+}
diff --git a/drivers/xen/usbback/usbback.c b/drivers/xen/usbback/usbback.c

new file mode 100644 (file)

index 0000000..b604f5d
--- /dev/null
+++ b/drivers/xen/usbback/usbback.c
@@ -0,0 +1,1198 @@
+/*
+ * usbback.c
+ *
+ * Xen USB backend driver
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <xen/balloon.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include "usbback.h"
+
+#if 0
+#include "../../usb/core/hub.h"
+#endif
+
+int usbif_reqs = USBIF_BACK_MAX_PENDING_REQS;
+module_param_named(reqs, usbif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of usbback requests to allocate");
+
+struct pending_req_segment {
+       uint16_t offset;
+       uint16_t length;
+};
+
+typedef struct {
+       usbif_t *usbif;
+
+       uint16_t id; /* request id */
+
+       struct usbstub *stub;
+       struct list_head urb_list;
+
+       /* urb */
+       struct urb *urb;
+       void *buffer;
+       dma_addr_t transfer_dma;
+       struct usb_ctrlrequest *setup;
+
+       /* request segments */
+       uint16_t nr_buffer_segs; /* number of urb->transfer_buffer segments */
+       uint16_t nr_extra_segs; /* number of iso_frame_desc segments (ISO) */
+       struct pending_req_segment *seg;
+
+       struct list_head free_list;
+} pending_req_t;
+
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static DEFINE_SPINLOCK(pending_free_lock);
+static LIST_HEAD(pending_urb_free);
+static DEFINE_SPINLOCK(urb_free_lock);
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define USBBACK_INVALID_HANDLE (~0)
+
+static struct page **pending_pages;
+static grant_handle_t *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+       return (req - pending_reqs) * USBIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+       unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]);
+       return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+#define pending_handle(_req, _seg) \
+       (pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
+static pending_req_t *alloc_req(void)
+{
+       pending_req_t *req = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+       if (!list_empty(&pending_free)) {
+               req = list_entry(pending_free.next, pending_req_t, free_list);
+               list_del(&req->free_list);
+       }
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+       return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+       unsigned long flags;
+       int was_empty;
+
+       spin_lock_irqsave(&pending_free_lock, flags);
+       was_empty = list_empty(&pending_free);
+       list_add(&req->free_list, &pending_free);
+       spin_unlock_irqrestore(&pending_free_lock, flags);
+       if (was_empty)
+               wake_up(&pending_free_wq);
+}
+
+static inline void add_req_to_submitting_list(struct usbstub *stub, pending_req_t *pending_req)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&stub->submitting_lock, flags);
+       list_add_tail(&pending_req->urb_list, &stub->submitting_list);
+       spin_unlock_irqrestore(&stub->submitting_lock, flags);
+}
+
+static inline void remove_req_from_submitting_list(struct usbstub *stub, pending_req_t *pending_req)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&stub->submitting_lock, flags);
+       list_del_init(&pending_req->urb_list);
+       spin_unlock_irqrestore(&stub->submitting_lock, flags);
+}
+
+void usbbk_unlink_urbs(struct usbstub *stub)
+{
+       pending_req_t *req, *tmp;
+       unsigned long flags;
+
+       spin_lock_irqsave(&stub->submitting_lock, flags);
+       list_for_each_entry_safe(req, tmp, &stub->submitting_list, urb_list) {
+               usb_unlink_urb(req->urb);
+       }
+       spin_unlock_irqrestore(&stub->submitting_lock, flags);
+}
+
+static void fast_flush_area(pending_req_t *pending_req)
+{
+       struct gnttab_unmap_grant_ref unmap[USBIF_MAX_SEGMENTS_PER_REQUEST];
+       unsigned int i, nr_segs, invcount = 0;
+       grant_handle_t handle;
+       int ret;
+
+       nr_segs = pending_req->nr_buffer_segs + pending_req->nr_extra_segs;
+
+       if (nr_segs) {
+               for (i = 0; i < nr_segs; i++) {
+                       handle = pending_handle(pending_req, i);
+                       if (handle == USBBACK_INVALID_HANDLE)
+                               continue;
+                       gnttab_set_unmap_op(&unmap[invcount], vaddr(pending_req, i),
+                                           GNTMAP_host_map, handle);
+                       pending_handle(pending_req, i) = USBBACK_INVALID_HANDLE;
+                       invcount++;
+               }
+
+               ret = HYPERVISOR_grant_table_op(
+                       GNTTABOP_unmap_grant_ref, unmap, invcount);
+               BUG_ON(ret);
+
+               kfree(pending_req->seg);
+       }
+
+       return;
+}
+
+static void copy_buff_to_pages(void *buff, pending_req_t *pending_req,
+               int start, int nr_pages)
+{
+       unsigned long copied = 0;
+       int i;
+
+       for (i = start; i < start + nr_pages; i++) {
+               memcpy((void *) vaddr(pending_req, i) + pending_req->seg[i].offset,
+                       buff + copied,
+                       pending_req->seg[i].length);
+               copied += pending_req->seg[i].length;
+       }
+}
+
+static void copy_pages_to_buff(void *buff, pending_req_t *pending_req,
+               int start, int nr_pages)
+{
+       unsigned long copied = 0;
+       int i;
+
+       for (i = start; i < start + nr_pages; i++) {
+               memcpy(buff + copied,
+                       (void *) vaddr(pending_req, i) + pending_req->seg[i].offset,
+                       pending_req->seg[i].length);
+               copied += pending_req->seg[i].length;
+       }
+}
+
+static int usbbk_alloc_urb(usbif_urb_request_t *req, pending_req_t *pending_req)
+{
+       int ret;
+
+       if (usb_pipeisoc(req->pipe))
+               pending_req->urb = usb_alloc_urb(req->u.isoc.number_of_packets, GFP_KERNEL);
+       else
+               pending_req->urb = usb_alloc_urb(0, GFP_KERNEL);
+       if (!pending_req->urb) {
+               pr_err("usbback: can't alloc urb\n");
+               ret = -ENOMEM;
+               goto fail;
+       }
+
+       if (req->buffer_length) {
+               pending_req->buffer = usb_alloc_coherent(pending_req->stub->udev,
+                               req->buffer_length, GFP_KERNEL,
+                               &pending_req->transfer_dma);
+               if (!pending_req->buffer) {
+                       pr_err("usbback: can't alloc urb buffer\n");
+                       ret = -ENOMEM;
+                       goto fail_free_urb;
+               }
+       }
+
+       if (usb_pipecontrol(req->pipe)) {
+               pending_req->setup = kmalloc(sizeof(struct usb_ctrlrequest),
+                                            GFP_KERNEL);
+               if (!pending_req->setup) {
+                       pr_err("usbback: can't alloc usb_ctrlrequest\n");
+                       ret = -ENOMEM;
+                       goto fail_free_buffer;
+               }
+       }
+
+       return 0;
+
+fail_free_buffer:
+       if (req->buffer_length)
+               usb_free_coherent(pending_req->stub->udev,
+                                 req->buffer_length,
+                                 pending_req->buffer,
+                                 pending_req->transfer_dma);
+fail_free_urb:
+       usb_free_urb(pending_req->urb);
+fail:
+       return ret;
+}
+
+static void usbbk_free_urb(struct urb *urb)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&urb_free_lock, flags);
+       list_add(&urb->urb_list, &pending_urb_free);
+       spin_unlock_irqrestore(&urb_free_lock, flags);
+}
+
+static void _usbbk_free_urb(struct urb *urb)
+{
+       if (usb_pipecontrol(urb->pipe))
+               kfree(urb->setup_packet);
+       if (urb->transfer_buffer_length)
+               usb_free_coherent(urb->dev, urb->transfer_buffer_length,
+                                 urb->transfer_buffer, urb->transfer_dma);
+       barrier();
+       usb_free_urb(urb);
+}
+
+static void usbbk_free_urbs(void)
+{
+       unsigned long flags;
+       struct list_head tmp_list;
+
+       if (list_empty(&pending_urb_free))
+               return;
+
+       INIT_LIST_HEAD(&tmp_list);
+
+       spin_lock_irqsave(&urb_free_lock, flags);
+       list_splice_init(&pending_urb_free, &tmp_list);
+       spin_unlock_irqrestore(&urb_free_lock, flags);
+
+       while (!list_empty(&tmp_list)) {
+               struct urb *next_urb = list_first_entry(&tmp_list, struct urb,
+                                                       urb_list);
+
+               list_del(&next_urb->urb_list);
+               _usbbk_free_urb(next_urb);
+       }
+}
+
+static void usbbk_notify_work(usbif_t *usbif)
+{
+       usbif->waiting_reqs = 1;
+       wake_up(&usbif->wq);
+}
+
+irqreturn_t usbbk_be_int(int irq, void *dev_id)
+{
+       usbbk_notify_work(dev_id);
+       return IRQ_HANDLED;
+}
+
+static void usbbk_do_response(pending_req_t *pending_req, int32_t status,
+                                       int32_t actual_length, int32_t error_count, uint16_t start_frame)
+{
+       usbif_t *usbif = pending_req->usbif;
+       usbif_urb_response_t *res;
+       unsigned long flags;
+       int notify;
+
+       spin_lock_irqsave(&usbif->urb_ring_lock, flags);
+       res = RING_GET_RESPONSE(&usbif->urb_ring, usbif->urb_ring.rsp_prod_pvt);
+       res->id = pending_req->id;
+       res->status = status;
+       res->actual_length = actual_length;
+       res->error_count = error_count;
+       res->start_frame = start_frame;
+       usbif->urb_ring.rsp_prod_pvt++;
+       barrier();
+       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&usbif->urb_ring, notify);
+       spin_unlock_irqrestore(&usbif->urb_ring_lock, flags);
+
+       if (notify)
+               notify_remote_via_irq(usbif->irq);
+}
+
+static void usbbk_urb_complete(struct urb *urb)
+{
+       pending_req_t *pending_req = (pending_req_t *)urb->context;
+
+       if (usb_pipein(urb->pipe) && urb->status == 0 && urb->actual_length > 0)
+               copy_buff_to_pages(pending_req->buffer, pending_req,
+                                       0, pending_req->nr_buffer_segs);
+
+       if (usb_pipeisoc(urb->pipe))
+               copy_buff_to_pages(&urb->iso_frame_desc[0], pending_req,
+                                       pending_req->nr_buffer_segs, pending_req->nr_extra_segs);
+
+       barrier();
+
+       fast_flush_area(pending_req);
+
+       usbbk_do_response(pending_req, urb->status, urb->actual_length,
+                                       urb->error_count, urb->start_frame);
+
+       remove_req_from_submitting_list(pending_req->stub, pending_req);
+
+       barrier();
+       usbbk_free_urb(urb);
+       usbif_put(pending_req->usbif);
+       free_req(pending_req);
+}
+
+static int usbbk_gnttab_map(usbif_t *usbif,
+                       usbif_urb_request_t *req, pending_req_t *pending_req)
+{
+       int i, ret;
+       unsigned int nr_segs;
+       uint32_t flags;
+       struct gnttab_map_grant_ref map[USBIF_MAX_SEGMENTS_PER_REQUEST];
+
+       nr_segs = pending_req->nr_buffer_segs + pending_req->nr_extra_segs;
+
+       if (nr_segs > USBIF_MAX_SEGMENTS_PER_REQUEST) {
+               pr_err("Bad number of segments in request\n");
+               ret = -EINVAL;
+               goto fail;
+       }
+
+       if (nr_segs) {
+               pending_req->seg = kmalloc(sizeof(struct pending_req_segment)
+                               * nr_segs, GFP_KERNEL);
+               if (!pending_req->seg) {
+                       ret = -ENOMEM;
+                       goto fail;
+               }
+
+               if (pending_req->nr_buffer_segs) {
+                       flags = GNTMAP_host_map;
+                       if (usb_pipeout(req->pipe))
+                               flags |= GNTMAP_readonly;
+                       for (i = 0; i < pending_req->nr_buffer_segs; i++)
+                               gnttab_set_map_op(&map[i], vaddr(
+                                               pending_req, i), flags,
+                                               req->seg[i].gref,
+                                               usbif->domid);
+               }
+
+               if (pending_req->nr_extra_segs) {
+                       flags = GNTMAP_host_map;
+                       for (i = req->nr_buffer_segs; i < nr_segs; i++)
+                               gnttab_set_map_op(&map[i], vaddr(
+                                               pending_req, i), flags,
+                                               req->seg[i].gref,
+                                               usbif->domid);
+               }
+
+               ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+                                       map, nr_segs);
+               BUG_ON(ret);
+
+               for (i = 0; i < nr_segs; i++) {
+                       /* Make sure than none of the map ops failed with GNTST_eagain */
+                       if (unlikely(map[i].status == GNTST_eagain))
+                               gnttab_check_GNTST_eagain_while(GNTTABOP_map_grant_ref, &map[i]);
+
+                       if (unlikely(map[i].status != GNTST_okay)) {
+                               pr_err("usbback: invalid buffer -- could not remap it\n");
+                               map[i].handle = USBBACK_INVALID_HANDLE;
+                               ret |= 1;
+                       }
+
+                       pending_handle(pending_req, i) = map[i].handle;
+
+                       if (ret)
+                               continue;
+
+                       set_phys_to_machine(__pa(vaddr(
+                               pending_req, i)) >> PAGE_SHIFT,
+                               FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
+
+                       pending_req->seg[i].offset = req->seg[i].offset;
+                       pending_req->seg[i].length = req->seg[i].length;
+
+                       barrier();
+
+                       if (pending_req->seg[i].offset >= PAGE_SIZE ||
+                                       pending_req->seg[i].length > PAGE_SIZE ||
+                                       pending_req->seg[i].offset + pending_req->seg[i].length > PAGE_SIZE)
+                                       ret |= 1;
+               }
+
+               if (ret)
+                       goto fail_flush;
+       }
+
+       return 0;
+
+fail_flush:
+       fast_flush_area(pending_req);
+       ret = -ENOMEM;
+
+fail:
+       return ret;
+}
+
+static void usbbk_init_urb(usbif_urb_request_t *req, pending_req_t *pending_req)
+{
+       unsigned int pipe;
+       struct usb_device *udev = pending_req->stub->udev;
+       struct urb *urb = pending_req->urb;
+
+       switch (usb_pipetype(req->pipe)) {
+       case PIPE_ISOCHRONOUS:
+               if (usb_pipein(req->pipe))
+                       pipe = usb_rcvisocpipe(udev, usb_pipeendpoint(req->pipe));
+               else
+                       pipe = usb_sndisocpipe(udev, usb_pipeendpoint(req->pipe));
+
+               urb->dev = udev;
+               urb->pipe = pipe;
+               urb->transfer_flags = req->transfer_flags;
+               urb->transfer_flags |= URB_ISO_ASAP;
+               urb->transfer_buffer = pending_req->buffer;
+               urb->transfer_buffer_length = req->buffer_length;
+               urb->complete = usbbk_urb_complete;
+               urb->context = pending_req;
+               urb->interval = req->u.isoc.interval;
+               urb->start_frame = req->u.isoc.start_frame;
+               urb->number_of_packets = req->u.isoc.number_of_packets;
+
+               break;
+       case PIPE_INTERRUPT:
+               if (usb_pipein(req->pipe))
+                       pipe = usb_rcvintpipe(udev, usb_pipeendpoint(req->pipe));
+               else
+                       pipe = usb_sndintpipe(udev, usb_pipeendpoint(req->pipe));
+
+               usb_fill_int_urb(urb, udev, pipe,
+                               pending_req->buffer, req->buffer_length,
+                               usbbk_urb_complete,
+                               pending_req, req->u.intr.interval);
+               /*
+                * high speed interrupt endpoints use a logarithmic encoding of
+                * the endpoint interval, and usb_fill_int_urb() initializes a
+                * interrupt urb with the encoded interval value.
+                *
+                * req->u.intr.interval is the interval value that already
+                * encoded in the frontend part, and the above usb_fill_int_urb()
+                * initializes the urb->interval with double encoded value.
+                *
+                * so, simply overwrite the urb->interval with original value.
+                */
+               urb->interval = req->u.intr.interval;
+               urb->transfer_flags = req->transfer_flags;
+
+               break;
+       case PIPE_CONTROL:
+               if (usb_pipein(req->pipe))
+                       pipe = usb_rcvctrlpipe(udev, 0);
+               else
+                       pipe = usb_sndctrlpipe(udev, 0);
+
+               usb_fill_control_urb(urb, udev, pipe,
+                               (unsigned char *) pending_req->setup,
+                               pending_req->buffer, req->buffer_length,
+                               usbbk_urb_complete, pending_req);
+               memcpy(pending_req->setup, req->u.ctrl, 8);
+               urb->transfer_flags = req->transfer_flags;
+
+               break;
+       case PIPE_BULK:
+               if (usb_pipein(req->pipe))
+                       pipe = usb_rcvbulkpipe(udev, usb_pipeendpoint(req->pipe));
+               else
+                       pipe = usb_sndbulkpipe(udev, usb_pipeendpoint(req->pipe));
+
+               usb_fill_bulk_urb(urb, udev, pipe,
+                               pending_req->buffer, req->buffer_length,
+                               usbbk_urb_complete, pending_req);
+               urb->transfer_flags = req->transfer_flags;
+
+               break;
+       default:
+               break;
+       }
+
+       if (req->buffer_length) {
+               urb->transfer_dma = pending_req->transfer_dma;
+               urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+       }
+}
+
+struct set_interface_request {
+       pending_req_t *pending_req;
+       int interface;
+       int alternate;
+       struct work_struct work;
+};
+
+static void usbbk_set_interface_work(struct work_struct *arg)
+{
+       struct set_interface_request *req
+               = container_of(arg, struct set_interface_request, work);
+       pending_req_t *pending_req = req->pending_req;
+       struct usb_device *udev = req->pending_req->stub->udev;
+
+       int ret;
+
+       usb_lock_device(udev);
+       ret = usb_set_interface(udev, req->interface, req->alternate);
+       usb_unlock_device(udev);
+       usb_put_dev(udev);
+
+       usbbk_do_response(pending_req, ret, 0, 0, 0);
+       usbif_put(pending_req->usbif);
+       free_req(pending_req);
+       kfree(req);
+}
+
+static int usbbk_set_interface(pending_req_t *pending_req, int interface, int alternate)
+{
+       struct set_interface_request *req;
+       struct usb_device *udev = pending_req->stub->udev;
+
+       req = kmalloc(sizeof(*req), GFP_KERNEL);
+       if (!req)
+               return -ENOMEM;
+       req->pending_req = pending_req;
+       req->interface = interface;
+       req->alternate = alternate;
+       INIT_WORK(&req->work, usbbk_set_interface_work);
+       usb_get_dev(udev);
+       schedule_work(&req->work);
+       return 0;
+}
+
+struct clear_halt_request {
+       pending_req_t *pending_req;
+       int pipe;
+       struct work_struct work;
+};
+
+static void usbbk_clear_halt_work(struct work_struct *arg)
+{
+       struct clear_halt_request *req
+               = container_of(arg, struct clear_halt_request, work);
+       pending_req_t *pending_req = req->pending_req;
+       struct usb_device *udev = req->pending_req->stub->udev;
+       int ret;
+
+       usb_lock_device(udev);
+       ret = usb_clear_halt(req->pending_req->stub->udev, req->pipe);
+       usb_unlock_device(udev);
+       usb_put_dev(udev);
+
+       usbbk_do_response(pending_req, ret, 0, 0, 0);
+       usbif_put(pending_req->usbif);
+       free_req(pending_req);
+       kfree(req);
+}
+
+static int usbbk_clear_halt(pending_req_t *pending_req, int pipe)
+{
+       struct clear_halt_request *req;
+       struct usb_device *udev = pending_req->stub->udev;
+
+       req = kmalloc(sizeof(*req), GFP_KERNEL);
+       if (!req)
+               return -ENOMEM;
+       req->pending_req = pending_req;
+       req->pipe = pipe;
+       INIT_WORK(&req->work, usbbk_clear_halt_work);
+
+       usb_get_dev(udev);
+       schedule_work(&req->work);
+       return 0;
+}
+
+#if 0
+struct port_reset_request {
+       pending_req_t *pending_req;
+       struct work_struct work;
+};
+
+static void usbbk_port_reset_work(struct work_struct *arg)
+{
+       struct port_reset_request *req
+               = container_of(arg, struct port_reset_request, work);
+       pending_req_t *pending_req = req->pending_req;
+       struct usb_device *udev = pending_req->stub->udev;
+       int ret, ret_lock;
+
+       ret = ret_lock = usb_lock_device_for_reset(udev, NULL);
+       if (ret_lock >= 0) {
+               ret = usb_reset_device(udev);
+               if (ret_lock)
+                       usb_unlock_device(udev);
+       }
+       usb_put_dev(udev);
+
+       usbbk_do_response(pending_req, ret, 0, 0, 0);
+       usbif_put(pending_req->usbif);
+       free_req(pending_req);
+       kfree(req);
+}
+
+static int usbbk_port_reset(pending_req_t *pending_req)
+{
+       struct port_reset_request *req;
+       struct usb_device *udev = pending_req->stub->udev;
+
+       req = kmalloc(sizeof(*req), GFP_KERNEL);
+       if (!req)
+               return -ENOMEM;
+
+       req->pending_req = pending_req;
+       INIT_WORK(&req->work, usbbk_port_reset_work);
+
+       usb_get_dev(udev);
+       schedule_work(&req->work);
+       return 0;
+}
+#endif
+
+static void usbbk_set_address(usbif_t *usbif, struct usbstub *stub, int cur_addr, int new_addr)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&usbif->addr_lock, flags);
+       if (cur_addr)
+               usbif->addr_table[cur_addr] = NULL;
+       if (new_addr)
+               usbif->addr_table[new_addr] = stub;
+       stub->addr = new_addr;
+       spin_unlock_irqrestore(&usbif->addr_lock, flags);
+}
+
+struct usbstub *find_attached_device(usbif_t *usbif, int portnum)
+{
+       struct usbstub *stub;
+       int found = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&usbif->stub_lock, flags);
+       list_for_each_entry(stub, &usbif->stub_list, dev_list) {
+               if (stub->portid->portnum == portnum) {
+                       found = 1;
+                       break;
+               }
+       }
+       spin_unlock_irqrestore(&usbif->stub_lock, flags);
+
+       if (found)
+               return stub;
+
+       return NULL;
+}
+
+static void process_unlink_req(usbif_t *usbif,
+               usbif_urb_request_t *req, pending_req_t *pending_req)
+{
+       pending_req_t *unlink_req = NULL;
+       int devnum;
+       int ret = 0;
+       unsigned long flags;
+
+       devnum = usb_pipedevice(req->pipe);
+       if (unlikely(devnum == 0)) {
+               pending_req->stub = find_attached_device(usbif, usbif_pipeportnum(req->pipe));
+               if (unlikely(!pending_req->stub)) {
+                       ret = -ENODEV;
+                       goto fail_response;
+               }
+       } else {
+               if (unlikely(!usbif->addr_table[devnum])) {
+                       ret = -ENODEV;
+                       goto fail_response;
+               }
+               pending_req->stub = usbif->addr_table[devnum];
+       }
+
+       spin_lock_irqsave(&pending_req->stub->submitting_lock, flags);
+       list_for_each_entry(unlink_req, &pending_req->stub->submitting_list, urb_list) {
+               if (unlink_req->id == req->u.unlink.unlink_id) {
+                       ret = usb_unlink_urb(unlink_req->urb);
+                       break;
+               }
+       }
+       spin_unlock_irqrestore(&pending_req->stub->submitting_lock, flags);
+
+fail_response:
+       usbbk_do_response(pending_req, ret, 0, 0, 0);
+       usbif_put(usbif);
+       free_req(pending_req);
+       return;
+}
+
+static int check_and_submit_special_ctrlreq(usbif_t *usbif,
+               usbif_urb_request_t *req, pending_req_t *pending_req)
+{
+       int devnum;
+       struct usbstub *stub = NULL;
+       struct usb_ctrlrequest *ctrl = (struct usb_ctrlrequest *) req->u.ctrl;
+       int ret;
+       int done = 0;
+
+       devnum = usb_pipedevice(req->pipe);
+
+       /*
+        * When the device is first connected or reseted, USB device has no address.
+        * In this initial state, following requests are send to device address (#0),
+        *
+        *  1. GET_DESCRIPTOR (with Descriptor Type is "DEVICE") is send,
+        *     and OS knows what device is connected to.
+        *
+        *  2. SET_ADDRESS is send, and then, device has its address.
+        *
+        * In the next step, SET_CONFIGURATION is send to addressed device, and then,
+        * the device is finally ready to use.
+        */
+       if (unlikely(devnum == 0)) {
+               stub = find_attached_device(usbif, usbif_pipeportnum(req->pipe));
+               if (unlikely(!stub)) {
+                       ret = -ENODEV;
+                       goto fail_response;
+               }
+
+               switch (ctrl->bRequest) {
+               case USB_REQ_GET_DESCRIPTOR:
+                       /*
+                        * GET_DESCRIPTOR request to device #0.
+                        * through to normal urb transfer.
+                        */
+                       pending_req->stub = stub;
+                       return 0;
+                       break;
+               case USB_REQ_SET_ADDRESS:
+                       /*
+                        * SET_ADDRESS request to device #0.
+                        * add attached device to addr_table.
+                        */
+                       {
+                               __u16 addr = le16_to_cpu(ctrl->wValue);
+                               usbbk_set_address(usbif, stub, 0, addr);
+                       }
+                       ret = 0;
+                       goto fail_response;
+                       break;
+               default:
+                       ret = -EINVAL;
+                       goto fail_response;
+               }
+       } else {
+               if (unlikely(!usbif->addr_table[devnum])) {
+                       ret = -ENODEV;
+                       goto fail_response;
+               }
+               pending_req->stub = usbif->addr_table[devnum];
+       }
+
+       /*
+        * Check special request
+        */
+       switch (ctrl->bRequest) {
+       case USB_REQ_SET_ADDRESS:
+               /*
+                * SET_ADDRESS request to addressed device.
+                * change addr or remove from addr_table.
+                */
+               {
+                       __u16 addr = le16_to_cpu(ctrl->wValue);
+                       usbbk_set_address(usbif, stub, devnum, addr);
+               }
+               ret = 0;
+               goto fail_response;
+               break;
+#if 0
+       case USB_REQ_SET_CONFIGURATION:
+               /*
+                * linux 2.6.27 or later version only!
+                */
+               if (ctrl->RequestType == USB_RECIP_DEVICE) {
+                       __u16 config = le16_to_cpu(ctrl->wValue);
+                       usb_driver_set_configuration(pending_req->stub->udev, config);
+                       done = 1;
+               }
+               break;
+#endif
+       case USB_REQ_SET_INTERFACE:
+               if (ctrl->bRequestType == USB_RECIP_INTERFACE) {
+                       __u16 alt = le16_to_cpu(ctrl->wValue);
+                       __u16 intf = le16_to_cpu(ctrl->wIndex);
+                       usbbk_set_interface(pending_req, intf, alt);
+                       done = 1;
+               }
+               break;
+       case USB_REQ_CLEAR_FEATURE:
+               if (ctrl->bRequestType == USB_RECIP_ENDPOINT
+                       && ctrl->wValue == USB_ENDPOINT_HALT) {
+                       int pipe;
+                       int ep = le16_to_cpu(ctrl->wIndex) & 0x0f;
+                       int dir = le16_to_cpu(ctrl->wIndex)
+                                       & USB_DIR_IN;
+                       if (dir)
+                               pipe = usb_rcvctrlpipe(pending_req->stub->udev, ep);
+                       else
+                               pipe = usb_sndctrlpipe(pending_req->stub->udev, ep);
+                       usbbk_clear_halt(pending_req, pipe);
+                       done = 1;
+               }
+               break;
+#if 0 /* not tested yet */
+       case USB_REQ_SET_FEATURE:
+               if (ctrl->bRequestType == USB_RT_PORT) {
+                       __u16 feat = le16_to_cpu(ctrl->wValue);
+                       if (feat == USB_PORT_FEAT_RESET) {
+                               usbbk_port_reset(pending_req);
+                               done = 1;
+                       }
+               }
+               break;
+#endif
+       default:
+               break;
+       }
+
+       return done;
+
+fail_response:
+       usbbk_do_response(pending_req, ret, 0, 0, 0);
+       usbif_put(usbif);
+       free_req(pending_req);
+       return 1;
+}
+
+static void dispatch_request_to_pending_reqs(usbif_t *usbif,
+               usbif_urb_request_t *req,
+               pending_req_t *pending_req)
+{
+       int ret;
+
+       pending_req->id = req->id;
+       pending_req->usbif = usbif;
+
+       barrier();
+
+       usbif_get(usbif);
+
+       /* unlink request */
+       if (unlikely(usbif_pipeunlink(req->pipe))) {
+               process_unlink_req(usbif, req, pending_req);
+               return;
+       }
+
+       if (usb_pipecontrol(req->pipe)) {
+               if (check_and_submit_special_ctrlreq(usbif, req, pending_req))
+                       return;
+       } else {
+               int devnum = usb_pipedevice(req->pipe);
+               if (unlikely(!usbif->addr_table[devnum])) {
+                       ret = -ENODEV;
+                       goto fail_response;
+               }
+               pending_req->stub = usbif->addr_table[devnum];
+       }
+
+       barrier();
+
+       ret = usbbk_alloc_urb(req, pending_req);
+       if (ret) {
+               ret = -ESHUTDOWN;
+               goto fail_response;
+       }
+
+       add_req_to_submitting_list(pending_req->stub, pending_req);
+
+       barrier();
+
+       usbbk_init_urb(req, pending_req);
+
+       barrier();
+
+       pending_req->nr_buffer_segs = req->nr_buffer_segs;
+       if (usb_pipeisoc(req->pipe))
+               pending_req->nr_extra_segs = req->u.isoc.nr_frame_desc_segs;
+       else
+               pending_req->nr_extra_segs = 0;
+
+       barrier();
+
+       ret = usbbk_gnttab_map(usbif, req, pending_req);
+       if (ret) {
+               pr_err("usbback: invalid buffer\n");
+               ret = -ESHUTDOWN;
+               goto fail_free_urb;
+       }
+
+       barrier();
+
+       if (usb_pipeout(req->pipe) && req->buffer_length)
+               copy_pages_to_buff(pending_req->buffer,
+                                       pending_req,
+                                       0,
+                                       pending_req->nr_buffer_segs);
+       if (usb_pipeisoc(req->pipe)) {
+               copy_pages_to_buff(&pending_req->urb->iso_frame_desc[0],
+                       pending_req,
+                       pending_req->nr_buffer_segs,
+                       pending_req->nr_extra_segs);
+       }
+
+       barrier();
+
+       ret = usb_submit_urb(pending_req->urb, GFP_KERNEL);
+       if (ret) {
+               pr_err("usbback: failed submitting urb, error %d\n", ret);
+               ret = -ESHUTDOWN;
+               goto fail_flush_area;
+       }
+       return;
+
+fail_flush_area:
+       fast_flush_area(pending_req);
+fail_free_urb:
+       remove_req_from_submitting_list(pending_req->stub, pending_req);
+       barrier();
+       usbbk_free_urb(pending_req->urb);
+fail_response:
+       usbbk_do_response(pending_req, ret, 0, 0, 0);
+       usbif_put(usbif);
+       free_req(pending_req);
+}
+
+static int usbbk_start_submit_urb(usbif_t *usbif)
+{
+       usbif_urb_back_ring_t *urb_ring = &usbif->urb_ring;
+       usbif_urb_request_t *req;
+       pending_req_t *pending_req;
+       RING_IDX rc, rp;
+       int more_to_do = 0;
+
+       rc = urb_ring->req_cons;
+       rp = urb_ring->sring->req_prod;
+       rmb();
+
+       while (rc != rp) {
+               if (RING_REQUEST_CONS_OVERFLOW(urb_ring, rc)) {
+                       pr_warning("RING_REQUEST_CONS_OVERFLOW\n");
+                       break;
+               }
+
+               pending_req = alloc_req();
+               if (NULL == pending_req) {
+                       more_to_do = 1;
+                       break;
+               }
+
+               req = RING_GET_REQUEST(urb_ring, rc);
+               urb_ring->req_cons = ++rc;
+
+               dispatch_request_to_pending_reqs(usbif, req,
+                                                       pending_req);
+       }
+
+       RING_FINAL_CHECK_FOR_REQUESTS(&usbif->urb_ring, more_to_do);
+
+       cond_resched();
+
+       return more_to_do;
+}
+
+void usbbk_hotplug_notify(usbif_t *usbif, int portnum, int speed)
+{
+       usbif_conn_back_ring_t *ring = &usbif->conn_ring;
+       usbif_conn_request_t *req;
+       usbif_conn_response_t *res;
+       unsigned long flags;
+       u16 id;
+       int notify;
+
+       spin_lock_irqsave(&usbif->conn_ring_lock, flags);
+
+       req = RING_GET_REQUEST(ring, ring->req_cons);;
+       id = req->id;
+       ring->req_cons++;
+       ring->sring->req_event = ring->req_cons + 1;
+
+       res = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt);
+       res->id = id;
+       res->portnum = portnum;
+       res->speed = speed;
+       ring->rsp_prod_pvt++;
+       RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(ring, notify);
+
+       spin_unlock_irqrestore(&usbif->conn_ring_lock, flags);
+
+       if (notify)
+               notify_remote_via_irq(usbif->irq);
+}
+
+int usbbk_schedule(void *arg)
+{
+       usbif_t *usbif = (usbif_t *) arg;
+
+       usbif_get(usbif);
+
+       while (!kthread_should_stop()) {
+               wait_event_interruptible(
+                       usbif->wq,
+                       usbif->waiting_reqs || kthread_should_stop());
+               wait_event_interruptible(
+                       pending_free_wq,
+                       !list_empty(&pending_free) || kthread_should_stop());
+               usbif->waiting_reqs = 0;
+               smp_mb();
+
+               if (usbbk_start_submit_urb(usbif))
+                       usbif->waiting_reqs = 1;
+
+               usbbk_free_urbs();
+       }
+
+       usbbk_free_urbs();
+       usbif->xenusbd = NULL;
+       usbif_put(usbif);
+
+       return 0;
+}
+
+/*
+ * attach usbstub device to usbif.
+ */
+void usbbk_attach_device(usbif_t *usbif, struct usbstub *stub)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&usbif->stub_lock, flags);
+       list_add(&stub->dev_list, &usbif->stub_list);
+       spin_unlock_irqrestore(&usbif->stub_lock, flags);
+       stub->usbif = usbif;
+}
+
+/*
+ * detach usbstub device from usbif.
+ */
+void usbbk_detach_device(usbif_t *usbif, struct usbstub *stub)
+{
+       unsigned long flags;
+
+       if (stub->addr)
+               usbbk_set_address(usbif, stub, stub->addr, 0);
+       spin_lock_irqsave(&usbif->stub_lock, flags);
+       list_del(&stub->dev_list);
+       spin_unlock_irqrestore(&usbif->stub_lock, flags);
+       stub->usbif = NULL;
+}
+
+void detach_device_without_lock(usbif_t *usbif, struct usbstub *stub)
+{
+       if (stub->addr)
+               usbbk_set_address(usbif, stub, stub->addr, 0);
+       list_del(&stub->dev_list);
+       stub->usbif = NULL;
+}
+
+static int __init usbback_init(void)
+{
+       int i, mmap_pages;
+       int err = 0;
+
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       mmap_pages = usbif_reqs * USBIF_MAX_SEGMENTS_PER_REQUEST;
+       pending_reqs = kzalloc(sizeof(pending_reqs[0]) *
+                       usbif_reqs, GFP_KERNEL);
+       pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+                       mmap_pages, GFP_KERNEL);
+       pending_pages = alloc_empty_pages_and_pagevec(mmap_pages);
+
+       if (!pending_reqs || !pending_grant_handles || !pending_pages) {
+               err = -ENOMEM;
+               goto out_mem;
+       }
+
+       for (i = 0; i < mmap_pages; i++)
+               pending_grant_handles[i] = USBBACK_INVALID_HANDLE;
+
+       INIT_LIST_HEAD(&pending_free);
+
+       for (i = 0; i < usbif_reqs; i++)
+               list_add_tail(&pending_reqs[i].free_list, &pending_free);
+
+       err = usbstub_init();
+       if (err)
+               goto out_mem;
+
+       err = usbback_xenbus_init();
+       if (err)
+               goto out_xenbus;
+
+       return 0;
+
+out_xenbus:
+       usbstub_exit();
+out_mem:
+       kfree(pending_reqs);
+       kfree(pending_grant_handles);
+       free_empty_pages_and_pagevec(pending_pages, mmap_pages);
+       return err;
+}
+
+static void __exit usbback_exit(void)
+{
+       usbback_xenbus_exit();
+       usbstub_exit();
+       kfree(pending_reqs);
+       kfree(pending_grant_handles);
+       free_empty_pages_and_pagevec(pending_pages, usbif_reqs * USBIF_MAX_SEGMENTS_PER_REQUEST);
+}
+
+module_init(usbback_init);
+module_exit(usbback_exit);
+
+MODULE_AUTHOR("");
+MODULE_DESCRIPTION("Xen USB backend driver (usbback)");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vusb");
diff --git a/drivers/xen/usbback/usbback.h b/drivers/xen/usbback/usbback.h

new file mode 100644 (file)

index 0000000..92d4f4d
--- /dev/null
+++ b/drivers/xen/usbback/usbback.h
@@ -0,0 +1,170 @@
+/*
+ * usbback.h
+ *
+ * This file is part of Xen USB backend driver.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_USBBACK_H__
+#define __XEN_USBBACK_H__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/usb.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/kref.h>
+#include <xen/xenbus.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/io/usbif.h>
+
+struct usbstub;
+
+#ifndef BUS_ID_SIZE
+#define USBBACK_BUS_ID_SIZE 20
+#else
+#define USBBACK_BUS_ID_SIZE BUS_ID_SIZE
+#endif
+
+#define USB_DEV_ADDR_SIZE 128
+
+typedef struct usbif_st {
+       domid_t domid;
+       unsigned int handle;
+       int num_ports;
+       enum usb_spec_version usb_ver;
+
+       struct xenbus_device *xbdev;
+       struct list_head usbif_list;
+
+       unsigned int      irq;
+
+       usbif_urb_back_ring_t urb_ring;
+       usbif_conn_back_ring_t conn_ring;
+       struct vm_struct *urb_ring_area;
+       struct vm_struct *conn_ring_area;
+
+       spinlock_t urb_ring_lock;
+       spinlock_t conn_ring_lock;
+       atomic_t refcnt;
+
+       struct xenbus_watch backend_watch;
+
+       /* device address lookup table */
+       struct usbstub *addr_table[USB_DEV_ADDR_SIZE];
+       spinlock_t addr_lock;
+
+       /* connected device list */
+       struct list_head stub_list;
+       spinlock_t stub_lock;
+
+       /* request schedule */
+       struct task_struct *xenusbd;
+       unsigned int waiting_reqs;
+       wait_queue_head_t waiting_to_free;
+       wait_queue_head_t wq;
+} usbif_t;
+
+struct vusb_port_id {
+       struct list_head id_list;
+
+       char phys_bus[USBBACK_BUS_ID_SIZE];
+       domid_t domid;
+       unsigned int handle;
+       int portnum;
+       unsigned is_connected:1;
+};
+
+struct usbstub {
+       struct kref kref;
+       struct list_head dev_list;
+
+       struct vusb_port_id *portid;
+       struct usb_device *udev;
+       usbif_t *usbif;
+       int addr;
+
+       struct list_head submitting_list;
+       spinlock_t submitting_lock;
+};
+
+usbif_t *usbif_alloc(domid_t domid, unsigned int handle);
+void usbif_disconnect(usbif_t *usbif);
+void usbif_free(usbif_t *usbif);
+int usbif_map(usbif_t *usbif, grant_ref_t urb_ring_ref,
+             grant_ref_t conn_ring_ref, evtchn_port_t);
+
+#define usbif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define usbif_put(_b) \
+       do { \
+               if (atomic_dec_and_test(&(_b)->refcnt)) \
+                       wake_up(&(_b)->waiting_to_free); \
+       } while (0)
+
+usbif_t *find_usbif(domid_t domid, unsigned int handle);
+int usbback_xenbus_init(void);
+void usbback_xenbus_exit(void);
+struct vusb_port_id *find_portid_by_busid(const char *busid);
+struct vusb_port_id *find_portid(const domid_t domid,
+                                               const unsigned int handle,
+                                               const int portnum);
+int portid_add(const char *busid,
+                                       const domid_t domid,
+                                       const unsigned int handle,
+                                       const int portnum);
+int portid_remove(const domid_t domid,
+                                       const unsigned int handle,
+                                       const int portnum);
+irqreturn_t usbbk_be_int(int irq, void *dev_id);
+int usbbk_schedule(void *arg);
+struct usbstub *find_attached_device(usbif_t *usbif, int port);
+void usbbk_attach_device(usbif_t *usbif, struct usbstub *stub);
+void usbbk_detach_device(usbif_t *usbif, struct usbstub *stub);
+void usbbk_hotplug_notify(usbif_t *usbif, int portnum, int speed);
+void detach_device_without_lock(usbif_t *usbif, struct usbstub *stub);
+void usbbk_unlink_urbs(struct usbstub *stub);
+
+int usbstub_init(void);
+void usbstub_exit(void);
+
+#endif /* __XEN_USBBACK_H__ */
diff --git a/drivers/xen/usbback/usbstub.c b/drivers/xen/usbback/usbstub.c

new file mode 100644 (file)

index 0000000..037755a
--- /dev/null
+++ b/drivers/xen/usbback/usbstub.c
@@ -0,0 +1,324 @@
+/*
+ * usbstub.c
+ *
+ * USB stub driver - grabbing and managing USB devices.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "usbback.h"
+
+static LIST_HEAD(port_list);
+static DEFINE_SPINLOCK(port_list_lock);
+
+struct vusb_port_id *find_portid_by_busid(const char *busid)
+{
+       struct vusb_port_id *portid;
+       int found = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&port_list_lock, flags);
+       list_for_each_entry(portid, &port_list, id_list) {
+               if (!(strncmp(portid->phys_bus, busid, USBBACK_BUS_ID_SIZE))) {
+                       found = 1;
+                       break;
+               }
+       }
+       spin_unlock_irqrestore(&port_list_lock, flags);
+
+       if (found)
+               return portid;
+
+       return NULL;
+}
+
+struct vusb_port_id *find_portid(const domid_t domid,
+                                               const unsigned int handle,
+                                               const int portnum)
+{
+       struct vusb_port_id *portid;
+       int found = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&port_list_lock, flags);
+       list_for_each_entry(portid, &port_list, id_list) {
+               if ((portid->domid == domid)
+                               && (portid->handle == handle)
+                               && (portid->portnum == portnum)) {
+                               found = 1;
+                               break;
+               }
+       }
+       spin_unlock_irqrestore(&port_list_lock, flags);
+
+       if (found)
+               return portid;
+
+       return NULL;
+}
+
+int portid_add(const char *busid,
+                                       const domid_t domid,
+                                       const unsigned int handle,
+                                       const int portnum)
+{
+       struct vusb_port_id *portid;
+       unsigned long flags;
+
+       portid = kzalloc(sizeof(*portid), GFP_KERNEL);
+       if (!portid)
+               return -ENOMEM;
+
+       portid->domid = domid;
+       portid->handle = handle;
+       portid->portnum = portnum;
+
+       strlcpy(portid->phys_bus, busid, USBBACK_BUS_ID_SIZE);
+
+       spin_lock_irqsave(&port_list_lock, flags);
+       list_add(&portid->id_list, &port_list);
+       spin_unlock_irqrestore(&port_list_lock, flags);
+
+       return 0;
+}
+
+int portid_remove(const domid_t domid,
+                                       const unsigned int handle,
+                                       const int portnum)
+{
+       struct vusb_port_id *portid, *tmp;
+       int err = -ENOENT;
+       unsigned long flags;
+
+       spin_lock_irqsave(&port_list_lock, flags);
+       list_for_each_entry_safe(portid, tmp, &port_list, id_list) {
+               if (portid->domid == domid
+                               && portid->handle == handle
+                               && portid->portnum == portnum) {
+                       list_del(&portid->id_list);
+                       kfree(portid);
+
+                       err = 0;
+               }
+       }
+       spin_unlock_irqrestore(&port_list_lock, flags);
+
+       return err;
+}
+
+static struct usbstub *usbstub_alloc(struct usb_device *udev,
+                                               struct vusb_port_id *portid)
+{
+       struct usbstub *stub;
+
+       stub = kzalloc(sizeof(*stub), GFP_KERNEL);
+       if (!stub) {
+               pr_err("no memory for usbstub\n");
+               return NULL;
+       }
+       kref_init(&stub->kref);
+       stub->udev = usb_get_dev(udev);
+       stub->portid = portid;
+       spin_lock_init(&stub->submitting_lock);
+       INIT_LIST_HEAD(&stub->submitting_list);
+
+       return stub;
+}
+
+static void usbstub_release(struct kref *kref)
+{
+       struct usbstub *stub;
+
+       stub = container_of(kref, struct usbstub, kref);
+
+       usb_put_dev(stub->udev);
+       stub->udev = NULL;
+       stub->portid = NULL;
+       kfree(stub);
+}
+
+static inline void usbstub_get(struct usbstub *stub)
+{
+       kref_get(&stub->kref);
+}
+
+static inline void usbstub_put(struct usbstub *stub)
+{
+       kref_put(&stub->kref, usbstub_release);
+}
+
+static int usbstub_probe(struct usb_interface *intf,
+               const struct usb_device_id *id)
+{
+       struct usb_device *udev = interface_to_usbdev(intf);
+       const char *busid = dev_name(intf->dev.parent);
+       struct vusb_port_id *portid = NULL;
+       struct usbstub *stub = NULL;
+       usbif_t *usbif = NULL;
+       int retval = -ENODEV;
+
+       /* hub currently not supported, so skip. */
+       if (udev->descriptor.bDeviceClass ==  USB_CLASS_HUB)
+               goto out;
+
+       portid = find_portid_by_busid(busid);
+       if (!portid)
+               goto out;
+
+       usbif = find_usbif(portid->domid, portid->handle);
+       if (!usbif)
+               goto out;
+
+       switch (udev->speed) {
+       case USB_SPEED_LOW:
+       case USB_SPEED_FULL:
+               break;
+       case USB_SPEED_HIGH:
+               if (usbif->usb_ver >= USB_VER_USB20)
+                       break;
+               /* fall through */
+       default:
+               goto out;
+       }
+
+       stub = find_attached_device(usbif, portid->portnum);
+       if (!stub) {
+               /* new connection */
+               stub = usbstub_alloc(udev, portid);
+               if (!stub)
+                       return -ENOMEM;
+               usbbk_attach_device(usbif, stub);
+               usbbk_hotplug_notify(usbif, portid->portnum, udev->speed);
+       } else {
+               /* maybe already called and connected by other intf */
+               if (strncmp(stub->portid->phys_bus, busid, USBBACK_BUS_ID_SIZE))
+                       goto out; /* invalid call */
+       }
+
+       usbstub_get(stub);
+       usb_set_intfdata(intf, stub);
+       retval = 0;
+
+out:
+       return retval;
+}
+
+static void usbstub_disconnect(struct usb_interface *intf)
+{
+       struct usbstub *stub
+               = (struct usbstub *) usb_get_intfdata(intf);
+
+       usb_set_intfdata(intf, NULL);
+
+       if (!stub)
+               return;
+
+       if (stub->usbif) {
+               usbbk_hotplug_notify(stub->usbif, stub->portid->portnum, 0);
+               usbbk_detach_device(stub->usbif, stub);
+       }
+       usbbk_unlink_urbs(stub);
+       usbstub_put(stub);
+}
+
+static ssize_t usbstub_show_portids(struct device_driver *driver,
+               char *buf)
+{
+       struct vusb_port_id *portid;
+       size_t count = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&port_list_lock, flags);
+       list_for_each_entry(portid, &port_list, id_list) {
+               if (count >= PAGE_SIZE)
+                       break;
+               count += scnprintf((char *)buf + count, PAGE_SIZE - count,
+                               "%s:%d:%d:%d\n",
+                               &portid->phys_bus[0],
+                               portid->domid,
+                               portid->handle,
+                               portid->portnum);
+       }
+       spin_unlock_irqrestore(&port_list_lock, flags);
+
+       return count;
+}
+static DRIVER_ATTR(port_ids, S_IRUSR, usbstub_show_portids, NULL);
+
+/* table of devices that matches any usbdevice */
+static const struct usb_device_id usbstub_table[] = {
+               { .driver_info = 1 }, /* wildcard, see usb_match_id() */
+               { } /* Terminating entry */
+};
+MODULE_DEVICE_TABLE(usb, usbstub_table);
+
+static struct usb_driver usbback_usb_driver = {
+               .name = "usbback",
+               .probe = usbstub_probe,
+               .disconnect = usbstub_disconnect,
+               .id_table = usbstub_table,
+               .no_dynamic_id = 1,
+};
+
+int __init usbstub_init(void)
+{
+       int err;
+
+       err = usb_register(&usbback_usb_driver);
+       if (err < 0) {
+               pr_err("usbback: usb_register failed (%d)\n", err);
+               goto out;
+       }
+
+       err = driver_create_file(&usbback_usb_driver.drvwrap.driver,
+                               &driver_attr_port_ids);
+       if (err)
+               usb_deregister(&usbback_usb_driver);
+
+out:
+       return err;
+}
+
+void usbstub_exit(void)
+{
+       driver_remove_file(&usbback_usb_driver.drvwrap.driver,
+                               &driver_attr_port_ids);
+       usb_deregister(&usbback_usb_driver);
+}
diff --git a/drivers/xen/usbback/xenbus.c b/drivers/xen/usbback/xenbus.c

new file mode 100644 (file)

index 0000000..989b5c4
--- /dev/null
+++ b/drivers/xen/usbback/xenbus.c
@@ -0,0 +1,334 @@
+/*
+ * xenbus.c
+ *
+ * Xenbus interface for USB backend driver.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "usbback.h"
+
+static int start_xenusbd(usbif_t *usbif)
+{
+       int err = 0;
+       char name[TASK_COMM_LEN];
+
+       snprintf(name, TASK_COMM_LEN, "usbback.%d.%d", usbif->domid,
+                       usbif->handle);
+       usbif->xenusbd = kthread_run(usbbk_schedule, usbif, name);
+       if (IS_ERR(usbif->xenusbd)) {
+               err = PTR_ERR(usbif->xenusbd);
+               usbif->xenusbd = NULL;
+               xenbus_dev_error(usbif->xbdev, err, "start xenusbd");
+       }
+
+       return err;
+}
+
+static void backend_changed(struct xenbus_watch *watch,
+                       const char **vec, unsigned int len)
+{
+       struct xenbus_transaction xbt;
+       int err;
+       int i;
+       char node[8];
+       char *busid;
+       struct vusb_port_id *portid = NULL;
+
+       usbif_t *usbif = container_of(watch, usbif_t, backend_watch);
+       struct xenbus_device *dev = usbif->xbdev;
+
+again:
+       err = xenbus_transaction_start(&xbt);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "starting transaction");
+               return;
+       }
+
+       for (i = 1; i <= usbif->num_ports; i++) {
+               sprintf(node, "port/%d", i);
+               busid = xenbus_read(xbt, dev->nodename, node, NULL);
+               if (IS_ERR(busid)) {
+                       err = PTR_ERR(busid);
+                       xenbus_dev_fatal(dev, err, "reading port/%d", i);
+                       goto abort;
+               }
+
+               /*
+                * remove portid, if the port is not connected,
+                */
+               if (strlen(busid) == 0) {
+                       portid = find_portid(usbif->domid, usbif->handle, i);
+                       if (portid) {
+                               if (portid->is_connected)
+                                       xenbus_dev_fatal(dev, err,
+                                               "can't remove port/%d, unbind first", i);
+                               else
+                                       portid_remove(usbif->domid, usbif->handle, i);
+                       }
+                       continue; /* never configured, ignore */
+               }
+
+               /*
+                * add portid,
+                * if the port is not configured and not used from other usbif.
+                */
+               portid = find_portid(usbif->domid, usbif->handle, i);
+               if (portid) {
+                       if ((strncmp(portid->phys_bus, busid, USBBACK_BUS_ID_SIZE)))
+                               xenbus_dev_fatal(dev, err,
+                                       "can't add port/%d, remove first", i);
+                       else
+                               continue; /* already configured, ignore */
+               } else {
+                       if (find_portid_by_busid(busid))
+                               xenbus_dev_fatal(dev, err,
+                                       "can't add port/%d, busid already used", i);
+                       else
+                               portid_add(busid, usbif->domid, usbif->handle, i);
+               }
+       }
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err == -EAGAIN)
+               goto again;
+       if (err)
+               xenbus_dev_fatal(dev, err, "completing transaction");
+
+       return;
+
+abort:
+       xenbus_transaction_end(xbt, 1);
+
+       return;
+}
+
+static int usbback_remove(struct xenbus_device *dev)
+{
+       usbif_t *usbif = dev_get_drvdata(&dev->dev);
+       int i;
+
+       if (usbif->backend_watch.node) {
+               unregister_xenbus_watch(&usbif->backend_watch);
+               kfree(usbif->backend_watch.node);
+               usbif->backend_watch.node = NULL;
+       }
+
+       if (usbif) {
+               /* remove all ports */
+               for (i = 1; i <= usbif->num_ports; i++)
+                       portid_remove(usbif->domid, usbif->handle, i);
+               usbif_disconnect(usbif);
+               usbif_free(usbif);;
+       }
+       dev_set_drvdata(&dev->dev, NULL);
+
+       return 0;
+}
+
+static int usbback_probe(struct xenbus_device *dev,
+                         const struct xenbus_device_id *id)
+{
+       usbif_t *usbif;
+       unsigned int handle;
+       int num_ports;
+       int usb_ver;
+       int err;
+
+       if (usb_disabled())
+               return -ENODEV;
+
+       handle = simple_strtoul(strrchr(dev->otherend, '/') + 1, NULL, 0);
+       usbif = usbif_alloc(dev->otherend_id, handle);
+       if (!usbif) {
+               xenbus_dev_fatal(dev, -ENOMEM, "allocating backend interface");
+               return -ENOMEM;
+       }
+       usbif->xbdev = dev;
+       dev_set_drvdata(&dev->dev, usbif);
+
+       err = xenbus_scanf(XBT_NIL, dev->nodename,
+                               "num-ports", "%d", &num_ports);
+       if (err != 1) {
+               xenbus_dev_fatal(dev, err, "reading num-ports");
+               goto fail;
+       }
+       if (num_ports < 1 || num_ports > USB_MAXCHILDREN) {
+               xenbus_dev_fatal(dev, err, "invalid num-ports");
+               goto fail;
+       }
+       usbif->num_ports = num_ports;
+
+       err = xenbus_scanf(XBT_NIL, dev->nodename,
+                               "usb-ver", "%d", &usb_ver);
+       if (err != 1) {
+               xenbus_dev_fatal(dev, err, "reading usb-ver");
+               goto fail;
+       }
+       switch (usb_ver) {
+       case USB_VER_USB11:
+       case USB_VER_USB20:
+               usbif->usb_ver = usb_ver;
+               break;
+       default:
+               xenbus_dev_fatal(dev, err, "invalid usb-ver");
+               goto fail;
+       }
+
+       err = xenbus_switch_state(dev, XenbusStateInitWait);
+       if (err)
+               goto fail;
+
+       return 0;
+
+fail:
+       usbback_remove(dev);
+       return err;
+}
+
+static int connect_rings(usbif_t *usbif)
+{
+       struct xenbus_device *dev = usbif->xbdev;
+       unsigned int urb_ring_ref, conn_ring_ref, evtchn;
+       int err;
+
+       err = xenbus_gather(XBT_NIL, dev->otherend,
+                           "urb-ring-ref", "%u", &urb_ring_ref,
+                           "conn-ring-ref", "%u", &conn_ring_ref,
+                           "event-channel", "%u", &evtchn, NULL);
+       if (err) {
+               xenbus_dev_fatal(dev, err,
+                                "reading %s/ring-ref and event-channel",
+                                dev->otherend);
+               return err;
+       }
+
+       pr_info("usbback: urb-ring-ref %u, conn-ring-ref %u,"
+               " event-channel %u\n",
+               urb_ring_ref, conn_ring_ref, evtchn);
+
+       err = usbif_map(usbif, urb_ring_ref, conn_ring_ref, evtchn);
+       if (err) {
+               xenbus_dev_fatal(dev, err,
+                               "mapping urb-ring-ref %u conn-ring-ref %u port %u",
+                               urb_ring_ref, conn_ring_ref, evtchn);
+               return err;
+       }
+
+       return 0;
+}
+
+static void frontend_changed(struct xenbus_device *dev,
+                                    enum xenbus_state frontend_state)
+{
+       usbif_t *usbif = dev_get_drvdata(&dev->dev);
+       int err;
+
+       switch (frontend_state) {
+       case XenbusStateInitialised:
+       case XenbusStateReconfiguring:
+       case XenbusStateReconfigured:
+               break;
+
+       case XenbusStateInitialising:
+               if (dev->state == XenbusStateClosed) {
+                       pr_info("%s: %s: prepare for reconnect\n",
+                               __FUNCTION__, dev->nodename);
+                       xenbus_switch_state(dev, XenbusStateInitWait);
+               }
+               break;
+
+       case XenbusStateConnected:
+               if (dev->state == XenbusStateConnected)
+                       break;
+               err = connect_rings(usbif);
+               if (err)
+                       break;
+               err = start_xenusbd(usbif);
+               if (err)
+                       break;
+               err = xenbus_watch_path2(dev, dev->nodename, "port",
+                                       &usbif->backend_watch, backend_changed);
+               if (err)
+                       break;
+               xenbus_switch_state(dev, XenbusStateConnected);
+               break;
+
+       case XenbusStateClosing:
+               usbif_disconnect(usbif);
+               xenbus_switch_state(dev, XenbusStateClosing);
+               break;
+
+       case XenbusStateClosed:
+               xenbus_switch_state(dev, XenbusStateClosed);
+               if (xenbus_dev_is_online(dev))
+                       break;
+               /* fall through if not online */
+       case XenbusStateUnknown:
+               device_unregister(&dev->dev);
+               break;
+
+       default:
+               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+                                frontend_state);
+               break;
+       }
+}
+
+static const struct xenbus_device_id usbback_ids[] = {
+       { "vusb" },
+       { "" },
+};
+
+static DEFINE_XENBUS_DRIVER(usbback, ,
+       .probe = usbback_probe,
+       .otherend_changed = frontend_changed,
+       .remove = usbback_remove,
+);
+
+int __init usbback_xenbus_init(void)
+{
+       return xenbus_register_backend(&usbback_driver);
+}
+
+void __exit usbback_xenbus_exit(void)
+{
+       xenbus_unregister_driver(&usbback_driver);
+}
diff --git a/drivers/xen/usbfront/Makefile b/drivers/xen/usbfront/Makefile

new file mode 100644 (file)

index 0000000..034ba96
--- /dev/null
+++ b/drivers/xen/usbfront/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_XEN_USB_FRONTEND) := xen-hcd.o
+
+xen-hcd-y   := usbfront-hcd.o xenbus.o
+
+ifeq ($(CONFIG_XEN_USB_FRONTEND_HCD_STATS),y)
+EXTRA_CFLAGS += -DXENHCD_STATS
+endif
+
+ifeq ($(CONFIG_XEN_USB_FRONTEND_HCD_PM),y)
+EXTRA_CFLAGS += -DXENHCD_PM
+endif
diff --git a/drivers/xen/usbfront/usbfront-dbg.c b/drivers/xen/usbfront/usbfront-dbg.c

new file mode 100644 (file)

index 0000000..647e3fe
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront-dbg.c
@@ -0,0 +1,101 @@
+/*
+ * usbfront-dbg.c
+ *
+ * Xen USB Virtual Host Controller - debugging
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+static ssize_t show_statistics(struct device *dev,
+                              struct device_attribute *attr, char *buf)
+{
+       struct usb_hcd *hcd;
+       struct usbfront_info *info;
+       unsigned long flags;
+       unsigned temp, size;
+       char *next;
+
+       hcd = dev_get_drvdata(dev);
+       info = hcd_to_info(hcd);
+       next = buf;
+       size = PAGE_SIZE;
+
+       spin_lock_irqsave(&info->lock, flags);
+
+       temp = scnprintf(next, size,
+                       "bus %s, device %s\n"
+                       "%s\n"
+                       "xenhcd, hcd state %d\n",
+                       hcd->self.controller->bus->name,
+                       dev_name(hcd->self.controller),
+                       hcd->product_desc,
+                       hcd->state);
+       size -= temp;
+       next += temp;
+
+#ifdef XENHCD_STATS
+       temp = scnprintf(next, size,
+               "complete %ld unlink %ld ring_full %ld\n",
+               info->stats.complete, info->stats.unlink,
+               info->stats.ring_full);
+       size -= temp;
+       next += temp;
+#endif
+
+       spin_unlock_irqrestore(&info->lock, flags);
+
+       return PAGE_SIZE - size;
+}
+
+static DEVICE_ATTR(statistics, S_IRUGO, show_statistics, NULL);
+
+static inline void create_debug_file(struct usbfront_info *info)
+{
+       struct device *dev = info_to_hcd(info)->self.controller;
+       if (device_create_file(dev, &dev_attr_statistics))
+               pr_warning("statistics file not created for %s\n",
+                          info_to_hcd(info)->self.bus_name);
+}
+
+static inline void remove_debug_file(struct usbfront_info *info)
+{
+       struct device *dev = info_to_hcd(info)->self.controller;
+       device_remove_file(dev, &dev_attr_statistics);
+}
diff --git a/drivers/xen/usbfront/usbfront-hcd.c b/drivers/xen/usbfront/usbfront-hcd.c

new file mode 100644 (file)

index 0000000..83c469b
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront-hcd.c
@@ -0,0 +1,232 @@
+/*
+ * usbfront-hcd.c
+ *
+ * Xen USB Virtual Host Controller driver
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "usbfront.h"
+#include "usbfront-dbg.c"
+#include "usbfront-hub.c"
+#include "usbfront-q.c"
+
+static void xenhcd_watchdog(unsigned long param)
+{
+       struct usbfront_info *info = (struct usbfront_info *) param;
+       unsigned long flags;
+
+       spin_lock_irqsave(&info->lock, flags);
+       if (likely(HC_IS_RUNNING(info_to_hcd(info)->state))) {
+               timer_action_done(info, TIMER_RING_WATCHDOG);
+               xenhcd_giveback_unlinked_urbs(info);
+               xenhcd_kick_pending_urbs(info);
+       }
+       spin_unlock_irqrestore(&info->lock, flags);
+}
+
+/*
+ * one-time HC init
+ */
+static int xenhcd_setup(struct usb_hcd *hcd)
+{
+       struct usbfront_info *info = hcd_to_info(hcd);
+
+       spin_lock_init(&info->lock);
+       INIT_LIST_HEAD(&info->pending_submit_list);
+       INIT_LIST_HEAD(&info->pending_unlink_list);
+       INIT_LIST_HEAD(&info->in_progress_list);
+       INIT_LIST_HEAD(&info->giveback_waiting_list);
+       init_timer(&info->watchdog);
+       info->watchdog.function = xenhcd_watchdog;
+       info->watchdog.data = (unsigned long) info;
+       return 0;
+}
+
+/*
+ * start HC running
+ */
+static int xenhcd_run(struct usb_hcd *hcd)
+{
+       hcd->uses_new_polling = 1;
+       clear_bit(HCD_FLAG_POLL_RH, &hcd->flags);
+       hcd->state = HC_STATE_RUNNING;
+       create_debug_file(hcd_to_info(hcd));
+       return 0;
+}
+
+/*
+ * stop running HC
+ */
+static void xenhcd_stop(struct usb_hcd *hcd)
+{
+       struct usbfront_info *info = hcd_to_info(hcd);
+
+       del_timer_sync(&info->watchdog);
+       remove_debug_file(info);
+       spin_lock_irq(&info->lock);
+       /* cancel all urbs */
+       hcd->state = HC_STATE_HALT;
+       xenhcd_cancel_all_enqueued_urbs(info);
+       xenhcd_giveback_unlinked_urbs(info);
+       spin_unlock_irq(&info->lock);
+}
+
+/*
+ * called as .urb_enqueue()
+ * non-error returns are promise to giveback the urb later
+ */
+static int xenhcd_urb_enqueue(struct usb_hcd *hcd,
+                                   struct urb *urb,
+                                   gfp_t mem_flags)
+{
+       struct usbfront_info *info = hcd_to_info(hcd);
+       struct urb_priv *urbp;
+       unsigned long flags;
+       int ret = 0;
+
+       spin_lock_irqsave(&info->lock, flags);
+
+       urbp = alloc_urb_priv(urb);
+       if (!urbp) {
+               ret = -ENOMEM;
+               goto done;
+       }
+       urbp->status = 1;
+
+       ret = xenhcd_submit_urb(info, urbp);
+       if (ret != 0)
+               free_urb_priv(urbp);
+
+done:
+       spin_unlock_irqrestore(&info->lock, flags);
+       return ret;
+}
+
+/*
+ * called as .urb_dequeue()
+ */
+static int xenhcd_urb_dequeue(struct usb_hcd *hcd,
+                             struct urb *urb, int status)
+{
+       struct usbfront_info *info = hcd_to_info(hcd);
+       struct urb_priv *urbp;
+       unsigned long flags;
+       int ret = 0;
+
+       spin_lock_irqsave(&info->lock, flags);
+
+       urbp = urb->hcpriv;
+       if (!urbp)
+               goto done;
+
+       urbp->status = status;
+       ret = xenhcd_unlink_urb(info, urbp);
+
+done:
+       spin_unlock_irqrestore(&info->lock, flags);
+       return ret;
+}
+
+/*
+ * called from usb_get_current_frame_number(),
+ * but, almost all drivers not use such function.
+ */
+static int xenhcd_get_frame(struct usb_hcd *hcd)
+{
+       /* it means error, but probably no problem :-) */
+       return 0;
+}
+
+static const char hcd_name[] = "xen_hcd";
+
+struct hc_driver xen_usb20_hc_driver = {
+       .description = hcd_name,
+       .product_desc = "Xen USB2.0 Virtual Host Controller",
+       .hcd_priv_size = sizeof(struct usbfront_info),
+       .flags = HCD_USB2,
+
+       /* basic HC lifecycle operations */
+       .reset = xenhcd_setup,
+       .start = xenhcd_run,
+       .stop = xenhcd_stop,
+
+       /* managing urb I/O */
+       .urb_enqueue = xenhcd_urb_enqueue,
+       .urb_dequeue = xenhcd_urb_dequeue,
+       .get_frame_number = xenhcd_get_frame,
+
+       /* root hub operations */
+       .hub_status_data = xenhcd_hub_status_data,
+       .hub_control = xenhcd_hub_control,
+#ifdef XENHCD_PM
+#ifdef CONFIG_PM
+       .bus_suspend = xenhcd_bus_suspend,
+       .bus_resume = xenhcd_bus_resume,
+#endif
+#endif
+};
+
+struct hc_driver xen_usb11_hc_driver = {
+       .description = hcd_name,
+       .product_desc = "Xen USB1.1 Virtual Host Controller",
+       .hcd_priv_size = sizeof(struct usbfront_info),
+       .flags = HCD_USB11,
+
+       /* basic HC lifecycle operations */
+       .reset = xenhcd_setup,
+       .start = xenhcd_run,
+       .stop = xenhcd_stop,
+
+       /* managing urb I/O */
+       .urb_enqueue = xenhcd_urb_enqueue,
+       .urb_dequeue = xenhcd_urb_dequeue,
+       .get_frame_number = xenhcd_get_frame,
+
+       /* root hub operations */
+       .hub_status_data = xenhcd_hub_status_data,
+       .hub_control = xenhcd_hub_control,
+#ifdef XENHCD_PM
+#ifdef CONFIG_PM
+       .bus_suspend = xenhcd_bus_suspend,
+       .bus_resume = xenhcd_bus_resume,
+#endif
+#endif
+};
diff --git a/drivers/xen/usbfront/usbfront-hub.c b/drivers/xen/usbfront/usbfront-hub.c

new file mode 100644 (file)

index 0000000..1a0bfa3
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront-hub.c
@@ -0,0 +1,471 @@
+/*
+ * usbfront-hub.c
+ *
+ * Xen USB Virtual Host Controller - Root Hub Emulations
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * set virtual port connection status
+ */
+void set_connect_state(struct usbfront_info *info, int portnum)
+{
+       int port;
+
+       port = portnum - 1;
+       if (info->ports[port].status & USB_PORT_STAT_POWER) {
+               switch (info->devices[port].speed) {
+               case USB_SPEED_UNKNOWN:
+                       info->ports[port].status &=
+                               ~(USB_PORT_STAT_CONNECTION |
+                                       USB_PORT_STAT_ENABLE |
+                                       USB_PORT_STAT_LOW_SPEED |
+                                       USB_PORT_STAT_HIGH_SPEED |
+                                       USB_PORT_STAT_SUSPEND);
+                       break;
+               case USB_SPEED_LOW:
+                       info->ports[port].status |= USB_PORT_STAT_CONNECTION;
+                       info->ports[port].status |= USB_PORT_STAT_LOW_SPEED;
+                       break;
+               case USB_SPEED_FULL:
+                       info->ports[port].status |= USB_PORT_STAT_CONNECTION;
+                       break;
+               case USB_SPEED_HIGH:
+                       info->ports[port].status |= USB_PORT_STAT_CONNECTION;
+                       info->ports[port].status |= USB_PORT_STAT_HIGH_SPEED;
+                       break;
+               default: /* error */
+                       return;
+               }
+               info->ports[port].status |= (USB_PORT_STAT_C_CONNECTION << 16);
+       }
+}
+
+/*
+ * set virtual device connection status
+ */
+void rhport_connect(struct usbfront_info *info,
+                               int portnum, enum usb_device_speed speed)
+{
+       int port;
+
+       if (portnum < 1 || portnum > info->rh_numports)
+               return; /* invalid port number */
+
+       port = portnum - 1;
+       if (info->devices[port].speed != speed) {
+               switch (speed) {
+               case USB_SPEED_UNKNOWN: /* disconnect */
+                       info->devices[port].status = USB_STATE_NOTATTACHED;
+                       break;
+               case USB_SPEED_LOW:
+               case USB_SPEED_FULL:
+               case USB_SPEED_HIGH:
+                       info->devices[port].status = USB_STATE_ATTACHED;
+                       break;
+               default: /* error */
+                       return;
+               }
+               info->devices[port].speed = speed;
+               info->ports[port].c_connection = 1;
+
+               set_connect_state(info, portnum);
+       }
+}
+
+/*
+ * SetPortFeature(PORT_SUSPENDED)
+ */
+void rhport_suspend(struct usbfront_info *info, int portnum)
+{
+       int port;
+
+       port = portnum - 1;
+       info->ports[port].status |= USB_PORT_STAT_SUSPEND;
+       info->devices[port].status = USB_STATE_SUSPENDED;
+}
+
+/*
+ * ClearPortFeature(PORT_SUSPENDED)
+ */
+void rhport_resume(struct usbfront_info *info, int portnum)
+{
+       int port;
+
+       port = portnum - 1;
+       if (info->ports[port].status & USB_PORT_STAT_SUSPEND) {
+               info->ports[port].resuming = 1;
+               info->ports[port].timeout = jiffies + msecs_to_jiffies(20);
+       }
+}
+
+/*
+ * SetPortFeature(PORT_POWER)
+ */
+void rhport_power_on(struct usbfront_info *info, int portnum)
+{
+       int port;
+
+       port = portnum - 1;
+       if ((info->ports[port].status & USB_PORT_STAT_POWER) == 0) {
+               info->ports[port].status |= USB_PORT_STAT_POWER;
+               if (info->devices[port].status != USB_STATE_NOTATTACHED)
+                       info->devices[port].status = USB_STATE_POWERED;
+               if (info->ports[port].c_connection)
+                       set_connect_state(info, portnum);
+       }
+}
+
+/*
+ * ClearPortFeature(PORT_POWER)
+ * SetConfiguration(non-zero)
+ * Power_Source_Off
+ * Over-current
+ */
+void rhport_power_off(struct usbfront_info *info, int portnum)
+{
+       int port;
+
+       port = portnum - 1;
+       if (info->ports[port].status & USB_PORT_STAT_POWER) {
+               info->ports[port].status = 0;
+               if (info->devices[port].status != USB_STATE_NOTATTACHED)
+                       info->devices[port].status = USB_STATE_ATTACHED;
+       }
+}
+
+/*
+ * ClearPortFeature(PORT_ENABLE)
+ */
+void rhport_disable(struct usbfront_info *info, int portnum)
+{
+       int port;
+
+       port = portnum - 1;
+       info->ports[port].status &= ~USB_PORT_STAT_ENABLE;
+       info->ports[port].status &= ~USB_PORT_STAT_SUSPEND;
+       info->ports[port].resuming = 0;
+       if (info->devices[port].status != USB_STATE_NOTATTACHED)
+               info->devices[port].status = USB_STATE_POWERED;
+}
+
+/*
+ * SetPortFeature(PORT_RESET)
+ */
+void rhport_reset(struct usbfront_info *info, int portnum)
+{
+       int port;
+
+       port = portnum - 1;
+       info->ports[port].status &= ~(USB_PORT_STAT_ENABLE
+                                       | USB_PORT_STAT_LOW_SPEED
+                                       | USB_PORT_STAT_HIGH_SPEED);
+       info->ports[port].status |= USB_PORT_STAT_RESET;
+
+       if (info->devices[port].status != USB_STATE_NOTATTACHED)
+               info->devices[port].status = USB_STATE_ATTACHED;
+
+       /* 10msec reset signaling */
+       info->ports[port].timeout = jiffies + msecs_to_jiffies(10);
+}
+
+#ifdef XENHCD_PM
+#ifdef CONFIG_PM
+static int xenhcd_bus_suspend(struct usb_hcd *hcd)
+{
+       struct usbfront_info *info = hcd_to_info(hcd);
+       int ret = 0;
+       int i, ports;
+
+       ports = info->rh_numports;
+
+       spin_lock_irq(&info->lock);
+       if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))
+               ret = -ESHUTDOWN;
+       else {
+               /* suspend any active ports*/
+               for (i = 1; i <= ports; i++)
+                       rhport_suspend(info, i);
+       }
+       spin_unlock_irq(&info->lock);
+
+       del_timer_sync(&info->watchdog);
+
+       return ret;
+}
+
+static int xenhcd_bus_resume(struct usb_hcd *hcd)
+{
+       struct usbfront_info *info = hcd_to_info(hcd);
+       int ret = 0;
+       int i, ports;
+
+       ports = info->rh_numports;
+
+       spin_lock_irq(&info->lock);
+       if (!test_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags))
+               ret = -ESHUTDOWN;
+       else {
+               /* resume any suspended ports*/
+               for (i = 1; i <= ports; i++)
+                       rhport_resume(info, i);
+       }
+       spin_unlock_irq(&info->lock);
+
+       return ret;
+}
+#endif
+#endif
+
+static void xenhcd_hub_descriptor(struct usbfront_info *info,
+                                 struct usb_hub_descriptor *desc)
+{
+       u16 temp;
+       int ports = info->rh_numports;
+
+       desc->bDescriptorType = 0x29;
+       desc->bPwrOn2PwrGood = 10; /* EHCI says 20ms max */
+       desc->bHubContrCurrent = 0;
+       desc->bNbrPorts = ports;
+
+       /* size of DeviceRemovable and PortPwrCtrlMask fields*/
+       temp = 1 + (ports / 8);
+       desc->bDescLength = 7 + 2 * temp;
+
+       /* bitmaps for DeviceRemovable and PortPwrCtrlMask */
+       memset(&desc->u.hs.DeviceRemovable[0], 0, temp);
+       memset(&desc->u.hs.DeviceRemovable[temp], 0xff, temp);
+
+       /* per-port over current reporting and no power switching */
+       temp = 0x000a;
+       desc->wHubCharacteristics = cpu_to_le16(temp);
+}
+
+/* port status change mask for hub_status_data */
+#define PORT_C_MASK \
+       ((USB_PORT_STAT_C_CONNECTION \
+       | USB_PORT_STAT_C_ENABLE \
+       | USB_PORT_STAT_C_SUSPEND \
+       | USB_PORT_STAT_C_OVERCURRENT \
+       | USB_PORT_STAT_C_RESET) << 16)
+
+/*
+ * See USB 2.0 Spec, 11.12.4 Hub and Port Status Change Bitmap.
+ * If port status changed, writes the bitmap to buf and return
+ * that length(number of bytes).
+ * If Nothing changed, return 0.
+ */
+static int xenhcd_hub_status_data(struct usb_hcd *hcd, char *buf)
+{
+       struct usbfront_info *info = hcd_to_info(hcd);
+
+       int ports;
+       int i;
+       int length;
+
+       unsigned long flags;
+       int ret = 0;
+
+       int changed = 0;
+
+       if (!HC_IS_RUNNING(hcd->state))
+               return 0;
+
+       /* initialize the status to no-changes */
+       ports = info->rh_numports;
+       length = 1 + (ports / 8);
+       for (i = 0; i < length; i++) {
+               buf[i] = 0;
+               ret++;
+       }
+
+       spin_lock_irqsave(&info->lock, flags);
+
+       for (i = 0; i < ports; i++) {
+               /* check status for each port */
+               if (info->ports[i].status & PORT_C_MASK) {
+                       if (i < 7)
+                               buf[0] |= 1 << (i + 1);
+                       else if (i < 15)
+                               buf[1] |= 1 << (i - 7);
+                       else if (i < 23)
+                               buf[2] |= 1 << (i - 15);
+                       else
+                               buf[3] |= 1 << (i - 23);
+                       changed = 1;
+               }
+       }
+
+       if (!changed)
+               ret = 0;
+
+       spin_unlock_irqrestore(&info->lock, flags);
+
+       return ret;
+}
+
+static int xenhcd_hub_control(struct usb_hcd *hcd,
+                              u16 typeReq,
+                              u16 wValue,
+                              u16 wIndex,
+                              char *buf,
+                              u16 wLength)
+{
+       struct usbfront_info *info = hcd_to_info(hcd);
+       int ports = info->rh_numports;
+       unsigned long flags;
+       int ret = 0;
+       int i;
+       int changed = 0;
+
+       spin_lock_irqsave(&info->lock, flags);
+       switch (typeReq) {
+       case ClearHubFeature:
+               /* ignore this request */
+               break;
+       case ClearPortFeature:
+               if (!wIndex || wIndex > ports)
+                       goto error;
+
+               switch (wValue) {
+               case USB_PORT_FEAT_SUSPEND:
+                       rhport_resume(info, wIndex);
+                       break;
+               case USB_PORT_FEAT_POWER:
+                       rhport_power_off(info, wIndex);
+                       break;
+               case USB_PORT_FEAT_ENABLE:
+                       rhport_disable(info, wIndex);
+                       break;
+               case USB_PORT_FEAT_C_CONNECTION:
+                       info->ports[wIndex-1].c_connection = 0;
+                       /* falling through */
+               default:
+                       info->ports[wIndex-1].status &= ~(1 << wValue);
+                       break;
+               }
+               break;
+       case GetHubDescriptor:
+               xenhcd_hub_descriptor(info,
+                                     (struct usb_hub_descriptor *) buf);
+               break;
+       case GetHubStatus:
+               /* always local power supply good and no over-current exists. */
+               *(__le32 *)buf = cpu_to_le32(0);
+               break;
+       case GetPortStatus:
+               if (!wIndex || wIndex > ports)
+                       goto error;
+
+               wIndex--;
+
+               /* resume completion */
+               if (info->ports[wIndex].resuming &&
+                       time_after_eq(jiffies, info->ports[wIndex].timeout)) {
+                       info->ports[wIndex].status |= (USB_PORT_STAT_C_SUSPEND << 16);
+                       info->ports[wIndex].status &= ~USB_PORT_STAT_SUSPEND;
+               }
+
+               /* reset completion */
+               if ((info->ports[wIndex].status & USB_PORT_STAT_RESET) != 0 &&
+                       time_after_eq(jiffies, info->ports[wIndex].timeout)) {
+                       info->ports[wIndex].status |= (USB_PORT_STAT_C_RESET << 16);
+                       info->ports[wIndex].status &= ~USB_PORT_STAT_RESET;
+
+                       if (info->devices[wIndex].status != USB_STATE_NOTATTACHED) {
+                               info->ports[wIndex].status |= USB_PORT_STAT_ENABLE;
+                               info->devices[wIndex].status = USB_STATE_DEFAULT;
+                       }
+
+                       switch (info->devices[wIndex].speed) {
+                       case USB_SPEED_LOW:
+                               info->ports[wIndex].status |= USB_PORT_STAT_LOW_SPEED;
+                               break;
+                       case USB_SPEED_HIGH:
+                               info->ports[wIndex].status |= USB_PORT_STAT_HIGH_SPEED;
+                               break;
+                       default:
+                               break;
+                       }
+               }
+
+               ((u16 *) buf)[0] = cpu_to_le16 (info->ports[wIndex].status);
+               ((u16 *) buf)[1] = cpu_to_le16 (info->ports[wIndex].status >> 16);
+               break;
+       case SetHubFeature:
+               /* not supported */
+               goto error;
+       case SetPortFeature:
+               if (!wIndex || wIndex > ports)
+                       goto error;
+
+               switch (wValue) {
+               case USB_PORT_FEAT_POWER:
+                       rhport_power_on(info, wIndex);
+                       break;
+               case USB_PORT_FEAT_RESET:
+                       rhport_reset(info, wIndex);
+                       break;
+               case USB_PORT_FEAT_SUSPEND:
+                       rhport_suspend(info, wIndex);
+                       break;
+               default:
+                       if ((info->ports[wIndex-1].status & USB_PORT_STAT_POWER) != 0)
+                               info->ports[wIndex-1].status |= (1 << wValue);
+               }
+               break;
+
+       default:
+error:
+               ret = -EPIPE;
+       }
+       spin_unlock_irqrestore(&info->lock, flags);
+
+       /* check status for each port */
+       for (i = 0; i < ports; i++) {
+               if (info->ports[i].status & PORT_C_MASK)
+                       changed = 1;
+       }
+       if (changed)
+               usb_hcd_poll_rh_status(hcd);
+
+       return ret;
+}
diff --git a/drivers/xen/usbfront/usbfront-q.c b/drivers/xen/usbfront/usbfront-q.c

new file mode 100644 (file)

index 0000000..90dd57f
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront-q.c
@@ -0,0 +1,542 @@
+/*
+ * usbfront-q.c
+ *
+ * Xen USB Virtual Host Controller - RING operations.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+struct kmem_cache *xenhcd_urbp_cachep;
+
+static struct urb_priv *alloc_urb_priv(struct urb *urb)
+{
+       struct urb_priv *urbp;
+
+       urbp = kmem_cache_zalloc(xenhcd_urbp_cachep, GFP_ATOMIC);
+       if (!urbp)
+               return NULL;
+
+       urbp->urb = urb;
+       urb->hcpriv = urbp;
+       urbp->req_id = ~0;
+       urbp->unlink_req_id = ~0;
+       INIT_LIST_HEAD(&urbp->list);
+
+       return urbp;
+}
+
+static void free_urb_priv(struct urb_priv *urbp)
+{
+       urbp->urb->hcpriv = NULL;
+       kmem_cache_free(xenhcd_urbp_cachep, urbp);
+}
+
+static inline int get_id_from_freelist(
+       struct usbfront_info *info)
+{
+       unsigned long free;
+       free = info->shadow_free;
+       BUG_ON(free >= USB_URB_RING_SIZE);
+       info->shadow_free = info->shadow[free].req.id;
+       info->shadow[free].req.id = (unsigned int)0x0fff; /* debug */
+       return free;
+}
+
+static inline void add_id_to_freelist(
+       struct usbfront_info *info, unsigned long id)
+{
+       info->shadow[id].req.id  = info->shadow_free;
+       info->shadow[id].urb = NULL;
+       info->shadow_free = id;
+}
+
+static inline int count_pages(void *addr, int length)
+{
+       unsigned long start = (unsigned long) addr >> PAGE_SHIFT;
+       unsigned long end = (unsigned long) (addr + length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       return end - start;
+}
+
+static inline void xenhcd_gnttab_map(struct usbfront_info *info,
+               void *addr, int length, grant_ref_t *gref_head,
+               struct usbif_request_segment *seg, int nr_pages, int flags)
+{
+       grant_ref_t ref;
+       struct page *page;
+       unsigned long buffer_pfn;
+       unsigned int offset;
+       unsigned int len;
+       unsigned int bytes;
+       int i;
+
+       len = length;
+
+       for (i = 0; i < nr_pages; i++) {
+               BUG_ON(!len);
+
+               page = virt_to_page(addr);
+               buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
+               offset = offset_in_page(addr);
+
+               bytes = PAGE_SIZE - offset;
+               if (bytes > len)
+                       bytes = len;
+
+               ref = gnttab_claim_grant_reference(gref_head);
+               BUG_ON(ref == -ENOSPC);
+               gnttab_grant_foreign_access_ref(ref, info->xbdev->otherend_id, buffer_pfn, flags);
+               seg[i].gref = ref;
+               seg[i].offset = (uint16_t)offset;
+               seg[i].length = (uint16_t)bytes;
+
+               addr += bytes;
+               len -= bytes;
+       }
+}
+
+static int map_urb_for_request(struct usbfront_info *info, struct urb *urb,
+               usbif_urb_request_t *req)
+{
+       grant_ref_t gref_head;
+       int nr_buff_pages = 0;
+       int nr_isodesc_pages = 0;
+       int ret = 0;
+
+       if (urb->transfer_buffer_length) {
+               nr_buff_pages = count_pages(urb->transfer_buffer, urb->transfer_buffer_length);
+
+               if (usb_pipeisoc(urb->pipe))
+                       nr_isodesc_pages = count_pages(&urb->iso_frame_desc[0],
+                                       sizeof(struct usb_iso_packet_descriptor) * urb->number_of_packets);
+
+               if (nr_buff_pages + nr_isodesc_pages > USBIF_MAX_SEGMENTS_PER_REQUEST)
+                       return -E2BIG;
+
+               ret = gnttab_alloc_grant_references(USBIF_MAX_SEGMENTS_PER_REQUEST, &gref_head);
+               if (ret) {
+                       pr_err("usbfront: gnttab_alloc_grant_references() error\n");
+                       return -ENOMEM;
+               }
+
+               xenhcd_gnttab_map(info, urb->transfer_buffer,
+                               urb->transfer_buffer_length,
+                               &gref_head, &req->seg[0], nr_buff_pages,
+                               usb_pipein(urb->pipe) ? 0 : GTF_readonly);
+
+               if (!usb_pipeisoc(urb->pipe))
+                       gnttab_free_grant_references(gref_head);
+       }
+
+       req->pipe = usbif_setportnum_pipe(urb->pipe, urb->dev->portnum);
+       req->transfer_flags = urb->transfer_flags;
+       req->buffer_length = urb->transfer_buffer_length;
+       req->nr_buffer_segs = nr_buff_pages;
+
+       switch (usb_pipetype(urb->pipe)) {
+       case PIPE_ISOCHRONOUS:
+               req->u.isoc.interval = urb->interval;
+               req->u.isoc.start_frame = urb->start_frame;
+               req->u.isoc.number_of_packets = urb->number_of_packets;
+               req->u.isoc.nr_frame_desc_segs = nr_isodesc_pages;
+               /* urb->number_of_packets must be > 0 */
+               if (unlikely(urb->number_of_packets <= 0))
+                       BUG();
+               xenhcd_gnttab_map(info, &urb->iso_frame_desc[0],
+                       sizeof(struct usb_iso_packet_descriptor) * urb->number_of_packets,
+                       &gref_head, &req->seg[nr_buff_pages], nr_isodesc_pages, 0);
+               gnttab_free_grant_references(gref_head);
+               break;
+       case PIPE_INTERRUPT:
+               req->u.intr.interval = urb->interval;
+               break;
+       case PIPE_CONTROL:
+               if (urb->setup_packet)
+                       memcpy(req->u.ctrl, urb->setup_packet, 8);
+               break;
+       case PIPE_BULK:
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+static void xenhcd_gnttab_done(struct usb_shadow *shadow)
+{
+       int nr_segs = 0;
+       int i;
+
+       nr_segs = shadow->req.nr_buffer_segs;
+
+       if (usb_pipeisoc(shadow->req.pipe))
+               nr_segs +=  shadow->req.u.isoc.nr_frame_desc_segs;
+
+       for (i = 0; i < nr_segs; i++)
+               gnttab_end_foreign_access(shadow->req.seg[i].gref, 0UL);
+
+       shadow->req.nr_buffer_segs = 0;
+       shadow->req.u.isoc.nr_frame_desc_segs = 0;
+}
+
+static void xenhcd_giveback_urb(struct usbfront_info *info, struct urb *urb, int status)
+__releases(info->lock)
+__acquires(info->lock)
+{
+       struct urb_priv *urbp = (struct urb_priv *) urb->hcpriv;
+
+       list_del_init(&urbp->list);
+       free_urb_priv(urbp);
+       switch (urb->status) {
+       case -ECONNRESET:
+       case -ENOENT:
+               COUNT(info->stats.unlink);
+               break;
+       case -EINPROGRESS:
+               urb->status = status;
+               /* falling through */
+       default:
+               COUNT(info->stats.complete);
+       }
+       spin_unlock(&info->lock);
+       usb_hcd_giveback_urb(info_to_hcd(info), urb,
+                            urbp->status <= 0 ? urbp->status : urb->status);
+       spin_lock(&info->lock);
+}
+
+static inline int xenhcd_do_request(struct usbfront_info *info, struct urb_priv *urbp)
+{
+       usbif_urb_request_t *req;
+       struct urb *urb = urbp->urb;
+       uint16_t id;
+       int notify;
+       int ret = 0;
+
+       req = RING_GET_REQUEST(&info->urb_ring, info->urb_ring.req_prod_pvt);
+       id = get_id_from_freelist(info);
+       req->id = id;
+
+       if (unlikely(urbp->unlinked)) {
+               req->u.unlink.unlink_id = urbp->req_id;
+               req->pipe = usbif_setunlink_pipe(usbif_setportnum_pipe(
+                               urb->pipe, urb->dev->portnum));
+               urbp->unlink_req_id = id;
+       } else {
+               ret = map_urb_for_request(info, urb, req);
+               if (ret < 0) {
+                       add_id_to_freelist(info, id);
+                       return ret;
+               }
+               urbp->req_id = id;
+       }
+
+       info->urb_ring.req_prod_pvt++;
+       info->shadow[id].urb = urb;
+       info->shadow[id].req = *req;
+
+       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->urb_ring, notify);
+       if (notify)
+               notify_remote_via_irq(info->irq);
+
+       return ret;
+}
+
+static void xenhcd_kick_pending_urbs(struct usbfront_info *info)
+{
+       struct urb_priv *urbp;
+       int ret;
+
+       while (!list_empty(&info->pending_submit_list)) {
+               if (RING_FULL(&info->urb_ring)) {
+                       COUNT(info->stats.ring_full);
+                       timer_action(info, TIMER_RING_WATCHDOG);
+                       goto done;
+               }
+
+               urbp = list_entry(info->pending_submit_list.next, struct urb_priv, list);
+               ret = xenhcd_do_request(info, urbp);
+               if (ret == 0)
+                       list_move_tail(&urbp->list, &info->in_progress_list);
+               else
+                       xenhcd_giveback_urb(info, urbp->urb, -ESHUTDOWN);
+       }
+       timer_action_done(info, TIMER_SCAN_PENDING_URBS);
+
+done:
+       return;
+}
+
+/*
+ * caller must lock info->lock
+ */
+static void xenhcd_cancel_all_enqueued_urbs(struct usbfront_info *info)
+{
+       struct urb_priv *urbp, *tmp;
+
+       list_for_each_entry_safe(urbp, tmp, &info->in_progress_list, list) {
+               if (!urbp->unlinked) {
+                       xenhcd_gnttab_done(&info->shadow[urbp->req_id]);
+                       barrier();
+                       if (urbp->urb->status == -EINPROGRESS)  /* not dequeued */
+                               xenhcd_giveback_urb(info, urbp->urb, -ESHUTDOWN);
+                       else                                    /* dequeued */
+                               xenhcd_giveback_urb(info, urbp->urb, urbp->urb->status);
+               }
+               info->shadow[urbp->req_id].urb = NULL;
+       }
+
+       list_for_each_entry_safe(urbp, tmp, &info->pending_submit_list, list) {
+               xenhcd_giveback_urb(info, urbp->urb, -ESHUTDOWN);
+       }
+
+       return;
+}
+
+/*
+ * caller must lock info->lock
+ */
+static void xenhcd_giveback_unlinked_urbs(struct usbfront_info *info)
+{
+       struct urb_priv *urbp, *tmp;
+
+       list_for_each_entry_safe(urbp, tmp, &info->giveback_waiting_list, list) {
+               xenhcd_giveback_urb(info, urbp->urb, urbp->urb->status);
+       }
+}
+
+static int xenhcd_submit_urb(struct usbfront_info *info, struct urb_priv *urbp)
+{
+       int ret = 0;
+
+       if (RING_FULL(&info->urb_ring)) {
+               list_add_tail(&urbp->list, &info->pending_submit_list);
+               COUNT(info->stats.ring_full);
+               timer_action(info, TIMER_RING_WATCHDOG);
+               goto done;
+       }
+
+       if (!list_empty(&info->pending_submit_list)) {
+               list_add_tail(&urbp->list, &info->pending_submit_list);
+               timer_action(info, TIMER_SCAN_PENDING_URBS);
+               goto done;
+       }
+
+       ret = xenhcd_do_request(info, urbp);
+       if (ret == 0)
+               list_add_tail(&urbp->list, &info->in_progress_list);
+
+done:
+       return ret;
+}
+
+static int xenhcd_unlink_urb(struct usbfront_info *info, struct urb_priv *urbp)
+{
+       int ret = 0;
+
+       /* already unlinked? */
+       if (urbp->unlinked)
+               return -EBUSY;
+
+       urbp->unlinked = 1;
+
+       /* the urb is still in pending_submit queue */
+       if (urbp->req_id == ~0) {
+               list_move_tail(&urbp->list, &info->giveback_waiting_list);
+               timer_action(info, TIMER_SCAN_PENDING_URBS);
+               goto done;
+       }
+
+       /* send unlink request to backend */
+       if (RING_FULL(&info->urb_ring)) {
+               list_move_tail(&urbp->list, &info->pending_unlink_list);
+               COUNT(info->stats.ring_full);
+               timer_action(info, TIMER_RING_WATCHDOG);
+               goto done;
+       }
+
+       if (!list_empty(&info->pending_unlink_list)) {
+               list_move_tail(&urbp->list, &info->pending_unlink_list);
+               timer_action(info, TIMER_SCAN_PENDING_URBS);
+               goto done;
+       }
+
+       ret = xenhcd_do_request(info, urbp);
+       if (ret == 0)
+               list_move_tail(&urbp->list, &info->in_progress_list);
+
+done:
+       return ret;
+}
+
+static int xenhcd_urb_request_done(struct usbfront_info *info)
+{
+       usbif_urb_response_t *res;
+       struct urb *urb;
+
+       RING_IDX i, rp;
+       uint16_t id;
+       int more_to_do = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&info->lock, flags);
+
+       rp = info->urb_ring.sring->rsp_prod;
+       rmb(); /* ensure we see queued responses up to "rp" */
+
+       for (i = info->urb_ring.rsp_cons; i != rp; i++) {
+               res = RING_GET_RESPONSE(&info->urb_ring, i);
+               id = res->id;
+
+               if (likely(usbif_pipesubmit(info->shadow[id].req.pipe))) {
+                       xenhcd_gnttab_done(&info->shadow[id]);
+                       urb = info->shadow[id].urb;
+                       barrier();
+                       if (likely(urb)) {
+                               urb->actual_length = res->actual_length;
+                               urb->error_count = res->error_count;
+                               urb->start_frame = res->start_frame;
+                               barrier();
+                               xenhcd_giveback_urb(info, urb, res->status);
+                       }
+               }
+
+               add_id_to_freelist(info, id);
+       }
+       info->urb_ring.rsp_cons = i;
+
+       if (i != info->urb_ring.req_prod_pvt)
+               RING_FINAL_CHECK_FOR_RESPONSES(&info->urb_ring, more_to_do);
+       else
+               info->urb_ring.sring->rsp_event = i + 1;
+
+       spin_unlock_irqrestore(&info->lock, flags);
+
+       cond_resched();
+
+       return more_to_do;
+}
+
+static int xenhcd_conn_notify(struct usbfront_info *info)
+{
+       usbif_conn_response_t *res;
+       usbif_conn_request_t *req;
+       RING_IDX rc, rp;
+       uint16_t id;
+       uint8_t portnum, speed;
+       int more_to_do = 0;
+       int notify;
+       int port_changed = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&info->lock, flags);
+
+       rc = info->conn_ring.rsp_cons;
+       rp = info->conn_ring.sring->rsp_prod;
+       rmb(); /* ensure we see queued responses up to "rp" */
+
+       while (rc != rp) {
+               res = RING_GET_RESPONSE(&info->conn_ring, rc);
+               id = res->id;
+               portnum = res->portnum;
+               speed = res->speed;
+               info->conn_ring.rsp_cons = ++rc;
+
+               rhport_connect(info, portnum, speed);
+               if (info->ports[portnum-1].c_connection)
+                       port_changed = 1;
+
+               barrier();
+
+               req = RING_GET_REQUEST(&info->conn_ring, info->conn_ring.req_prod_pvt);
+               req->id = id;
+               info->conn_ring.req_prod_pvt++;
+       }
+
+       if (rc != info->conn_ring.req_prod_pvt)
+               RING_FINAL_CHECK_FOR_RESPONSES(&info->conn_ring, more_to_do);
+       else
+               info->conn_ring.sring->rsp_event = rc + 1;
+
+       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->conn_ring, notify);
+       if (notify)
+               notify_remote_via_irq(info->irq);
+
+       spin_unlock_irqrestore(&info->lock, flags);
+
+       if (port_changed)
+               usb_hcd_poll_rh_status(info_to_hcd(info));
+
+       cond_resched();
+
+       return more_to_do;
+}
+
+int xenhcd_schedule(void *arg)
+{
+       struct usbfront_info *info = (struct usbfront_info *) arg;
+
+       while (!kthread_should_stop()) {
+               wait_event_interruptible(
+                               info->wq,
+                               info->waiting_resp || kthread_should_stop());
+               info->waiting_resp = 0;
+               smp_mb();
+
+               if (xenhcd_urb_request_done(info))
+                       info->waiting_resp = 1;
+
+               if (xenhcd_conn_notify(info))
+                       info->waiting_resp = 1;
+       }
+
+       return 0;
+}
+
+static void xenhcd_notify_work(struct usbfront_info *info)
+{
+       info->waiting_resp = 1;
+       wake_up(&info->wq);
+}
+
+irqreturn_t xenhcd_int(int irq, void *dev_id)
+{
+       xenhcd_notify_work((struct usbfront_info *) dev_id);
+       return IRQ_HANDLED;
+}
diff --git a/drivers/xen/usbfront/usbfront.h b/drivers/xen/usbfront/usbfront.h

new file mode 100644 (file)

index 0000000..a260114
--- /dev/null
+++ b/drivers/xen/usbfront/usbfront.h
@@ -0,0 +1,197 @@
+/*
+ * usbfront.h
+ *
+ * This file is part of Xen USB Virtual Host Controller driver.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_USBFRONT_H__
+#define __XEN_USBFRONT_H__
+
+#include <linux/module.h>
+#include <linux/usb.h>
+#include <linux/list.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/usb/hcd.h>
+#include <asm/io.h>
+#include <xen/xenbus.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/io/usbif.h>
+
+static inline struct usbfront_info *hcd_to_info(struct usb_hcd *hcd)
+{
+       return (struct usbfront_info *) (hcd->hcd_priv);
+}
+
+static inline struct usb_hcd *info_to_hcd(struct usbfront_info *info)
+{
+       return container_of((void *) info, struct usb_hcd, hcd_priv);
+}
+
+/* Private per-URB data */
+struct urb_priv {
+       struct list_head list;
+       struct urb *urb;
+       int req_id;     /* RING_REQUEST id for submitting */
+       int unlink_req_id; /* RING_REQUEST id for unlinking */
+       int status;
+       unsigned unlinked:1; /* dequeued marker */
+};
+
+/* virtual roothub port status */
+struct rhport_status {
+       u32 status;
+       unsigned resuming:1; /* in resuming */
+       unsigned c_connection:1; /* connection changed */
+       unsigned long timeout;
+};
+
+/* status of attached device */
+struct vdevice_status {
+       int devnum;
+       enum usb_device_state status;
+       enum usb_device_speed speed;
+};
+
+/* RING request shadow */
+struct usb_shadow {
+       usbif_urb_request_t req;
+       struct urb *urb;
+};
+
+/* statistics for tuning, monitoring, ... */
+struct xenhcd_stats {
+       unsigned long ring_full; /* RING_FULL conditions */
+       unsigned long complete; /* normal givebacked urbs */
+       unsigned long unlink; /* unlinked urbs */
+};
+
+struct usbfront_info {
+       /* Virtual Host Controller has 4 urb queues */
+       struct list_head pending_submit_list;
+       struct list_head pending_unlink_list;
+       struct list_head in_progress_list;
+       struct list_head giveback_waiting_list;
+
+       spinlock_t lock;
+
+       /* timer that kick pending and giveback waiting urbs */
+       struct timer_list watchdog;
+       unsigned long actions;
+
+       /* virtual root hub */
+       int rh_numports;
+       struct rhport_status ports[USB_MAXCHILDREN];
+       struct vdevice_status devices[USB_MAXCHILDREN];
+
+       /* Xen related staff */
+       struct xenbus_device *xbdev;
+       int urb_ring_ref;
+       int conn_ring_ref;
+       usbif_urb_front_ring_t urb_ring;
+       usbif_conn_front_ring_t conn_ring;
+
+       unsigned int irq; /* event channel */
+       struct usb_shadow shadow[USB_URB_RING_SIZE];
+       unsigned long shadow_free;
+
+       /* RING_RESPONSE thread */
+       struct task_struct *kthread;
+       wait_queue_head_t wq;
+       unsigned int waiting_resp;
+
+       /* xmit statistics */
+#ifdef XENHCD_STATS
+       struct xenhcd_stats stats;
+#define COUNT(x) do { (x)++; } while (0)
+#else
+#define COUNT(x) do {} while (0)
+#endif
+};
+
+#define XENHCD_RING_JIFFIES (HZ/200)
+#define XENHCD_SCAN_JIFFIES 1
+
+enum xenhcd_timer_action {
+       TIMER_RING_WATCHDOG,
+       TIMER_SCAN_PENDING_URBS,
+};
+
+static inline void
+timer_action_done(struct usbfront_info *info, enum xenhcd_timer_action action)
+{
+       clear_bit(action, &info->actions);
+}
+
+static inline void
+timer_action(struct usbfront_info *info, enum xenhcd_timer_action action)
+{
+       if (timer_pending(&info->watchdog)
+                       && test_bit(TIMER_SCAN_PENDING_URBS, &info->actions))
+               return;
+
+       if (!test_and_set_bit(action, &info->actions)) {
+               unsigned long t;
+
+               switch (action) {
+               case TIMER_RING_WATCHDOG:
+                       t = XENHCD_RING_JIFFIES;
+                       break;
+               default:
+                       t = XENHCD_SCAN_JIFFIES;
+                       break;
+               }
+               mod_timer(&info->watchdog, t + jiffies);
+       }
+}
+
+extern struct kmem_cache *xenhcd_urbp_cachep;
+extern struct hc_driver xen_usb20_hc_driver;
+extern struct hc_driver xen_usb11_hc_driver;
+irqreturn_t xenhcd_int(int irq, void *dev_id);
+void xenhcd_rhport_state_change(struct usbfront_info *info,
+                               int port, enum usb_device_speed speed);
+int xenhcd_schedule(void *arg);
+
+#endif /* __XEN_USBFRONT_H__ */
diff --git a/drivers/xen/usbfront/xenbus.c b/drivers/xen/usbfront/xenbus.c

new file mode 100644 (file)

index 0000000..2ec7531
--- /dev/null
+++ b/drivers/xen/usbfront/xenbus.c
@@ -0,0 +1,412 @@
+/*
+ * xenbus.c
+ *
+ * Xenbus interface for Xen USB Virtual Host Controller
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * or, by your choice,
+ *
+ * When distributed separately from the Linux kernel or incorporated into
+ * other software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "usbfront.h"
+
+static void destroy_rings(struct usbfront_info *info)
+{
+       if (info->irq)
+               unbind_from_irqhandler(info->irq, info);
+       info->irq = 0;
+
+       if (info->urb_ring_ref != GRANT_INVALID_REF) {
+               gnttab_end_foreign_access(info->urb_ring_ref,
+                                         (unsigned long)info->urb_ring.sring);
+               info->urb_ring_ref = GRANT_INVALID_REF;
+       }
+       info->urb_ring.sring = NULL;
+
+       if (info->conn_ring_ref != GRANT_INVALID_REF) {
+               gnttab_end_foreign_access(info->conn_ring_ref,
+                                         (unsigned long)info->conn_ring.sring);
+               info->conn_ring_ref = GRANT_INVALID_REF;
+       }
+       info->conn_ring.sring = NULL;
+}
+
+static int setup_rings(struct xenbus_device *dev,
+                          struct usbfront_info *info)
+{
+       usbif_urb_sring_t *urb_sring;
+       usbif_conn_sring_t *conn_sring;
+       int err;
+
+       info->urb_ring_ref = GRANT_INVALID_REF;
+       info->conn_ring_ref = GRANT_INVALID_REF;
+
+       urb_sring = (usbif_urb_sring_t *)get_zeroed_page(GFP_NOIO|__GFP_HIGH);
+       if (!urb_sring) {
+               xenbus_dev_fatal(dev, -ENOMEM, "allocating urb ring");
+               return -ENOMEM;
+       }
+       SHARED_RING_INIT(urb_sring);
+       FRONT_RING_INIT(&info->urb_ring, urb_sring, PAGE_SIZE);
+
+       err = xenbus_grant_ring(dev, virt_to_mfn(info->urb_ring.sring));
+       if (err < 0) {
+               free_page((unsigned long)urb_sring);
+               info->urb_ring.sring = NULL;
+               goto fail;
+       }
+       info->urb_ring_ref = err;
+
+       conn_sring = (usbif_conn_sring_t *)get_zeroed_page(GFP_NOIO|__GFP_HIGH);
+       if (!conn_sring) {
+               xenbus_dev_fatal(dev, -ENOMEM, "allocating conn ring");
+               return -ENOMEM;
+       }
+       SHARED_RING_INIT(conn_sring);
+       FRONT_RING_INIT(&info->conn_ring, conn_sring, PAGE_SIZE);
+
+       err = xenbus_grant_ring(dev, virt_to_mfn(info->conn_ring.sring));
+       if (err < 0) {
+               free_page((unsigned long)conn_sring);
+               info->conn_ring.sring = NULL;
+               goto fail;
+       }
+       info->conn_ring_ref = err;
+
+       err = bind_listening_port_to_irqhandler(
+               dev->otherend_id, xenhcd_int, IRQF_SAMPLE_RANDOM, "usbif", info);
+       if (err <= 0) {
+               xenbus_dev_fatal(dev, err,
+                                "bind_listening_port_to_irqhandler");
+               goto fail;
+       }
+       info->irq = err;
+
+       return 0;
+fail:
+       destroy_rings(info);
+       return err;
+}
+
+static int talk_to_backend(struct xenbus_device *dev,
+                          struct usbfront_info *info)
+{
+       const char *message;
+       struct xenbus_transaction xbt;
+       int err;
+
+       err = setup_rings(dev, info);
+       if (err)
+               goto out;
+
+again:
+       err = xenbus_transaction_start(&xbt);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "starting transaction");
+               goto destroy_ring;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "urb-ring-ref", "%u",
+                           info->urb_ring_ref);
+       if (err) {
+               message = "writing urb-ring-ref";
+               goto abort_transaction;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "conn-ring-ref", "%u",
+                           info->conn_ring_ref);
+       if (err) {
+               message = "writing conn-ring-ref";
+               goto abort_transaction;
+       }
+
+       err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+                           irq_to_evtchn_port(info->irq));
+       if (err) {
+               message = "writing event-channel";
+               goto abort_transaction;
+       }
+
+       err = xenbus_transaction_end(xbt, 0);
+       if (err) {
+               if (err == -EAGAIN)
+                       goto again;
+               xenbus_dev_fatal(dev, err, "completing transaction");
+               goto destroy_ring;
+       }
+
+       return 0;
+
+abort_transaction:
+       xenbus_transaction_end(xbt, 1);
+       xenbus_dev_fatal(dev, err, "%s", message);
+
+destroy_ring:
+       destroy_rings(info);
+
+out:
+       return err;
+}
+
+static int connect(struct xenbus_device *dev)
+{
+       struct usbfront_info *info = dev_get_drvdata(&dev->dev);
+
+       usbif_conn_request_t *req;
+       int i, idx, err;
+       int notify;
+       char name[TASK_COMM_LEN];
+       struct usb_hcd *hcd;
+
+       hcd = info_to_hcd(info);
+       snprintf(name, TASK_COMM_LEN, "xenhcd.%d", hcd->self.busnum);
+
+       err = talk_to_backend(dev, info);
+       if (err)
+               return err;
+
+       info->kthread = kthread_run(xenhcd_schedule, info, name);
+       if (IS_ERR(info->kthread)) {
+               err = PTR_ERR(info->kthread);
+               info->kthread = NULL;
+               xenbus_dev_fatal(dev, err, "Error creating thread");
+               return err;
+       }
+       /* prepare ring for hotplug notification */
+       for (idx = 0, i = 0; i < USB_CONN_RING_SIZE; i++) {
+               req = RING_GET_REQUEST(&info->conn_ring, idx);
+               req->id = idx;
+               idx++;
+       }
+       info->conn_ring.req_prod_pvt = idx;
+
+       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->conn_ring, notify);
+       if (notify)
+               notify_remote_via_irq(info->irq);
+
+       return 0;
+}
+
+static struct usb_hcd *create_hcd(struct xenbus_device *dev)
+{
+       int i;
+       int err = 0;
+       int num_ports;
+       int usb_ver;
+       struct usb_hcd *hcd = NULL;
+       struct usbfront_info *info = NULL;
+
+       err = xenbus_scanf(XBT_NIL, dev->otherend,
+                                       "num-ports", "%d", &num_ports);
+       if (err != 1) {
+               xenbus_dev_fatal(dev, err, "reading num-ports");
+               return ERR_PTR(-EINVAL);
+       }
+       if (num_ports < 1 || num_ports > USB_MAXCHILDREN) {
+               xenbus_dev_fatal(dev, err, "invalid num-ports");
+               return ERR_PTR(-EINVAL);
+       }
+
+       err = xenbus_scanf(XBT_NIL, dev->otherend,
+                                       "usb-ver", "%d", &usb_ver);
+       if (err != 1) {
+               xenbus_dev_fatal(dev, err, "reading usb-ver");
+               return ERR_PTR(-EINVAL);
+       }
+       switch (usb_ver) {
+       case USB_VER_USB11:
+               hcd = usb_create_hcd(&xen_usb11_hc_driver, &dev->dev, dev_name(&dev->dev));
+               break;
+       case USB_VER_USB20:
+               hcd = usb_create_hcd(&xen_usb20_hc_driver, &dev->dev, dev_name(&dev->dev));
+               break;
+       default:
+               xenbus_dev_fatal(dev, err, "invalid usb-ver");
+               return ERR_PTR(-EINVAL);
+       }
+       if (!hcd) {
+               xenbus_dev_fatal(dev, err,
+                               "fail to allocate USB host controller");
+               return ERR_PTR(-ENOMEM);
+       }
+
+       info = hcd_to_info(hcd);
+       info->xbdev = dev;
+       info->rh_numports = num_ports;
+
+       for (i = 0; i < USB_URB_RING_SIZE; i++) {
+               info->shadow[i].req.id = i + 1;
+               info->shadow[i].urb = NULL;
+       }
+       info->shadow[USB_URB_RING_SIZE-1].req.id = 0x0fff;
+
+       return hcd;
+}
+
+static int usbfront_probe(struct xenbus_device *dev,
+                         const struct xenbus_device_id *id)
+{
+       int err;
+       struct usb_hcd *hcd;
+       struct usbfront_info *info;
+
+       if (usb_disabled())
+               return -ENODEV;
+
+       hcd = create_hcd(dev);
+       if (IS_ERR(hcd)) {
+               err = PTR_ERR(hcd);
+               xenbus_dev_fatal(dev, err,
+                               "fail to create usb host controller");
+               goto fail;
+       }
+
+       info = hcd_to_info(hcd);
+       dev_set_drvdata(&dev->dev, info);
+
+       err = usb_add_hcd(hcd, 0, 0);
+       if (err != 0) {
+               xenbus_dev_fatal(dev, err,
+                               "fail to adding USB host controller");
+               goto fail;
+       }
+
+       init_waitqueue_head(&info->wq);
+
+       return 0;
+
+fail:
+       usb_put_hcd(hcd);
+       dev_set_drvdata(&dev->dev, NULL);
+       return err;
+}
+
+static void usbfront_disconnect(struct xenbus_device *dev)
+{
+       struct usbfront_info *info = dev_get_drvdata(&dev->dev);
+       struct usb_hcd *hcd = info_to_hcd(info);
+
+       usb_remove_hcd(hcd);
+       if (info->kthread) {
+               kthread_stop(info->kthread);
+               info->kthread = NULL;
+       }
+       xenbus_frontend_closed(dev);
+}
+
+static void backend_changed(struct xenbus_device *dev,
+                                    enum xenbus_state backend_state)
+{
+       switch (backend_state) {
+       case XenbusStateInitialising:
+       case XenbusStateInitialised:
+       case XenbusStateConnected:
+       case XenbusStateReconfiguring:
+       case XenbusStateReconfigured:
+       case XenbusStateUnknown:
+       case XenbusStateClosed:
+               break;
+
+       case XenbusStateInitWait:
+               if (dev->state != XenbusStateInitialising)
+                       break;
+               if (!connect(dev))
+                       xenbus_switch_state(dev, XenbusStateConnected);
+               break;
+
+       case XenbusStateClosing:
+               usbfront_disconnect(dev);
+               break;
+
+       default:
+               xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+                                backend_state);
+               break;
+       }
+}
+
+static int usbfront_remove(struct xenbus_device *dev)
+{
+       struct usbfront_info *info = dev_get_drvdata(&dev->dev);
+       struct usb_hcd *hcd = info_to_hcd(info);
+
+       destroy_rings(info);
+       usb_put_hcd(hcd);
+
+       return 0;
+}
+
+static const struct xenbus_device_id usbfront_ids[] = {
+       { "vusb" },
+       { "" },
+};
+MODULE_ALIAS("xen:vusb");
+
+static DEFINE_XENBUS_DRIVER(usbfront, ,
+       .probe = usbfront_probe,
+       .otherend_changed = backend_changed,
+       .remove = usbfront_remove,
+);
+
+static int __init usbfront_init(void)
+{
+       if (!is_running_on_xen())
+               return -ENODEV;
+
+       xenhcd_urbp_cachep = kmem_cache_create("xenhcd_urb_priv",
+                       sizeof(struct urb_priv), 0, 0, NULL);
+       if (!xenhcd_urbp_cachep) {
+               pr_err("usbfront failed to create kmem cache\n");
+               return -ENOMEM;
+       }
+
+       return xenbus_register_frontend(&usbfront_driver);
+}
+
+static void __exit usbfront_exit(void)
+{
+       kmem_cache_destroy(xenhcd_urbp_cachep);
+       xenbus_unregister_driver(&usbfront_driver);
+}
+
+module_init(usbfront_init);
+module_exit(usbfront_exit);
+
+MODULE_AUTHOR("");
+MODULE_DESCRIPTION("Xen USB Virtual Host Controller driver (usbfront)");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/util.c b/drivers/xen/util.c

new file mode 100644 (file)

index 0000000..412f19a
--- /dev/null
+++ b/drivers/xen/util.c
@@ -0,0 +1,74 @@
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <xen/driver_util.h>
+
+static struct class *_get_xen_class(void)
+{
+       static struct class *xen_class;
+       static DEFINE_MUTEX(xc_mutex);
+
+       mutex_lock(&xc_mutex);
+       if (IS_ERR_OR_NULL(xen_class))
+               xen_class = class_create(THIS_MODULE, "xen");
+       mutex_unlock(&xc_mutex);
+       if (IS_ERR(xen_class))
+               pr_err("failed to create xen sysfs class\n");
+
+       return xen_class;
+}
+
+struct class *get_xen_class(void)
+{
+       struct class *class = _get_xen_class();
+
+       return !IS_ERR(class) ? class : NULL;
+}
+EXPORT_SYMBOL_GPL(get_xen_class);
+
+static void xcdev_release(struct device *dev)
+{
+       kfree(dev);
+}
+
+struct device *xen_class_device_create(struct device_type *type,
+                                      struct device *parent,
+                                      dev_t devt, void *drvdata,
+                                      const char *fmt, ...)
+{
+       struct device *dev;
+       int err;
+
+       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+       if (dev) {
+               va_list vargs;
+
+               va_start(vargs, fmt);
+               err = kobject_set_name_vargs(&dev->kobj, fmt, vargs);
+               va_end(vargs);
+       } else
+               err = -ENOMEM;
+
+       if (!err) {
+               dev->devt = devt;
+               dev->class = _get_xen_class();
+               if (IS_ERR(dev->class))
+                       err = PTR_ERR(dev->class);
+       }
+
+       if (!err) {
+               dev->type = type;
+               dev->parent = parent;
+               dev_set_drvdata(dev, drvdata);
+               dev->release = xcdev_release;
+               err = device_register(dev);
+               if (!err)
+                       return dev;
+               put_device(dev);
+       } else
+               kfree(dev);
+
+       return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(xen_class_device_create);
diff --git a/drivers/xen/xen-pciback/Makefile b/drivers/xen/xen-pciback/Makefile

index ffe0ad3..34a40ce 100644 (file)
--- a/drivers/xen/xen-pciback/Makefile
+++ b/drivers/xen/xen-pciback/Makefile
@@ -1,7 +1,13 @@
-obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o
+pcibk-$(CONFIG_PARAVIRT_XEN) := xen-pciback
+pcibk-$(CONFIG_XEN) := pciback
  
-xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o
-xen-pciback-y += conf_space.o conf_space_header.o \
+obj-$(CONFIG_XEN_PCIDEV_BACKEND) := $(pcibk-y).o
+
+$(pcibk-y)-y := pci_stub.o pciback_ops.o xenbus.o
+$(pcibk-y)-y += conf_space.o conf_space_header.o \
                  conf_space_capability.o \
-                conf_space_quirks.o vpci.o \
-                passthrough.o
+                conf_space_quirks.o
+$(pcibk-y)-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o
+$(pcibk-y)-$(CONFIG_XEN_PCIDEV_BACKEND_PASSTHROUGH) += passthrough.o
+$(pcibk-y)-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o
+$(pcibk-y)-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o
diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c

index 7f83e90..89d6e90 100644 (file)
--- a/drivers/xen/xen-pciback/conf_space_capability.c
+++ b/drivers/xen/xen-pciback/conf_space_capability.c
@@ -140,6 +140,21 @@ static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
                 goto out;
         }
  
+#ifdef CONFIG_XEN
+       /*
+        * Device may lose PCI config info on D3->D0 transition. This
+        * is a problem for some guests which will not reset BARs. Even
+        * those that have a go will be foiled by our BAR-write handler
+        * which will discard the write! Since Linux won't re-init
+        * the config space automatically in all cases, we do it here.
+        * Future: Should we re-initialise all first 64 bytes of config space?
+        */
+       if (new_state == PCI_D0 &&
+           (old_state == PCI_D3hot || old_state == PCI_D3cold) &&
+           !(old_value & PCI_PM_CTRL_NO_SOFT_RESET))
+               pci_restore_bars(dev);
+#endif
+
   out:
         return err;
  }
diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c

index 3daf862..cdf1dfb 100644 (file)
--- a/drivers/xen/xen-pciback/conf_space_header.c
+++ b/drivers/xen/xen-pciback/conf_space_header.c
@@ -39,10 +39,11 @@ static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data)
  
  static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
  {
-       struct xen_pcibk_dev_data *dev_data;
+#ifndef CONFIG_XEN
+       struct xen_pcibk_dev_data *dev_data = dev_data = pci_get_drvdata(dev);
+#endif
         int err;
  
-       dev_data = pci_get_drvdata(dev);
         if (!pci_is_enabled(dev) && is_enable_cmd(value)) {
                 if (unlikely(verbose_request))
                         printk(KERN_DEBUG DRV_NAME ": %s: enable\n",
@@ -50,15 +51,19 @@ static int command_write(struct pci_dev *dev, int offset, u16 value, void *data)
                 err = pci_enable_device(dev);
                 if (err)
                         return err;
+#ifndef CONFIG_XEN
                 if (dev_data)
                         dev_data->enable_intx = 1;
+#endif
         } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) {
                 if (unlikely(verbose_request))
                         printk(KERN_DEBUG DRV_NAME ": %s: disable\n",
                                pci_name(dev));
                 pci_disable_device(dev);
+#ifndef CONFIG_XEN
                 if (dev_data)
                         dev_data->enable_intx = 0;
+#endif
         }
  
         if (!dev->is_busmaster && is_master_cmd(value)) {
diff --git a/drivers/xen/xen-pciback/controller.c b/drivers/xen/xen-pciback/controller.c

new file mode 100644 (file)

index 0000000..550f5c2
--- /dev/null
+++ b/drivers/xen/xen-pciback/controller.c
@@ -0,0 +1,450 @@
+/*
+ * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
+ *      Alex Williamson <alex.williamson@hp.com>
+ *
+ * PCI "Controller" Backend - virtualize PCI bus topology based on PCI
+ * controllers.  Devices under the same PCI controller are exposed on the
+ * same virtual domain:bus.  Within a bus, device slots are virtualized
+ * to compact the bus.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/acpi.h>
+#include "pciback.h"
+
+#define PCI_MAX_BUSSES 255
+#define PCI_MAX_SLOTS  32
+
+struct controller_dev_entry {
+       struct list_head list;
+       struct pci_dev *dev;
+       unsigned int devfn;
+};
+
+struct controller_list_entry {
+       struct list_head list;
+       struct pci_controller *controller;
+       unsigned int domain;
+       unsigned int bus;
+       unsigned int next_devfn;
+       struct list_head dev_list;
+};
+
+struct controller_dev_data {
+       struct list_head list;
+       unsigned int next_domain;
+       unsigned int next_bus;
+       spinlock_t lock;
+};
+
+struct walk_info {
+       struct xen_pcibk_device *pdev;
+       int resource_count;
+       int root_num;
+};
+
+static struct pci_dev *_xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
+                                             unsigned int domain,
+                                             unsigned int bus,
+                                             unsigned int devfn)
+{
+       struct controller_dev_data *dev_data = pdev->pci_dev_data;
+       struct controller_dev_entry *dev_entry;
+       struct controller_list_entry *cntrl_entry;
+       struct pci_dev *dev = NULL;
+
+       mutex_lock(&dev_data->lock);
+
+       list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+               if (cntrl_entry->domain != domain ||
+                   cntrl_entry->bus != bus)
+                       continue;
+
+               list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+                       if (devfn == dev_entry->devfn) {
+                               dev = dev_entry->dev;
+                               goto found;
+                       }
+               }
+       }
+found:
+       mutex_unlock(&dev_data->lock);
+
+       return dev;
+}
+
+static int _xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
+                                 struct pci_dev *dev, int devid,
+                                 publish_pci_dev_cb publish_cb)
+{
+       struct controller_dev_data *dev_data = pdev->pci_dev_data;
+       struct controller_dev_entry *dev_entry;
+       struct controller_list_entry *cntrl_entry;
+       struct pci_controller *dev_controller = PCI_CONTROLLER(dev);
+       int ret = 0, found = 0;
+
+       mutex_lock(&dev_data->lock);
+
+       /* Look to see if we already have a domain:bus for this controller */
+       list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+               if (cntrl_entry->controller == dev_controller) {
+                       found = 1;
+                       break;
+               }
+       }
+
+       if (!found) {
+               cntrl_entry = kmalloc(sizeof(*cntrl_entry), GFP_ATOMIC);
+               if (!cntrl_entry) {
+                       ret =  -ENOMEM;
+                       goto out;
+               }
+
+               cntrl_entry->controller = dev_controller;
+               cntrl_entry->next_devfn = PCI_DEVFN(0, 0);
+
+               cntrl_entry->domain = dev_data->next_domain;
+               cntrl_entry->bus = dev_data->next_bus++;
+               if (dev_data->next_bus > PCI_MAX_BUSSES) {
+                       dev_data->next_domain++;
+                       dev_data->next_bus = 0;
+               }
+
+               INIT_LIST_HEAD(&cntrl_entry->dev_list);
+
+               list_add_tail(&cntrl_entry->list, &dev_data->list);
+       }
+
+       if (PCI_SLOT(cntrl_entry->next_devfn) > PCI_MAX_SLOTS) {
+               /*
+                * While it seems unlikely, this can actually happen if
+                * a controller has P2P bridges under it.
+                */
+               xenbus_dev_fatal(pdev->xdev, -ENOSPC, "Virtual bus %04x:%02x "
+                                "is full, no room to export %04x:%02x:%02x.%x",
+                                cntrl_entry->domain, cntrl_entry->bus,
+                                pci_domain_nr(dev->bus), dev->bus->number,
+                                PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+               ret = -ENOSPC;
+               goto out;
+       }
+
+       dev_entry = kmalloc(sizeof(*dev_entry), GFP_ATOMIC);
+       if (!dev_entry) {
+               if (list_empty(&cntrl_entry->dev_list)) {
+                       list_del(&cntrl_entry->list);
+                       kfree(cntrl_entry);
+               }
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       dev_entry->dev = dev;
+       dev_entry->devfn = cntrl_entry->next_devfn;
+
+       list_add_tail(&dev_entry->list, &cntrl_entry->dev_list);
+
+       cntrl_entry->next_devfn += PCI_DEVFN(1, 0);
+
+out:
+       mutex_unlock(&dev_data->lock);
+
+       /* TODO: Publish virtual domain:bus:slot.func here. */
+
+       return ret;
+}
+
+static void _xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
+                                      struct pci_dev *dev)
+{
+       struct controller_dev_data *dev_data = pdev->pci_dev_data;
+       struct controller_list_entry *cntrl_entry;
+       struct controller_dev_entry *dev_entry = NULL;
+       struct pci_dev *found_dev = NULL;
+
+       mutex_lock(&dev_data->lock);
+
+       list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+               if (cntrl_entry->controller != PCI_CONTROLLER(dev))
+                       continue;
+
+               list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+                       if (dev_entry->dev == dev) {
+                               found_dev = dev_entry->dev;
+                               break;
+                       }
+               }
+       }
+
+       if (!found_dev) {
+               mutex_unlock(&dev_data->lock);
+               return;
+       }
+
+       list_del(&dev_entry->list);
+       kfree(dev_entry);
+
+       if (list_empty(&cntrl_entry->dev_list)) {
+               list_del(&cntrl_entry->list);
+               kfree(cntrl_entry);
+       }
+
+       mutex_unlock(&dev_data->lock);
+       pcistub_put_pci_dev(found_dev);
+}
+
+static int _xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
+{
+       struct controller_dev_data *dev_data;
+
+       dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL);
+       if (!dev_data)
+               return -ENOMEM;
+
+       mutex_init(&dev_data->lock);
+
+       INIT_LIST_HEAD(&dev_data->list);
+
+       /* Starting domain:bus numbers */
+       dev_data->next_domain = 0;
+       dev_data->next_bus = 0;
+
+       pdev->pci_dev_data = dev_data;
+
+       return 0;
+}
+
+static acpi_status write_xenbus_resource(struct acpi_resource *res, void *data)
+{
+       struct walk_info *info = data;
+       struct acpi_resource_address64 addr;
+       acpi_status status;
+       int i, len, err;
+       char str[32], tmp[3];
+       unsigned char *ptr, *buf;
+
+       status = acpi_resource_to_address64(res, &addr);
+
+       /* Do we care about this range?  Let's check. */
+       if (!ACPI_SUCCESS(status) ||
+           !(addr.resource_type == ACPI_MEMORY_RANGE ||
+             addr.resource_type == ACPI_IO_RANGE) ||
+           !addr.address_length || addr.producer_consumer != ACPI_PRODUCER)
+               return AE_OK;
+
+       /*
+        * Furthermore, we really only care to tell the guest about
+        * address ranges that require address translation of some sort.
+        */
+       if (!(addr.resource_type == ACPI_MEMORY_RANGE &&
+             addr.info.mem.translation) &&
+           !(addr.resource_type == ACPI_IO_RANGE &&
+             addr.info.io.translation))
+               return AE_OK;
+          
+       /* Store the resource in xenbus for the guest */
+       len = snprintf(str, sizeof(str), "root-%d-resource-%d",
+                      info->root_num, info->resource_count);
+       if (unlikely(len >= (sizeof(str) - 1)))
+               return AE_OK;
+
+       buf = kzalloc((sizeof(*res) * 2) + 1, GFP_KERNEL);
+       if (!buf)
+               return AE_OK;
+
+       /* Clean out resource_source */
+       res->data.address64.resource_source.index = 0xFF;
+       res->data.address64.resource_source.string_length = 0;
+       res->data.address64.resource_source.string_ptr = NULL;
+
+       ptr = (unsigned char *)res;
+
+       /* Turn the acpi_resource into an ASCII byte stream */
+       for (i = 0; i < sizeof(*res); i++) {
+               snprintf(tmp, sizeof(tmp), "%02x", ptr[i]);
+               strncat(buf, tmp, 2);
+       }
+
+       err = xenbus_printf(XBT_NIL, info->pdev->xdev->nodename,
+                           str, "%s", buf);
+
+       if (!err)
+               info->resource_count++;
+
+       kfree(buf);
+
+       return AE_OK;
+}
+
+static int _xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
+                                       publish_pci_root_cb publish_root_cb)
+{
+       struct controller_dev_data *dev_data = pdev->pci_dev_data;
+       struct controller_list_entry *cntrl_entry;
+       int i, root_num, len, err = 0;
+       unsigned int domain, bus;
+       char str[64];
+       struct walk_info info;
+
+       mutex_lock(&dev_data->lock);
+
+       list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+               /* First publish all the domain:bus info */
+               err = publish_root_cb(pdev, cntrl_entry->domain,
+                                     cntrl_entry->bus);
+               if (err)
+                       goto out;
+
+               /*
+                * Now figure out which root-%d this belongs to
+                * so we can associate resources with it.
+                */
+               err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+                                  "root_num", "%d", &root_num);
+
+               if (err != 1)
+                       goto out;
+
+               for (i = 0; i < root_num; i++) {
+                       len = snprintf(str, sizeof(str), "root-%d", i);
+                       if (unlikely(len >= (sizeof(str) - 1))) {
+                               err = -ENOMEM;
+                               goto out;
+                       }
+
+                       err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename,
+                                          str, "%x:%x", &domain, &bus);
+                       if (err != 2)
+                               goto out;
+
+                       /* Is this the one we just published? */
+                       if (domain == cntrl_entry->domain &&
+                           bus == cntrl_entry->bus)
+                               break;
+               }
+
+               if (i == root_num)
+                       goto out;
+
+               info.pdev = pdev;
+               info.resource_count = 0;
+               info.root_num = i;
+
+               /* Let ACPI do the heavy lifting on decoding resources */
+               acpi_walk_resources(cntrl_entry->controller->acpi_handle,
+                                   METHOD_NAME__CRS, write_xenbus_resource,
+                                   &info);
+
+               /* No resouces.  OK.  On to the next one */
+               if (!info.resource_count)
+                       continue;
+
+               /* Store the number of resources we wrote for this root-%d */
+               len = snprintf(str, sizeof(str), "root-%d-resources", i);
+               if (unlikely(len >= (sizeof(str) - 1))) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+                                   "%d", info.resource_count);
+               if (err)
+                       goto out;
+       }
+
+       /* Finally, write some magic to synchronize with the guest. */
+       len = snprintf(str, sizeof(str), "root-resource-magic");
+       if (unlikely(len >= (sizeof(str) - 1))) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str,
+                           "%lx", (sizeof(struct acpi_resource) * 2) + 1);
+
+out:
+       mutex_unlock(&dev_data->lock);
+
+       return err;
+}
+
+static void _xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
+{
+       struct controller_dev_data *dev_data = pdev->pci_dev_data;
+       struct controller_list_entry *cntrl_entry, *c;
+       struct controller_dev_entry *dev_entry, *d;
+
+       list_for_each_entry_safe(cntrl_entry, c, &dev_data->list, list) {
+               list_for_each_entry_safe(dev_entry, d,
+                                        &cntrl_entry->dev_list, list) {
+                       list_del(&dev_entry->list);
+                       pcistub_put_pci_dev(dev_entry->dev);
+                       kfree(dev_entry);
+               }
+               list_del(&cntrl_entry->list);
+               kfree(cntrl_entry);
+       }
+
+       kfree(dev_data);
+       pdev->pci_dev_data = NULL;
+}
+
+static int _xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
+                                      struct xen_pcibk_device *pdev,
+                                      unsigned int *domain,
+                                      unsigned int *bus, unsigned int *devfn)
+{
+       struct controller_dev_data *dev_data = pdev->pci_dev_data;
+       struct controller_dev_entry *dev_entry;
+       struct controller_list_entry *cntrl_entry;
+       int found = 0;
+
+       mutex_lock(&dev_data->lock);
+       list_for_each_entry(cntrl_entry, &dev_data->list, list) {
+               list_for_each_entry(dev_entry, &cntrl_entry->dev_list, list) {
+                       if ( (dev_entry->dev->bus->number == 
+                                       pcidev->bus->number) &&
+                               (dev_entry->dev->devfn ==
+                                       pcidev->devfn) &&
+                               (pci_domain_nr(dev_entry->dev->bus) ==
+                                       pci_domain_nr(pcidev->bus)))
+                       {
+                               found = 1;
+                               *domain = cntrl_entry->domain;
+                               *bus = cntrl_entry->bus;
+                               *devfn = dev_entry->devfn;
+                               goto out;
+                       }
+               }
+       }
+out:
+       mutex_unlock(&dev_data->lock);
+       return found;
+
+}
+
+const struct xen_pcibk_backend xen_pcibk_controller_backend = {
+       .name           = "controller",
+       .init           = _xen_pcibk_init_devices,
+       .free           = _xen_pcibk_release_devices,
+       .find           = _xen_pcibk_get_pcifront_dev,
+       .publish        = _xen_pcibk_publish_pci_roots,
+       .release        = _xen_pcibk_release_pci_dev,
+       .add            = _xen_pcibk_add_pci_dev,
+       .get            = _xen_pcibk_get_pci_dev,
+};
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c

index 097e536..846dd29 100644 (file)
--- a/drivers/xen/xen-pciback/pci_stub.c
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -14,9 +14,14 @@
  #include <linux/wait.h>
  #include <linux/sched.h>
  #include <linux/atomic.h>
+#ifndef CONFIG_XEN
  #include <xen/events.h>
  #include <asm/xen/pci.h>
  #include <asm/xen/hypervisor.h>
+#else
+#include <xen/evtchn.h>
+#endif
+#include <xen/xen.h>
  #include "pciback.h"
  #include "conf_space.h"
  #include "conf_space_quirks.h"
@@ -92,7 +97,9 @@ static void pcistub_device_release(struct kref *kref)
  
         dev_dbg(&psdev->dev->dev, "pcistub_device_release\n");
  
+#ifndef CONFIG_XEN
         xen_unregister_device_domain_owner(psdev->dev);
+#endif
  
         /* Call the reset function which does not take lock as this
          * is called from "unbind" which takes a device_lock mutex.
@@ -260,7 +267,9 @@ void pcistub_put_pci_dev(struct pci_dev *dev)
         xen_pcibk_config_free_dyn_fields(found_psdev->dev);
         xen_pcibk_config_reset_dev(found_psdev->dev);
  
+#ifndef CONFIG_XEN
         xen_unregister_device_domain_owner(found_psdev->dev);
+#endif
  
         spin_lock_irqsave(&found_psdev->lock, flags);
         found_psdev->pdev = NULL;
@@ -320,19 +329,25 @@ static int __devinit pcistub_init_device(struct pci_dev *dev)
          * would need to be called somewhere to free the memory allocated
          * here and then to call kfree(pci_get_drvdata(psdev->dev)).
          */
+#ifndef CONFIG_XEN
         dev_data = kzalloc(sizeof(*dev_data) +  strlen(DRV_NAME "[]")
                                 + strlen(pci_name(dev)) + 1, GFP_ATOMIC);
+#else
+       dev_data = kzalloc(sizeof(*dev_data), GFP_ATOMIC);
+#endif
         if (!dev_data) {
                 err = -ENOMEM;
                 goto out;
         }
         pci_set_drvdata(dev, dev_data);
  
+#ifndef CONFIG_XEN
         /*
          * Setup name for fake IRQ handler. It will only be enabled
          * once the device is turned on by the guest.
          */
         sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev));
+#endif
  
         dev_dbg(&dev->dev, "initializing config\n");
  
@@ -480,6 +495,16 @@ static int __devinit pcistub_probe(struct pci_dev *dev,
  
                 dev_info(&dev->dev, "seizing device\n");
                 err = pcistub_seize(dev);
+#ifdef CONFIG_PCI_GUESTDEV
+       } else if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
+               if (!pci_is_guestdev(dev)) {
+                       err = -ENODEV;
+                       goto out;
+               }
+
+               dev_info(&dev->dev, "seizing device\n");
+               err = pcistub_seize(dev);
+#endif /* CONFIG_PCI_GUESTDEV */
         } else
                 /* Didn't find the device */
                 err = -ENODEV;
@@ -884,8 +909,10 @@ static struct pci_error_handlers xen_pcibk_error_handler = {
   */
  
  static struct pci_driver xen_pcibk_pci_driver = {
+#ifndef CONFIG_XEN
         /* The name should be xen_pciback, but until the tools are updated
          * we will keep it as pciback. */
+#endif
         .name = "pciback",
         .id_table = pcistub_ids,
         .probe = pcistub_probe,
@@ -1075,6 +1102,7 @@ static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf)
  }
  static DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL);
  
+#ifndef CONFIG_XEN
  static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf)
  {
         struct pcistub_device *psdev;
@@ -1140,6 +1168,7 @@ out:
  }
  static DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL,
                    pcistub_irq_handler_switch);
+#endif
  
  static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf,
                                  size_t count)
@@ -1272,6 +1301,21 @@ static ssize_t permissive_show(struct device_driver *drv, char *buf)
  static DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show,
                    permissive_add);
  
+#if defined(CONFIG_XEN) && defined(CONFIG_PCI_MSI)
+static int xen_pcibk_get_owner(struct pci_dev *dev)
+{
+       struct pcistub_device *psdev;
+
+       psdev = pcistub_device_find(pci_domain_nr(dev->bus), dev->bus->number,
+                       PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
+
+       if (!psdev || !psdev->pdev)
+               return -1;
+
+       return psdev->pdev->xdev->otherend_id;
+}
+#endif
+
  static void pcistub_exit(void)
  {
         driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_new_slot);
@@ -1281,10 +1325,14 @@ static void pcistub_exit(void)
         driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_quirks);
         driver_remove_file(&xen_pcibk_pci_driver.driver,
                            &driver_attr_permissive);
+#if !defined(CONFIG_XEN)
         driver_remove_file(&xen_pcibk_pci_driver.driver,
                            &driver_attr_irq_handlers);
         driver_remove_file(&xen_pcibk_pci_driver.driver,
                            &driver_attr_irq_handler_state);
+#elif defined(CONFIG_PCI_MSI)
+       WARN_ON(unregister_msi_get_owner(xen_pcibk_get_owner));
+#endif
         pci_unregister_driver(&xen_pcibk_pci_driver);
  }
  
@@ -1343,12 +1391,17 @@ static int __init pcistub_init(void)
                 err = driver_create_file(&xen_pcibk_pci_driver.driver,
                                          &driver_attr_permissive);
  
+#if !defined(CONFIG_XEN)
         if (!err)
                 err = driver_create_file(&xen_pcibk_pci_driver.driver,
                                          &driver_attr_irq_handlers);
         if (!err)
                 err = driver_create_file(&xen_pcibk_pci_driver.driver,
                                         &driver_attr_irq_handler_state);
+#elif defined(CONFIG_PCI_MSI)
+       if (!err)
+               err = register_msi_get_owner(xen_pcibk_get_owner);
+#endif
         if (err)
                 pcistub_exit();
  
diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h

index a7def01..f25f1d8 100644 (file)
--- a/drivers/xen/xen-pciback/pciback.h
+++ b/drivers/xen/xen-pciback/pciback.h
@@ -10,12 +10,16 @@
  #include <linux/interrupt.h>
  #include <xen/xenbus.h>
  #include <linux/list.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
  #include <linux/workqueue.h>
  #include <linux/atomic.h>
  #include <xen/interface/io/pciif.h>
  
+#ifndef CONFIG_XEN
  #define DRV_NAME       "xen-pciback"
+#else
+#define DRV_NAME       "pciback"
+#endif
  
  struct pci_dev_entry {
         struct list_head list;
@@ -34,6 +38,9 @@ struct xen_pcibk_device {
         struct xenbus_watch be_watch;
         u8 be_watching;
         int evtchn_irq;
+#ifdef CONFIG_XEN
+       struct vm_struct *sh_area;
+#endif
         struct xen_pci_sharedinfo *sh_info;
         unsigned long flags;
         struct work_struct op_work;
@@ -44,12 +51,14 @@ struct xen_pcibk_dev_data {
         struct pci_saved_state *pci_saved_state;
         unsigned int permissive:1;
         unsigned int warned_on_write:1;
+#ifndef CONFIG_XEN
         unsigned int enable_intx:1;
         unsigned int isr_on:1; /* Whether the IRQ handler is installed. */
         unsigned int ack_intr:1; /* .. and ACK-ing */
         unsigned long handled;
         unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */
         char irq_name[0]; /* xen-pcibk[000:04:00.0] */
+#endif
  };
  
  /* Used by XenBus and xen_pcibk_ops.c */
@@ -87,9 +96,11 @@ typedef int (*publish_pci_dev_cb) (struct xen_pcibk_device *pdev,
  typedef int (*publish_pci_root_cb) (struct xen_pcibk_device *pdev,
                                     unsigned int domain, unsigned int bus);
  
-/* Backend registration for the two types of BDF representation:
+/* Backend registration for the different types of BDF representation:
   *  vpci - BDFs start at 00
   *  passthrough - BDFs are exactly like in the host.
+ *  slot - like vpci, but each function becoming a separate slot
+ *  controller - devices on same host bus will also be on same virtual bus
   */
  struct xen_pcibk_backend {
         const char *name;
@@ -107,8 +118,10 @@ struct xen_pcibk_backend {
                                unsigned int devfn);
  };
  
-extern const struct xen_pcibk_backend xen_pcibk_vpci_backend;
-extern const struct xen_pcibk_backend xen_pcibk_passthrough_backend;
+extern const struct xen_pcibk_backend __weak xen_pcibk_vpci_backend;
+extern const struct xen_pcibk_backend __weak xen_pcibk_passthrough_backend;
+extern const struct xen_pcibk_backend __weak xen_pcibk_slot_backend;
+extern const struct xen_pcibk_backend __weak xen_pcibk_controller_backend;
  extern const struct xen_pcibk_backend *xen_pcibk_backend;
  
  static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c

index 97f5d26..af4907d 100644 (file)
--- a/drivers/xen/xen-pciback/pciback_ops.c
+++ b/drivers/xen/xen-pciback/pciback_ops.c
@@ -6,13 +6,18 @@
  #include <linux/module.h>
  #include <linux/wait.h>
  #include <linux/bitops.h>
+#ifndef CONFIG_XEN
  #include <xen/events.h>
+#else
+#include <xen/evtchn.h>
+#endif
  #include <linux/sched.h>
  #include "pciback.h"
  
  int verbose_request;
  module_param(verbose_request, int, 0644);
  
+#ifndef CONFIG_XEN
  static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id);
  
  /* Ensure a device is has the fake IRQ handler "turned on/off" and is
@@ -92,6 +97,7 @@ out:
                 enable ? (dev_data->isr_on ? "enabled" : "failed to enable") :
                         (dev_data->isr_on ? "failed to disable" : "disabled"));
  }
+#endif
  
  /* Ensure a device is "turned off" and ready to be exported.
   * (Also see xen_pcibk_config_reset to ensure virtual configuration space is
@@ -101,7 +107,9 @@ void xen_pcibk_reset_device(struct pci_dev *dev)
  {
         u16 cmd;
  
+#ifndef CONFIG_XEN
         xen_pcibk_control_isr(dev, 1 /* reset device */);
+#endif
  
         /* Disable devices (but not bridges) */
         if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) {
@@ -117,6 +125,9 @@ void xen_pcibk_reset_device(struct pci_dev *dev)
  
                 pci_write_config_word(dev, PCI_COMMAND, 0);
  
+#ifdef CONFIG_XEN
+               atomic_set(&dev->enable_cnt, 0);
+#endif
                 dev->is_busmaster = 0;
         } else {
                 pci_read_config_word(dev, PCI_COMMAND, &cmd);
@@ -134,7 +145,9 @@ static
  int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev,
                          struct pci_dev *dev, struct xen_pci_op *op)
  {
+#ifndef CONFIG_XEN
         struct xen_pcibk_dev_data *dev_data;
+#endif
         int otherend = pdev->xdev->otherend_id;
         int status;
  
@@ -153,14 +166,20 @@ int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev,
         /* The value the guest needs is actually the IDT vector, not the
          * the local domain's IRQ number. */
  
+#ifndef CONFIG_XEN
         op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+#else
+       op->value = dev->irq;
+#endif
         if (unlikely(verbose_request))
                 printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev),
                         op->value);
  
+#ifndef CONFIG_XEN
         dev_data = pci_get_drvdata(dev);
         if (dev_data)
                 dev_data->ack_intr = 0;
+#endif
  
         return 0;
  }
@@ -169,20 +188,28 @@ static
  int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev,
                           struct pci_dev *dev, struct xen_pci_op *op)
  {
+#ifndef CONFIG_XEN
         struct xen_pcibk_dev_data *dev_data;
+#endif
  
         if (unlikely(verbose_request))
                 printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n",
                        pci_name(dev));
         pci_disable_msi(dev);
  
+#ifndef CONFIG_XEN
         op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0;
+#else
+       op->value = dev->irq;
+#endif
         if (unlikely(verbose_request))
                 printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev),
                         op->value);
+#ifndef CONFIG_XEN
         dev_data = pci_get_drvdata(dev);
         if (dev_data)
                 dev_data->ack_intr = 1;
+#endif
         return 0;
  }
  
@@ -190,7 +217,9 @@ static
  int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
                           struct pci_dev *dev, struct xen_pci_op *op)
  {
+#ifndef CONFIG_XEN
         struct xen_pcibk_dev_data *dev_data;
+#endif
         int i, result;
         struct msix_entry *entries;
  
@@ -214,9 +243,13 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
         if (result == 0) {
                 for (i = 0; i < op->value; i++) {
                         op->msix_entries[i].entry = entries[i].entry;
+#ifndef CONFIG_XEN
                         if (entries[i].vector)
                                 op->msix_entries[i].vector =
                                         xen_pirq_from_irq(entries[i].vector);
+#else
+                       op->msix_entries[i].vector = entries[i].vector;
+#endif
                                 if (unlikely(verbose_request))
                                         printk(KERN_DEBUG DRV_NAME ": %s: " \
                                                 "MSI-X[%d]: %d\n",
@@ -230,9 +263,11 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev,
         kfree(entries);
  
         op->value = result;
+#ifndef CONFIG_XEN
         dev_data = pci_get_drvdata(dev);
         if (dev_data)
                 dev_data->ack_intr = 0;
+#endif
  
         return result > 0 ? 0 : result;
  }
@@ -241,12 +276,16 @@ static
  int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev,
                            struct pci_dev *dev, struct xen_pci_op *op)
  {
+#ifndef CONFIG_XEN
         struct xen_pcibk_dev_data *dev_data;
+#endif
+
         if (unlikely(verbose_request))
                 printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n",
                         pci_name(dev));
         pci_disable_msix(dev);
  
+#ifndef CONFIG_XEN
         /*
          * SR-IOV devices (which don't have any legacy IRQ) have
          * an undefined IRQ value of zero.
@@ -258,6 +297,9 @@ int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev,
         dev_data = pci_get_drvdata(dev);
         if (dev_data)
                 dev_data->ack_intr = 1;
+#else
+       op->value = dev->irq;
+#endif
         return 0;
  }
  #endif
@@ -302,9 +344,14 @@ void xen_pcibk_do_op(struct work_struct *data)
         if (dev == NULL)
                 op->err = XEN_PCI_ERR_dev_not_found;
         else {
+#ifndef CONFIG_XEN
                 dev_data = pci_get_drvdata(dev);
                 if (dev_data)
                         test_intx = dev_data->enable_intx;
+#else
+               (void)dev_data;
+               (void)test_intx;
+#endif
                 switch (op->cmd) {
                 case XEN_PCI_OP_conf_read:
                         op->err = xen_pcibk_config_read(dev,
@@ -333,11 +380,13 @@ void xen_pcibk_do_op(struct work_struct *data)
                         break;
                 }
         }
+#ifndef CONFIG_XEN
         if (!op->err && dev && dev_data) {
                 /* Transition detected */
                 if ((dev_data->enable_intx != test_intx))
                         xen_pcibk_control_isr(dev, 0 /* no reset */);
         }
+#endif
         /* Tell the driver domain that we're done. */
         wmb();
         clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags);
@@ -362,6 +411,8 @@ irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id)
  
         return IRQ_HANDLED;
  }
+
+#ifndef CONFIG_XEN
  static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id)
  {
         struct pci_dev *dev = (struct pci_dev *)dev_id;
@@ -381,3 +432,4 @@ static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id)
         }
         return IRQ_NONE;
  }
+#endif
diff --git a/drivers/xen/xen-pciback/slot.c b/drivers/xen/xen-pciback/slot.c

new file mode 100644 (file)

index 0000000..a9d876f
--- /dev/null
+++ b/drivers/xen/xen-pciback/slot.c
@@ -0,0 +1,200 @@
+/*
+ * PCI Backend - Provides a Virtual PCI bus (with real devices)
+ *               to the frontend
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c)
+ *   Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c
+ */
+
+#include <linux/spinlock.h>
+#include "pciback.h"
+
+/* There are at most 32 slots in a pci bus.  */
+#define PCI_SLOT_MAX 32
+
+#define PCI_BUS_NBR 2
+
+struct slot_dev_data {
+       /* Access to dev_list must be protected by lock */
+       struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX];
+       spinlock_t lock;
+};
+
+static struct pci_dev *_xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev,
+                                             unsigned int domain,
+                                             unsigned int bus,
+                                             unsigned int devfn)
+{
+       struct pci_dev *dev = NULL;
+       struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+       unsigned long flags;
+
+       if (domain != 0 || PCI_FUNC(devfn) != 0)
+               return NULL;
+
+       if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR)
+               return NULL;
+
+       spin_lock_irqsave(&slot_dev->lock, flags);
+       dev = slot_dev->slots[bus][PCI_SLOT(devfn)];
+       spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+       return dev;
+}
+
+static int _xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev,
+                                 struct pci_dev *dev, int devid,
+                                 publish_pci_dev_cb publish_cb)
+{
+       int err = 0, slot, bus;
+       struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+       unsigned long flags;
+
+       if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) {
+               err = -EFAULT;
+               xenbus_dev_fatal(pdev->xdev, err,
+                                "Can't export bridges on the virtual PCI bus");
+               goto out;
+       }
+
+       spin_lock_irqsave(&slot_dev->lock, flags);
+
+       /* Assign to a new slot on the virtual PCI bus */
+       for (bus = 0; bus < PCI_BUS_NBR; bus++)
+               for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+                       if (slot_dev->slots[bus][slot] == NULL) {
+                               pr_info("pciback: slot: %s: assign to"
+                                       " virtual slot %d, bus %d\n",
+                                       pci_name(dev), slot, bus);
+                               slot_dev->slots[bus][slot] = dev;
+                               goto unlock;
+                       }
+               }
+
+       err = -ENOMEM;
+       xenbus_dev_fatal(pdev->xdev, err,
+                        "No more space on root virtual PCI bus");
+
+      unlock:
+       spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+       /* Publish this device. */
+       if(!err)
+               err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, 0), devid);
+
+      out:
+       return err;
+}
+
+static void _xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev,
+                                      struct pci_dev *dev)
+{
+       int slot, bus;
+       struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+       struct pci_dev *found_dev = NULL;
+       unsigned long flags;
+
+       spin_lock_irqsave(&slot_dev->lock, flags);
+
+       for (bus = 0; bus < PCI_BUS_NBR; bus++)
+               for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+                       if (slot_dev->slots[bus][slot] == dev) {
+                               slot_dev->slots[bus][slot] = NULL;
+                               found_dev = dev;
+                               goto out;
+                       }
+               }
+
+      out:
+       spin_unlock_irqrestore(&slot_dev->lock, flags);
+
+       if (found_dev)
+               pcistub_put_pci_dev(found_dev);
+}
+
+static int _xen_pcibk_init_devices(struct xen_pcibk_device *pdev)
+{
+       int slot, bus;
+       struct slot_dev_data *slot_dev;
+
+       slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL);
+       if (!slot_dev)
+               return -ENOMEM;
+
+       spin_lock_init(&slot_dev->lock);
+
+       for (bus = 0; bus < PCI_BUS_NBR; bus++)
+               for (slot = 0; slot < PCI_SLOT_MAX; slot++)
+                       slot_dev->slots[bus][slot] = NULL;
+
+       pdev->pci_dev_data = slot_dev;
+
+       return 0;
+}
+
+static int _xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev,
+                                       publish_pci_root_cb publish_cb)
+{
+       /* The Virtual PCI bus has only one root */
+       return publish_cb(pdev, 0, 0);
+}
+
+static void _xen_pcibk_release_devices(struct xen_pcibk_device *pdev)
+{
+       int slot, bus;
+       struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+       struct pci_dev *dev;
+
+       for (bus = 0; bus < PCI_BUS_NBR; bus++)
+               for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+                       dev = slot_dev->slots[bus][slot];
+                       if (dev != NULL)
+                               pcistub_put_pci_dev(dev);
+               }
+
+       kfree(slot_dev);
+       pdev->pci_dev_data = NULL;
+}
+
+static int _xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev,
+                                      struct xen_pcibk_device *pdev,
+                                      unsigned int *domain,
+                                      unsigned int *bus, unsigned int *devfn)
+{
+       int slot, busnr;
+       struct slot_dev_data *slot_dev = pdev->pci_dev_data;
+       struct pci_dev *dev;
+       int found = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&slot_dev->lock, flags);
+
+       for (busnr = 0; busnr < PCI_BUS_NBR; bus++)
+               for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
+                       dev = slot_dev->slots[busnr][slot];
+                       if (dev && dev->bus->number == pcidev->bus->number
+                               && dev->devfn == pcidev->devfn
+                               && pci_domain_nr(dev->bus) == pci_domain_nr(pcidev->bus)) {
+                               found = 1;
+                               *domain = 0;
+                               *bus = busnr;
+                               *devfn = PCI_DEVFN(slot,0);
+                               goto out;
+                       }
+               }
+out:
+       spin_unlock_irqrestore(&slot_dev->lock, flags);
+       return found;
+
+}
+
+const struct xen_pcibk_backend xen_pcibk_slot_backend = {
+       .name           = "slot",
+       .init           = _xen_pcibk_init_devices,
+       .free           = _xen_pcibk_release_devices,
+       .find           = _xen_pcibk_get_pcifront_dev,
+       .publish        = _xen_pcibk_publish_pci_roots,
+       .release        = _xen_pcibk_release_pci_dev,
+       .add            = _xen_pcibk_add_pci_dev,
+       .get            = _xen_pcibk_get_pci_dev,
+};
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c

index 64b11f9..a70a674 100644 (file)
--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -6,30 +6,77 @@
  #include <linux/module.h>
  #include <linux/init.h>
  #include <linux/list.h>
-#include <linux/vmalloc.h>
  #include <linux/workqueue.h>
  #include <xen/xenbus.h>
+#ifndef CONFIG_XEN
  #include <xen/events.h>
  #include <asm/xen/pci.h>
+#else
+#include <xen/evtchn.h>
+#endif
  #include "pciback.h"
  
  #define INVALID_EVTCHN_IRQ  (-1)
  struct workqueue_struct *xen_pcibk_wq;
  
-static bool __read_mostly passthrough;
-module_param(passthrough, bool, S_IRUGO);
-MODULE_PARM_DESC(passthrough,
-       "Option to specify how to export PCI topology to guest:\n"\
-       " 0 - (default) Hide the true PCI topology and makes the frontend\n"\
-       "   there is a single PCI bus with only the exported devices on it.\n"\
-       "   For example, a device at 03:05.0 will be re-assigned to 00:00.0\n"\
-       "   while second device at 02:1a.1 will be re-assigned to 00:01.1.\n"\
-       " 1 - Passthrough provides a real view of the PCI topology to the\n"\
-       "   frontend (for example, a device at 06:01.b will still appear at\n"\
-       "   06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\
-       "   exposed PCI devices to its driver domains. This may be required\n"\
-       "   for drivers which depend on finding their hardward in certain\n"\
-       "   bus/slot locations.");
+static char __read_mostly mode[16] = CONFIG_XEN_PCIDEV_BACKEND_DEFAULT;
+module_param_string(mode, mode, sizeof(mode), S_IRUGO);
+MODULE_PARM_DESC(mode,
+       "Option to specify how to export PCI topology to guest:\n"
+#ifdef CONFIG_XEN_PCIDEV_BACKEND_VPCI
+       " vpci"
+# ifdef CONFIG_XEN_PCIDEV_BACKEND_DEFAULT_VPCI
+       " (default)"
+# endif
+       "\n"
+       "   Hides the true PCI topology and makes the frontend think there\n"
+       "   is a single PCI bus with only the exported devices on it.\n"
+       "   For example, a device at 03:05.0 will be re-assigned to 00:00.0\n"
+       "   while second device at 02:1a.1 will be re-assigned to 00:01.1.\n"
+#endif
+#ifdef CONFIG_XEN_PCIDEV_BACKEND_PASSTHROUGH
+       " passthrough"
+# ifdef CONFIG_XEN_PCIDEV_BACKEND_DEFAULT_PASSTHROUGH
+       " (default)"
+# endif
+       "\n"
+       "   Passthrough provides a real view of the PCI topology to the\n"
+       "   frontend (for example, a device at 06:01.b will still appear at\n"
+       "   06:01.b to the frontend). This is similar to how Xen 2.0.x\n"
+       "   exposed PCI devices to its driver domains. This may be required\n"
+       "   for drivers which depend on finding their hardware in certain\n"
+       "   bus/slot locations.\n"
+#endif
+#ifdef CONFIG_XEN_PCIDEV_BACKEND_SLOT
+       " slot\n"
+# ifdef CONFIG_XEN_PCIDEV_BACKEND_DEFAULT_SLOT
+       " (default)"
+# endif
+       "   Hides the true PCI topology and makes the frontend think there\n"
+       "   is a single PCI bus with only the exported devices on it.\n"
+       "   Contrary to the virtual PCI backend, each function becomes a\n"
+       "   new slot.\n"
+       "   For example, a device at 03:05.2 will be re-assigned to 00:00.0.\n"
+       "   A second device at 02:1a.1 will be re-assigned to 00:01.0.\n"
+#endif
+#ifdef CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER
+       " controller\n"
+# ifdef CONFIG_XEN_PCIDEV_BACKEND_DEFAULT_CONTROLLER
+       " (default)"
+# endif
+       "   Virtualizes the PCI bus topology by providing a virtual bus\n"
+       "   per PCI root device.  Devices which are physically under\n"
+       "   the same root bus will appear on the same virtual bus.  For\n"
+       "   systems with complex I/O addressing, this is the only backend\n"
+       "   which supports extended I/O port spaces and MMIO translation\n"
+       "   offsets.  This backend also supports slot virtualization.\n"
+       "   For example, a device at 0000:01:02.1 will be re-assigned to\n"
+       "   0000:00:00.0.  A second device at 0000:02:05.0 (behind a P2P\n"
+       "   bridge on bus 0000:01) will be re-assigned to 0000:00:01.0.  A\n"
+       "   third device at 0000:16:05.0 (under a different PCI root bus)\n"
+       "   will be re-assigned to 0000:01:00.0.\n"
+#endif
+       );
  
  static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev)
  {
@@ -45,6 +92,9 @@ static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev)
  
         mutex_init(&pdev->dev_lock);
  
+#ifdef CONFIG_XEN
+       pdev->sh_area = NULL;
+#endif
         pdev->sh_info = NULL;
         pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
         pdev->be_watching = 0;
@@ -75,7 +125,11 @@ static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev)
         flush_workqueue(xen_pcibk_wq);
  
         if (pdev->sh_info != NULL) {
+#ifndef CONFIG_XEN
                 xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info);
+#else
+               xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_area);
+#endif
                 pdev->sh_info = NULL;
         }
         mutex_unlock(&pdev->dev_lock);
@@ -102,20 +156,35 @@ static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref,
                              int remote_evtchn)
  {
         int err = 0;
+#ifndef CONFIG_XEN
         void *vaddr;
+#else
+       struct vm_struct *area;
+#endif
  
         dev_dbg(&pdev->xdev->dev,
                 "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
                 gnt_ref, remote_evtchn);
  
+#ifndef CONFIG_XEN
         err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
         if (err < 0) {
+#else
+       area = xenbus_map_ring_valloc(pdev->xdev, gnt_ref);
+       if (IS_ERR(area)) {
+               err = PTR_ERR(area);
+#endif
                 xenbus_dev_fatal(pdev->xdev, err,
                                 "Error mapping other domain page in ours.");
                 goto out;
         }
  
+#ifndef CONFIG_XEN
         pdev->sh_info = vaddr;
+#else
+       pdev->sh_area = area;
+       pdev->sh_info = area->addr;
+#endif
  
         err = bind_interdomain_evtchn_to_irqhandler(
                 pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event,
@@ -241,6 +310,7 @@ static int xen_pcibk_export_device(struct xen_pcibk_device *pdev,
         if (err)
                 goto out;
  
+#ifndef CONFIG_XEN
         dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id);
         if (xen_register_device_domain_owner(dev,
                                              pdev->xdev->otherend_id) != 0) {
@@ -249,6 +319,7 @@ static int xen_pcibk_export_device(struct xen_pcibk_device *pdev,
                 xen_unregister_device_domain_owner(dev);
                 xen_register_device_domain_owner(dev, pdev->xdev->otherend_id);
         }
+#endif
  
         /* TODO: It'd be nice to export a bridge and have all of its children
          * get exported with it. This may be best done in xend (which will
@@ -280,8 +351,10 @@ static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev,
                 goto out;
         }
  
+#ifndef CONFIG_XEN
         dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id);
         xen_unregister_device_domain_owner(dev);
+#endif
  
         xen_pcibk_release_pci_dev(pdev, dev);
  
@@ -718,18 +791,31 @@ static DEFINE_XENBUS_DRIVER(xen_pcibk, DRV_NAME,
  );
  
  const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend;
+static const struct xen_pcibk_backend *__initdata xen_pcibk_backends[] = {
+       &xen_pcibk_vpci_backend,
+       &xen_pcibk_passthrough_backend,
+       &xen_pcibk_slot_backend,
+       &xen_pcibk_controller_backend,
+};
  
  int __init xen_pcibk_xenbus_register(void)
  {
+       unsigned int i;
+
         xen_pcibk_wq = create_workqueue("xen_pciback_workqueue");
         if (!xen_pcibk_wq) {
                 printk(KERN_ERR "%s: create"
                         "xen_pciback_workqueue failed\n", __func__);
                 return -EFAULT;
         }
-       xen_pcibk_backend = &xen_pcibk_vpci_backend;
-       if (passthrough)
-               xen_pcibk_backend = &xen_pcibk_passthrough_backend;
+       for (i = 0; i < ARRAY_SIZE(xen_pcibk_backends); ++i) {
+               if (!xen_pcibk_backends[i])
+                       continue;
+               if (strcmp(xen_pcibk_backends[i]->name, mode) == 0) {
+                       xen_pcibk_backend = xen_pcibk_backends[i];
+                       break;
+               }
+       }
         pr_info(DRV_NAME ": backend is %s\n", xen_pcibk_backend->name);
         return xenbus_register_backend(&xen_pcibk_driver);
  }
diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c

index 146c948..66724f9 100644 (file)
--- a/drivers/xen/xen-selfballoon.c
+++ b/drivers/xen/xen-selfballoon.c
@@ -79,6 +79,10 @@
  #include <xen/tmem.h>
  #include <xen/xen.h>
  
+#ifdef CONFIG_XEN
+#include "balloon/common.h"
+#endif
+
  /* Enable/disable with sysfs. */
  static int xen_selfballooning_enabled __read_mostly;
  
@@ -503,7 +507,6 @@ int register_xen_selfballooning(struct device *dev)
  #endif
         return error;
  }
-EXPORT_SYMBOL(register_xen_selfballooning);
  
  static int __init xen_selfballoon_init(void)
  {
diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile

index 31e2e90..a1b4dcc 100644 (file)
--- a/drivers/xen/xenbus/Makefile
+++ b/drivers/xen/xenbus/Makefile
@@ -1,14 +1,17 @@
-obj-y  += xenbus.o
-obj-y  += xenbus_dev_frontend.o
+obj-y += xenbus_client.o xenbus_comms.o xenbus_xs.o xenbus_probe.o
+backend-standalone-$(CONFIG_XEN) += xenbus_be.o
+obj-$(CONFIG_PARAVIRT_XEN) += xenbus_dev_frontend.o
  
-xenbus-objs =
-xenbus-objs += xenbus_client.o
-xenbus-objs += xenbus_comms.o
-xenbus-objs += xenbus_xs.o
-xenbus-objs += xenbus_probe.o
+xenbus_be-objs =
+xenbus_be-objs += xenbus_backend_client.o
+xenbus_be-objs += xenbus_dev_backend.o
  
-xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
-xenbus-objs += $(xenbus-be-objs-y)
+xenbus-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o
+obj-y += $(xenbus-y) $(xenbus-m)
+obj-$(CONFIG_XEN_XENBUS_DEV) += xenbus_dev.o
  
-obj-$(CONFIG_XEN_BACKEND) += xenbus_dev_backend.o
+obj-$(CONFIG_PARAVIRT_XEN_BACKEND) += xenbus_probe_backend.o
+backend-standalone-$(CONFIG_PARAVIRT_XEN) += xenbus_dev_backend.o
  obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o
+
+obj-$(CONFIG_XEN_BACKEND) += $(backend-standalone-y)
diff --git a/drivers/xen/xenbus/xenbus_backend_client.c b/drivers/xen/xenbus/xenbus_backend_client.c

new file mode 100644 (file)

index 0000000..40bbe19
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_backend_client.c
@@ -0,0 +1,106 @@
+/******************************************************************************
+ * Backend-client-facing interface for the Xenbus driver.  In other words, the
+ * interface between the Xenbus and the device-specific code in the backend
+ * driver.
+ *
+ * Copyright (C) 2005-2006 XenSource Ltd
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <xen/gnttab.h>
+#include <xen/xenbus.h>
+
+/* Based on Rusty Russell's skeleton driver's map_page */
+struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t gnt_ref)
+{
+       struct gnttab_map_grant_ref op;
+       struct vm_struct *area;
+
+       area = alloc_vm_area(PAGE_SIZE, NULL);
+       if (!area)
+               return ERR_PTR(-ENOMEM);
+
+       gnttab_set_map_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
+                         gnt_ref, dev->otherend_id);
+       
+       gnttab_check_GNTST_eagain_do_while(GNTTABOP_map_grant_ref, &op);
+
+       if (op.status != GNTST_okay) {
+               free_vm_area(area);
+               xenbus_dev_fatal(dev, op.status,
+                                "mapping in shared page %d from domain %d",
+                                gnt_ref, dev->otherend_id);
+               BUG_ON(!IS_ERR(ERR_PTR(op.status)));
+               return ERR_PTR(-EINVAL);
+       }
+
+       /* Stuff the handle in an unused field */
+       area->phys_addr = (unsigned long)op.handle;
+
+       return area;
+}
+EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
+
+
+/* Based on Rusty Russell's skeleton driver's unmap_page */
+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *area)
+{
+       struct gnttab_unmap_grant_ref op;
+
+       gnttab_set_unmap_op(&op, (unsigned long)area->addr, GNTMAP_host_map,
+                           (grant_handle_t)area->phys_addr);
+
+       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+               BUG();
+
+       if (op.status == GNTST_okay)
+               free_vm_area(area);
+       else
+               xenbus_dev_error(dev, op.status,
+                                "unmapping page at handle %d error %d",
+                                (int16_t)area->phys_addr, op.status);
+
+       return op.status == GNTST_okay ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
+
+
+int xenbus_dev_is_online(struct xenbus_device *dev)
+{
+       int rc, val;
+
+       rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val);
+       if (rc != 1)
+               val = 0; /* no online node present */
+
+       return val;
+}
+EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
+
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c

index b3e146e..9b59401 100644 (file)
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -31,6 +31,10 @@
   */
  
  #include <linux/slab.h>
+#if defined(CONFIG_XEN) || defined(MODULE)
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#else
  #include <linux/types.h>
  #include <linux/spinlock.h>
  #include <linux/vmalloc.h>
@@ -42,9 +46,11 @@
  #include <xen/balloon.h>
  #include <xen/events.h>
  #include <xen/grant_table.h>
+#endif
  #include <xen/xenbus.h>
  #include <xen/xen.h>
  
+#if defined(CONFIG_PARAVIRT_XEN)
  #include "xenbus_probe.h"
  
  struct xenbus_map_node {
@@ -60,11 +66,14 @@ static DEFINE_SPINLOCK(xenbus_valloc_lock);
  static LIST_HEAD(xenbus_valloc_pages);
  
  struct xenbus_ring_ops {
-       int (*map)(struct xenbus_device *dev, int gnt, void **vaddr);
+       int (*map)(struct xenbus_device *dev, grant_ref_t gnt, void **vaddr);
         int (*unmap)(struct xenbus_device *dev, void *vaddr);
  };
  
  static const struct xenbus_ring_ops *ring_ops __read_mostly;
+#elif defined(HAVE_XEN_PLATFORM_COMPAT_H)
+#include <xen/platform-compat.h>
+#endif
  
  const char *xenbus_strstate(enum xenbus_state state)
  {
@@ -75,9 +84,9 @@ const char *xenbus_strstate(enum xenbus_state state)
                 [ XenbusStateInitialised  ] = "Initialised",
                 [ XenbusStateConnected    ] = "Connected",
                 [ XenbusStateClosing      ] = "Closing",
-               [ XenbusStateClosed       ] = "Closed",
-               [XenbusStateReconfiguring] = "Reconfiguring",
-               [XenbusStateReconfigured] = "Reconfigured",
+               [ XenbusStateClosed       ] = "Closed",
+               [ XenbusStateReconfiguring ] = "Reconfiguring",
+               [ XenbusStateReconfigured ] = "Reconfigured",
         };
         return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
  }
@@ -120,6 +129,26 @@ int xenbus_watch_path(struct xenbus_device *dev, const char *path,
  EXPORT_SYMBOL_GPL(xenbus_watch_path);
  
  
+#if defined(CONFIG_XEN) || defined(MODULE)
+int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
+                      const char *path2, struct xenbus_watch *watch,
+                      void (*callback)(struct xenbus_watch *,
+                                       const char **, unsigned int))
+{
+       int err;
+       char *state = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/%s", path, path2);
+       if (!state) {
+               xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
+               return -ENOMEM;
+       }
+       err = xenbus_watch_path(dev, state, watch, callback);
+
+       if (err)
+               kfree(state);
+       return err;
+}
+EXPORT_SYMBOL_GPL(xenbus_watch_path2);
+#else
  /**
   * xenbus_watch_pathfmt - register a watch on a sprintf-formatted path
   * @dev: xenbus device
@@ -160,6 +189,7 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev,
         return err;
  }
  EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
+#endif
  
  static void xenbus_switch_fatal(struct xenbus_device *, int, int,
                                 const char *, ...);
@@ -233,7 +263,6 @@ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
  {
         return __xenbus_switch_state(dev, state, 0);
  }
-
  EXPORT_SYMBOL_GPL(xenbus_switch_state);
  
  int xenbus_frontend_closed(struct xenbus_device *dev)
@@ -254,41 +283,23 @@ static char *error_path(struct xenbus_device *dev)
  }
  
  
-static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
-                               const char *fmt, va_list ap)
+static void _dev_error(struct xenbus_device *dev, int err,
+                       const char *fmt, va_list *ap)
  {
-       int ret;
-       unsigned int len;
-       char *printf_buffer = NULL;
-       char *path_buffer = NULL;
-
-#define PRINTF_BUFFER_SIZE 4096
-       printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
-       if (printf_buffer == NULL)
-               goto fail;
-
-       len = sprintf(printf_buffer, "%i ", -err);
-       ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
+       char *printf_buffer, *path_buffer;
+       struct va_format vaf = { .fmt = fmt, .va = ap };
  
-       BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
-
-       dev_err(&dev->dev, "%s\n", printf_buffer);
+       printf_buffer = kasprintf(GFP_KERNEL, "%i %pV", -err, &vaf);
+       if (printf_buffer)
+               dev_err(&dev->dev, "%s\n", printf_buffer);
  
         path_buffer = error_path(dev);
+       if (!printf_buffer || !path_buffer
+           || xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer))
+               dev_err(&dev->dev,
+                       "xenbus: failed to write error node for %s (%s)\n",
+                       dev->nodename, printf_buffer);
  
-       if (path_buffer == NULL) {
-               dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
-                      dev->nodename, printf_buffer);
-               goto fail;
-       }
-
-       if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) {
-               dev_err(&dev->dev, "failed to write error node for %s (%s)\n",
-                      dev->nodename, printf_buffer);
-               goto fail;
-       }
-
-fail:
         kfree(printf_buffer);
         kfree(path_buffer);
  }
@@ -308,11 +319,12 @@ void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...)
         va_list ap;
  
         va_start(ap, fmt);
-       xenbus_va_dev_error(dev, err, fmt, ap);
+       _dev_error(dev, err, fmt, &ap);
         va_end(ap);
  }
  EXPORT_SYMBOL_GPL(xenbus_dev_error);
  
+
  /**
   * xenbus_dev_fatal
   * @dev: xenbus device
@@ -323,13 +335,12 @@ EXPORT_SYMBOL_GPL(xenbus_dev_error);
   * xenbus_switch_state(dev, XenbusStateClosing) to schedule an orderly
   * closedown of this driver and its peer.
   */
-
  void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
  {
         va_list ap;
  
         va_start(ap, fmt);
-       xenbus_va_dev_error(dev, err, fmt, ap);
+       _dev_error(dev, err, fmt, &ap);
         va_end(ap);
  
         xenbus_switch_state(dev, XenbusStateClosing);
@@ -346,7 +357,7 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
         va_list ap;
  
         va_start(ap, fmt);
-       xenbus_va_dev_error(dev, err, fmt, ap);
+       _dev_error(dev, err, fmt, &ap);
         va_end(ap);
  
         if (!depth)
@@ -357,7 +368,7 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
   * xenbus_grant_ring
   * @dev: xenbus device
   * @ring_mfn: mfn of ring to grant
-
+ *
   * Grant access to the given @ring_mfn to the peer of the given device.  Return
   * 0 on success, or -errno on error.  On error, the device will switch to
   * XenbusStateClosing, and the error will be saved in the store.
@@ -383,7 +394,7 @@ int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
         struct evtchn_alloc_unbound alloc_unbound;
         int err;
  
-       alloc_unbound.dom = DOMID_SELF;
+       alloc_unbound.dom        = DOMID_SELF;
         alloc_unbound.remote_dom = dev->otherend_id;
  
         err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
@@ -398,6 +409,7 @@ int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port)
  EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn);
  
  
+#if 0 /* !defined(CONFIG_XEN) && !defined(MODULE) */
  /**
   * Bind to an existing interdomain event channel in another domain. Returns 0
   * on success and stores the local port in *port. On error, returns -errno,
@@ -423,6 +435,7 @@ int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port)
         return err;
  }
  EXPORT_SYMBOL_GPL(xenbus_bind_evtchn);
+#endif
  
  
  /**
@@ -444,6 +457,7 @@ int xenbus_free_evtchn(struct xenbus_device *dev, int port)
  EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
  
  
+#if !defined(CONFIG_XEN) && !defined(MODULE)
  /**
   * xenbus_map_ring_valloc
   * @dev: xenbus device
@@ -458,14 +472,14 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
   * or -ENOMEM on error. If an error is returned, device will switch to
   * XenbusStateClosing and the error message will be saved in XenStore.
   */
-int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
+int xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t gnt_ref, void **vaddr)
  {
         return ring_ops->map(dev, gnt_ref, vaddr);
  }
  EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
  
  static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
-                                    int gnt_ref, void **vaddr)
+                                    grant_ref_t gnt_ref, void **vaddr)
  {
         struct gnttab_map_grant_ref op = {
                 .flags = GNTMAP_host_map | GNTMAP_contains_pte,
@@ -514,7 +528,7 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
  }
  
  static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
-                                     int gnt_ref, void **vaddr)
+                                     grant_ref_t gnt_ref, void **vaddr)
  {
         struct xenbus_map_node *node;
         int err;
@@ -564,7 +578,7 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
   * or -ENOMEM on error. If an error is returned, device will switch to
   * XenbusStateClosing and the error message will be saved in XenStore.
   */
-int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
+int xenbus_map_ring(struct xenbus_device *dev, grant_ref_t gnt_ref,
                     grant_handle_t *handle, void *vaddr)
  {
         struct gnttab_map_grant_ref op;
@@ -711,6 +725,7 @@ int xenbus_unmap_ring(struct xenbus_device *dev,
         return op.status;
  }
  EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
+#endif
  
  
  /**
@@ -722,15 +737,16 @@ EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
   */
  enum xenbus_state xenbus_read_driver_state(const char *path)
  {
-       enum xenbus_state result;
-       int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL);
-       if (err)
+       int result;
+
+       if (xenbus_scanf(XBT_NIL, path, "state", "%d", &result) != 1)
                 result = XenbusStateUnknown;
  
         return result;
  }
  EXPORT_SYMBOL_GPL(xenbus_read_driver_state);
  
+#if !defined(CONFIG_XEN) && !defined(MODULE)
  static const struct xenbus_ring_ops ring_ops_pv = {
         .map = xenbus_map_ring_valloc_pv,
         .unmap = xenbus_unmap_ring_vfree_pv,
@@ -748,3 +764,4 @@ void __init xenbus_ring_ops_init(void)
         else
                 ring_ops = &ring_ops_hvm;
  }
+#endif
diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c

index 2eff7a6..721fd08 100644 (file)
--- a/drivers/xen/xenbus/xenbus_comms.c
+++ b/drivers/xen/xenbus/xenbus_comms.c
@@ -35,24 +35,57 @@
  #include <linux/sched.h>
  #include <linux/err.h>
  #include <xen/xenbus.h>
+#if defined(CONFIG_XEN) || defined(MODULE)
+#include <xen/evtchn.h>
+#include <asm/hypervisor.h>
+#else
  #include <asm/xen/hypervisor.h>
  #include <xen/events.h>
  #include <xen/page.h>
+#endif
+
  #include "xenbus_comms.h"
  
-static int xenbus_irq;
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
  
-static DECLARE_WORK(probe_work, xenbus_probe);
+static int xenbus_irq;
  
  static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
  
  static irqreturn_t wake_waiting(int irq, void *unused)
  {
-       if (unlikely(xenstored_ready == 0)) {
-               xenstored_ready = 1;
-               schedule_work(&probe_work);
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       static DECLARE_WORK(probe_work, xenbus_probe);
+       int old, new;
+
+       old = atomic_read(&xenbus_xsd_state);
+       switch (old) {
+               case XENBUS_XSD_UNCOMMITTED:
+                       BUG();
+                       return IRQ_HANDLED;
+
+               case XENBUS_XSD_FOREIGN_INIT:
+                       new = XENBUS_XSD_FOREIGN_READY;
+                       break;
+
+               case XENBUS_XSD_LOCAL_INIT:
+                       new = XENBUS_XSD_LOCAL_READY;
+                       break;
+
+               case XENBUS_XSD_FOREIGN_READY:
+               case XENBUS_XSD_LOCAL_READY:
+               default:
+                       goto wake;
         }
  
+       old = atomic_cmpxchg(&xenbus_xsd_state, old, new);
+       if (old != new)
+               schedule_work(&probe_work);
+
+wake:
+#endif
         wake_up(&xb_waitq);
         return IRQ_HANDLED;
  }
@@ -203,34 +236,48 @@ int xb_read(void *data, unsigned len)
  int xb_init_comms(void)
  {
         struct xenstore_domain_interface *intf = xen_store_interface;
+       int err;
  
         if (intf->req_prod != intf->req_cons)
-               printk(KERN_ERR "XENBUS request ring is not quiescent "
+               pr_err("XENBUS request ring is not quiescent "
                        "(%08x:%08x)!\n", intf->req_cons, intf->req_prod);
  
         if (intf->rsp_prod != intf->rsp_cons) {
-               printk(KERN_WARNING "XENBUS response ring is not quiescent "
-                      "(%08x:%08x): fixing up\n",
-                      intf->rsp_cons, intf->rsp_prod);
+               pr_warning("XENBUS response ring is not quiescent"
+                          " (%08x:%08x): fixing up\n",
+                          intf->rsp_cons, intf->rsp_prod);
                 /* breaks kdump */
                 if (!reset_devices)
                         intf->rsp_cons = intf->rsp_prod;
         }
  
+#if defined(CONFIG_XEN) || defined(MODULE)
+       if (xenbus_irq)
+               unbind_from_irqhandler(xenbus_irq, &xb_waitq);
+
+       err = bind_caller_port_to_irqhandler(
+               xen_store_evtchn, wake_waiting,
+               0, "xenbus", &xb_waitq);
+       if (err <= 0) {
+               pr_err("XENBUS request irq failed %i\n", err);
+               return err;
+       }
+
+       xenbus_irq = err;
+#else
         if (xenbus_irq) {
                 /* Already have an irq; assume we're resuming */
                 rebind_evtchn_irq(xen_store_evtchn, xenbus_irq);
         } else {
-               int err;
                 err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting,
                                                 0, "xenbus", &xb_waitq);
                 if (err <= 0) {
-                       printk(KERN_ERR "XENBUS request irq failed %i\n", err);
+                       pr_err("XENBUS request irq failed %i\n", err);
                         return err;
                 }
-
                 xenbus_irq = err;
         }
+#endif
  
         return 0;
  }
diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h

index 6e42800..10f775e 100644 (file)
--- a/drivers/xen/xenbus/xenbus_comms.h
+++ b/drivers/xen/xenbus/xenbus_comms.h
@@ -47,4 +47,27 @@ extern int xen_store_evtchn;
  
  extern const struct file_operations xen_xenbus_fops;
  
+/* For xenbus internal use. */
+enum {
+       XENBUS_XSD_UNCOMMITTED = 0,
+       XENBUS_XSD_FOREIGN_INIT,
+       XENBUS_XSD_FOREIGN_READY,
+       XENBUS_XSD_LOCAL_INIT,
+       XENBUS_XSD_LOCAL_READY,
+};
+extern atomic_t xenbus_xsd_state;
+
+static inline int is_xenstored_ready(void)
+{
+       int s = atomic_read(&xenbus_xsd_state);
+       return s == XENBUS_XSD_FOREIGN_READY || s == XENBUS_XSD_LOCAL_READY;
+}
+
+#if defined(CONFIG_XEN_XENBUS_DEV) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
+#include <xen/interface/event_channel.h>
+#include <xen/interface/grant_table.h>
+
+int xenbus_conn(domid_t, grant_ref_t *, evtchn_port_t *);
+#endif
+
  #endif /* _XENBUS_COMMS_H */
diff --git a/drivers/xen/xenbus/xenbus_dev.c b/drivers/xen/xenbus/xenbus_dev.c

new file mode 100644 (file)

index 0000000..c183e3e
--- /dev/null
+++ b/drivers/xen/xenbus/xenbus_dev.c
@@ -0,0 +1,511 @@
+/*
+ * xenbus_dev.c
+ * 
+ * Driver giving user-space access to the kernel's xenbus connection
+ * to xenstore.
+ * 
+ * Copyright (c) 2005, Christian Limpach
+ * Copyright (c) 2005, Rusty Russell, IBM Corporation
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/uio.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+
+#include "xenbus_comms.h"
+
+#include <asm/uaccess.h>
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/xen_proc.h>
+#include <asm/hypervisor.h>
+
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#include <xen/public/xenbus.h>
+
+struct xenbus_dev_transaction {
+       struct list_head list;
+       struct xenbus_transaction handle;
+};
+
+struct read_buffer {
+       struct list_head list;
+       unsigned int cons;
+       unsigned int len;
+       char msg[];
+};
+
+struct xenbus_dev_data {
+       /* In-progress transaction. */
+       struct list_head transactions;
+
+       /* Active watches. */
+       struct list_head watches;
+
+       /* Partial request. */
+       unsigned int len;
+       union {
+               struct xsd_sockmsg msg;
+               char buffer[XENSTORE_PAYLOAD_MAX];
+       } u;
+
+       /* Response queue. */
+       struct list_head read_buffers;
+       wait_queue_head_t read_waitq;
+
+       struct mutex reply_mutex;
+};
+
+static struct proc_dir_entry *xenbus_dev_intf;
+
+static ssize_t xenbus_dev_read(struct file *filp,
+                              char __user *ubuf,
+                              size_t len, loff_t *ppos)
+{
+       struct xenbus_dev_data *u = filp->private_data;
+       struct read_buffer *rb;
+       int i, ret;
+
+       if (!is_xenstored_ready())
+               return -ENODEV;
+
+       mutex_lock(&u->reply_mutex);
+       while (list_empty(&u->read_buffers)) {
+               mutex_unlock(&u->reply_mutex);
+               if (filp->f_flags & O_NONBLOCK)
+                       return -EAGAIN;
+
+               ret = wait_event_interruptible(u->read_waitq,
+                                              !list_empty(&u->read_buffers));
+               if (ret)
+                       return ret;
+               mutex_lock(&u->reply_mutex);
+       }
+
+       rb = list_entry(u->read_buffers.next, struct read_buffer, list);
+       for (i = 0; i < len;) {
+               put_user(rb->msg[rb->cons], ubuf + i);
+               i++;
+               rb->cons++;
+               if (rb->cons == rb->len) {
+                       list_del(&rb->list);
+                       kfree(rb);
+                       if (list_empty(&u->read_buffers))
+                               break;
+                       rb = list_entry(u->read_buffers.next,
+                                       struct read_buffer, list);
+               }
+       }
+       mutex_unlock(&u->reply_mutex);
+
+       return i;
+}
+
+static int queue_reply(struct list_head *queue,
+                      const void *data, unsigned int len)
+{
+       struct read_buffer *rb;
+
+       if (len == 0)
+               return 0;
+
+       rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL);
+       if (!rb)
+               return -ENOMEM;
+
+       rb->cons = 0;
+       rb->len = len;
+
+       memcpy(rb->msg, data, len);
+
+       list_add_tail(&rb->list, queue);
+       return 0;
+}
+
+static void queue_flush(struct xenbus_dev_data *u, struct list_head *queue,
+                       int err)
+{
+       if (!err) {
+               list_splice_tail(queue, &u->read_buffers);
+               wake_up(&u->read_waitq);
+       } else
+               while (!list_empty(queue)) {
+                       struct read_buffer *rb = list_entry(queue->next,
+                               struct read_buffer, list);
+
+                       list_del(queue->next);
+                       kfree(rb);
+               }
+}
+
+struct watch_adapter
+{
+       struct list_head list;
+       struct xenbus_watch watch;
+       struct xenbus_dev_data *dev_data;
+       char *token;
+};
+
+static void free_watch_adapter (struct watch_adapter *watch)
+{
+       kfree(watch->watch.node);
+       kfree(watch->token);
+       kfree(watch);
+}
+
+static void watch_fired(struct xenbus_watch *watch,
+                       const char **vec,
+                       unsigned int len)
+{
+       struct watch_adapter *adap =
+            container_of(watch, struct watch_adapter, watch);
+       struct xsd_sockmsg hdr;
+       const char *path, *token;
+       int err, path_len, tok_len, body_len, data_len = 0;
+       LIST_HEAD(queue);
+
+       path = vec[XS_WATCH_PATH];
+       token = adap->token;
+
+       path_len = strlen(path) + 1;
+       tok_len = strlen(token) + 1;
+       if (len > 2)
+               data_len = vec[len] - vec[2] + 1;
+       body_len = path_len + tok_len + data_len;
+
+       hdr.type = XS_WATCH_EVENT;
+       hdr.len = body_len;
+
+       mutex_lock(&adap->dev_data->reply_mutex);
+       err = queue_reply(&queue, &hdr, sizeof(hdr));
+       if (!err)
+               err = queue_reply(&queue, path, path_len);
+       if (!err)
+               err = queue_reply(&queue, token, tok_len);
+       if (!err && len > 2)
+               err = queue_reply(&queue, vec[2], data_len);
+       queue_flush(adap->dev_data, &queue, err);
+       mutex_unlock(&adap->dev_data->reply_mutex);
+}
+
+static LIST_HEAD(watch_list);
+
+static ssize_t xenbus_dev_write(struct file *filp,
+                               const char __user *ubuf,
+                               size_t len, loff_t *ppos)
+{
+       struct xenbus_dev_data *u = filp->private_data;
+       struct xenbus_dev_transaction *trans = NULL;
+       uint32_t msg_type;
+       void *reply = NULL;
+       LIST_HEAD(queue);
+       char *path, *token;
+       struct watch_adapter *watch;
+       int err, rc = len;
+
+       if (!is_xenstored_ready())
+               return -ENODEV;
+
+       if ((len + u->len) > sizeof(u->u.buffer)) {
+               rc = -EINVAL;
+               goto out;
+       }
+
+       if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0) {
+               rc = -EFAULT;
+               goto out;
+       }
+
+       u->len += len;
+       if ((u->len < sizeof(u->u.msg)) ||
+           (u->len < (sizeof(u->u.msg) + u->u.msg.len)))
+               return rc;
+
+       msg_type = u->u.msg.type;
+
+       switch (msg_type) {
+       case XS_WATCH:
+       case XS_UNWATCH: {
+               static const char XS_RESP[] = "OK";
+               struct xsd_sockmsg hdr;
+
+               path = u->u.buffer + sizeof(u->u.msg);
+               token = memchr(path, 0, u->u.msg.len);
+               if (token == NULL) {
+                       rc = -EILSEQ;
+                       goto out;
+               }
+               token++;
+               if (memchr(token, 0, u->u.msg.len - (token - path)) == NULL) {
+                       rc = -EILSEQ;
+                       goto out;
+               }
+
+               if (msg_type == XS_WATCH) {
+                       watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+                       if (watch == NULL) {
+                               rc = -ENOMEM;
+                               goto out;
+                       }
+                       watch->watch.node = kstrdup(path, GFP_KERNEL);
+                       watch->watch.callback = watch_fired;
+                       watch->token = kstrdup(token, GFP_KERNEL);
+                       watch->dev_data = u;
+
+                       err = watch->watch.node && watch->token
+                             ? register_xenbus_watch(&watch->watch) : -ENOMEM;
+                       if (err) {
+                               free_watch_adapter(watch);
+                               rc = err;
+                               goto out;
+                       }
+                       
+                       list_add(&watch->list, &u->watches);
+               } else {
+                       list_for_each_entry(watch, &u->watches, list) {
+                               if (!strcmp(watch->token, token) &&
+                                   !strcmp(watch->watch.node, path))
+                               {
+                                       unregister_xenbus_watch(&watch->watch);
+                                       list_del(&watch->list);
+                                       free_watch_adapter(watch);
+                                       break;
+                               }
+                       }
+               }
+
+               hdr.type = msg_type;
+               hdr.len = sizeof(XS_RESP);
+               mutex_lock(&u->reply_mutex);
+               err = queue_reply(&queue, &hdr, sizeof(hdr))
+                     ?: queue_reply(&queue, XS_RESP, hdr.len);
+               break;
+       }
+
+       case XS_TRANSACTION_START:
+               trans = kmalloc(sizeof(*trans), GFP_KERNEL);
+               if (!trans) {
+                       rc = -ENOMEM;
+                       goto out;
+               }
+               goto common;
+
+       case XS_TRANSACTION_END:
+               list_for_each_entry(trans, &u->transactions, list)
+                       if (trans->handle.id == u->u.msg.tx_id)
+                               break;
+               if (&trans->list == &u->transactions) {
+                       rc = -ESRCH;
+                       goto out;
+               }
+               /* fall through */
+       common:
+       default:
+               reply = xenbus_dev_request_and_reply(&u->u.msg);
+               if (IS_ERR(reply)) {
+                       if (msg_type == XS_TRANSACTION_START)
+                               kfree(trans);
+                       rc = PTR_ERR(reply);
+                       goto out;
+               }
+
+               if (msg_type == XS_TRANSACTION_START) {
+                       trans->handle.id = simple_strtoul(reply, NULL, 0);
+                       list_add(&trans->list, &u->transactions);
+               } else if (msg_type == XS_TRANSACTION_END) {
+                       list_del(&trans->list);
+                       kfree(trans);
+               }
+               mutex_lock(&u->reply_mutex);
+               err = queue_reply(&queue, &u->u.msg, sizeof(u->u.msg))
+                     ?: queue_reply(&queue, reply, u->u.msg.len);
+               break;
+       }
+
+       queue_flush(u, &queue, err);
+       mutex_unlock(&u->reply_mutex);
+       kfree(reply);
+       if (err)
+               rc = err;
+
+ out:
+       u->len = 0;
+       return rc;
+}
+
+static int xenbus_dev_open(struct inode *inode, struct file *filp)
+{
+       struct xenbus_dev_data *u;
+
+       if (xen_store_evtchn == 0)
+               return -ENOENT;
+
+       nonseekable_open(inode, filp);
+
+       u = kzalloc(sizeof(*u), GFP_KERNEL);
+       if (u == NULL)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(&u->transactions);
+       INIT_LIST_HEAD(&u->watches);
+       INIT_LIST_HEAD(&u->read_buffers);
+       init_waitqueue_head(&u->read_waitq);
+
+       mutex_init(&u->reply_mutex);
+
+       filp->private_data = u;
+
+       return 0;
+}
+
+static int xenbus_dev_release(struct inode *inode, struct file *filp)
+{
+       struct xenbus_dev_data *u = filp->private_data;
+       struct xenbus_dev_transaction *trans, *tmp;
+       struct watch_adapter *watch, *tmp_watch;
+       struct read_buffer *rb, *tmp_rb;
+
+       list_for_each_entry_safe(trans, tmp, &u->transactions, list) {
+               xenbus_transaction_end(trans->handle, 1);
+               list_del(&trans->list);
+               kfree(trans);
+       }
+
+       list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) {
+               unregister_xenbus_watch(&watch->watch);
+               list_del(&watch->list);
+               free_watch_adapter(watch);
+       }
+
+       list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) {
+               list_del(&rb->list);
+               kfree(rb);
+       }
+       kfree(u);
+
+       return 0;
+}
+
+static unsigned int xenbus_dev_poll(struct file *file, poll_table *wait)
+{
+       struct xenbus_dev_data *u = file->private_data;
+
+       if (!is_xenstored_ready())
+               return -ENODEV;
+
+       poll_wait(file, &u->read_waitq, wait);
+       if (!list_empty(&u->read_buffers))
+               return POLLIN | POLLRDNORM;
+       return 0;
+}
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+static long xenbus_dev_ioctl(struct file *file,
+                             unsigned int cmd, unsigned long data)
+{
+       void __user *udata = (void __user *) data;
+       int ret = -ENOTTY;
+       
+       if (!is_initial_xendomain())
+               return -ENODEV;
+
+
+       switch (cmd) {
+       case IOCTL_XENBUS_ALLOC: {
+               xenbus_alloc_t xa;
+               int old;
+
+               old = atomic_cmpxchg(&xenbus_xsd_state,
+                                    XENBUS_XSD_UNCOMMITTED,
+                                    XENBUS_XSD_FOREIGN_INIT);
+               if (old != XENBUS_XSD_UNCOMMITTED)
+                       return -EBUSY;
+
+               if (copy_from_user(&xa, udata, sizeof(xa))) {
+                       ret = -EFAULT;
+                       atomic_set(&xenbus_xsd_state, XENBUS_XSD_UNCOMMITTED);
+                       break;
+               }
+
+               ret = xenbus_conn(xa.dom, &xa.grant_ref, &xa.port);
+               if (ret != 0) {
+                       atomic_set(&xenbus_xsd_state, XENBUS_XSD_UNCOMMITTED);
+                       break;
+               }
+
+               if (copy_to_user(udata, &xa, sizeof(xa))) {
+                       ret = -EFAULT;
+                       atomic_set(&xenbus_xsd_state, XENBUS_XSD_UNCOMMITTED);
+                       break;
+               }
+       }
+       break;
+
+       default:
+               break;
+       }
+
+       return ret;
+}
+#endif
+
+static const struct file_operations xenbus_dev_file_ops = {
+       .read = xenbus_dev_read,
+       .write = xenbus_dev_write,
+       .open = xenbus_dev_open,
+       .release = xenbus_dev_release,
+       .llseek = no_llseek,
+       .poll = xenbus_dev_poll,
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       .unlocked_ioctl = xenbus_dev_ioctl
+#endif
+};
+
+int
+#ifndef MODULE
+__init
+#else
+__devinit
+#endif
+xenbus_dev_init(void)
+{
+       xenbus_dev_intf = create_xen_proc_entry("xenbus", 0400);
+       if (xenbus_dev_intf)
+               xenbus_dev_intf->proc_fops = &xenbus_dev_file_ops;
+
+       return 0;
+}
diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c

index 3d3be78..e5d2994 100644 (file)
--- a/drivers/xen/xenbus/xenbus_dev_backend.c
+++ b/drivers/xen/xenbus/xenbus_dev_backend.c
@@ -7,7 +7,9 @@
  #include <linux/capability.h>
  
  #include <xen/xen.h>
+#ifdef CONFIG_PARAVIRT_XEN
  #include <xen/page.h>
+#endif
  #include <xen/xenbus_dev.h>
  
  #include "xenbus_comms.h"
@@ -49,7 +51,7 @@ static int xenbus_backend_mmap(struct file *file, struct vm_area_struct *vma)
                 return -EINVAL;
  
         if (remap_pfn_range(vma, vma->vm_start,
-                           virt_to_pfn(xen_store_interface),
+                           PFN_DOWN(__pa(xen_store_interface)),
                             size, vma->vm_page_prot))
                 return -EAGAIN;
  
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c

index b793723..ceeece9 100644 (file)
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -4,6 +4,7 @@
   * Copyright (C) 2005 Rusty Russell, IBM Corporation
   * Copyright (C) 2005 Mike Wray, Hewlett-Packard
   * Copyright (C) 2005, 2006 XenSource Ltd
+ * Copyright (C) 2007 Solarflare Communications, Inc.
   *
   * This program is free software; you can redistribute it and/or
   * modify it under the terms of the GNU General Public License version 2
@@ -32,17 +33,18 @@
  
  #define DPRINTK(fmt, args...)                          \
         pr_debug("xenbus_probe (%s:%d) " fmt ".\n",     \
-                __func__, __LINE__, ##args)
+                __FUNCTION__, __LINE__, ##args)
  
  #include <linux/kernel.h>
+#include <linux/version.h>
  #include <linux/err.h>
  #include <linux/string.h>
  #include <linux/ctype.h>
  #include <linux/fcntl.h>
  #include <linux/mm.h>
+#include <linux/sched.h>
  #include <linux/proc_fs.h>
  #include <linux/notifier.h>
-#include <linux/kthread.h>
  #include <linux/mutex.h>
  #include <linux/io.h>
  #include <linux/slab.h>
@@ -50,6 +52,16 @@
  
  #include <asm/page.h>
  #include <asm/pgtable.h>
+#if defined(CONFIG_XEN) || defined(MODULE)
+#include <asm/hypervisor.h>
+#include <xen/xenbus.h>
+#include <xen/xen_proc.h>
+#include <xen/evtchn.h>
+#include <xen/features.h>
+#include <xen/gnttab.h>
+
+#define PARAVIRT_EXPORT_SYMBOL(sym) __typeof__(sym) sym
+#else
  #include <asm/xen/hypervisor.h>
  
  #include <xen/xen.h>
@@ -57,11 +69,19 @@
  #include <xen/events.h>
  #include <xen/page.h>
  
+#define PARAVIRT_EXPORT_SYMBOL EXPORT_SYMBOL_GPL
+#endif
+
+#ifndef CONFIG_XEN
  #include <xen/hvm.h>
+#endif
  
  #include "xenbus_comms.h"
  #include "xenbus_probe.h"
  
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
  
  int xen_store_evtchn;
  EXPORT_SYMBOL_GPL(xen_store_evtchn);
@@ -71,7 +91,17 @@ EXPORT_SYMBOL_GPL(xen_store_interface);
  
  static unsigned long xen_store_mfn;
  
-static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
+extern struct mutex xenwatch_mutex;
+
+static
+#ifdef CONFIG_XEN_UNPRIVILEGED_GUEST
+__initdata
+#endif
+BLOCKING_NOTIFIER_HEAD(xenstore_chain);
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+static void wait_for_devices(struct xenbus_driver *xendrv);
+#endif
  
  /* If something in array of ids matches this device, return it. */
  static const struct xenbus_device_id *
@@ -93,7 +123,7 @@ int xenbus_match(struct device *_dev, struct device_driver *_drv)
  
         return match_device(drv->ids, to_xenbus_device(_dev)) != NULL;
  }
-EXPORT_SYMBOL_GPL(xenbus_match);
+PARAVIRT_EXPORT_SYMBOL(xenbus_match);
  
  
  static void free_otherend_details(struct xenbus_device *dev)
@@ -113,29 +143,6 @@ static void free_otherend_watch(struct xenbus_device *dev)
  }
  
  
-static int talk_to_otherend(struct xenbus_device *dev)
-{
-       struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
-
-       free_otherend_watch(dev);
-       free_otherend_details(dev);
-
-       return drv->read_otherend_details(dev);
-}
-
-
-
-static int watch_otherend(struct xenbus_device *dev)
-{
-       struct xen_bus_type *bus =
-               container_of(dev->dev.bus, struct xen_bus_type, bus);
-
-       return xenbus_watch_pathfmt(dev, &dev->otherend_watch,
-                                   bus->otherend_changed,
-                                   "%s/%s", dev->otherend, "state");
-}
-
-
  int xenbus_read_otherend_details(struct xenbus_device *xendev,
                                  char *id_node, char *path_node)
  {
@@ -161,11 +168,22 @@ int xenbus_read_otherend_details(struct xenbus_device *xendev,
  
         return 0;
  }
-EXPORT_SYMBOL_GPL(xenbus_read_otherend_details);
+PARAVIRT_EXPORT_SYMBOL(xenbus_read_otherend_details);
  
+#if defined(CONFIG_XEN) || defined(MODULE)
+
+static int read_backend_details(struct xenbus_device *xendev)
+{
+       return xenbus_read_otherend_details(xendev, "backend-id", "backend");
+}
+
+static void otherend_changed(struct xenbus_watch *watch,
+                            const char **vec, unsigned int len)
+#else /* !CONFIG_XEN && !MODULE */
  void xenbus_otherend_changed(struct xenbus_watch *watch,
                              const char **vec, unsigned int len,
                              int ignore_on_shutdown)
+#endif /* CONFIG_XEN || MODULE */
  {
         struct xenbus_device *dev =
                 container_of(watch, struct xenbus_device, otherend_watch);
@@ -177,31 +195,69 @@ void xenbus_otherend_changed(struct xenbus_watch *watch,
         if (!dev->otherend ||
             strncmp(dev->otherend, vec[XS_WATCH_PATH],
                     strlen(dev->otherend))) {
-               dev_dbg(&dev->dev, "Ignoring watch at %s\n",
-                       vec[XS_WATCH_PATH]);
+               dev_dbg(&dev->dev, "Ignoring watch at %s", vec[XS_WATCH_PATH]);
                 return;
         }
  
         state = xenbus_read_driver_state(dev->otherend);
  
-       dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n",
+       dev_dbg(&dev->dev, "state is %d (%s), %s, %s",
                 state, xenbus_strstate(state), dev->otherend_watch.node,
                 vec[XS_WATCH_PATH]);
  
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
         /*
          * Ignore xenbus transitions during shutdown. This prevents us doing
          * work that can fail e.g., when the rootfs is gone.
          */
         if (system_state > SYSTEM_RUNNING) {
+               /* If we're frontend, drive the state machine to Closed. */
+               /* This should cause the backend to release our resources. */
+# if defined(CONFIG_XEN) || defined(MODULE)
+               const struct xen_bus_type *bus =
+                       container_of(dev->dev.bus, struct xen_bus_type, bus);
+               int ignore_on_shutdown = (bus->levels == 2);
+# endif
+
                 if (ignore_on_shutdown && (state == XenbusStateClosing))
                         xenbus_frontend_closed(dev);
                 return;
         }
+#endif
  
         if (drv->otherend_changed)
                 drv->otherend_changed(dev, state);
  }
-EXPORT_SYMBOL_GPL(xenbus_otherend_changed);
+PARAVIRT_EXPORT_SYMBOL(xenbus_otherend_changed);
+
+
+static int talk_to_otherend(struct xenbus_device *dev)
+{
+       struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver);
+
+       free_otherend_watch(dev);
+       free_otherend_details(dev);
+
+       return drv->read_otherend_details(dev);
+}
+
+
+
+static int watch_otherend(struct xenbus_device *dev)
+{
+#if defined(CONFIG_XEN) || defined(MODULE)
+       return xenbus_watch_path2(dev, dev->otherend, "state",
+                                 &dev->otherend_watch, otherend_changed);
+#else
+       struct xen_bus_type *bus =
+               container_of(dev->dev.bus, struct xen_bus_type, bus);
+
+       return xenbus_watch_pathfmt(dev, &dev->otherend_watch,
+                                   bus->otherend_changed,
+                                   "%s/%s", dev->otherend, "state");
+#endif
+}
+
  
  int xenbus_dev_probe(struct device *_dev)
  {
@@ -225,8 +281,9 @@ int xenbus_dev_probe(struct device *_dev)
  
         err = talk_to_otherend(dev);
         if (err) {
-               dev_warn(&dev->dev, "talk_to_otherend on %s failed.\n",
-                        dev->nodename);
+               dev_warn(&dev->dev,
+                        "xenbus_probe: talk_to_otherend on %s failed.\n",
+                        dev->nodename);
                 return err;
         }
  
@@ -236,8 +293,9 @@ int xenbus_dev_probe(struct device *_dev)
  
         err = watch_otherend(dev);
         if (err) {
-               dev_warn(&dev->dev, "watch_otherend on %s failed.\n",
-                      dev->nodename);
+               dev_warn(&dev->dev,
+                        "xenbus_probe: watch_otherend on %s failed.\n",
+                        dev->nodename);
                 return err;
         }
  
@@ -245,9 +303,13 @@ int xenbus_dev_probe(struct device *_dev)
  fail:
         xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename);
         xenbus_switch_state(dev, XenbusStateClosed);
+#if defined(CONFIG_XEN) || defined(MODULE)
+       return -ENODEV;
+#else
         return err;
+#endif
  }
-EXPORT_SYMBOL_GPL(xenbus_dev_probe);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_probe);
  
  int xenbus_dev_remove(struct device *_dev)
  {
@@ -266,7 +328,7 @@ int xenbus_dev_remove(struct device *_dev)
         xenbus_switch_state(dev, XenbusStateClosed);
         return 0;
  }
-EXPORT_SYMBOL_GPL(xenbus_dev_remove);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_remove);
  
  void xenbus_dev_shutdown(struct device *_dev)
  {
@@ -275,30 +337,55 @@ void xenbus_dev_shutdown(struct device *_dev)
  
         DPRINTK("%s", dev->nodename);
  
+/* Commented out since xenstored stubdom is now minios based not linux based
+#define XENSTORE_DOMAIN_SHARES_THIS_KERNEL
+*/
+#ifndef XENSTORE_DOMAIN_SHARES_THIS_KERNEL
+       if (is_initial_xendomain())
+#endif
+               return;
+
         get_device(&dev->dev);
         if (dev->state != XenbusStateConnected) {
-               printk(KERN_INFO "%s: %s: %s != Connected, skipping\n", __func__,
-                      dev->nodename, xenbus_strstate(dev->state));
+               dev_info(&dev->dev, "%s: %s: %s != Connected, skipping\n", __FUNCTION__,
+                        dev->nodename, xenbus_strstate(dev->state));
                 goto out;
         }
         xenbus_switch_state(dev, XenbusStateClosing);
+
+       if (!strcmp(dev->devicetype, "vfb"))
+               goto out;
+
         timeout = wait_for_completion_timeout(&dev->down, timeout);
         if (!timeout)
-               printk(KERN_INFO "%s: %s timeout closing device\n",
-                      __func__, dev->nodename);
+               dev_info(&dev->dev, "%s: %s timeout closing device\n",
+                        __FUNCTION__, dev->nodename);
   out:
         put_device(&dev->dev);
  }
-EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_shutdown);
  
  int xenbus_register_driver_common(struct xenbus_driver *drv,
                                   struct xen_bus_type *bus)
  {
+       int ret;
+
+       if (bus->error)
+               return bus->error;
+
         drv->driver.bus = &bus->bus;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
+       drv->driver.probe = xenbus_dev_probe;
+       drv->driver.remove = xenbus_dev_remove;
+       drv->driver.shutdown = xenbus_dev_shutdown;
+#endif
  
-       return driver_register(&drv->driver);
+       mutex_lock(&xenwatch_mutex);
+       ret = driver_register(&drv->driver);
+       mutex_unlock(&xenwatch_mutex);
+       return ret;
  }
-EXPORT_SYMBOL_GPL(xenbus_register_driver_common);
+PARAVIRT_EXPORT_SYMBOL(xenbus_register_driver_common);
  
  void xenbus_unregister_driver(struct xenbus_driver *drv)
  {
@@ -375,19 +462,28 @@ static void xenbus_dev_release(struct device *dev)
  }
  
  static ssize_t nodename_show(struct device *dev,
-                            struct device_attribute *attr, char *buf)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
+                            struct device_attribute *attr,
+#endif
+                            char *buf)
  {
         return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
  }
  
  static ssize_t devtype_show(struct device *dev,
-                           struct device_attribute *attr, char *buf)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
+                           struct device_attribute *attr,
+#endif
+                           char *buf)
  {
         return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
  }
  
  static ssize_t modalias_show(struct device *dev,
-                            struct device_attribute *attr, char *buf)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)
+                            struct device_attribute *attr,
+#endif
+                            char *buf)
  {
         return sprintf(buf, "%s:%s\n", dev->bus->name,
                        to_xenbus_device(dev)->devicetype);
@@ -399,13 +495,12 @@ struct device_attribute xenbus_dev_attrs[] = {
         __ATTR_RO(modalias),
         __ATTR_NULL
  };
-EXPORT_SYMBOL_GPL(xenbus_dev_attrs);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_attrs);
  
  int xenbus_probe_node(struct xen_bus_type *bus,
                       const char *type,
                       const char *nodename)
  {
-       char devname[XEN_BUS_ID_SIZE];
         int err;
         struct xenbus_device *xendev;
         size_t stringlen;
@@ -413,6 +508,9 @@ int xenbus_probe_node(struct xen_bus_type *bus,
  
         enum xenbus_state state = xenbus_read_driver_state(nodename);
  
+       if (bus->error)
+               return bus->error;
+
         if (state != XenbusStateInitialising) {
                 /* Device is not new, so ignore it.  This can happen if a
                    device is going away after switching to Closed.  */
@@ -437,15 +535,26 @@ int xenbus_probe_node(struct xen_bus_type *bus,
         xendev->devicetype = tmpstring;
         init_completion(&xendev->down);
  
+#if defined(CONFIG_XEN) || defined(MODULE)
+       xendev->dev.parent = &bus->dev;
+#endif
         xendev->dev.bus = &bus->bus;
         xendev->dev.release = xenbus_dev_release;
  
-       err = bus->get_bus_id(devname, xendev->nodename);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
+       {
+               char devname[XEN_BUS_ID_SIZE];
+
+               err = bus->get_bus_id(devname, xendev->nodename);
+               if (!err)
+                       dev_set_name(&xendev->dev, devname);
+       }
+#else
+       err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename);
+#endif
         if (err)
                 goto fail;
  
-       dev_set_name(&xendev->dev, devname);
-
         /* Register with generic device framework. */
         err = device_register(&xendev->dev);
         if (err)
@@ -456,7 +565,113 @@ fail:
         kfree(xendev);
         return err;
  }
-EXPORT_SYMBOL_GPL(xenbus_probe_node);
+PARAVIRT_EXPORT_SYMBOL(xenbus_probe_node);
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+
+/* device/<type>/<id> => <type>-<id> */
+static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
+{
+       nodename = strchr(nodename, '/');
+       if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) {
+               pr_warning("XENBUS: bad frontend %s\n", nodename);
+               return -EINVAL;
+       }
+
+       strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE);
+       if (!strchr(bus_id, '/')) {
+               pr_warning("XENBUS: bus_id %s no slash\n", bus_id);
+               return -EINVAL;
+       }
+       *strchr(bus_id, '/') = '-';
+       return 0;
+}
+
+/* device/<typename>/<name> */
+static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type,
+                                const char *name)
+{
+       char *nodename;
+       int err;
+
+       if (!strcmp(type, "console"))
+               return 0;
+
+       nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name);
+       if (!nodename)
+               return -ENOMEM;
+
+       DPRINTK("%s", nodename);
+
+       err = xenbus_probe_node(bus, type, nodename);
+       kfree(nodename);
+       return err;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+static int xenbus_uevent_frontend(struct device *dev, struct kobj_uevent_env *env)
+{
+       struct xenbus_device *xdev;
+
+       if (dev == NULL)
+               return -ENODEV;
+       xdev = to_xenbus_device(dev);
+       if (xdev == NULL)
+               return -ENODEV;
+
+       /* stuff we want to pass to /sbin/hotplug */
+       if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype) ||
+           add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename) ||
+           add_uevent_var(env, "MODALIAS=xen:%s", xdev->devicetype))
+               return -ENOMEM;
+
+       return 0;
+}
+#endif
+
+/* Bus type for frontend drivers. */
+static struct xen_bus_type xenbus_frontend = {
+       .root = "device",
+       .levels = 2,            /* device/type/<id> */
+       .get_bus_id = frontend_bus_id,
+       .probe = xenbus_probe_frontend,
+       .error = -ENODEV,
+       .bus = {
+               .name      = "xen",
+               .match     = xenbus_match,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+               .probe     = xenbus_dev_probe,
+               .remove    = xenbus_dev_remove,
+               .shutdown  = xenbus_dev_shutdown,
+               .uevent    = xenbus_uevent_frontend,
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29)
+               .dev_attrs = xenbus_dev_attrs,
+#endif
+       },
+       .dev = {
+               .init_name = "xen",
+       },
+};
+
+int xenbus_register_frontend(struct xenbus_driver *drv)
+{
+       int ret;
+
+       drv->read_otherend_details = read_backend_details;
+
+       ret = xenbus_register_driver_common(drv, &xenbus_frontend);
+       if (ret)
+               return ret;
+
+       /* If this driver is loaded as a module wait for devices to attach. */
+       wait_for_devices(drv);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(xenbus_register_frontend);
+
+#endif
  
  static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type)
  {
@@ -485,6 +700,9 @@ int xenbus_probe_devices(struct xen_bus_type *bus)
         char **dir;
         unsigned int i, dir_n;
  
+       if (bus->error)
+               return bus->error;
+
         dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n);
         if (IS_ERR(dir))
                 return PTR_ERR(dir);
@@ -498,7 +716,7 @@ int xenbus_probe_devices(struct xen_bus_type *bus)
         kfree(dir);
         return err;
  }
-EXPORT_SYMBOL_GPL(xenbus_probe_devices);
+PARAVIRT_EXPORT_SYMBOL(xenbus_probe_devices);
  
  static unsigned int char_count(const char *str, char c)
  {
@@ -530,7 +748,7 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
         char type[XEN_BUS_ID_SIZE];
         const char *p, *root;
  
-       if (char_count(node, '/') < 2)
+       if (bus->error || char_count(node, '/') < 2)
                 return;
  
         exists = xenbus_exists(XBT_NIL, node, "");
@@ -559,9 +777,27 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
  
         kfree(root);
  }
-EXPORT_SYMBOL_GPL(xenbus_dev_changed);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_changed);
  
+#if defined(CONFIG_XEN) || defined(MODULE)
+static void frontend_changed(struct xenbus_watch *watch,
+                            const char **vec, unsigned int len)
+{
+       DPRINTK("");
+
+       xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend);
+}
+
+/* We watch for devices appearing and vanishing. */
+static struct xenbus_watch fe_watch = {
+       .node = "device",
+       .callback = frontend_changed,
+};
+
+static int __maybe_unused suspend_dev(struct device *dev, void *data)
+#else
  int xenbus_dev_suspend(struct device *dev)
+#endif
  {
         int err = 0;
         struct xenbus_driver *drv;
@@ -576,13 +812,37 @@ int xenbus_dev_suspend(struct device *dev)
         if (drv->suspend)
                 err = drv->suspend(xdev);
         if (err)
-               printk(KERN_WARNING
-                      "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
+               pr_warning("xenbus: suspend %s failed: %i\n",
+                          dev_name(dev), err);
+       return 0;
+}
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_suspend);
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+static int __maybe_unused suspend_cancel_dev(struct device *dev, void *data)
+{
+       int err = 0;
+       struct xenbus_driver *drv;
+       struct xenbus_device *xdev;
+
+       DPRINTK("");
+
+       if (dev->driver == NULL)
+               return 0;
+       drv = to_xenbus_driver(dev->driver);
+       xdev = container_of(dev, struct xenbus_device, dev);
+       if (drv->suspend_cancel)
+               err = drv->suspend_cancel(xdev);
+       if (err)
+               pr_warning("xenbus: suspend_cancel %s failed: %i\n",
+                          dev_name(dev), err);
         return 0;
  }
-EXPORT_SYMBOL_GPL(xenbus_dev_suspend);
  
+static int __maybe_unused resume_dev(struct device *dev, void *data)
+#else
  int xenbus_dev_resume(struct device *dev)
+#endif
  {
         int err;
         struct xenbus_driver *drv;
@@ -596,9 +856,8 @@ int xenbus_dev_resume(struct device *dev)
         drv = to_xenbus_driver(dev->driver);
         err = talk_to_otherend(xdev);
         if (err) {
-               printk(KERN_WARNING
-                      "xenbus: resume (talk_to_otherend) %s failed: %i\n",
-                      dev_name(dev), err);
+               pr_warning("xenbus: resume (talk_to_otherend) %s failed: %i\n",
+                          dev_name(dev), err);
                 return err;
         }
  
@@ -607,48 +866,80 @@ int xenbus_dev_resume(struct device *dev)
         if (drv->resume) {
                 err = drv->resume(xdev);
                 if (err) {
-                       printk(KERN_WARNING
-                              "xenbus: resume %s failed: %i\n",
-                              dev_name(dev), err);
+                       pr_warning("xenbus: resume %s failed: %i\n",
+                                  dev_name(dev), err);
                         return err;
                 }
         }
  
         err = watch_otherend(xdev);
         if (err) {
-               printk(KERN_WARNING
-                      "xenbus_probe: resume (watch_otherend) %s failed: "
-                      "%d.\n", dev_name(dev), err);
+               pr_warning("xenbus_probe: resume (watch_otherend) %s failed:"
+                          " %d\n", dev_name(dev), err);
                 return err;
         }
  
         return 0;
  }
-EXPORT_SYMBOL_GPL(xenbus_dev_resume);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_resume);
  
+#if !defined(CONFIG_XEN) && !defined(MODULE)
  int xenbus_dev_cancel(struct device *dev)
  {
         /* Do nothing */
         DPRINTK("cancel");
         return 0;
  }
-EXPORT_SYMBOL_GPL(xenbus_dev_cancel);
+PARAVIRT_EXPORT_SYMBOL(xenbus_dev_cancel);
+#elif defined(CONFIG_PM_SLEEP) || defined(MODULE)
+void xenbus_suspend(void)
+{
+       DPRINTK("");
+
+       if (!xenbus_frontend.error)
+               bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
+       xenbus_backend_suspend(suspend_dev);
+       xs_suspend();
+}
+
+void xenbus_resume(void)
+{
+       xb_init_comms();
+       xs_resume();
+       if (!xenbus_frontend.error)
+               bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
+       xenbus_backend_resume(resume_dev);
+}
+
+void xenbus_suspend_cancel(void)
+{
+       xs_suspend_cancel();
+       if (!xenbus_frontend.error)
+               bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
+       xenbus_backend_resume(suspend_cancel_dev);
+}
+#endif /* CONFIG_PM_SLEEP || MODULE */
  
  /* A flag to determine if xenstored is 'ready' (i.e. has started) */
-int xenstored_ready;
+atomic_t xenbus_xsd_state = ATOMIC_INIT(XENBUS_XSD_UNCOMMITTED);
  
  
-int register_xenstore_notifier(struct notifier_block *nb)
+int
+#ifdef CONFIG_XEN
+__init
+#endif
+register_xenstore_notifier(struct notifier_block *nb)
  {
         int ret = 0;
  
-       if (xenstored_ready > 0)
+       if (is_xenstored_ready())
                 ret = nb->notifier_call(nb, 0, NULL);
         else
                 blocking_notifier_chain_register(&xenstore_chain, nb);
  
         return ret;
  }
+#ifndef CONFIG_XEN
  EXPORT_SYMBOL_GPL(register_xenstore_notifier);
  
  void unregister_xenstore_notifier(struct notifier_block *nb)
@@ -656,16 +947,157 @@ void unregister_xenstore_notifier(struct notifier_block *nb)
         blocking_notifier_chain_unregister(&xenstore_chain, nb);
  }
  EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
+#endif
+
+#ifndef CONFIG_XEN
+static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq);
+static int backend_state;
+
+static void xenbus_reset_backend_state_changed(struct xenbus_watch *w,
+                                       const char **v, unsigned int l)
+{
+       if (xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", &backend_state) != 1)
+               backend_state = XenbusStateUnknown;
+       printk(KERN_DEBUG "XENBUS: backend %s %s\n",
+                       v[XS_WATCH_PATH], xenbus_strstate(backend_state));
+       wake_up(&backend_state_wq);
+}
+
+static void xenbus_reset_wait_for_backend(char *be, int expected)
+{
+       long timeout;
+       timeout = wait_event_interruptible_timeout(backend_state_wq,
+                       backend_state == expected, 5 * HZ);
+       if (timeout <= 0)
+               pr_info("XENBUS: backend %s timed out.\n", be);
+}
+
+/*
+ * Reset frontend if it is in Connected or Closed state.
+ * Wait for backend to catch up.
+ * State Connected happens during kdump, Closed after kexec.
+ */
+static void xenbus_reset_frontend(char *fe, char *be, int be_state)
+{
+       struct xenbus_watch be_watch;
+
+       printk(KERN_DEBUG "XENBUS: backend %s %s\n",
+                       be, xenbus_strstate(be_state));
+
+       memset(&be_watch, 0, sizeof(be_watch));
+       be_watch.node = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/state", be);
+       if (!be_watch.node)
+               return;
+
+       be_watch.callback = xenbus_reset_backend_state_changed;
+       backend_state = XenbusStateUnknown;
+
+       pr_info("XENBUS: triggering reconnect on %s\n", be);
+       register_xenbus_watch(&be_watch);
+
+       /* fall through to forward backend to state XenbusStateInitialising */
+       switch (be_state) {
+       case XenbusStateConnected:
+               xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing);
+               xenbus_reset_wait_for_backend(be, XenbusStateClosing);
+
+       case XenbusStateClosing:
+               xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed);
+               xenbus_reset_wait_for_backend(be, XenbusStateClosed);
+
+       case XenbusStateClosed:
+               xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising);
+               xenbus_reset_wait_for_backend(be, XenbusStateInitWait);
+       }
+
+       unregister_xenbus_watch(&be_watch);
+       pr_info("XENBUS: reconnect done on %s\n", be);
+       kfree(be_watch.node);
+}
+
+static void xenbus_check_frontend(char *class, char *dev)
+{
+       int be_state, fe_state, err;
+       char *backend, *frontend;
  
-void xenbus_probe(struct work_struct *unused)
+       frontend = kasprintf(GFP_NOIO | __GFP_HIGH, "device/%s/%s", class, dev);
+       if (!frontend)
+               return;
+
+       err = xenbus_scanf(XBT_NIL, frontend, "state", "%i", &fe_state);
+       if (err != 1)
+               goto out;
+
+       switch (fe_state) {
+       case XenbusStateConnected:
+       case XenbusStateClosed:
+               printk(KERN_DEBUG "XENBUS: frontend %s %s\n",
+                               frontend, xenbus_strstate(fe_state));
+               backend = xenbus_read(XBT_NIL, frontend, "backend", NULL);
+               if (!backend || IS_ERR(backend))
+                       goto out;
+               err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state);
+               if (err == 1)
+                       xenbus_reset_frontend(frontend, backend, be_state);
+               kfree(backend);
+               break;
+       default:
+               break;
+       }
+out:
+       kfree(frontend);
+}
+
+static void xenbus_reset_state(void)
  {
-       xenstored_ready = 1;
+       char **devclass, **dev;
+       int devclass_n, dev_n;
+       int i, j;
+
+       devclass = xenbus_directory(XBT_NIL, "device", "", &devclass_n);
+       if (IS_ERR(devclass))
+               return;
+
+       for (i = 0; i < devclass_n; i++) {
+               dev = xenbus_directory(XBT_NIL, "device", devclass[i], &dev_n);
+               if (IS_ERR(dev))
+                       continue;
+               for (j = 0; j < dev_n; j++)
+                       xenbus_check_frontend(devclass[i], dev[j]);
+               kfree(dev);
+       }
+       kfree(devclass);
+}
+#endif
+
+void
+#if defined(CONFIG_XEN_UNPRIVILEGED_GUEST)
+__init
+#elif defined(MODULE)
+__devinit
+#endif
+xenbus_probe(struct work_struct *unused)
+{
+       BUG_ON(!is_xenstored_ready());
+
+#ifndef CONFIG_XEN
+       /* reset devices in Connected or Closed state */
+       xenbus_reset_state();
+#endif
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+       /* Enumerate devices in xenstore and watch for changes. */
+       xenbus_probe_devices(&xenbus_frontend);
+       register_xenbus_watch(&fe_watch);
+       xenbus_backend_probe_and_watch();
+#endif
  
         /* Notify others that xenstore is up */
         blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
  }
-EXPORT_SYMBOL_GPL(xenbus_probe);
+PARAVIRT_EXPORT_SYMBOL(xenbus_probe);
  
+#if !defined(CONFIG_XEN) && !defined(MODULE)
  static int __init xenbus_probe_initcall(void)
  {
         if (!xen_domain())
@@ -679,6 +1111,120 @@ static int __init xenbus_probe_initcall(void)
  }
  
  device_initcall(xenbus_probe_initcall);
+#endif
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#ifdef CONFIG_PROC_FS
+static struct file_operations xsd_kva_fops;
+static struct proc_dir_entry *xsd_kva_intf;
+static struct proc_dir_entry *xsd_port_intf;
+
+static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       size_t size = vma->vm_end - vma->vm_start;
+       int old;
+       int rc;
+
+       old = atomic_cmpxchg(&xenbus_xsd_state,
+                          XENBUS_XSD_UNCOMMITTED,
+                          XENBUS_XSD_LOCAL_INIT);
+       switch (old) {
+               case XENBUS_XSD_UNCOMMITTED:
+                       rc = xb_init_comms();
+                       if (rc != 0)
+                               return rc;
+                       break;
+
+               case XENBUS_XSD_FOREIGN_INIT:
+               case XENBUS_XSD_FOREIGN_READY:
+                       return -EBUSY;
+
+               case XENBUS_XSD_LOCAL_INIT:
+               case XENBUS_XSD_LOCAL_READY:
+               default:
+                       break;
+       }
+
+       if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+               return -EINVAL;
+
+       if (remap_pfn_range(vma, vma->vm_start, mfn_to_pfn(xen_store_mfn),
+                           size, vma->vm_page_prot))
+               return -EAGAIN;
+
+       return 0;
+}
+
+static int xsd_kva_read(char *page, char **start, off_t off,
+                       int count, int *eof, void *data)
+{
+       int len;
+
+       len  = sprintf(page, "0x%p", xen_store_interface);
+       *eof = 1;
+       return len;
+}
+
+static int xsd_port_read(char *page, char **start, off_t off,
+                        int count, int *eof, void *data)
+{
+       int len;
+
+       len  = sprintf(page, "%d", xen_store_evtchn);
+       *eof = 1;
+       return len;
+}
+#endif
+
+#ifdef CONFIG_XEN_XENBUS_DEV
+int xenbus_conn(domid_t remote_dom, grant_ref_t *grant_ref,
+               evtchn_port_t *local_port)
+{
+       struct evtchn_alloc_unbound alloc_unbound;
+       int rc, rc2;
+
+       BUG_ON(atomic_read(&xenbus_xsd_state) != XENBUS_XSD_FOREIGN_INIT);
+       BUG_ON(!is_initial_xendomain());
+
+       remove_xen_proc_entry("xsd_kva");
+       remove_xen_proc_entry("xsd_port");
+
+       rc = close_evtchn(xen_store_evtchn);
+       if (rc != 0)
+               goto fail0;
+
+       alloc_unbound.dom = DOMID_SELF;
+       alloc_unbound.remote_dom = remote_dom;
+       rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+                                        &alloc_unbound);
+       if (rc != 0)
+               goto fail0;
+       *local_port = xen_store_evtchn = alloc_unbound.port;
+
+       /* keep the old page (xen_store_mfn, xen_store_interface) */
+       rc = gnttab_grant_foreign_access(remote_dom, xen_store_mfn,
+                                        GTF_permit_access);
+       if (rc < 0)
+               goto fail1;
+       *grant_ref = rc;
+
+       rc = xb_init_comms();
+       if (rc != 0)
+               goto fail1;
+
+       return 0;
+
+fail1:
+       rc2 = close_evtchn(xen_store_evtchn);
+       if (rc2 != 0)
+               pr_warning("XENBUS: Error freeing xenstore event channel:"
+                          " %d\n", rc2);
+fail0:
+       xen_store_evtchn = -1;
+       return rc;
+}
+#endif
+#endif /* CONFIG_XEN_PRIVILEGED_GUEST */
  
  /* Set up event channel for xenstored which is run as a local process
   * (this is normally used only in dom0)
@@ -719,13 +1265,81 @@ static int __init xenstored_local_init(void)
         return err;
  }
  
-static int __init xenbus_init(void)
+#ifndef MODULE
+static int __init
+#else
+int __devinit
+#endif
+xenbus_init(void)
  {
         int err = 0;
  
-       if (!xen_domain())
+       DPRINTK("");
+
+       if (!is_running_on_xen())
                 return -ENODEV;
  
+#if defined(CONFIG_XEN) || defined(MODULE)
+       /* Register ourselves with the kernel bus subsystem */
+       xenbus_frontend.error = bus_register(&xenbus_frontend.bus);
+       if (xenbus_frontend.error)
+               pr_warning("XENBUS: Error registering frontend bus: %i\n",
+                          xenbus_frontend.error);
+       xenbus_backend_bus_register();
+
+       /*
+        * Domain0 doesn't have a store_evtchn or store_mfn yet.
+        */
+       if (is_initial_xendomain()) {
+               err = xenstored_local_init();
+               if (err)
+                       goto out_error;
+
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST)
+               /* And finally publish the above info in /proc/xen */
+               xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600);
+               if (xsd_kva_intf) {
+                       memcpy(&xsd_kva_fops, xsd_kva_intf->proc_fops,
+                              sizeof(xsd_kva_fops));
+                       xsd_kva_fops.mmap = xsd_kva_mmap;
+                       xsd_kva_intf->proc_fops = &xsd_kva_fops;
+                       xsd_kva_intf->read_proc = xsd_kva_read;
+               }
+               xsd_port_intf = create_xen_proc_entry("xsd_port", 0400);
+               if (xsd_port_intf)
+                       xsd_port_intf->read_proc = xsd_port_read;
+#endif
+               xen_store_interface = mfn_to_virt(xen_store_mfn);
+       } else {
+#ifndef CONFIG_XEN
+               uint64_t v = 0;
+
+               err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
+               if (err)
+                       goto out_error;
+               xen_store_evtchn = (int)v;
+               err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
+               if (err)
+                       goto out_error;
+               xen_store_mfn = (unsigned long)v;
+               xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT,
+                                             PAGE_SIZE);
+#endif
+#ifndef MODULE
+               xen_store_evtchn = xen_start_info->store_evtchn;
+               xen_store_mfn = xen_start_info->store_mfn;
+               xen_store_interface = mfn_to_virt(xen_store_mfn);
+#endif
+               atomic_set(&xenbus_xsd_state, XENBUS_XSD_FOREIGN_READY);
+
+               /* Initialize the shared memory rings to talk to xenstored */
+               err = xb_init_comms();
+               if (err)
+                       goto out_error;
+       }
+
+       xenbus_dev_init();
+#else /* !defined(CONFIG_XEN) && !defined(MODULE) */
         xenbus_ring_ops_init();
  
         if (xen_hvm_domain()) {
@@ -743,7 +1357,7 @@ static int __init xenbus_init(void)
                 xen_store_evtchn = xen_start_info->store_evtchn;
                 xen_store_mfn = xen_start_info->store_mfn;
                 if (xen_store_evtchn)
-                       xenstored_ready = 1;
+                       atomic_set(&xenbus_xsd_state, XENBUS_XSD_FOREIGN_READY);
                 else {
                         err = xenstored_local_init();
                         if (err)
@@ -751,16 +1365,33 @@ static int __init xenbus_init(void)
                 }
                 xen_store_interface = mfn_to_virt(xen_store_mfn);
         }
+#endif
  
         /* Initialize the interface to xenstore. */
         err = xs_init();
         if (err) {
-               printk(KERN_WARNING
-                      "XENBUS: Error initializing xenstore comms: %i\n", err);
+               pr_warning("XENBUS: Error initializing xenstore comms: %i\n",
+                          err);
                 goto out_error;
         }
  
-#ifdef CONFIG_XEN_COMPAT_XENFS
+#if defined(CONFIG_XEN) || defined(MODULE)
+       /* Register ourselves with the kernel device subsystem */
+       if (!xenbus_frontend.error) {
+               xenbus_frontend.error = device_register(&xenbus_frontend.dev);
+               if (xenbus_frontend.error) {
+                       bus_unregister(&xenbus_frontend.bus);
+                       pr_warning("XENBUS: Error registering frontend device:"
+                                  " %d\n", xenbus_frontend.error);
+               }
+       }
+       xenbus_backend_device_register();
+
+       if (!is_initial_xendomain())
+               xenbus_probe(NULL);
+#endif
+
+#if defined(CONFIG_XEN_COMPAT_XENFS) && !defined(MODULE)
         /*
          * Create xenfs mountpoint in /proc for compatibility with
          * utilities that expect to find "xenbus" under "/proc/xen".
@@ -768,10 +1399,162 @@ static int __init xenbus_init(void)
         proc_mkdir("xen", NULL);
  #endif
  
+       return 0;
+
  out_error:
+       /*
+        * Do not unregister the xenbus front/backend buses here. The buses
+        * must exist because front/backend drivers will use them when they are
+        * registered.
+        */
         return err;
  }
  
+#ifndef MODULE
  postcore_initcall(xenbus_init);
-
+#ifdef CONFIG_XEN
+MODULE_LICENSE("Dual BSD/GPL");
+#else
  MODULE_LICENSE("GPL");
+#endif
+#endif
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+
+static int is_device_connecting(struct device *dev, void *data)
+{
+       struct xenbus_device *xendev = to_xenbus_device(dev);
+       struct device_driver *drv = data;
+       struct xenbus_driver *xendrv;
+
+       /*
+        * A device with no driver will never connect. We care only about
+        * devices which should currently be in the process of connecting.
+        */
+       if (!dev->driver)
+               return 0;
+
+       /* Is this search limited to a particular driver? */
+       if (drv && (dev->driver != drv))
+               return 0;
+
+       xendrv = to_xenbus_driver(dev->driver);
+       return (xendev->state < XenbusStateConnected ||
+               (xendev->state == XenbusStateConnected &&
+                xendrv->is_ready && !xendrv->is_ready(xendev)));
+}
+
+static int exists_connecting_device(struct device_driver *drv)
+{
+       if (xenbus_frontend.error)
+               return xenbus_frontend.error;
+       return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+                               is_device_connecting);
+}
+
+static int print_device_status(struct device *dev, void *data)
+{
+       struct xenbus_device *xendev = to_xenbus_device(dev);
+       struct device_driver *drv = data;
+       struct xenbus_driver *xendrv;
+
+       /* Is this operation limited to a particular driver? */
+       if (drv && (dev->driver != drv))
+               return 0;
+
+       if (!dev->driver) {
+               /* Information only: is this too noisy? */
+               pr_info("XENBUS: Device with no driver: %s\n",
+                       xendev->nodename);
+               return 0;
+       }
+
+       if (xendev->state < XenbusStateConnected) {
+               enum xenbus_state rstate = XenbusStateUnknown;
+               if (xendev->otherend)
+                       rstate = xenbus_read_driver_state(xendev->otherend);
+               pr_warning("XENBUS: Timeout connecting to device: %s"
+                          " (local state %d, remote state %d)\n",
+                          xendev->nodename, xendev->state, rstate);
+       }
+
+       xendrv = to_xenbus_driver(dev->driver);
+       if (xendrv->is_ready && !xendrv->is_ready(xendev))
+               pr_warning("XENBUS: Device not ready: %s\n",
+                          xendev->nodename);
+
+       return 0;
+}
+
+/* We only wait for device setup after most initcalls have run. */
+static int ready_to_wait_for_devices;
+
+/*
+ * On a 5-minute timeout, wait for all devices currently configured.  We need
+ * to do this to guarantee that the filesystems and / or network devices
+ * needed for boot are available, before we can allow the boot to proceed.
+ *
+ * This needs to be on a late_initcall, to happen after the frontend device
+ * drivers have been initialised, but before the root fs is mounted.
+ *
+ * A possible improvement here would be to have the tools add a per-device
+ * flag to the store entry, indicating whether it is needed at boot time.
+ * This would allow people who knew what they were doing to accelerate their
+ * boot slightly, but of course needs tools or manual intervention to set up
+ * those flags correctly.
+ */
+static void wait_for_devices(struct xenbus_driver *xendrv)
+{
+       unsigned long start = jiffies;
+       struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
+       unsigned int seconds_waited = 0;
+
+       if (!ready_to_wait_for_devices || !is_running_on_xen())
+               return;
+
+       while (exists_connecting_device(drv)) {
+               if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
+                       if (!seconds_waited)
+                               pr_warning("XENBUS: Waiting for "
+                                          "devices to initialise: ");
+                       seconds_waited += 5;
+                       printk("%us...", 300 - seconds_waited);
+                       if (seconds_waited == 300)
+                               break;
+               }
+
+               schedule_timeout_interruptible(HZ/10);
+       }
+
+       if (seconds_waited)
+               printk("\n");
+
+       bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
+                        print_device_status);
+}
+
+#ifndef MODULE
+static int __init boot_wait_for_devices(void)
+{
+#if !defined(CONFIG_XEN) && !defined(MODULE)
+       if (xen_hvm_domain() && !xen_platform_pci_unplug)
+               return -ENODEV;
+#endif
+
+       if (!xenbus_frontend.error) {
+               ready_to_wait_for_devices = 1;
+               wait_for_devices(NULL);
+       }
+       return 0;
+}
+
+late_initcall(boot_wait_for_devices);
+#endif
+
+int xenbus_for_each_frontend(void *arg, int (*fn)(struct device *, void *))
+{
+       return bus_for_each_dev(&xenbus_frontend.bus, NULL, arg, fn);
+}
+EXPORT_SYMBOL_GPL(xenbus_for_each_frontend);
+
+#endif /* CONFIG_XEN || MODULE */
diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h

index bb4f92e..4b3c63f 100644 (file)
--- a/drivers/xen/xenbus/xenbus_probe.h
+++ b/drivers/xen/xenbus/xenbus_probe.h
@@ -34,16 +34,46 @@
  #ifndef _XENBUS_PROBE_H
  #define _XENBUS_PROBE_H
  
+#ifndef BUS_ID_SIZE
  #define XEN_BUS_ID_SIZE                        20
+#else
+#define XEN_BUS_ID_SIZE                        BUS_ID_SIZE
+#endif
+
+#ifdef CONFIG_PARAVIRT_XEN
+#define is_running_on_xen() xen_domain()
+#define is_initial_xendomain() xen_initial_domain()
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
+#define dev_name(dev) ((dev)->bus_id)
+#endif
+
+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
+extern void xenbus_backend_suspend(int (*fn)(struct device *, void *));
+extern void xenbus_backend_resume(int (*fn)(struct device *, void *));
+extern void xenbus_backend_probe_and_watch(void);
+extern void xenbus_backend_bus_register(void);
+extern void xenbus_backend_device_register(void);
+#else
+static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {}
+static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {}
+static inline void xenbus_backend_probe_and_watch(void) {}
+static inline void xenbus_backend_bus_register(void) {}
+static inline void xenbus_backend_device_register(void) {}
+#endif
  
  struct xen_bus_type {
         char *root;
+       int error;
         unsigned int levels;
         int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
         int (*probe)(struct xen_bus_type *bus, const char *type,
                      const char *dir);
+#if !defined(CONFIG_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
         void (*otherend_changed)(struct xenbus_watch *watch, const char **vec,
                                  unsigned int len);
+#else
+       struct device dev;
+#endif
         struct bus_type bus;
  };
  
diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c

index 257be37..e601e27 100644 (file)
--- a/drivers/xen/xenbus/xenbus_probe_backend.c
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@ -36,24 +36,36 @@
                  __func__, __LINE__, ##args)
  
  #include <linux/kernel.h>
+#include <linux/version.h>
  #include <linux/err.h>
  #include <linux/string.h>
  #include <linux/ctype.h>
  #include <linux/fcntl.h>
  #include <linux/mm.h>
+#include <linux/slab.h>
  #include <linux/notifier.h>
  #include <linux/export.h>
  
  #include <asm/page.h>
  #include <asm/pgtable.h>
+#ifndef CONFIG_XEN
  #include <asm/xen/hypervisor.h>
+#endif
  #include <asm/hypervisor.h>
  #include <xen/xenbus.h>
+#ifdef CONFIG_XEN
+#include <xen/xen_proc.h>
+#include <xen/evtchn.h>
+#endif
  #include <xen/features.h>
  
  #include "xenbus_comms.h"
  #include "xenbus_probe.h"
  
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
  /* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */
  static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename)
  {
@@ -179,25 +191,36 @@ static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type,
         return err;
  }
  
+#ifndef CONFIG_XEN
  static void frontend_changed(struct xenbus_watch *watch,
                             const char **vec, unsigned int len)
  {
         xenbus_otherend_changed(watch, vec, len, 0);
  }
+#endif
  
  static struct xen_bus_type xenbus_backend = {
         .root = "backend",
         .levels = 3,            /* backend/type/<frontend>/<id> */
         .get_bus_id = backend_bus_id,
         .probe = xenbus_probe_backend,
+#ifndef CONFIG_XEN
         .otherend_changed = frontend_changed,
+#else
+       .dev = {
+               .init_name = "xen-backend",
+       },
+#endif
+       .error = -ENODEV,
         .bus = {
                 .name           = "xen-backend",
                 .match          = xenbus_match,
                 .uevent         = xenbus_uevent_backend,
                 .probe          = xenbus_dev_probe,
                 .remove         = xenbus_dev_remove,
+#ifdef CONFIG_XEN
                 .shutdown       = xenbus_dev_shutdown,
+#endif
                 .dev_attrs      = xenbus_dev_attrs,
         },
  };
@@ -220,6 +243,7 @@ static int read_frontend_details(struct xenbus_device *xendev)
         return xenbus_read_otherend_details(xendev, "frontend-id", "frontend");
  }
  
+#ifndef CONFIG_XEN
  int xenbus_dev_is_online(struct xenbus_device *dev)
  {
         int rc, val;
@@ -231,6 +255,7 @@ int xenbus_dev_is_online(struct xenbus_device *dev)
         return val;
  }
  EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
+#endif
  
  int xenbus_register_backend(struct xenbus_driver *drv)
  {
@@ -240,17 +265,41 @@ int xenbus_register_backend(struct xenbus_driver *drv)
  }
  EXPORT_SYMBOL_GPL(xenbus_register_backend);
  
+#if defined(CONFIG_XEN) && defined(CONFIG_PM_SLEEP)
+void xenbus_backend_suspend(int (*fn)(struct device *, void *))
+{
+       DPRINTK("");
+       if (!xenbus_backend.error)
+               bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
+}
+
+void xenbus_backend_resume(int (*fn)(struct device *, void *))
+{
+       DPRINTK("");
+       if (!xenbus_backend.error)
+               bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn);
+}
+#endif
+
+#ifndef CONFIG_XEN
  static int backend_probe_and_watch(struct notifier_block *notifier,
                                    unsigned long event,
                                    void *data)
+#else
+void xenbus_backend_probe_and_watch(void)
+#endif
  {
         /* Enumerate devices in xenstore and watch for changes. */
         xenbus_probe_devices(&xenbus_backend);
         register_xenbus_watch(&be_watch);
  
+#ifndef CONFIG_XEN
         return NOTIFY_DONE;
+#endif
  }
  
+#ifndef CONFIG_XEN
+
  static int __init xenbus_probe_backend_init(void)
  {
         static struct notifier_block xenstore_notifier = {
@@ -270,3 +319,34 @@ static int __init xenbus_probe_backend_init(void)
         return 0;
  }
  subsys_initcall(xenbus_probe_backend_init);
+
+#else
+
+void __init xenbus_backend_bus_register(void)
+{
+       xenbus_backend.error = bus_register(&xenbus_backend.bus);
+       if (xenbus_backend.error)
+               pr_warning("XENBUS: Error registering backend bus: %i\n",
+                          xenbus_backend.error);
+}
+
+void __init xenbus_backend_device_register(void)
+{
+       if (xenbus_backend.error)
+               return;
+
+       xenbus_backend.error = device_register(&xenbus_backend.dev);
+       if (xenbus_backend.error) {
+               bus_unregister(&xenbus_backend.bus);
+               pr_warning("XENBUS: Error registering backend device: %i\n",
+                          xenbus_backend.error);
+       }
+}
+
+int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *))
+{
+       return bus_for_each_dev(&xenbus_backend.bus, NULL, arg, fn);
+}
+EXPORT_SYMBOL_GPL(xenbus_for_each_backend);
+
+#endif
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c

index d1c217b..a1f7b81 100644 (file)
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -48,6 +48,14 @@
  #include <xen/xen.h>
  #include "xenbus_comms.h"
  
+#ifdef HAVE_XEN_PLATFORM_COMPAT_H
+#include <xen/platform-compat.h>
+#endif
+
+#ifndef PF_NOFREEZE /* Old kernel (pre-2.6.6). */
+#define PF_NOFREEZE    0
+#endif
+
  struct xs_stored_msg {
         struct list_head list;
  
@@ -119,7 +127,7 @@ static DEFINE_SPINLOCK(watch_events_lock);
   * carrying out work.
   */
  static pid_t xenwatch_pid;
-static DEFINE_MUTEX(xenwatch_mutex);
+/* static */ DEFINE_MUTEX(xenwatch_mutex);
  static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq);
  
  static int get_error(const char *errorstring)
@@ -128,9 +136,8 @@ static int get_error(const char *errorstring)
  
         for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) {
                 if (i == ARRAY_SIZE(xsd_errors) - 1) {
-                       printk(KERN_WARNING
-                              "XENBUS xen store gave: unknown error %s",
-                              errorstring);
+                       pr_warning("XENBUS xen store gave: unknown error %s",
+                                  errorstring);
                         return EINVAL;
                 }
         }
@@ -181,6 +188,7 @@ static void transaction_end(void)
                 wake_up(&xs_state.transaction_wq);
  }
  
+#if !defined(CONFIG_XEN) || defined(CONFIG_PM_SLEEP)
  static void transaction_suspend(void)
  {
         mutex_lock(&xs_state.transaction_mutex);
@@ -192,14 +200,15 @@ static void transaction_resume(void)
  {
         mutex_unlock(&xs_state.transaction_mutex);
  }
+#endif
  
  void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
  {
         void *ret;
-       struct xsd_sockmsg req_msg = *msg;
+       enum xsd_sockmsg_type type = msg->type;
         int err;
  
-       if (req_msg.type == XS_TRANSACTION_START)
+       if (type == XS_TRANSACTION_START)
                 transaction_start();
  
         mutex_lock(&xs_state.request_mutex);
@@ -213,14 +222,15 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
  
         mutex_unlock(&xs_state.request_mutex);
  
-       if ((msg->type == XS_TRANSACTION_END) ||
-           ((req_msg.type == XS_TRANSACTION_START) &&
-            (msg->type == XS_ERROR)))
+       if ((type == XS_TRANSACTION_END) ||
+           ((type == XS_TRANSACTION_START) && (msg->type == XS_ERROR)))
                 transaction_end();
  
         return ret;
  }
+#if !defined(CONFIG_XEN) && !defined(MODULE)
  EXPORT_SYMBOL(xenbus_dev_request_and_reply);
+#endif
  
  /* Send message to xs, get kmalloc'ed reply.  ERR_PTR() on error. */
  static void *xs_talkv(struct xenbus_transaction t,
@@ -272,9 +282,9 @@ static void *xs_talkv(struct xenbus_transaction t,
  
         if (msg.type != type) {
                 if (printk_ratelimit())
-                       printk(KERN_WARNING
-                              "XENBUS unexpected type [%d], expected [%d]\n",
-                              msg.type, type);
+                       pr_warning("XENBUS unexpected type [%d],"
+                                  " expected [%d]\n",
+                                  msg.type, type);
                 kfree(ret);
                 return ERR_PTR(-EINVAL);
         }
@@ -331,7 +341,7 @@ static char **split(char *strings, unsigned int len, unsigned int *num)
         char *p, **ret;
  
         /* Count the strings. */
-       *num = count_strings(strings, len);
+       *num = count_strings(strings, len) + 1;
  
         /* Transfer to one big alloc for easy freeing. */
         ret = kmalloc(*num * sizeof(char *) + len, GFP_NOIO | __GFP_HIGH);
@@ -345,6 +355,7 @@ static char **split(char *strings, unsigned int len, unsigned int *num)
         strings = (char *)&ret[*num];
         for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1)
                 ret[(*num)++] = p;
+       ret[*num] = strings + len;
  
         return ret;
  }
@@ -532,18 +543,18 @@ int xenbus_printf(struct xenbus_transaction t,
  {
         va_list ap;
         int ret;
-       char *buf;
+       char *printf_buffer;
  
         va_start(ap, fmt);
-       buf = kvasprintf(GFP_NOIO | __GFP_HIGH, fmt, ap);
+       printf_buffer = kvasprintf(GFP_NOIO | __GFP_HIGH, fmt, ap);
         va_end(ap);
  
-       if (!buf)
+       if (!printf_buffer)
                 return -ENOMEM;
  
-       ret = xenbus_write(t, dir, node, buf);
+       ret = xenbus_write(t, dir, node, printf_buffer);
  
-       kfree(buf);
+       kfree(printf_buffer);
  
         return ret;
  }
@@ -618,6 +629,23 @@ static struct xenbus_watch *find_watch(const char *token)
         return NULL;
  }
  
+static void xs_reset_watches(void)
+{
+#ifdef MODULE
+       int err, supported = 0;
+
+       err = xenbus_scanf(XBT_NIL, "control",
+                          "platform-feature-xs_reset_watches", "%d",
+                          &supported);
+       if (err != 1 || !supported)
+               return;
+
+       err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL));
+       if (err && err != -EEXIST)
+               pr_warning("xs_reset_watches failed: %d\n", err);
+#endif
+}
+
  /* Register callback to watch this node. */
  int register_xenbus_watch(struct xenbus_watch *watch)
  {
@@ -654,6 +682,10 @@ void unregister_xenbus_watch(struct xenbus_watch *watch)
         char token[sizeof(watch) * 2 + 1];
         int err;
  
+#if defined(CONFIG_XEN) || defined(MODULE)
+       BUG_ON(watch->flags & XBWF_new_thread);
+#endif
+
         sprintf(token, "%lX", (long)watch);
  
         down_read(&xs_state.watch_mutex);
@@ -665,9 +697,8 @@ void unregister_xenbus_watch(struct xenbus_watch *watch)
  
         err = xs_unwatch(watch->node, token);
         if (err)
-               printk(KERN_WARNING
-                      "XENBUS Failed to release watch %s: %i\n",
-                      watch->node, err);
+               pr_warning("XENBUS Failed to release watch %s: %i\n",
+                          watch->node, err);
  
         up_read(&xs_state.watch_mutex);
  
@@ -692,6 +723,7 @@ void unregister_xenbus_watch(struct xenbus_watch *watch)
  }
  EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
  
+#if !defined(CONFIG_XEN) || defined(CONFIG_PM_SLEEP)
  void xs_suspend(void)
  {
         transaction_suspend();
@@ -705,7 +737,9 @@ void xs_resume(void)
         struct xenbus_watch *watch;
         char token[sizeof(watch) * 2 + 1];
  
+#if !defined(CONFIG_XEN) && !defined(MODULE)
         xb_init_comms();
+#endif
  
         mutex_unlock(&xs_state.response_mutex);
         mutex_unlock(&xs_state.request_mutex);
@@ -727,12 +761,34 @@ void xs_suspend_cancel(void)
         up_write(&xs_state.watch_mutex);
         mutex_unlock(&xs_state.transaction_mutex);
  }
+#endif
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+static int xenwatch_handle_callback(void *data)
+{
+       struct xs_stored_msg *msg = data;
+
+       msg->u.watch.handle->callback(msg->u.watch.handle,
+                                     (const char **)msg->u.watch.vec,
+                                     msg->u.watch.vec_size);
+
+       kfree(msg->u.watch.vec);
+       kfree(msg);
+
+       /* Kill this kthread if we were spawned just for this callback. */
+       if (current->pid != xenwatch_pid)
+               do_exit(0);
+
+       return 0;
+}
+#endif
  
  static int xenwatch_thread(void *unused)
  {
         struct list_head *ent;
         struct xs_stored_msg *msg;
  
+       current->flags |= PF_NOFREEZE;
         for (;;) {
                 wait_event_interruptible(watch_events_waitq,
                                          !list_empty(&watch_events));
@@ -748,17 +804,39 @@ static int xenwatch_thread(void *unused)
                         list_del(ent);
                 spin_unlock(&watch_events_lock);
  
-               if (ent != &watch_events) {
-                       msg = list_entry(ent, struct xs_stored_msg, list);
-                       msg->u.watch.handle->callback(
-                               msg->u.watch.handle,
-                               (const char **)msg->u.watch.vec,
-                               msg->u.watch.vec_size);
-                       kfree(msg->u.watch.vec);
-                       kfree(msg);
+               if (ent == &watch_events) {
+                       mutex_unlock(&xenwatch_mutex);
+                       continue;
                 }
  
+               msg = list_entry(ent, struct xs_stored_msg, list);
+
+#if defined(CONFIG_XEN) || defined(MODULE)
+               /*
+                * Unlock the mutex before running an XBWF_new_thread
+                * handler. kthread_run can block which can deadlock
+                * against unregister_xenbus_watch() if we need to
+                * unregister other watches in order to make
+                * progress. This can occur on resume before the swap
+                * device is attached.
+                */
+               if (msg->u.watch.handle->flags & XBWF_new_thread) {
+                       mutex_unlock(&xenwatch_mutex);
+                       kthread_run(xenwatch_handle_callback,
+                                   msg, "xenwatch_cb");
+               } else {
+                       xenwatch_handle_callback(msg);
+                       mutex_unlock(&xenwatch_mutex);
+               }
+#else
+               msg->u.watch.handle->callback(
+                       msg->u.watch.handle,
+                       (const char **)msg->u.watch.vec,
+                       msg->u.watch.vec_size);
                 mutex_unlock(&xenwatch_mutex);
+               kfree(msg->u.watch.vec);
+               kfree(msg);
+#endif
         }
  
         return 0;
@@ -858,11 +936,12 @@ static int xenbus_thread(void *unused)
  {
         int err;
  
+       current->flags |= PF_NOFREEZE;
         for (;;) {
                 err = process_msg();
                 if (err)
-                       printk(KERN_WARNING "XENBUS error %d while reading "
-                              "message\n", err);
+                       pr_warning("XENBUS error %d while reading "
+                                  "message\n", err);
                 if (kthread_should_stop())
                         break;
         }
@@ -870,9 +949,14 @@ static int xenbus_thread(void *unused)
         return 0;
  }
  
-int xs_init(void)
+int
+#ifndef MODULE
+__init
+#else
+__devinit
+#endif
+xs_init(void)
  {
-       int err;
         struct task_struct *task;
  
         INIT_LIST_HEAD(&xs_state.reply_list);
@@ -886,11 +970,6 @@ int xs_init(void)
         atomic_set(&xs_state.transaction_count, 0);
         init_waitqueue_head(&xs_state.transaction_wq);
  
-       /* Initialize the shared memory rings to talk to xenstored */
-       err = xb_init_comms();
-       if (err)
-               return err;
-
         task = kthread_run(xenwatch_thread, NULL, "xenwatch");
         if (IS_ERR(task))
                 return PTR_ERR(task);
@@ -900,5 +979,8 @@ int xs_init(void)
         if (IS_ERR(task))
                 return PTR_ERR(task);
  
+       /* shutdown watches for kexec boot */
+       xs_reset_watches();
+
         return 0;
  }
diff --git a/drivers/xen/xenoprof/xenoprofile.c b/drivers/xen/xenoprof/xenoprofile.c

new file mode 100644 (file)

index 0000000..80541d9
--- /dev/null
+++ b/drivers/xen/xenoprof/xenoprofile.c
@@ -0,0 +1,585 @@
+/**
+ * @file xenoprofile.c
+ *
+ * @remark Copyright 2002 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
+ *
+ * Modified by Aravind Menon and Jose Renato Santos for Xen
+ * These modifications are:
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ *
+ * Separated out arch-generic part
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ */
+
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/oprofile.h>
+#include <linux/syscore_ops.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/vmalloc.h>
+#include <asm/pgtable.h>
+#include <xen/evtchn.h>
+#include <xen/xenoprof.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/xenoprof.h>
+#include "../../../drivers/oprofile/event_buffer.h"
+
+#define MAX_XENOPROF_SAMPLES 16
+
+/* sample buffers shared with Xen */
+static xenoprof_buf_t **__read_mostly xenoprof_buf;
+/* Shared buffer area */
+static struct xenoprof_shared_buffer shared_buffer;
+
+/* Passive sample buffers shared with Xen */
+static xenoprof_buf_t **__read_mostly p_xenoprof_buf[MAX_OPROF_DOMAINS];
+/* Passive shared buffer area */
+static struct xenoprof_shared_buffer p_shared_buffer[MAX_OPROF_DOMAINS];
+
+static int xenoprof_start(void);
+static void xenoprof_stop(void);
+
+static int xenoprof_enabled = 0;
+static int xenoprof_is_primary = 0;
+static int active_defined;
+
+extern unsigned long oprofile_backtrace_depth;
+
+/* Number of buffers in shared area (one per VCPU) */
+static int nbuf;
+/* Mapping of VIRQ_XENOPROF to irq number */
+static int ovf_irq = -1;
+static cpumask_var_t ovf_irq_mapped;
+/* cpu model type string - copied from Xen on XENOPROF_init command */
+static char cpu_type[XENOPROF_CPU_TYPE_SIZE];
+
+#ifdef CONFIG_PM_SLEEP
+
+static int xenoprof_suspend(void)
+{
+       if (xenoprof_enabled == 1)
+               xenoprof_stop();
+       return 0;
+}
+
+
+static void xenoprof_resume(void)
+{
+       if (xenoprof_enabled == 1)
+               xenoprof_start();
+}
+
+
+static struct syscore_ops oprofile_syscore_ops = {
+       .resume         = xenoprof_resume,
+       .suspend        = xenoprof_suspend
+};
+
+
+static int __init init_driverfs(void)
+{
+       register_syscore_ops(&oprofile_syscore_ops);
+       return 0;
+}
+
+
+static void exit_driverfs(void)
+{
+       unregister_syscore_ops(&oprofile_syscore_ops);
+}
+
+#else
+#define init_driverfs() do { } while (0)
+#define exit_driverfs() do { } while (0)
+#endif /* CONFIG_PM_SLEEP */
+
+static unsigned long long oprofile_samples;
+static unsigned long long p_oprofile_samples;
+
+static unsigned int pdomains;
+static struct xenoprof_passive passive_domains[MAX_OPROF_DOMAINS];
+
+/* Check whether the given entry is an escape code */
+static int xenoprof_is_escape(xenoprof_buf_t * buf, int tail)
+{
+#if CONFIG_XEN_COMPAT < 0x040200 && !defined(CONFIG_64BIT)
+       if (buf->event_log[tail].eip == (unsigned long)XENOPROF_ESCAPE_CODE)
+               return 1;
+#endif
+       return (buf->event_log[tail].eip == XENOPROF_ESCAPE_CODE);
+}
+
+/* Get the event at the given entry  */
+static uint8_t xenoprof_get_event(xenoprof_buf_t * buf, int tail)
+{
+       return (buf->event_log[tail].event);
+}
+
+static void xenoprof_add_pc(xenoprof_buf_t *buf, int is_passive)
+{
+       int head, tail, size;
+       int tracing = 0;
+
+       head = buf->event_head;
+       tail = buf->event_tail;
+       size = buf->event_size;
+
+       while (tail != head) {
+               if (xenoprof_is_escape(buf, tail) &&
+                   xenoprof_get_event(buf, tail) == XENOPROF_TRACE_BEGIN) {
+                       tracing=1;
+                       oprofile_add_mode(buf->event_log[tail].mode);
+                       if (!is_passive)
+                               oprofile_samples++;
+                       else
+                               p_oprofile_samples++;
+                       
+               } else {
+                       oprofile_add_pc(buf->event_log[tail].eip,
+                                       buf->event_log[tail].mode,
+                                       buf->event_log[tail].event);
+                       if (!tracing) {
+                               if (!is_passive)
+                                       oprofile_samples++;
+                               else
+                                       p_oprofile_samples++;
+                       }
+       
+               }
+               tail++;
+               if(tail==size)
+                   tail=0;
+       }
+       buf->event_tail = tail;
+}
+
+static void xenoprof_handle_passive(void)
+{
+       int i, j;
+       int flag_domain, flag_switch = 0;
+       
+       for (i = 0; i < pdomains; i++) {
+               flag_domain = 0;
+               for (j = 0; j < passive_domains[i].nbuf; j++) {
+                       xenoprof_buf_t *buf = p_xenoprof_buf[i][j];
+                       if (buf->event_head == buf->event_tail)
+                               continue;
+                       if (!flag_domain) {
+                               if (!oprofile_add_domain_switch(
+                                       passive_domains[i].domain_id))
+                                       goto done;
+                               flag_domain = 1;
+                       }
+                       xenoprof_add_pc(buf, 1);
+                       flag_switch = 1;
+               }
+       }
+done:
+       if (flag_switch)
+               oprofile_add_domain_switch(COORDINATOR_DOMAIN);
+}
+
+static irqreturn_t xenoprof_ovf_interrupt(int irq, void *dev_id)
+{
+       struct xenoprof_buf * buf;
+       static unsigned long flag;
+
+       buf = xenoprof_buf[smp_processor_id()];
+
+       xenoprof_add_pc(buf, 0);
+
+       if (xenoprof_is_primary && !test_and_set_bit(0, &flag)) {
+               xenoprof_handle_passive();
+               smp_mb__before_clear_bit();
+               clear_bit(0, &flag);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static struct irqaction ovf_action = {
+       .handler = xenoprof_ovf_interrupt,
+       .flags   = IRQF_DISABLED,
+       .name    = "xenoprof"
+};
+
+static void unbind_virq(void)
+{
+       unsigned int i;
+
+       for_each_online_cpu(i) {
+               if (cpumask_test_and_clear_cpu(i, ovf_irq_mapped))
+                       unbind_from_per_cpu_irq(ovf_irq, i, &ovf_action);
+       }
+       ovf_irq = -1;
+}
+
+
+static int bind_virq(void)
+{
+       unsigned int i;
+       int result;
+
+       for_each_online_cpu(i) {
+               result = bind_virq_to_irqaction(VIRQ_XENOPROF, i, &ovf_action);
+
+               if (result < 0) {
+                       unbind_virq();
+                       return result;
+               }
+
+               if (ovf_irq < 0)
+                       ovf_irq = result;
+               else if (result != ovf_irq) {
+                       unbind_virq();
+                       pr_err("IRQ%d unexpected (should be %d)\n",
+                              result, ovf_irq);
+                       return -ESTALE;
+               }
+               cpumask_set_cpu(i, ovf_irq_mapped);
+       }
+               
+       return 0;
+}
+
+
+static xenoprof_buf_t **get_buffer_array(unsigned int nbuf)
+{
+       size_t size = nbuf * sizeof(xenoprof_buf_t);
+
+       if (size <= PAGE_SIZE)
+               return kmalloc(size, GFP_KERNEL);
+       return vmalloc(size);
+}
+
+static void release_buffer_array(xenoprof_buf_t **buf, unsigned int nbuf)
+{
+       if (nbuf * sizeof(xenoprof_buf_t) <= PAGE_SIZE)
+               kfree(buf);
+       else
+               vfree(buf);
+}
+
+
+static void unmap_passive_list(void)
+{
+       int i;
+       for (i = 0; i < pdomains; i++) {
+               xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
+               release_buffer_array(p_xenoprof_buf[i],
+                                    passive_domains[i].nbuf);
+       }
+       pdomains = 0;
+}
+
+
+static int map_xenoprof_buffer(int max_samples)
+{
+       struct xenoprof_get_buffer get_buffer;
+       struct xenoprof_buf *buf;
+       int ret, i;
+
+       if ( shared_buffer.buffer )
+               return 0;
+
+       get_buffer.max_samples = max_samples;
+       ret = xenoprof_arch_map_shared_buffer(&get_buffer, &shared_buffer);
+       if (ret)
+               return ret;
+       nbuf = get_buffer.nbuf;
+
+       xenoprof_buf = get_buffer_array(nbuf);
+       if (!xenoprof_buf) {
+               xenoprof_arch_unmap_shared_buffer(&shared_buffer);
+               return -ENOMEM;
+       }
+
+       for (i=0; i< nbuf; i++) {
+               buf = (struct xenoprof_buf*) 
+                       &shared_buffer.buffer[i * get_buffer.bufsize];
+               BUG_ON(buf->vcpu_id >= nbuf);
+               xenoprof_buf[buf->vcpu_id] = buf;
+       }
+
+       return 0;
+}
+
+
+static int xenoprof_setup(void)
+{
+       int ret;
+
+       if ( (ret = map_xenoprof_buffer(MAX_XENOPROF_SAMPLES)) )
+               return ret;
+
+       if ( (ret = bind_virq()) ) {
+               release_buffer_array(xenoprof_buf, nbuf);
+               return ret;
+       }
+
+       if (xenoprof_is_primary) {
+               /* Define dom0 as an active domain if not done yet */
+               if (!active_defined) {
+                       domid_t domid;
+                       ret = HYPERVISOR_xenoprof_op(
+                               XENOPROF_reset_active_list, NULL);
+                       if (ret)
+                               goto err;
+                       domid = 0;
+                       ret = HYPERVISOR_xenoprof_op(
+                               XENOPROF_set_active, &domid);
+                       if (ret)
+                               goto err;
+                       active_defined = 1;
+               }
+
+               if (oprofile_backtrace_depth > 0) {
+                       ret = HYPERVISOR_xenoprof_op(XENOPROF_set_backtrace, 
+                                                    &oprofile_backtrace_depth);
+                       if (ret)
+                               oprofile_backtrace_depth = 0;
+               }
+
+               ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL);
+               if (ret)
+                       goto err;
+               
+               xenoprof_arch_counter();
+               ret = HYPERVISOR_xenoprof_op(XENOPROF_setup_events, NULL);
+               if (ret)
+                       goto err;
+       }
+
+       ret = HYPERVISOR_xenoprof_op(XENOPROF_enable_virq, NULL);
+       if (ret)
+               goto err;
+
+       xenoprof_enabled = 1;
+       return 0;
+ err:
+       unbind_virq();
+       release_buffer_array(xenoprof_buf, nbuf);
+       return ret;
+}
+
+
+static void xenoprof_shutdown(void)
+{
+       xenoprof_enabled = 0;
+
+       WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_disable_virq, NULL));
+
+       if (xenoprof_is_primary) {
+               WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_release_counters,
+                                              NULL));
+               active_defined = 0;
+       }
+
+       unbind_virq();
+
+       xenoprof_arch_unmap_shared_buffer(&shared_buffer);
+       if (xenoprof_is_primary)
+               unmap_passive_list();
+       release_buffer_array(xenoprof_buf, nbuf);
+}
+
+
+static int xenoprof_start(void)
+{
+       int ret = 0;
+
+       if (xenoprof_is_primary)
+               ret = HYPERVISOR_xenoprof_op(XENOPROF_start, NULL);
+       if (!ret)
+               xenoprof_arch_start();
+       return ret;
+}
+
+
+static void xenoprof_stop(void)
+{
+       if (xenoprof_is_primary)
+               WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_stop, NULL));
+       xenoprof_arch_stop();
+}
+
+
+static int xenoprof_set_active(int * active_domains,
+                              unsigned int adomains)
+{
+       int ret = 0;
+       int i;
+       int set_dom0 = 0;
+       domid_t domid;
+
+       if (!xenoprof_is_primary)
+               return 0;
+
+       if (adomains > MAX_OPROF_DOMAINS)
+               return -E2BIG;
+
+       ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL);
+       if (ret)
+               return ret;
+
+       for (i=0; i<adomains; i++) {
+               domid = active_domains[i];
+               if (domid != active_domains[i]) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
+               if (ret)
+                       goto out;
+               if (active_domains[i] == 0)
+                       set_dom0 = 1;
+       }
+       /* dom0 must always be active but may not be in the list */ 
+       if (!set_dom0) {
+               domid = 0;
+               ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid);
+       }
+
+out:
+       if (ret)
+               WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list,
+                                              NULL));
+       active_defined = !ret;
+       return ret;
+}
+
+static int xenoprof_set_passive(int * p_domains,
+                                unsigned int pdoms)
+{
+       int ret;
+       unsigned int i, j;
+       struct xenoprof_buf *buf;
+
+       if (!xenoprof_is_primary)
+               return 0;
+
+       if (pdoms > MAX_OPROF_DOMAINS)
+               return -E2BIG;
+
+       ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_passive_list, NULL);
+       if (ret)
+               return ret;
+       unmap_passive_list();
+
+       for (i = 0; i < pdoms; i++) {
+               passive_domains[i].domain_id = p_domains[i];
+               passive_domains[i].max_samples = 2048;
+               ret = xenoprof_arch_set_passive(&passive_domains[i],
+                                               &p_shared_buffer[i]);
+               if (ret)
+                       goto out;
+
+               p_xenoprof_buf[i] = get_buffer_array(passive_domains[i].nbuf);
+               if (!p_xenoprof_buf[i]) {
+                       ++i;
+                       ret = -ENOMEM;
+                       goto out;
+               }
+
+               for (j = 0; j < passive_domains[i].nbuf; j++) {
+                       buf = (struct xenoprof_buf *)
+                               &p_shared_buffer[i].buffer[
+                               j * passive_domains[i].bufsize];
+                       BUG_ON(buf->vcpu_id >= passive_domains[i].nbuf);
+                       p_xenoprof_buf[i][buf->vcpu_id] = buf;
+               }
+       }
+
+       pdomains = pdoms;
+       return 0;
+
+out:
+       for (j = 0; j < i; j++) {
+               xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]);
+               release_buffer_array(p_xenoprof_buf[i],
+                                    passive_domains[i].nbuf);
+       }
+
+       return ret;
+}
+
+
+/* The dummy backtrace function to keep oprofile happy
+ * The real backtrace is done in xen
+ */
+static void xenoprof_dummy_backtrace(struct pt_regs * const regs, 
+                                    unsigned int depth)
+{
+       /* this should never be called */
+       BUG();
+       return;
+}
+
+
+static struct oprofile_operations xenoprof_ops = {
+#ifdef HAVE_XENOPROF_CREATE_FILES
+       .create_files   = xenoprof_create_files,
+#endif
+       .set_active     = xenoprof_set_active,
+       .set_passive    = xenoprof_set_passive,
+       .setup          = xenoprof_setup,
+       .shutdown       = xenoprof_shutdown,
+       .start          = xenoprof_start,
+       .stop           = xenoprof_stop,
+       .backtrace      = xenoprof_dummy_backtrace
+};
+
+
+/* in order to get driverfs right */
+static int using_xenoprof;
+
+int __init xenoprofile_init(struct oprofile_operations * ops)
+{
+       struct xenoprof_init init;
+       int ret;
+
+       if (!zalloc_cpumask_var(&ovf_irq_mapped, GFP_KERNEL))
+               return -ENOMEM;
+
+       ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init);
+       if (!ret) {
+               xenoprof_arch_init_counter(&init);
+               xenoprof_is_primary = init.is_primary;
+
+               /* cpu_type is detected by Xen */
+               strlcpy(cpu_type, init.cpu_type, XENOPROF_CPU_TYPE_SIZE);
+               xenoprof_ops.cpu_type = cpu_type;
+
+               init_driverfs();
+               using_xenoprof = 1;
+               *ops = xenoprof_ops;
+
+               active_defined = 0;
+       } else
+               free_cpumask_var(ovf_irq_mapped);
+
+       pr_info("%s: ret %d, events %d, xenoprof_is_primary %d\n",
+               __func__, ret, init.num_events, xenoprof_is_primary);
+       return ret;
+}
+
+
+void xenoprofile_exit(void)
+{
+       if (using_xenoprof)
+               exit_driverfs();
+
+       xenoprof_arch_unmap_shared_buffer(&shared_buffer);
+       if (xenoprof_is_primary) {
+               unmap_passive_list();
+               WARN_ON(HYPERVISOR_xenoprof_op(XENOPROF_shutdown, NULL));
+        }
+
+       free_cpumask_var(ovf_irq_mapped);
+}
diff --git a/fs/Kconfig b/fs/Kconfig

index f95ae3a..1c2aa9e 100644 (file)
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -34,6 +34,9 @@ config FS_MBCACHE
  source "fs/reiserfs/Kconfig"
  source "fs/jfs/Kconfig"
  
+config FS_RICHACL
+       bool
+
  source "fs/xfs/Kconfig"
  source "fs/gfs2/Kconfig"
  source "fs/ocfs2/Kconfig"
@@ -166,6 +169,7 @@ config HUGETLBFS
         bool "HugeTLB file system support"
         depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
                    SYS_SUPPORTS_HUGETLBFS || BROKEN
+       depends on !XEN
         help
           hugetlbfs is a filesystem backing for HugeTLB pages, based on
           ramfs. For architectures that support it, say Y here and read
diff --git a/fs/Makefile b/fs/Makefile

index 2fb9779..bdd6944 100644 (file)
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,9 @@ obj-$(CONFIG_FS_POSIX_ACL)    += posix_acl.o xattr_acl.o
  obj-$(CONFIG_NFS_COMMON)       += nfs_common/
  obj-$(CONFIG_GENERIC_ACL)      += generic_acl.o
  
+obj-$(CONFIG_FS_RICHACL)       += richacl.o
+richacl-y                      := richacl_base.o richacl_inode.o richacl_xattr.o
+
  obj-$(CONFIG_FHANDLE)          += fhandle.o
  
  obj-y                          += quota/
diff --git a/fs/aio.c b/fs/aio.c

index 67a6db3..c51a272 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -39,6 +39,11 @@
  #include <asm/kmap_types.h>
  #include <asm/uaccess.h>
  
+#ifdef CONFIG_EPOLL
+#include <linux/poll.h>
+#include <linux/anon_inodes.h>
+#endif
+
  #if DEBUG > 1
  #define dprintk                printk
  #else
@@ -1053,6 +1058,11 @@ put_rq:
         if (waitqueue_active(&ctx->wait))
                 wake_up(&ctx->wait);
  
+#ifdef CONFIG_EPOLL
+       if (ctx->file && waitqueue_active(&ctx->poll_wait))
+               wake_up(&ctx->poll_wait);
+#endif
+
         spin_unlock_irqrestore(&ctx->ctx_lock, flags);
         return ret;
  }
@@ -1061,6 +1071,8 @@ EXPORT_SYMBOL(aio_complete);
  /* aio_read_evt
   *     Pull an event off of the ioctx's event ring.  Returns the number of 
   *     events fetched (0 or 1 ;-)
+ *     If ent parameter is 0, just returns the number of events that would
+ *     be fetched.
   *     FIXME: make this use cmpxchg.
   *     TODO: make the ringbuffer user mmap()able (requires FIXME).
   */
@@ -1083,13 +1095,17 @@ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
  
         head = ring->head % info->nr;
         if (head != ring->tail) {
-               struct io_event *evp = aio_ring_event(info, head);
-               *ent = *evp;
-               head = (head + 1) % info->nr;
-               smp_mb(); /* finish reading the event before updatng the head */
-               ring->head = head;
-               ret = 1;
-               put_aio_ring_event(evp);
+               if (ent) { /* event requested */
+                       struct io_event *evp = aio_ring_event(info, head);
+                       *ent = *evp;
+                       head = (head + 1) % info->nr;
+                       /* finish reading the event before updatng the head */
+                       smp_mb();
+                       ring->head = head;
+                       ret = 1;
+                       put_aio_ring_event(evp);
+               } else /* only need to know availability */
+                       ret = 1;
         }
         spin_unlock(&info->ring_lock);
  
@@ -1274,6 +1290,15 @@ static void io_destroy(struct kioctx *ioctx)
  
         kill_ctx(ioctx);
  
+#ifdef CONFIG_EPOLL
+       /* forget the poll file, but it's up to the user to close it */
+       if (ioctx->file) {
+               fput(ioctx->file);
+               ioctx->file->private_data = 0;
+               ioctx->file = 0;
+       }
+#endif
+
         /*
          * Wake up any waiters.  The setting of ctx->dead must be seen
          * by other CPUs at this point.  Right now, we rely on the
@@ -1282,6 +1307,70 @@ static void io_destroy(struct kioctx *ioctx)
         wake_up_all(&ioctx->wait);
  }
  
+#ifdef CONFIG_EPOLL
+
+static int aio_queue_fd_close(struct inode *inode, struct file *file)
+{
+       struct kioctx *ioctx = file->private_data;
+       if (ioctx) {
+               file->private_data = 0;
+               spin_lock_irq(&ioctx->ctx_lock);
+               ioctx->file = 0;
+               spin_unlock_irq(&ioctx->ctx_lock);
+               fput(file);
+       }
+       return 0;
+}
+
+static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait)
+{      unsigned int pollflags = 0;
+       struct kioctx *ioctx = file->private_data;
+
+       if (ioctx) {
+
+               spin_lock_irq(&ioctx->ctx_lock);
+               /* Insert inside our poll wait queue */
+               poll_wait(file, &ioctx->poll_wait, wait);
+
+               /* Check our condition */
+               if (aio_read_evt(ioctx, 0))
+                       pollflags = POLLIN | POLLRDNORM;
+               spin_unlock_irq(&ioctx->ctx_lock);
+       }
+
+       return pollflags;
+}
+
+static const struct file_operations aioq_fops = {
+       .release        = aio_queue_fd_close,
+       .poll           = aio_queue_fd_poll
+};
+
+/* make_aio_fd:
+ *  Create a file descriptor that can be used to poll the event queue.
+ *  Based on the excellent epoll code.
+ */
+
+static int make_aio_fd(struct kioctx *ioctx)
+{
+       int fd;
+       struct file *file;
+
+       fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
+       if (fd < 0)
+               return fd;
+
+       /* associate the file with the IO context */
+       file = fget(fd);
+       if (!file)
+               return -EBADF;
+       file->private_data = ioctx;
+       ioctx->file = file;
+       init_waitqueue_head(&ioctx->poll_wait);
+       return fd;
+}
+#endif
+
  /* sys_io_setup:
   *     Create an aio_context capable of receiving at least nr_events.
   *     ctxp must not point to an aio_context that already exists, and
@@ -1294,18 +1383,30 @@ static void io_destroy(struct kioctx *ioctx)
   *     resources are available.  May fail with -EFAULT if an invalid
   *     pointer is passed for ctxp.  Will fail with -ENOSYS if not
   *     implemented.
+ *
+ *     To request a selectable fd, the user context has to be initialized
+ *     to 1, instead of 0, and the return value is the fd.
+ *     This keeps the system call compatible, since a non-zero value
+ *     was not allowed so far.
   */
  SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
  {
         struct kioctx *ioctx = NULL;
         unsigned long ctx;
         long ret;
+       int make_fd = 0;
  
         ret = get_user(ctx, ctxp);
         if (unlikely(ret))
                 goto out;
  
         ret = -EINVAL;
+#ifdef CONFIG_EPOLL
+       if (ctx == 1) {
+               make_fd = 1;
+               ctx = 0;
+       }
+#endif
         if (unlikely(ctx || nr_events == 0)) {
                 pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
                          ctx, nr_events);
@@ -1316,7 +1417,11 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
         ret = PTR_ERR(ioctx);
         if (!IS_ERR(ioctx)) {
                 ret = put_user(ioctx->user_id, ctxp);
-               if (ret)
+#ifdef CONFIG_EPOLL
+               if (make_fd && !ret)
+                       ret = make_aio_fd(ioctx);
+#endif
+               if (ret < 0)
                         io_destroy(ioctx);
                 put_ioctx(ioctx);
         }
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c

index debdfe0..b9bb0d7 100644 (file)
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -116,6 +116,13 @@
  #include <asm/fbio.h>
  #endif
  
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+#include <xen/public/evtchn.h>
+#include <xen/public/privcmd.h>
+#include <xen/compat_ioctl.h>
+#endif
+
  static int w_long(unsigned int fd, unsigned int cmd,
                 compat_ulong_t __user *argp)
  {
@@ -1414,6 +1421,16 @@ IGNORE_IOCTL(FBIOGETCMAP32)
  IGNORE_IOCTL(FBIOSCURSOR32)
  IGNORE_IOCTL(FBIOGCURSOR32)
  #endif
+
+#ifdef CONFIG_XEN
+COMPATIBLE_IOCTL(IOCTL_PRIVCMD_HYPERCALL)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_VIRQ)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_INTERDOMAIN)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_BIND_UNBOUND_PORT)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_UNBIND)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_NOTIFY)
+COMPATIBLE_IOCTL(IOCTL_EVTCHN_RESET)
+#endif
  };
  
  /*
@@ -1470,6 +1487,12 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
                 return do_video_stillpicture(fd, cmd, argp);
         case VIDEO_SET_SPU_PALETTE:
                 return do_video_set_spu_palette(fd, cmd, argp);
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       case IOCTL_PRIVCMD_MMAP_32:
+       case IOCTL_PRIVCMD_MMAPBATCH_32:
+       case IOCTL_PRIVCMD_MMAPBATCH_V2_32:
+               return privcmd_ioctl_32(fd, cmd, argp);
+#endif
         }
  
         /*
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig

index 9ed1bb1..da7a332 100644 (file)
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -83,3 +83,13 @@ config EXT4_DEBUG
  
           If you select Y here, then you will be able to turn on debugging
           with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
+
+config EXT4_FS_RICHACL
+      bool "Ext4 Rich Access Control Lists (EXPERIMENTAL)"
+      depends on EXT4_FS_XATTR && EXPERIMENTAL
+      select FS_RICHACL
+      help
+       Rich ACLs are an implementation of NFSv4 ACLs, extended by file masks
+       to fit into the standard POSIX file permission model.  They are
+       designed to work seamlessly locally as well as across the NFSv4 and
+       CIFS/SMB2 network file system protocols.
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile

index 56fd8f8..b757ccc 100644 (file)
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -12,3 +12,4 @@ ext4-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
  ext4-$(CONFIG_EXT4_FS_XATTR)           += xattr.o xattr_user.o xattr_trusted.o
  ext4-$(CONFIG_EXT4_FS_POSIX_ACL)       += acl.o
  ext4-$(CONFIG_EXT4_FS_SECURITY)                += xattr_security.o
+ext4-$(CONFIG_EXT4_FS_RICHACL)         += richacl.o
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index 0e01e90..2586f93 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -908,6 +908,10 @@ struct ext4_inode_info {
          */
         tid_t i_sync_tid;
         tid_t i_datasync_tid;
+#ifdef CONFIG_EXT4_FS_RICHACL
+       struct richacl   *i_richacl;
+#endif
+
  };
  
  /*
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index cb70f18..8aac615 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -28,6 +28,7 @@
  #include "ext4_jbd2.h"
  #include "xattr.h"
  #include "acl.h"
+#include "richacl.h"
  
  /*
   * Called when an inode is released. Note that this is different
@@ -258,5 +259,8 @@ const struct inode_operations ext4_file_inode_operations = {
  #endif
         .get_acl        = ext4_get_acl,
         .fiemap         = ext4_fiemap,
+       .permission     = ext4_permission,
+       .may_create     = ext4_may_create,
+       .may_delete     = ext4_may_delete,
  };
  
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c

index 409c2ee..251be03 100644 (file)
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -28,6 +28,7 @@
  #include "ext4_jbd2.h"
  #include "xattr.h"
  #include "acl.h"
+#include "richacl.h"
  
  #include <trace/events/ext4.h>
  
@@ -861,7 +862,11 @@ got:
         if (err)
                 goto fail_drop;
  
-       err = ext4_init_acl(handle, inode, dir);
+       if (EXT4_IS_RICHACL(dir))
+               err = ext4_init_richacl(handle, inode, dir);
+       else
+               err = ext4_init_acl(handle, inode, dir);
+
         if (err)
                 goto fail_free_drop;
  
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index c77b0bd..0bd1795 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -42,6 +42,7 @@
  #include "xattr.h"
  #include "acl.h"
  #include "truncate.h"
+#include "richacl.h"
  
  #include <trace/events/ext4.h>
  
@@ -3654,6 +3655,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
         set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
  
         ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
+#ifdef CONFIG_EXT4_FS_RICHACL
+       ei->i_richacl = EXT4_RICHACL_NOT_CACHED;
+#endif
         ei->i_dir_start_lookup = 0;
         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
         /* We now have enough fields to check if the inode was active or not.
@@ -4078,7 +4082,11 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
         int orphan = 0;
         const unsigned int ia_valid = attr->ia_valid;
  
-       error = inode_change_ok(inode, attr);
+       if (EXT4_IS_RICHACL(inode))
+               error = richacl_inode_change_ok(inode, attr,
+                                               ext4_richacl_permission);
+       else
+               error = inode_change_ok(inode, attr);
         if (error)
                 return error;
  
@@ -4178,9 +4186,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
         if (orphan && inode->i_nlink)
                 ext4_orphan_del(NULL, inode);
  
-       if (!rc && (ia_valid & ATTR_MODE))
-               rc = ext4_acl_chmod(inode);
-
+       if (!rc && (ia_valid & ATTR_MODE)) {
+               if (EXT4_IS_RICHACL(inode))
+                       rc = ext4_richacl_chmod(inode);
+               else
+                       rc = ext4_acl_chmod(inode);
+       }
  err_out:
         ext4_std_error(inode->i_sb, error);
         if (!error)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c

index 349d7b3..3265d4a 100644 (file)
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -39,6 +39,7 @@
  
  #include "xattr.h"
  #include "acl.h"
+#include "richacl.h"
  
  #include <trace/events/ext4.h>
  /*
@@ -2587,6 +2588,9 @@ const struct inode_operations ext4_dir_inode_operations = {
  #endif
         .get_acl        = ext4_get_acl,
         .fiemap         = ext4_fiemap,
+       .permission     = ext4_permission,
+       .may_create     = ext4_may_create,
+       .may_delete     = ext4_may_delete,
  };
  
  const struct inode_operations ext4_special_inode_operations = {
@@ -2598,4 +2602,7 @@ const struct inode_operations ext4_special_inode_operations = {
         .removexattr    = generic_removexattr,
  #endif
         .get_acl        = ext4_get_acl,
+       .permission     = ext4_permission,
+       .may_create     = ext4_may_create,
+       .may_delete     = ext4_may_delete,
  };
diff --git a/fs/ext4/richacl.c b/fs/ext4/richacl.c

new file mode 100644 (file)

index 0000000..0cc7d12
--- /dev/null
+++ b/fs/ext4/richacl.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/richacl_xattr.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+#include "xattr.h"
+#include "acl.h"
+#include "richacl.h"
+
+static inline struct richacl *
+ext4_iget_richacl(struct inode *inode)
+{
+       struct richacl *acl = EXT4_RICHACL_NOT_CACHED;
+       struct ext4_inode_info *ei = EXT4_I(inode);
+
+       spin_lock(&inode->i_lock);
+       if (ei->i_richacl != EXT4_RICHACL_NOT_CACHED)
+               acl = richacl_get(ei->i_richacl);
+       spin_unlock(&inode->i_lock);
+
+       return acl;
+}
+
+static inline void
+ext4_iset_richacl(struct inode *inode, struct richacl *acl)
+{
+       struct ext4_inode_info *ei = EXT4_I(inode);
+
+       spin_lock(&inode->i_lock);
+       if (ei->i_richacl != EXT4_RICHACL_NOT_CACHED)
+               richacl_put(ei->i_richacl);
+       ei->i_richacl = richacl_get(acl);
+       spin_unlock(&inode->i_lock);
+}
+
+static struct richacl *
+ext4_get_richacl(struct inode *inode)
+{
+       const int name_index = EXT4_XATTR_INDEX_RICHACL;
+       void *value = NULL;
+       struct richacl *acl;
+       int retval;
+
+       if (!IS_RICHACL(inode))
+               return ERR_PTR(-EOPNOTSUPP);
+       acl = ext4_iget_richacl(inode);
+       if (acl != EXT4_RICHACL_NOT_CACHED)
+               return acl;
+       retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
+       if (retval > 0) {
+               value = kmalloc(retval, GFP_KERNEL);
+               if (!value)
+                       return ERR_PTR(-ENOMEM);
+               retval = ext4_xattr_get(inode, name_index, "", value, retval);
+       }
+       if (retval > 0) {
+               acl = richacl_from_xattr(value, retval);
+               if (acl == ERR_PTR(-EINVAL))
+                       acl = ERR_PTR(-EIO);
+       } else if (retval == -ENODATA || retval == -ENOSYS)
+               acl = NULL;
+       else
+               acl = ERR_PTR(retval);
+       kfree(value);
+
+       if (!IS_ERR_OR_NULL(acl))
+               ext4_iset_richacl(inode, acl);
+
+       return acl;
+}
+
+static int
+ext4_set_richacl(handle_t *handle, struct inode *inode, struct richacl *acl)
+{
+       const int name_index = EXT4_XATTR_INDEX_RICHACL;
+       size_t size = 0;
+       void *value = NULL;
+       int retval;
+
+       if (acl) {
+               mode_t mode = inode->i_mode;
+               if (richacl_equiv_mode(acl, &mode) == 0) {
+                       inode->i_mode = mode;
+                       ext4_mark_inode_dirty(handle, inode);
+                       acl = NULL;
+               }
+       }
+       if (acl) {
+               size = richacl_xattr_size(acl);
+               value = kmalloc(size, GFP_KERNEL);
+               if (!value)
+                       return -ENOMEM;
+               richacl_to_xattr(acl, value);
+       }
+       if (handle)
+               retval = ext4_xattr_set_handle(handle, inode, name_index, "",
+                                              value, size, 0);
+       else
+               retval = ext4_xattr_set(inode, name_index, "", value, size, 0);
+       kfree(value);
+       if (!retval)
+               ext4_iset_richacl(inode, acl);
+
+       return retval;
+}
+
+int
+ext4_richacl_permission(struct inode *inode, unsigned int mask)
+{
+       struct richacl *acl;
+       int retval;
+
+       if (!IS_RICHACL(inode))
+               BUG();
+
+       acl = ext4_get_richacl(inode);
+       if (acl && IS_ERR(acl))
+               retval = PTR_ERR(acl);
+       else {
+               retval = richacl_inode_permission(inode, acl, mask);
+               richacl_put(acl);
+       }
+
+       return retval;
+}
+
+int ext4_permission(struct inode *inode, int mask)
+{
+       if (IS_RICHACL(inode))
+               return ext4_richacl_permission(inode,
+                                       richacl_want_to_mask(mask));
+       else
+               return generic_permission(inode, mask);
+}
+
+int ext4_may_create(struct inode *dir, int isdir)
+{
+       return richacl_may_create(dir, isdir, ext4_richacl_permission);
+}
+
+int ext4_may_delete(struct inode *dir, struct inode *inode, int replace)
+{
+       return richacl_may_delete(dir, inode, replace, ext4_richacl_permission);
+}
+
+int
+ext4_init_richacl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+       struct richacl *dir_acl = NULL;
+
+       if (!S_ISLNK(inode->i_mode)) {
+               dir_acl = ext4_get_richacl(dir);
+               if (IS_ERR(dir_acl))
+                       return PTR_ERR(dir_acl);
+       }
+       if (dir_acl) {
+               struct richacl *acl;
+               int retval;
+
+               acl = richacl_inherit(dir_acl, inode);
+               richacl_put(dir_acl);
+
+               retval = PTR_ERR(acl);
+               if (acl && !IS_ERR(acl)) {
+                       retval = ext4_set_richacl(handle, inode, acl);
+                       richacl_put(acl);
+               }
+               return retval;
+       } else {
+               inode->i_mode &= ~current_umask();
+               return 0;
+       }
+}
+
+int
+ext4_richacl_chmod(struct inode *inode)
+{
+       struct richacl *acl;
+       int retval;
+
+       if (S_ISLNK(inode->i_mode))
+               return -EOPNOTSUPP;
+       acl = ext4_get_richacl(inode);
+       if (IS_ERR_OR_NULL(acl))
+               return PTR_ERR(acl);
+       acl = richacl_chmod(acl, inode->i_mode);
+       if (IS_ERR(acl))
+               return PTR_ERR(acl);
+       retval = ext4_set_richacl(NULL, inode, acl);
+       richacl_put(acl);
+
+       return retval;
+}
+
+static size_t
+ext4_xattr_list_richacl(struct dentry *dentry, char *list, size_t list_len,
+                       const char *name, size_t name_len, int type)
+{
+       const size_t size = sizeof(RICHACL_XATTR);
+       if (!IS_RICHACL(dentry->d_inode))
+               return 0;
+       if (list && size <= list_len)
+               memcpy(list, RICHACL_XATTR, size);
+       return size;
+}
+
+static int
+ext4_xattr_get_richacl(struct dentry *dentry, const char *name, void *buffer,
+               size_t buffer_size, int type)
+{
+       struct richacl *acl;
+       size_t size;
+
+       if (strcmp(name, "") != 0)
+               return -EINVAL;
+       acl = ext4_get_richacl(dentry->d_inode);
+       if (IS_ERR(acl))
+               return PTR_ERR(acl);
+       if (acl == NULL)
+               return -ENODATA;
+       size = richacl_xattr_size(acl);
+       if (buffer) {
+               if (size > buffer_size)
+                       return -ERANGE;
+               richacl_to_xattr(acl, buffer);
+       }
+       richacl_put(acl);
+
+       return size;
+}
+
+static int
+ext4_xattr_set_richacl(struct dentry *dentry, const char *name,
+               const void *value, size_t size, int flags, int type)
+{
+       handle_t *handle;
+       struct richacl *acl = NULL;
+       int retval, retries = 0;
+       struct inode *inode = dentry->d_inode;
+
+       if (!IS_RICHACL(dentry->d_inode))
+               return -EOPNOTSUPP;
+       if (S_ISLNK(inode->i_mode))
+               return -EOPNOTSUPP;
+       if (strcmp(name, "") != 0)
+               return -EINVAL;
+       if (current_fsuid() != inode->i_uid &&
+           ext4_richacl_permission(inode, ACE4_WRITE_ACL) &&
+           !capable(CAP_FOWNER))
+               return -EPERM;
+       if (value) {
+               acl = richacl_from_xattr(value, size);
+               if (IS_ERR(acl))
+                       return PTR_ERR(acl);
+
+               inode->i_mode &= ~S_IRWXUGO;
+               inode->i_mode |= richacl_masks_to_mode(acl);
+       }
+
+retry:
+       handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+       ext4_mark_inode_dirty(handle, inode);
+       retval = ext4_set_richacl(handle, inode, acl);
+       ext4_journal_stop(handle);
+       if (retval == ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+               goto retry;
+       richacl_put(acl);
+       return retval;
+}
+
+const struct xattr_handler ext4_richacl_xattr_handler = {
+       .prefix = RICHACL_XATTR,
+       .list   = ext4_xattr_list_richacl,
+       .get    = ext4_xattr_get_richacl,
+       .set    = ext4_xattr_set_richacl,
+};
diff --git a/fs/ext4/richacl.h b/fs/ext4/richacl.h

new file mode 100644 (file)

index 0000000..00d89f2
--- /dev/null
+++ b/fs/ext4/richacl.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright IBM Corporation, 2010
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#ifndef __FS_EXT4_RICHACL_H
+#define __FS_EXT4_RICHACL_H
+
+#include <linux/richacl.h>
+
+#ifdef CONFIG_EXT4_FS_RICHACL
+
+# define EXT4_IS_RICHACL(inode) IS_RICHACL(inode)
+
+/* Value for i_richacl if RICHACL has not been cached */
+# define EXT4_RICHACL_NOT_CACHED ((void *)-1)
+
+extern int ext4_permission(struct inode *, int);
+extern int ext4_richacl_permission(struct inode *, unsigned int);
+extern int ext4_may_create(struct inode *, int);
+extern int ext4_may_delete(struct inode *, struct inode *, int);
+extern int ext4_init_richacl(handle_t *, struct inode *, struct inode *);
+extern int ext4_richacl_chmod(struct inode *);
+
+#else  /* CONFIG_FS_EXT4_RICHACL */
+
+# define EXT4_IS_RICHACL(inode) (0)
+
+# define ext4_permission NULL
+# define ext4_may_create NULL
+# define ext4_may_delete NULL
+# define ext4_richacl_permission NULL
+
+static inline int
+ext4_init_richacl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+       return 0;
+}
+
+static inline int
+ext4_richacl_chmod(struct inode *inode)
+{
+       return 0;
+}
+
+#endif  /* CONFIG_FS_EXT4_RICHACL */
+#endif  /* __FS_EXT4_RICHACL_H */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index e1fb1d5..3266db3 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -50,6 +50,7 @@
  #include "xattr.h"
  #include "acl.h"
  #include "mballoc.h"
+#include "richacl.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/ext4.h>
@@ -921,7 +922,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
         ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
         if (!ei)
                 return NULL;
-
+#ifdef CONFIG_EXT4_FS_RICHACL
+       ei->i_richacl = EXT4_RICHACL_NOT_CACHED;
+#endif
         ei->vfs_inode.i_version = 1;
         ei->vfs_inode.i_data.writeback_index = 0;
         memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
@@ -1009,6 +1012,13 @@ void ext4_clear_inode(struct inode *inode)
         invalidate_inode_buffers(inode);
         end_writeback(inode);
         dquot_drop(inode);
+#ifdef CONFIG_EXT4_FS_RICHACL
+       if (EXT4_I(inode)->i_richacl &&
+               EXT4_I(inode)->i_richacl != EXT4_RICHACL_NOT_CACHED) {
+               richacl_put(EXT4_I(inode)->i_richacl);
+               EXT4_I(inode)->i_richacl = EXT4_RICHACL_NOT_CACHED;
+       }
+#endif
         ext4_discard_preallocations(inode);
         if (EXT4_I(inode)->jinode) {
                 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
@@ -1171,7 +1181,7 @@ enum {
         Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
         Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
         Opt_nouid32, Opt_debug, Opt_removed,
-       Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+       Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_richacl, Opt_noacl,
         Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
         Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
         Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,
@@ -1208,6 +1218,7 @@ static const match_table_t tokens = {
         {Opt_user_xattr, "user_xattr"},
         {Opt_nouser_xattr, "nouser_xattr"},
         {Opt_acl, "acl"},
+       {Opt_richacl, "richacl"},
         {Opt_noacl, "noacl"},
         {Opt_noload, "norecovery"},
         {Opt_noload, "noload"},
@@ -1467,6 +1478,9 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
         case Opt_nouser_xattr:
                 ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
                 break;
+       case Opt_richacl:
+               sb->s_flags |= MS_RICHACL;
+               return 1;
         case Opt_sb:
                 return 1;       /* handled by get_sb_block() */
         case Opt_removed:
@@ -1648,6 +1662,10 @@ static int parse_options(char *options, struct super_block *sb,
                 }
         }
  #endif
+#if defined(CONFIG_EXT4_FS_RICHACL) && defined(CONFIG_EXT4_FS_POSIX_ACL)
+       if (test_opt(sb, POSIX_ACL) && (sb->s_flags & MS_RICHACL))
+               clear_opt(sb, POSIX_ACL);
+#endif
         return 1;
  }
  
@@ -1772,6 +1790,9 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
                        (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
                 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
  
+       if (sb->s_flags & MS_RICHACL)
+               SEQ_OPTS_PUTS("richacl");
+
         ext4_show_quota_options(seq, sb);
         return 0;
  }
@@ -2967,6 +2988,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         int err;
         unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
         ext4_group_t first_not_zeroed;
+       unsigned long acl_flags = 0;
  
         sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
         if (!sbi)
@@ -3038,7 +3060,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
  #ifdef CONFIG_EXT4_FS_XATTR
         set_opt(sb, XATTR_USER);
  #endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
+#if defined(CONFIG_EXT4_FS_POSIX_ACL)
         set_opt(sb, POSIX_ACL);
  #endif
         set_opt(sb, MBLK_IO_SUBMIT);
@@ -3121,8 +3143,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 }
         }
  
-       sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-               (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+       if (sb->s_flags & MS_RICHACL)
+               acl_flags = MS_RICHACL;
+       else if (test_opt(sb, POSIX_ACL))
+               acl_flags = MS_POSIXACL;
+
+       sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_RICHACL)) | acl_flags;
  
         if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
             (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
@@ -4233,6 +4259,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
         ext4_group_t g;
         unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
         int err = 0;
+       unsigned long acl_flags = 0;
  #ifdef CONFIG_QUOTA
         int i;
  #endif
@@ -4267,8 +4294,12 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
         if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
                 ext4_abort(sb, "Abort forced by user");
  
-       sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
-               (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
+       if (sb->s_flags & MS_RICHACL)
+               acl_flags = MS_RICHACL;
+       else if (test_opt(sb, POSIX_ACL))
+               acl_flags = MS_POSIXACL;
+
+       sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_RICHACL)) | acl_flags;
  
         es = sbi->s_es;
  
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c

index e88748e..fb5817c 100644 (file)
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -107,6 +107,9 @@ static const struct xattr_handler *ext4_xattr_handler_map[] = {
  #ifdef CONFIG_EXT4_FS_SECURITY
         [EXT4_XATTR_INDEX_SECURITY]          = &ext4_xattr_security_handler,
  #endif
+#ifdef CONFIG_EXT4_FS_RICHACL
+       [EXT4_XATTR_INDEX_RICHACL]           = &ext4_richacl_xattr_handler,
+#endif
  };
  
  const struct xattr_handler *ext4_xattr_handlers[] = {
@@ -119,6 +122,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
  #ifdef CONFIG_EXT4_FS_SECURITY
         &ext4_xattr_security_handler,
  #endif
+#ifdef CONFIG_EXT4_FS_RICHACL
+       &ext4_richacl_xattr_handler,
+#endif
         NULL
  };
  
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h

index 25b7387..f73563f 100644 (file)
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -21,6 +21,7 @@
  #define EXT4_XATTR_INDEX_TRUSTED               4
  #define        EXT4_XATTR_INDEX_LUSTRE                 5
  #define EXT4_XATTR_INDEX_SECURITY              6
+#define EXT4_XATTR_INDEX_RICHACL               7
  
  struct ext4_xattr_header {
         __le32  h_magic;        /* magic number for identification */
@@ -70,6 +71,10 @@ extern const struct xattr_handler ext4_xattr_trusted_handler;
  extern const struct xattr_handler ext4_xattr_acl_access_handler;
  extern const struct xattr_handler ext4_xattr_acl_default_handler;
  extern const struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_xattr_acl_access_handler;
+extern const struct xattr_handler ext4_xattr_acl_default_handler;
+extern const struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_richacl_xattr_handler;
  
  extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
  
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c

index cdb41a1..f7c8ce0 100644 (file)
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -414,7 +414,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
         spin_lock(&tree->hash_lock);
         node = hfs_bnode_findhash(tree, num);
         spin_unlock(&tree->hash_lock);
-       BUG_ON(node);
+       if (node) {
+               printk(KERN_CRIT "new node %u already hashed?\n", num);
+               WARN_ON(1);
+               return node;
+       }
         node = __hfs_bnode_create(tree, num);
         if (!node)
                 return ERR_PTR(-ENOMEM);
diff --git a/fs/namei.c b/fs/namei.c

index c427919..d1ebe96 100644 (file)
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1947,6 +1947,26 @@ other_userns:
  }
  
  /*
+ * Do the directory specific tests of inode_permission() and call the
+ * may_delete inode operation.  The may_delete inode operation must do the
+ * sticky check when needed.
+ */
+static int may_delete_iop(struct inode *dir, struct inode *inode, int replace)
+{
+       int error;
+
+       if (IS_RDONLY(dir))
+               return -EROFS;
+       if (IS_IMMUTABLE(dir))
+               return -EACCES;
+       error = dir->i_op->may_delete(dir, inode, replace);
+       if (!error)
+               error = security_inode_permission(dir, MAY_WRITE | MAY_EXEC);
+
+       return error;
+}
+
+/*
   *     Check whether we can remove a link victim from directory dir, check
   *  whether the type of victim is right.
   *  1. We can't do it if dir is read-only (done in permission())
@@ -1965,7 +1985,8 @@ other_userns:
   * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
   *     nfs_async_unlink().
   */
-static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
+static int may_delete(struct inode *dir, struct dentry *victim,
+                     int isdir, int replace)
  {
         int error;
  
@@ -1974,14 +1995,19 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
  
         BUG_ON(victim->d_parent->d_inode != dir);
         audit_inode_child(victim, dir);
-
-       error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+       if (dir->i_op->may_delete)
+               error = may_delete_iop(dir, victim->d_inode, replace);
+       else {
+               error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+               if (!error && check_sticky(dir, victim->d_inode))
+                       error = -EPERM;
+       }
         if (error)
                 return error;
         if (IS_APPEND(dir))
                 return -EPERM;
-       if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
-           IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+       if (IS_APPEND(victim->d_inode) || IS_IMMUTABLE(victim->d_inode) ||
+               IS_SWAPFILE(victim->d_inode))
                 return -EPERM;
         if (isdir) {
                 if (!S_ISDIR(victim->d_inode->i_mode))
@@ -1997,6 +2023,25 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
         return 0;
  }
  
+/*
+ * Do the directory specific tests of inode_permission() and call the
+ * may_create inode operation.
+ */
+static int may_create_iop(struct inode *dir, int isdir)
+{
+       int error;
+
+       if (IS_RDONLY(dir))
+               return -EROFS;
+       if (IS_IMMUTABLE(dir))
+               return -EACCES;
+       error = dir->i_op->may_create(dir, isdir);
+       if (!error)
+               error = security_inode_permission(dir, MAY_WRITE | MAY_EXEC);
+
+       return error;
+}
+
  /*     Check whether we can create an object with dentry child in directory
   *  dir.
   *  1. We can't do it if child already exists (open has special treatment for
@@ -2005,13 +2050,16 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
   *  3. We should have write and exec permissions on dir
   *  4. We can't do it if dir is immutable (done in permission())
   */
-static inline int may_create(struct inode *dir, struct dentry *child)
+static inline int may_create(struct inode *dir, struct dentry *child, int isdir)
  {
         if (child->d_inode)
                 return -EEXIST;
         if (IS_DEADDIR(dir))
                 return -ENOENT;
-       return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+       if (dir->i_op->may_create)
+               return may_create_iop(dir, isdir);
+       else
+               return inode_permission(dir, MAY_WRITE | MAY_EXEC);
  }
  
  /*
@@ -2059,7 +2107,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
  int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                 struct nameidata *nd)
  {
-       int error = may_create(dir, dentry);
+       int error = may_create(dir, dentry, 0);
  
         if (error)
                 return error;
@@ -2258,7 +2306,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
         /* Negative dentry, just create the file */
         if (!dentry->d_inode) {
                 umode_t mode = op->mode;
-               if (!IS_POSIXACL(dir->d_inode))
+               if (!IS_ACL(dir->d_inode))
                         mode &= ~current_umask();
                 /*
                  * This write is needed to ensure that a
@@ -2526,7 +2574,7 @@ EXPORT_SYMBOL(user_path_create);
  
  int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
  {
-       int error = may_create(dir, dentry);
+       int error = may_create(dir, dentry, 0);
  
         if (error)
                 return error;
@@ -2583,7 +2631,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
         if (IS_ERR(dentry))
                 return PTR_ERR(dentry);
  
-       if (!IS_POSIXACL(path.dentry->d_inode))
+       if (!IS_ACL(path.dentry->d_inode))
                 mode &= ~current_umask();
         error = may_mknod(mode);
         if (error)
@@ -2623,7 +2671,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
  
  int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
  {
-       int error = may_create(dir, dentry);
+       int error = may_create(dir, dentry, 1);
         unsigned max_links = dir->i_sb->s_max_links;
  
         if (error)
@@ -2656,7 +2704,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
         if (IS_ERR(dentry))
                 return PTR_ERR(dentry);
  
-       if (!IS_POSIXACL(path.dentry->d_inode))
+       if (!IS_ACL(path.dentry->d_inode))
                 mode &= ~current_umask();
         error = mnt_want_write(path.mnt);
         if (error)
@@ -2705,7 +2753,7 @@ void dentry_unhash(struct dentry *dentry)
  
  int vfs_rmdir(struct inode *dir, struct dentry *dentry)
  {
-       int error = may_delete(dir, dentry, 1);
+       int error = may_delete(dir, dentry, 1, 0);
  
         if (error)
                 return error;
@@ -2800,7 +2848,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
  
  int vfs_unlink(struct inode *dir, struct dentry *dentry)
  {
-       int error = may_delete(dir, dentry, 0);
+       int error = may_delete(dir, dentry, 0, 0);
  
         if (error)
                 return error;
@@ -2909,7 +2957,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
  
  int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
  {
-       int error = may_create(dir, dentry);
+       int error = may_create(dir, dentry, 0);
  
         if (error)
                 return error;
@@ -2976,7 +3024,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
         if (!inode)
                 return -ENOENT;
  
-       error = may_create(dir, new_dentry);
+       error = may_create(dir, new_dentry, S_ISDIR(inode->i_mode));
         if (error)
                 return error;
  
@@ -3204,14 +3252,14 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         if (old_dentry->d_inode == new_dentry->d_inode)
                 return 0;
   
-       error = may_delete(old_dir, old_dentry, is_dir);
+       error = may_delete(old_dir, old_dentry, is_dir, 0);
         if (error)
                 return error;
  
         if (!new_dentry->d_inode)
-               error = may_create(new_dir, new_dentry);
+               error = may_create(new_dir, new_dentry, is_dir);
         else
-               error = may_delete(new_dir, new_dentry, is_dir);
+               error = may_delete(new_dir, new_dentry, is_dir, 1);
         if (error)
                 return error;
  
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c

index 8789210..0b3e910 100644 (file)
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -875,6 +875,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
         desc->dir_cookie = &dir_ctx->dir_cookie;
         desc->decode = NFS_PROTO(inode)->decode_dirent;
         desc->plus = NFS_USE_READDIRPLUS(inode);
+       if (filp->f_pos > 0 && !test_bit(NFS_INO_SEEN_GETATTR, &NFS_I(inode)->flags))
+               desc->plus = 0;
+       clear_bit(NFS_INO_SEEN_GETATTR, &NFS_I(inode)->flags);
  
         nfs_block_sillyrename(dentry);
         res = nfs_revalidate_mapping(inode, filp->f_mapping);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c

index e8bbfa5..d8a6a18 100644 (file)
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -511,6 +511,15 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
         struct inode *inode = dentry->d_inode;
         int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
         int err;
+       struct dentry *p;
+       struct inode *pi;
+
+       rcu_read_lock();
+       p = dentry->d_parent;
+       pi = rcu_dereference(p)->d_inode;
+       if (pi && !test_bit(NFS_INO_SEEN_GETATTR, &NFS_I(pi)->flags))
+               set_bit(NFS_INO_SEEN_GETATTR, &NFS_I(pi)->flags);
+       rcu_read_unlock();
  
         /* Flush out writes to the server in order to update c/mtime.  */
         if (S_ISREG(inode->i_mode)) {
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c

index 86c67ee..2b84627 100644 (file)
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -130,7 +130,7 @@ static void __kcore_update_ram(struct list_head *list)
  }
  
  
-#ifdef CONFIG_HIGHMEM
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_XEN)
  /*
   * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
   * because memory hole is not as big as !HIGHMEM case.
@@ -146,7 +146,11 @@ static int kcore_update_ram(void)
         if (!ent)
                 return -ENOMEM;
         ent->addr = (unsigned long)__va(0);
+#ifdef CONFIG_HIGHMEM
         ent->size = max_low_pfn << PAGE_SHIFT;
+#else
+       ent->size = max_pfn << PAGE_SHIFT;
+#endif
         ent->type = KCORE_RAM;
         list_add(&ent->list, &head);
         __kcore_update_ram(&head);
diff --git a/fs/richacl_base.c b/fs/richacl_base.c

new file mode 100644 (file)

index 0000000..143f0f8
--- /dev/null
+++ b/fs/richacl_base.c
@@ -0,0 +1,617 @@
+/*
+ * Copyright (C) 2006, 2010  Novell, Inc.
+ * Written by Andreas Gruenbacher <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/richacl.h>
+
+MODULE_LICENSE("GPL");
+
+/*
+ * Special e_who identifiers:  ACEs which have ACE4_SPECIAL_WHO set in
+ * ace->e_flags use these constants in ace->u.e_who.
+ *
+ * For efficiency, we compare pointers instead of comparing strings.
+ */
+const char richace_owner_who[]   = "OWNER@";
+EXPORT_SYMBOL_GPL(richace_owner_who);
+const char richace_group_who[]   = "GROUP@";
+EXPORT_SYMBOL_GPL(richace_group_who);
+const char richace_everyone_who[] = "EVERYONE@";
+EXPORT_SYMBOL_GPL(richace_everyone_who);
+
+/**
+ * richacl_alloc  -  allocate a richacl
+ * @count:     number of entries
+ */
+struct richacl *
+richacl_alloc(int count)
+{
+       size_t size = sizeof(struct richacl) + count * sizeof(struct richace);
+       struct richacl *acl = kzalloc(size, GFP_KERNEL);
+
+       if (acl) {
+               atomic_set(&acl->a_refcount, 1);
+               acl->a_count = count;
+       }
+       return acl;
+}
+EXPORT_SYMBOL_GPL(richacl_alloc);
+
+/**
+ * richacl_clone  -  create a copy of a richacl
+ */
+static struct richacl *
+richacl_clone(const struct richacl *acl)
+{
+       int count = acl->a_count;
+       size_t size = sizeof(struct richacl) + count * sizeof(struct richace);
+       struct richacl *dup = kmalloc(size, GFP_KERNEL);
+
+       if (dup) {
+               memcpy(dup, acl, size);
+               atomic_set(&dup->a_refcount, 1);
+       }
+       return dup;
+}
+
+/**
+ * richacl_mask_to_mode  -  compute the file permission bits which correspond to @mask
+ * @mask:      %ACE4_* permission mask
+ *
+ * See richacl_masks_to_mode().
+ */
+static int
+richacl_mask_to_mode(unsigned int mask)
+{
+       int mode = 0;
+
+       if (mask & ACE4_POSIX_MODE_READ)
+               mode |= MAY_READ;
+       if (mask & ACE4_POSIX_MODE_WRITE)
+               mode |= MAY_WRITE;
+       if (mask & ACE4_POSIX_MODE_EXEC)
+               mode |= MAY_EXEC;
+
+       return mode;
+}
+
+/**
+ * richacl_masks_to_mode  -  compute the file permission bits from the file masks
+ *
+ * When setting a richacl, we set the file permission bits to indicate maximum
+ * permissions: for example, we set the Write permission when a mask contains
+ * ACE4_APPEND_DATA even if it does not also contain ACE4_WRITE_DATA.
+ *
+ * Permissions which are not in ACE4_POSIX_MODE_READ, ACE4_POSIX_MODE_WRITE, or
+ * ACE4_POSIX_MODE_EXEC cannot be represented in the file permission bits.
+ * Such permissions can still be effective, but not for new files or after a
+ * chmod(), and only if they were set explicitly, for example, by setting a
+ * richacl.
+ */
+int
+richacl_masks_to_mode(const struct richacl *acl)
+{
+       return richacl_mask_to_mode(acl->a_owner_mask) << 6 |
+              richacl_mask_to_mode(acl->a_group_mask) << 3 |
+              richacl_mask_to_mode(acl->a_other_mask);
+}
+EXPORT_SYMBOL_GPL(richacl_masks_to_mode);
+
+/**
+ * richacl_mode_to_mask  - compute a file mask from the lowest three mode bits
+ *
+ * When the file permission bits of a file are set with chmod(), this specifies
+ * the maximum permissions that processes will get.  All permissions beyond
+ * that will be removed from the file masks, and become ineffective.
+ *
+ * We also add in the permissions which are always allowed no matter what the
+ * acl says.
+ */
+unsigned int
+richacl_mode_to_mask(mode_t mode)
+{
+       unsigned int mask = ACE4_POSIX_ALWAYS_ALLOWED;
+
+       if (mode & MAY_READ)
+               mask |= ACE4_POSIX_MODE_READ;
+       if (mode & MAY_WRITE)
+               mask |= ACE4_POSIX_MODE_WRITE;
+       if (mode & MAY_EXEC)
+               mask |= ACE4_POSIX_MODE_EXEC;
+
+       return mask;
+}
+
+/**
+ * richacl_want_to_mask  - convert the iop->permission want argument to a mask
+ * @want:      @want argument of the permission inode operation
+ *
+ * When checking for append, @want is (MAY_WRITE | MAY_APPEND).
+ *
+ * Richacls use the iop->may_create and iop->may_delete hooks which are
+ * used for checking if creating and deleting files is allowed.  These hooks do
+ * not use richacl_want_to_mask(), so we do not have to deal with mapping
+ * MAY_WRITE to ACE4_ADD_FILE, ACE4_ADD_SUBDIRECTORY, and ACE4_DELETE_CHILD
+ * here.
+ */
+unsigned int
+richacl_want_to_mask(int want)
+{
+       unsigned int mask = 0;
+
+       if (want & MAY_READ)
+               mask |= ACE4_READ_DATA;
+       if (want & MAY_APPEND)
+               mask |= ACE4_APPEND_DATA;
+       else if (want & MAY_WRITE)
+               mask |= ACE4_WRITE_DATA;
+       if (want & MAY_EXEC)
+               mask |= ACE4_EXECUTE;
+
+       return mask;
+}
+EXPORT_SYMBOL_GPL(richacl_want_to_mask);
+
+/**
+ * richace_is_same_identifier  -  are both identifiers the same?
+ */
+int
+richace_is_same_identifier(const struct richace *a, const struct richace *b)
+{
+#define WHO_FLAGS (ACE4_SPECIAL_WHO | ACE4_IDENTIFIER_GROUP)
+       if ((a->e_flags & WHO_FLAGS) != (b->e_flags & WHO_FLAGS))
+               return 0;
+       if (a->e_flags & ACE4_SPECIAL_WHO)
+               return a->u.e_who == b->u.e_who;
+       else
+               return a->u.e_id == b->u.e_id;
+#undef WHO_FLAGS
+}
+
+/**
+ * richacl_set_who  -  set a special who value
+ * @ace:       acl entry
+ * @who:       who value to use
+ */
+int
+richace_set_who(struct richace *ace, const char *who)
+{
+       if (!strcmp(who, richace_owner_who))
+               who = richace_owner_who;
+       else if (!strcmp(who, richace_group_who))
+               who = richace_group_who;
+       else if (!strcmp(who, richace_everyone_who))
+               who = richace_everyone_who;
+       else
+               return -EINVAL;
+
+       ace->u.e_who = who;
+       ace->e_flags |= ACE4_SPECIAL_WHO;
+       ace->e_flags &= ~ACE4_IDENTIFIER_GROUP;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(richace_set_who);
+
+/**
+ * richacl_allowed_to_who  -  mask flags allowed to a specific who value
+ *
+ * Computes the mask values allowed to a specific who value, taking
+ * EVERYONE@ entries into account.
+ */
+static unsigned int richacl_allowed_to_who(struct richacl *acl,
+                                          struct richace *who)
+{
+       struct richace *ace;
+       unsigned int allowed = 0;
+
+       richacl_for_each_entry_reverse(ace, acl) {
+               if (richace_is_inherit_only(ace))
+                       continue;
+               if (richace_is_same_identifier(ace, who) ||
+                   richace_is_everyone(ace)) {
+                       if (richace_is_allow(ace))
+                               allowed |= ace->e_mask;
+                       else if (richace_is_deny(ace))
+                               allowed &= ~ace->e_mask;
+               }
+       }
+       return allowed;
+}
+
+/**
+ * richacl_group_class_allowed  -  maximum permissions the group class is allowed
+ *
+ * See richacl_compute_max_masks().
+ */
+static unsigned int richacl_group_class_allowed(struct richacl *acl)
+{
+       struct richace *ace;
+       unsigned int everyone_allowed = 0, group_class_allowed = 0;
+       int had_group_ace = 0;
+
+       richacl_for_each_entry_reverse(ace, acl) {
+               if (richace_is_inherit_only(ace) ||
+                   richace_is_owner(ace))
+                       continue;
+
+               if (richace_is_everyone(ace)) {
+                       if (richace_is_allow(ace))
+                               everyone_allowed |= ace->e_mask;
+                       else if (richace_is_deny(ace))
+                               everyone_allowed &= ~ace->e_mask;
+               } else {
+                       group_class_allowed |=
+                               richacl_allowed_to_who(acl, ace);
+
+                       if (richace_is_group(ace))
+                               had_group_ace = 1;
+               }
+       }
+       if (!had_group_ace)
+               group_class_allowed |= everyone_allowed;
+       return group_class_allowed;
+}
+
+/**
+ * richacl_compute_max_masks  -  compute upper bound masks
+ *
+ * Computes upper bound owner, group, and other masks so that none of
+ * the mask flags allowed by the acl are disabled (for any choice of the
+ * file owner or group membership).
+ */
+void richacl_compute_max_masks(struct richacl *acl)
+{
+       unsigned int gmask = ~0;
+       struct richace *ace;
+
+       /*
+        * @gmask contains all permissions which the group class is ever
+        * allowed.  We use it to avoid adding permissions to the group mask
+        * from everyone@ allow aces which the group class is always denied
+        * through other aces.  For example, the following acl would otherwise
+        * result in a group mask or rw:
+        *
+        *      group@:w::deny
+        *      everyone@:rw::allow
+        *
+        * Avoid computing @gmask for acls which do not include any group class
+        * deny aces: in such acls, the group class is never denied any
+        * permissions from everyone@ allow aces.
+        */
+
+restart:
+       acl->a_owner_mask = 0;
+       acl->a_group_mask = 0;
+       acl->a_other_mask = 0;
+
+       richacl_for_each_entry_reverse(ace, acl) {
+               if (richace_is_inherit_only(ace))
+                       continue;
+
+               if (richace_is_owner(ace)) {
+                       if (richace_is_allow(ace))
+                               acl->a_owner_mask |= ace->e_mask;
+                       else if (richace_is_deny(ace))
+                               acl->a_owner_mask &= ~ace->e_mask;
+               } else if (richace_is_everyone(ace)) {
+                       if (richace_is_allow(ace)) {
+                               acl->a_owner_mask |= ace->e_mask;
+                               acl->a_group_mask |= ace->e_mask & gmask;
+                               acl->a_other_mask |= ace->e_mask;
+                       } else if (richace_is_deny(ace)) {
+                               acl->a_owner_mask &= ~ace->e_mask;
+                               acl->a_group_mask &= ~ace->e_mask;
+                               acl->a_other_mask &= ~ace->e_mask;
+                       }
+               } else {
+                       if (richace_is_allow(ace)) {
+                               acl->a_owner_mask |= ace->e_mask & gmask;
+                               acl->a_group_mask |= ace->e_mask & gmask;
+                       } else if (richace_is_deny(ace) && gmask == ~0) {
+                               gmask = richacl_group_class_allowed(acl);
+                               if (likely(gmask != ~0))  /* should always be true */
+                                       goto restart;
+                       }
+               }
+       }
+}
+EXPORT_SYMBOL_GPL(richacl_compute_max_masks);
+
+/**
+ * richacl_chmod  -  update the file masks to reflect the new mode
+ * @mode:      new file permission bits
+ *
+ * Return a copy of @acl where the file masks have been replaced by the file
+ * masks corresponding to the file permission bits in @mode, or returns @acl
+ * itself if the file masks are already up to date.  Takes over a reference
+ * to @acl.
+ */
+struct richacl *
+richacl_chmod(struct richacl *acl, mode_t mode)
+{
+       unsigned int owner_mask, group_mask, other_mask;
+       struct richacl *clone;
+
+       owner_mask = richacl_mode_to_mask(mode >> 6);
+       group_mask = richacl_mode_to_mask(mode >> 3);
+       other_mask = richacl_mode_to_mask(mode);
+
+       if (acl->a_owner_mask == owner_mask &&
+           acl->a_group_mask == group_mask &&
+           acl->a_other_mask == other_mask &&
+           (!richacl_is_auto_inherit(acl) || richacl_is_protected(acl)))
+               return acl;
+
+       clone = richacl_clone(acl);
+       richacl_put(acl);
+       if (!clone)
+               return ERR_PTR(-ENOMEM);
+
+       clone->a_owner_mask = owner_mask;
+       clone->a_group_mask = group_mask;
+       clone->a_other_mask = other_mask;
+       if (richacl_is_auto_inherit(clone))
+               clone->a_flags |= ACL4_PROTECTED;
+
+       return clone;
+}
+EXPORT_SYMBOL_GPL(richacl_chmod);
+
+/**
+ * richacl_permission  -  richacl permission check algorithm
+ * @inode:     inode to check
+ * @acl:       rich acl of the inode
+ * @mask:      requested access (ACE4_* bitmask)
+ *
+ * Checks if the current process is granted @mask flags in @acl.
+ */
+int
+richacl_permission(struct inode *inode, const struct richacl *acl,
+                  unsigned int mask)
+{
+       const struct richace *ace;
+       unsigned int file_mask, requested = mask, denied = 0;
+       int in_owning_group = in_group_p(inode->i_gid);
+       int in_owner_or_group_class = in_owning_group;
+
+       /*
+        * A process is
+        *   - in the owner file class if it owns the file,
+        *   - in the group file class if it is in the file's owning group or
+        *     it matches any of the user or group entries, and
+        *   - in the other file class otherwise.
+        */
+
+       /*
+        * Check if the acl grants the requested access and determine which
+        * file class the process is in.
+        */
+       richacl_for_each_entry(ace, acl) {
+               unsigned int ace_mask = ace->e_mask;
+
+               if (richace_is_inherit_only(ace))
+                       continue;
+               if (richace_is_owner(ace)) {
+                       if (current_fsuid() != inode->i_uid)
+                               continue;
+                       goto is_owner;
+               } else if (richace_is_group(ace)) {
+                       if (!in_owning_group)
+                               continue;
+               } else if (richace_is_unix_id(ace)) {
+                       if (ace->e_flags & ACE4_IDENTIFIER_GROUP) {
+                               if (!in_group_p(ace->u.e_id))
+                                       continue;
+                       } else {
+                               if (current_fsuid() != ace->u.e_id)
+                                       continue;
+                       }
+               } else
+                       goto is_everyone;
+
+               /*
+                * Apply the group file mask to entries other than OWNER@ and
+                * EVERYONE@. This is not required for correct access checking
+                * but ensures that we grant the same permissions as the acl
+                * computed by richacl_apply_masks() would grant.  See
+                * richacl_apply_masks() for a more detailed explanation.
+                */
+               if (richace_is_allow(ace))
+                       ace_mask &= acl->a_group_mask;
+
+is_owner:
+               /* The process is in the owner or group file class. */
+               in_owner_or_group_class = 1;
+
+is_everyone:
+               /* Check which mask flags the ACE allows or denies. */
+               if (richace_is_deny(ace))
+                       denied |= ace_mask & mask;
+               mask &= ~ace_mask;
+
+               /*
+                * Keep going until we know which file class
+                * the process is in.
+                */
+               if (!mask && in_owner_or_group_class)
+                       break;
+       }
+       denied |= mask;
+
+       /*
+        * The file class a process is in determines which file mask applies.
+        * Check if that file mask also grants the requested access.
+        */
+       if (current_fsuid() == inode->i_uid)
+               file_mask = acl->a_owner_mask;
+       else if (in_owner_or_group_class)
+               file_mask = acl->a_group_mask;
+       else
+               file_mask = acl->a_other_mask;
+       denied |= requested & ~file_mask;
+
+       return denied ? -EACCES : 0;
+}
+EXPORT_SYMBOL_GPL(richacl_permission);
+
+/**
+ * richacl_inherit  -  compute the inherited acl of a new file
+ * @dir_acl:   acl of the containing direcory
+ * @inode:     inode of the new file (create mode in i_mode)
+ *
+ * A directory can have acl entries which files and/or directories created
+ * inside the directory will inherit.  This function computes the acl for such
+ * a new file.  If there is no inheritable acl, it will return %NULL.
+ *
+ * The file permission bits in inode->i_mode must be set to the create mode.
+ * If there is an inheritable acl, the maximum permissions that the acl grants
+ * will be computed and permissions not granted by the acl will be removed from
+ * inode->i_mode.  If there is no inheritable acl, the umask will be applied
+ * instead.
+ */
+struct richacl *
+richacl_inherit(const struct richacl *dir_acl, struct inode *inode)
+{
+       const struct richace *dir_ace;
+       struct richacl *acl = NULL;
+       struct richace *ace;
+       int count = 0;
+       mode_t mask = ~current_umask();
+
+       if (S_ISDIR(inode->i_mode)) {
+               richacl_for_each_entry(dir_ace, dir_acl) {
+                       if (!richace_is_inheritable(dir_ace))
+                               continue;
+                       count++;
+               }
+               if (!count)
+                       goto mask;
+               acl = richacl_alloc(count);
+               if (!acl)
+                       return ERR_PTR(-ENOMEM);
+               ace = acl->a_entries;
+               richacl_for_each_entry(dir_ace, dir_acl) {
+                       if (!richace_is_inheritable(dir_ace))
+                               continue;
+                       memcpy(ace, dir_ace, sizeof(struct richace));
+                       if (dir_ace->e_flags & ACE4_NO_PROPAGATE_INHERIT_ACE)
+                               richace_clear_inheritance_flags(ace);
+                       if ((dir_ace->e_flags & ACE4_FILE_INHERIT_ACE) &&
+                           !(dir_ace->e_flags & ACE4_DIRECTORY_INHERIT_ACE))
+                               ace->e_flags |= ACE4_INHERIT_ONLY_ACE;
+                       ace++;
+               }
+       } else {
+               richacl_for_each_entry(dir_ace, dir_acl) {
+                       if (!(dir_ace->e_flags & ACE4_FILE_INHERIT_ACE))
+                               continue;
+                       count++;
+               }
+               if (!count)
+                       goto mask;
+               acl = richacl_alloc(count);
+               if (!acl)
+                       return ERR_PTR(-ENOMEM);
+               ace = acl->a_entries;
+               richacl_for_each_entry(dir_ace, dir_acl) {
+                       if (!(dir_ace->e_flags & ACE4_FILE_INHERIT_ACE))
+                               continue;
+                       memcpy(ace, dir_ace, sizeof(struct richace));
+                       richace_clear_inheritance_flags(ace);
+                       /*
+                        * ACE4_DELETE_CHILD is meaningless for
+                        * non-directories, so clear it.
+                        */
+                       ace->e_mask &= ~ACE4_DELETE_CHILD;
+                       ace++;
+               }
+       }
+
+       richacl_compute_max_masks(acl);
+
+       /*
+        * Ensure that the acl will not grant any permissions beyond the create
+        * mode.
+        */
+       acl->a_owner_mask &= richacl_mode_to_mask(inode->i_mode >> 6);
+       acl->a_group_mask &= richacl_mode_to_mask(inode->i_mode >> 3);
+       acl->a_other_mask &= richacl_mode_to_mask(inode->i_mode);
+       mask = ~S_IRWXUGO | richacl_masks_to_mode(acl);
+
+       if (richacl_is_auto_inherit(dir_acl)) {
+               /*
+                * We need to set ACL4_PROTECTED because we are
+                * doing an implicit chmod
+                */
+               acl->a_flags = ACL4_AUTO_INHERIT | ACL4_PROTECTED;
+               richacl_for_each_entry(ace, acl)
+                       ace->e_flags |= ACE4_INHERITED_ACE;
+       }
+
+mask:
+       inode->i_mode &= mask;
+       return acl;
+}
+EXPORT_SYMBOL_GPL(richacl_inherit);
+
+/**
+ * richacl_equiv_mode  -  check if @acl is equivalent to file permission bits
+ * @mode_p:    the file mode (including the file type)
+ *
+ * If @acl can be fully represented by file permission bits, this function
+ * returns 0, and the file permission bits in @mode_p are set to the equivalent
+ * of @acl.
+ *
+ * This function is used to avoid storing richacls on disk if the acl can be
+ * computed from the file permission bits.  It allows user-space to make sure
+ * that a file has no explicit richacl set.
+ */
+int
+richacl_equiv_mode(const struct richacl *acl, mode_t *mode_p)
+{
+       const struct richace *ace = acl->a_entries;
+       unsigned int x;
+       mode_t mode;
+
+       if (acl->a_count != 1 ||
+           acl->a_flags ||
+           !richace_is_everyone(ace) ||
+           !richace_is_allow(ace) ||
+           ace->e_flags & ~ACE4_SPECIAL_WHO)
+               return -1;
+
+       /*
+        * Figure out the permissions we care about: ACE4_DELETE_CHILD is
+        * meaningless for non-directories, so we ignore it.
+        */
+       x = ~ACE4_POSIX_ALWAYS_ALLOWED;
+       if (!S_ISDIR(*mode_p))
+               x &= ~ACE4_DELETE_CHILD;
+
+       if ((ace->e_mask & x) != (ACE4_POSIX_MODE_ALL & x))
+               return -1;
+
+       mode = richacl_masks_to_mode(acl);
+       if ((acl->a_owner_mask & x) != (richacl_mode_to_mask(mode >> 6) & x) ||
+           (acl->a_group_mask & x) != (richacl_mode_to_mask(mode >> 3) & x) ||
+           (acl->a_other_mask & x) != (richacl_mode_to_mask(mode) & x))
+               return -1;
+
+       *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(richacl_equiv_mode);
diff --git a/fs/richacl_inode.c b/fs/richacl_inode.c

new file mode 100644 (file)

index 0000000..1953a22
--- /dev/null
+++ b/fs/richacl_inode.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (C) 2010  Novell, Inc.
+ * Written by Andreas Gruenbacher <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/richacl.h>
+
+/**
+ * richacl_may_create  -  helper for implementing iop->may_create
+ */
+int
+richacl_may_create(struct inode *dir, int isdir,
+               int (*richacl_permission)(struct inode *, unsigned int))
+{
+       if (IS_RICHACL(dir))
+               return richacl_permission(dir,
+                               ACE4_EXECUTE | (isdir ?
+                               ACE4_ADD_SUBDIRECTORY : ACE4_ADD_FILE));
+       else
+               return generic_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+EXPORT_SYMBOL(richacl_may_create);
+
+static int
+check_sticky(struct inode *dir, struct inode *inode)
+{
+       if (!(dir->i_mode & S_ISVTX))
+               return 0;
+       if (inode->i_uid == current_fsuid())
+               return 0;
+       if (dir->i_uid == current_fsuid())
+               return 0;
+       return !capable(CAP_FOWNER);
+}
+
+/**
+ * richacl_may_delete  -  helper for implementing iop->may_delete
+ */
+int
+richacl_may_delete(struct inode *dir, struct inode *inode, int replace,
+                  int (*richacl_permission)(struct inode *, unsigned int))
+{
+       int error;
+
+       if (IS_RICHACL(inode)) {
+               error = richacl_permission(dir,
+                               ACE4_EXECUTE | ACE4_DELETE_CHILD);
+               if (!error && check_sticky(dir, inode))
+                       error = -EPERM;
+               if (error && !richacl_permission(inode, ACE4_DELETE))
+                       error = 0;
+               if (!error && replace)
+                       error = richacl_permission(dir,
+                                       ACE4_EXECUTE | (S_ISDIR(inode->i_mode) ?
+                                       ACE4_ADD_SUBDIRECTORY : ACE4_ADD_FILE));
+       } else {
+               error = generic_permission(dir, MAY_WRITE | MAY_EXEC);
+               if (!error && check_sticky(dir, inode))
+                       error = -EPERM;
+       }
+
+       return error;
+}
+EXPORT_SYMBOL(richacl_may_delete);
+
+/**
+ * richacl_inode_permission  -  helper for implementing iop->permission
+ * @inode:     inode to check
+ * @acl:       rich acl of the inode (may be NULL)
+ * @mask:      requested access (ACE4_* bitmask)
+ *
+ * This function is supposed to be used by file systems for implementing the
+ * permission inode operation.
+ */
+int
+richacl_inode_permission(struct inode *inode, const struct richacl *acl,
+                        unsigned int mask)
+{
+       if (acl) {
+               if (!richacl_permission(inode, acl, mask))
+                       return 0;
+       } else {
+               int mode = inode->i_mode;
+
+               if (current_fsuid() == inode->i_uid)
+                       mode >>= 6;
+               else if (in_group_p(inode->i_gid))
+                       mode >>= 3;
+               if (!(mask & ~richacl_mode_to_mask(mode)))
+                       return 0;
+       }
+
+       /*
+        * Keep in sync with the capability checks in generic_permission().
+        */
+       if (!(mask & ~ACE4_POSIX_MODE_ALL)) {
+               /*
+                * Read/write DACs are always overridable.
+                * Executable DACs are overridable if at
+                * least one exec bit is set.
+                */
+               if (!(mask & ACE4_POSIX_MODE_EXEC) || execute_ok(inode))
+                       if (capable(CAP_DAC_OVERRIDE))
+                               return 0;
+       }
+       /*
+        * Searching includes executable on directories, else just read.
+        */
+       if (!(mask & ~(ACE4_READ_DATA | ACE4_LIST_DIRECTORY | ACE4_EXECUTE)) &&
+           (S_ISDIR(inode->i_mode) || !(mask & ACE4_EXECUTE)))
+               if (capable(CAP_DAC_READ_SEARCH))
+                       return 0;
+
+       return -EACCES;
+}
+EXPORT_SYMBOL_GPL(richacl_inode_permission);
+
+/**
+ * richacl_inode_change_ok  -  helper for implementing iop->setattr
+ * @inode:     inode to check
+ * @attr:      requested inode attribute changes
+ * @richacl_permission:        permission function taking an inode and ACE4_* flags
+ *
+ * Keep in sync with inode_change_ok().
+ */
+int
+richacl_inode_change_ok(struct inode *inode, struct iattr *attr,
+                       int (*richacl_permission)(struct inode *, unsigned int))
+{
+       unsigned int ia_valid = attr->ia_valid;
+
+       /* If force is set do it anyway. */
+       if (ia_valid & ATTR_FORCE)
+               return 0;
+
+       /* Make sure a caller can chown. */
+       if ((ia_valid & ATTR_UID) &&
+           (current_fsuid() != inode->i_uid ||
+            attr->ia_uid != inode->i_uid) &&
+           (current_fsuid() != attr->ia_uid ||
+            richacl_permission(inode, ACE4_WRITE_OWNER)) &&
+           !capable(CAP_CHOWN))
+               goto error;
+
+       /* Make sure caller can chgrp. */
+       if ((ia_valid & ATTR_GID)) {
+               int in_group = in_group_p(attr->ia_gid);
+               if ((current_fsuid() != inode->i_uid ||
+                   (!in_group && attr->ia_gid != inode->i_gid)) &&
+                   (!in_group ||
+                    richacl_permission(inode, ACE4_WRITE_OWNER)) &&
+                   !capable(CAP_CHOWN))
+                       goto error;
+       }
+
+       /* Make sure a caller can chmod. */
+       if (ia_valid & ATTR_MODE) {
+               if (current_fsuid() != inode->i_uid &&
+                   richacl_permission(inode, ACE4_WRITE_ACL) &&
+                   !capable(CAP_FOWNER))
+                       goto error;
+               /* Also check the setgid bit! */
+               if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
+                               inode->i_gid) && !capable(CAP_FSETID))
+                       attr->ia_mode &= ~S_ISGID;
+       }
+
+       /* Check for setting the inode time. */
+       if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) {
+               if (current_fsuid() != inode->i_uid &&
+                   richacl_permission(inode, ACE4_WRITE_ATTRIBUTES) &&
+                   !capable(CAP_FOWNER))
+                       goto error;
+       }
+       return 0;
+error:
+       return -EPERM;
+}
+EXPORT_SYMBOL_GPL(richacl_inode_change_ok);
diff --git a/fs/richacl_xattr.c b/fs/richacl_xattr.c

new file mode 100644 (file)

index 0000000..1f6e3f2
--- /dev/null
+++ b/fs/richacl_xattr.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2006, 2010  Novell, Inc.
+ * Written by Andreas Gruenbacher <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/richacl_xattr.h>
+
+MODULE_LICENSE("GPL");
+
+/**
+ * richacl_from_xattr  -  convert a richacl xattr into the in-memory representation
+ */
+struct richacl *
+richacl_from_xattr(const void *value, size_t size)
+{
+       const struct richacl_xattr *xattr_acl = value;
+       const struct richace_xattr *xattr_ace = (void *)(xattr_acl + 1);
+       struct richacl *acl;
+       struct richace *ace;
+       int count;
+
+       if (size < sizeof(struct richacl_xattr) ||
+           xattr_acl->a_version != ACL4_XATTR_VERSION ||
+           (xattr_acl->a_flags & ~ACL4_VALID_FLAGS))
+               return ERR_PTR(-EINVAL);
+
+       count = le16_to_cpu(xattr_acl->a_count);
+       if (count > ACL4_XATTR_MAX_COUNT)
+               return ERR_PTR(-EINVAL);
+
+       acl = richacl_alloc(count);
+       if (!acl)
+               return ERR_PTR(-ENOMEM);
+
+       acl->a_flags = xattr_acl->a_flags;
+       acl->a_owner_mask = le32_to_cpu(xattr_acl->a_owner_mask);
+       if (acl->a_owner_mask & ~ACE4_VALID_MASK)
+               goto fail_einval;
+       acl->a_group_mask = le32_to_cpu(xattr_acl->a_group_mask);
+       if (acl->a_group_mask & ~ACE4_VALID_MASK)
+               goto fail_einval;
+       acl->a_other_mask = le32_to_cpu(xattr_acl->a_other_mask);
+       if (acl->a_other_mask & ~ACE4_VALID_MASK)
+               goto fail_einval;
+
+       richacl_for_each_entry(ace, acl) {
+               const char *who = (void *)(xattr_ace + 1), *end;
+               ssize_t used = (void *)who - value;
+
+               if (used > size)
+                       goto fail_einval;
+               end = memchr(who, 0, size - used);
+               if (!end)
+                       goto fail_einval;
+
+               ace->e_type = le16_to_cpu(xattr_ace->e_type);
+               ace->e_flags = le16_to_cpu(xattr_ace->e_flags);
+               ace->e_mask = le32_to_cpu(xattr_ace->e_mask);
+               ace->u.e_id = le32_to_cpu(xattr_ace->e_id);
+
+               if (ace->e_flags & ~ACE4_VALID_FLAGS)
+                       goto fail_einval;
+               if (ace->e_type > ACE4_ACCESS_DENIED_ACE_TYPE ||
+                   (ace->e_mask & ~ACE4_VALID_MASK))
+                       goto fail_einval;
+
+               if (who == end) {
+                       if (ace->u.e_id == -1)
+                               goto fail_einval;  /* uid/gid needed */
+               } else if (richace_set_who(ace, who))
+                       goto fail_einval;
+
+               xattr_ace = (void *)who + ALIGN(end - who + 1, 4);
+       }
+
+       return acl;
+
+fail_einval:
+       richacl_put(acl);
+       return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL_GPL(richacl_from_xattr);
+
+/**
+ * richacl_xattr_size  -  compute the size of the xattr representation of @acl
+ */
+size_t
+richacl_xattr_size(const struct richacl *acl)
+{
+       size_t size = sizeof(struct richacl_xattr);
+       const struct richace *ace;
+
+       richacl_for_each_entry(ace, acl) {
+               size += sizeof(struct richace_xattr) +
+                       (richace_is_unix_id(ace) ? 4 :
+                        ALIGN(strlen(ace->u.e_who) + 1, 4));
+       }
+       return size;
+}
+EXPORT_SYMBOL_GPL(richacl_xattr_size);
+
+/**
+ * richacl_to_xattr  -  convert @acl into its xattr representation
+ * @acl:       the richacl to convert
+ * @buffer:    buffer of size richacl_xattr_size(@acl) for the result
+ */
+void
+richacl_to_xattr(const struct richacl *acl, void *buffer)
+{
+       struct richacl_xattr *xattr_acl = buffer;
+       struct richace_xattr *xattr_ace;
+       const struct richace *ace;
+
+       xattr_acl->a_version = ACL4_XATTR_VERSION;
+       xattr_acl->a_flags = acl->a_flags;
+       xattr_acl->a_count = cpu_to_le16(acl->a_count);
+
+       xattr_acl->a_owner_mask = cpu_to_le32(acl->a_owner_mask);
+       xattr_acl->a_group_mask = cpu_to_le32(acl->a_group_mask);
+       xattr_acl->a_other_mask = cpu_to_le32(acl->a_other_mask);
+
+       xattr_ace = (void *)(xattr_acl + 1);
+       richacl_for_each_entry(ace, acl) {
+               xattr_ace->e_type = cpu_to_le16(ace->e_type);
+               xattr_ace->e_flags = cpu_to_le16(ace->e_flags &
+                       ACE4_VALID_FLAGS);
+               xattr_ace->e_mask = cpu_to_le32(ace->e_mask);
+               if (richace_is_unix_id(ace)) {
+                       xattr_ace->e_id = cpu_to_le32(ace->u.e_id);
+                       memset(xattr_ace->e_who, 0, 4);
+                       xattr_ace = (void *)xattr_ace->e_who + 4;
+               } else {
+                       int sz = ALIGN(strlen(ace->u.e_who) + 1, 4);
+
+                       xattr_ace->e_id = cpu_to_le32(-1);
+                       memset(xattr_ace->e_who + sz - 4, 0, 4);
+                       strcpy(xattr_ace->e_who, ace->u.e_who);
+                       xattr_ace = (void *)xattr_ace->e_who + sz;
+               }
+       }
+}
+EXPORT_SYMBOL_GPL(richacl_to_xattr);
diff --git a/fs/super.c b/fs/super.c

index cf00177..3131940 100644 (file)
--- a/fs/super.c
+++ b/fs/super.c
@@ -714,16 +714,10 @@ rescan:
         return NULL;
  }
  
-/**
- *     do_remount_sb - asks filesystem to change mount options.
- *     @sb:    superblock in question
- *     @flags: numeric part of options
- *     @data:  the rest of options
- *      @force: whether or not to force the change
- *
- *     Alters the mount options of a mounted file system.
- */
-int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+#define REMOUNT_FORCE          1
+#define REMOUNT_SHRINK_DCACHE  2
+
+static int __do_remount_sb(struct super_block *sb, int flags, void *data, int rflags)
  {
         int retval;
         int remount_ro;
@@ -738,7 +732,8 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
  
         if (flags & MS_RDONLY)
                 acct_auto_close(sb);
-       shrink_dcache_sb(sb);
+       if (rflags & REMOUNT_SHRINK_DCACHE)
+               shrink_dcache_sb(sb);
         sync_filesystem(sb);
  
         remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
@@ -746,7 +741,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
         /* If we are remounting RDONLY and current sb is read/write,
            make sure there are no rw files opened */
         if (remount_ro) {
-               if (force) {
+               if (rflags & REMOUNT_FORCE) {
                         mark_files_ro(sb);
                 } else {
                         retval = sb_prepare_remount_readonly(sb);
@@ -758,7 +753,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
         if (sb->s_op->remount_fs) {
                 retval = sb->s_op->remount_fs(sb, &flags, data);
                 if (retval) {
-                       if (!force)
+                       if (!(rflags & REMOUNT_FORCE))
                                 goto cancel_readonly;
                         /* If forced remount, go ahead despite any errors */
                         WARN(1, "forced remount of a %s fs returned %i\n",
@@ -787,6 +782,21 @@ cancel_readonly:
         return retval;
  }
  
+/**
+ *     do_remount_sb - asks filesystem to change mount options.
+ *     @sb:    superblock in question
+ *     @flags: numeric part of options
+ *     @data:  the rest of options
+ *      @force: whether or not to force the change
+ *
+ *     Alters the mount options of a mounted file system.
+ */
+int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+{
+       return __do_remount_sb(sb, flags, data,
+                       REMOUNT_SHRINK_DCACHE|(force? REMOUNT_FORCE : 0));
+}
+
  static void do_emergency_remount(struct work_struct *work)
  {
         struct super_block *sb, *p = NULL;
@@ -1103,7 +1113,7 @@ struct dentry *mount_single(struct file_system_type *fs_type,
                 }
                 s->s_flags |= MS_ACTIVE;
         } else {
-               do_remount_sb(s, flags, data, 0);
+               __do_remount_sb(s, flags, data, 0);
         }
         return dget(s->s_root);
  }
diff --git a/include/acpi/processor.h b/include/acpi/processor.h

index 9d65047..ab7efd2 100644 (file)
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -42,6 +42,17 @@
  
  struct acpi_processor_cx;
  
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+struct acpi_csd_package {
+       acpi_integer num_entries;
+       acpi_integer revision;
+       acpi_integer domain;
+       acpi_integer coord_type;
+       acpi_integer num_processors;
+       acpi_integer index;
+} __attribute__ ((packed));
+#endif
+
  struct acpi_power_register {
         u8 descriptor;
         u16 length;
@@ -63,18 +74,36 @@ struct acpi_processor_cx {
         u32 power;
         u32 usage;
         u64 time;
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         u8 bm_sts_skip;
+#else
+       /* Require raw information for external control logic */
+       struct acpi_power_register reg;
+       u32 csd_count;
+       struct acpi_csd_package *domain_info;
+#endif
         char desc[ACPI_CX_DESC_LEN];
  };
  
  struct acpi_processor_power {
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+       union { /* 'dev' is actually only used for taking its address. */
+#endif
         struct cpuidle_device dev;
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         struct acpi_processor_cx *state;
         unsigned long bm_check_timestamp;
         u32 default_state;
+#else
+       struct {
+#endif
         int count;
         struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
+#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL
         int timer_broadcast_on_state;
+#else
+       }; };
+#endif
  };
  
  /* Performance Management */
@@ -290,6 +319,9 @@ static inline void acpi_processor_ppc_exit(void)
  {
         return;
  }
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+int acpi_processor_ppc_has_changed(struct acpi_processor *, int event_flag);
+#else
  static inline int acpi_processor_ppc_has_changed(struct acpi_processor *pr,
                                                                 int event_flag)
  {
@@ -307,6 +339,7 @@ static inline int acpi_processor_get_bios_limit(int cpu, unsigned int *limit)
  {
         return -ENODEV;
  }
+#endif                         /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */
  
  #endif                         /* CONFIG_CPU_FREQ */
  
@@ -355,4 +388,119 @@ static inline void acpi_thermal_cpufreq_exit(void)
  }
  #endif
  
+/*
+ * Following are interfaces geared to external processor PM control
+ * logic like a VMM
+ */
+/* Events notified to external control logic */
+#define PROCESSOR_PM_INIT      1
+#define PROCESSOR_PM_CHANGE    2
+#define PROCESSOR_HOTPLUG      3
+
+/* Objects for the PM events */
+#define PM_TYPE_IDLE           0
+#define PM_TYPE_PERF           1
+#define PM_TYPE_THR            2
+#define PM_TYPE_MAX            3
+
+/* Processor hotplug events */
+#define HOTPLUG_TYPE_ADD       0
+#define HOTPLUG_TYPE_REMOVE    1
+
+#ifdef CONFIG_PROCESSOR_EXTERNAL_CONTROL
+struct processor_extcntl_ops {
+       /* Transfer processor PM events to external control logic */
+       int (*pm_ops[PM_TYPE_MAX])(struct acpi_processor *pr, int event);
+       /* Notify physical processor status to external control logic */
+       int (*hotplug)(struct acpi_processor *pr, int type);
+};
+extern const struct processor_extcntl_ops *processor_extcntl_ops;
+
+static inline int processor_cntl_external(void)
+{
+       return (processor_extcntl_ops != NULL);
+}
+
+static inline int processor_pm_external(void)
+{
+       return processor_cntl_external() &&
+               (processor_extcntl_ops->pm_ops[PM_TYPE_IDLE] != NULL);
+}
+
+static inline int processor_pmperf_external(void)
+{
+       return processor_cntl_external() &&
+               (processor_extcntl_ops->pm_ops[PM_TYPE_PERF] != NULL);
+}
+
+static inline int processor_pmthr_external(void)
+{
+       return processor_cntl_external() &&
+               (processor_extcntl_ops->pm_ops[PM_TYPE_THR] != NULL);
+}
+
+extern int processor_notify_external(struct acpi_processor *pr,
+                       int event, int type);
+extern int processor_extcntl_prepare(struct acpi_processor *pr);
+extern int acpi_processor_get_performance_info(struct acpi_processor *pr);
+extern int acpi_processor_get_psd(struct acpi_processor *pr);
+#else
+static inline int processor_cntl_external(void) {return 0;}
+static inline int processor_pm_external(void) {return 0;}
+static inline int processor_pmperf_external(void) {return 0;}
+static inline int processor_pmthr_external(void) {return 0;}
+static inline int processor_notify_external(struct acpi_processor *pr,
+                       int event, int type)
+{
+       return 0;
+}
+static inline int processor_extcntl_prepare(struct acpi_processor *pr)
+{
+       return 0;
+}
+#endif /* CONFIG_PROCESSOR_EXTERNAL_CONTROL */
+
+#ifdef CONFIG_XEN
+static inline void xen_convert_pct_reg(struct xen_pct_register *xpct,
+       struct acpi_pct_register *apct)
+{
+       xpct->descriptor = apct->descriptor;
+       xpct->length     = apct->length;
+       xpct->space_id   = apct->space_id;
+       xpct->bit_width  = apct->bit_width;
+       xpct->bit_offset = apct->bit_offset;
+       xpct->reserved   = apct->reserved;
+       xpct->address    = apct->address;
+}
+
+static inline void xen_convert_pss_states(struct xen_processor_px *xpss,
+       struct acpi_processor_px *apss, int state_count)
+{
+       int i;
+       for(i=0; i<state_count; i++) {
+               xpss->core_frequency     = apss->core_frequency;
+               xpss->power              = apss->power;
+               xpss->transition_latency = apss->transition_latency;
+               xpss->bus_master_latency = apss->bus_master_latency;
+               xpss->control            = apss->control;
+               xpss->status             = apss->status;
+               xpss++;
+               apss++;
+       }
+}
+
+static inline void xen_convert_psd_pack(struct xen_psd_package *xpsd,
+       struct acpi_psd_package *apsd)
+{
+       xpsd->num_entries    = apsd->num_entries;
+       xpsd->revision       = apsd->revision;
+       xpsd->domain         = apsd->domain;
+       xpsd->coord_type     = apsd->coord_type;
+       xpsd->num_processors = apsd->num_processors;
+}
+
+extern int xen_pcpu_hotplug(int type);
+extern int xen_pcpu_index(uint32_t id, bool is_acpiid);
+#endif /* CONFIG_XEN */
+
  #endif
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h

index 8aeadf6..dff4842 100644 (file)
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -359,6 +359,8 @@
                 MEM_KEEP(exit.rodata)                                   \
         }                                                               \
                                                                         \
+       EH_FRAME                                                        \
+                                                                       \
         /* Built-in module parameters. */                               \
         __param : AT(ADDR(__param) - LOAD_OFFSET) {                     \
                 VMLINUX_SYMBOL(__start___param) = .;                    \
@@ -803,3 +805,23 @@
         BSS(bss_align)                                                  \
         . = ALIGN(stop_align);                                          \
         VMLINUX_SYMBOL(__bss_stop) = .;
+
+#ifdef CONFIG_STACK_UNWIND
+#define EH_FRAME                                                       \
+               /* Unwind data binary search table */                   \
+               . = ALIGN(8);                                           \
+               .eh_frame_hdr : AT(ADDR(.eh_frame_hdr) - LOAD_OFFSET) { \
+                       VMLINUX_SYMBOL(__start_unwind_hdr) = .;         \
+                       *(.eh_frame_hdr)                                \
+                       VMLINUX_SYMBOL(__end_unwind_hdr) = .;           \
+               }                                                       \
+               /* Unwind data */                                       \
+               . = ALIGN(8);                                           \
+               .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {         \
+                       VMLINUX_SYMBOL(__start_unwind) = .;             \
+                       *(.eh_frame)                                    \
+                       VMLINUX_SYMBOL(__end_unwind) = .;               \
+               }
+#else
+#define EH_FRAME
+#endif
diff --git a/include/linux/acpi.h b/include/linux/acpi.h

index f421dd8..5f4b780 100644 (file)
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -247,6 +247,8 @@ int acpi_check_region(resource_size_t start, resource_size_t n,
  
  int acpi_resources_are_enforced(void);
  
+int acpi_pci_get_root_seg_bbn(char *hid, char *uid, int *seg, int *bbn);
+
  #ifdef CONFIG_PM_SLEEP
  void __init acpi_no_s4_hw_signature(void);
  void __init acpi_old_suspend_ordering(void);
diff --git a/include/linux/aio.h b/include/linux/aio.h

index 2314ad8..4b2724f 100644 (file)
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -200,6 +200,12 @@ struct kioctx {
  
         struct delayed_work     wq;
  
+#ifdef CONFIG_EPOLL
+       /* poll integration */
+       wait_queue_head_t       poll_wait;
+       struct file             *file;
+#endif
+
         struct rcu_head         rcu_head;
  };
  
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h

index 4d4ac24..fbb94f4 100644 (file)
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -976,7 +976,11 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
  enum blk_default_limits {
         BLK_MAX_SEGMENTS        = 128,
         BLK_SAFE_MAX_SECTORS    = 255,
+#ifndef CONFIG_KERNEL_DESKTOP
+       BLK_DEF_MAX_SECTORS     = 2048,
+#else
         BLK_DEF_MAX_SECTORS     = 1024,
+#endif
         BLK_MAX_SEGMENT_SIZE    = 65536,
         BLK_SEG_BOUNDARY_MASK   = 0xFFFFFFFFUL,
  };
diff --git a/include/linux/bootsplash.h b/include/linux/bootsplash.h

new file mode 100644 (file)

index 0000000..b850a96
--- /dev/null
+++ b/include/linux/bootsplash.h
@@ -0,0 +1,87 @@
+/*
+ *    linux/drivers/video/bootsplash/bootsplash.h - splash screen definition.
+ *
+ *     (w) 2001-2003 by Volker Poplawski, <volker@poplawski.de>
+ *             Stefan Reinauer, <stepan@suse.de>
+ *
+ *
+ *     idea and SuSE screen work by Ken Wimer, <wimer@suse.de>
+ */
+
+#ifndef __BOOTSPLASH_H
+#define __BOOTSPLASH_H
+
+# ifdef CONFIG_BOOTSPLASH
+
+struct fb_info;
+union pt {
+       u32 *ul;
+       u16 *us;
+       u8  *ub;
+};
+
+enum splash_color_format {
+       SPLASH_DEPTH_UNKNOWN = 0,
+       SPLASH_DEPTH_15 = 15,
+       SPLASH_DEPTH_16 = 16,
+       SPLASH_DEPTH_24_PACKED = 24,
+       SPLASH_DEPTH_24 = 32
+};
+
+#define splash_octpp(cf) (((int)cf + 1) >> 3)
+
+struct vc_data;
+struct fb_info;
+struct fb_cursor;
+struct splash_data;
+
+/* splash.c */
+extern int splash_prepare(struct vc_data *, struct fb_info *);
+extern void splash_init(void);
+extern int splash_verbose(void);
+
+/* splash_render.c */
+extern void splash_putcs(struct vc_data *vc, struct fb_info *info,
+                       const unsigned short *s, int count,
+                        int ypos, int xpos);
+extern void splash_sync_region(struct fb_info *info, int x, int y,
+                              int width, int height);
+extern void splashcopy(u8 *dst, u8 *src, int height, int width,
+                      int dstbytes, int srcbytes, int octpp);
+extern void splash_clear(struct vc_data *vc, struct fb_info *info, int sy,
+                       int sx, int height, int width);
+extern void splash_bmove(struct vc_data *vc, struct fb_info *info, int sy,
+                       int sx, int dy, int dx, int height, int width);
+extern void splash_clear_margins(struct vc_data *vc, struct fb_info *info,
+                       int bottom_only);
+extern int splash_cursor(struct fb_info *info, struct fb_cursor *cursor);
+extern void splash_bmove_redraw(struct vc_data *vc, struct fb_info *info,
+                       int y, int sx, int dx, int width);
+extern void splash_blank(struct vc_data *vc, struct fb_info *info,
+                       int blank);
+
+#  define SPLASH_VERBOSE() splash_verbose()
+#  define SPLASH_DATA(x) (x->splash_data)
+#  define TEXT_WIDTH_FROM_SPLASH_DATA(x) (x->splash_data->splash_vc_text_wi)
+#  define TEXT_HIGHT_FROM_SPLASH_DATA(x) (x->splash_data->splash_vc_text_he)
+/* vt.c */
+extern void con_remap_def_color(struct vc_data *vc, int new_color);
+
+# else
+#  define splash_init()
+#  define splash_verbose() 0
+#  define SPLASH_VERBOSE()
+#  define splash_blank(vc, info, blank)
+#  define splash_bmove(vc, info, sy, sx, dy, dx, height, width)
+#  define splash_bmove_redraw(vc, info, sy, sx, dx, width)
+#  define splash_cursor(info, cursor)
+#  define splash_clear(vc, info, sy, sx, height, width)
+#  define splash_clear_margins(vc, info, bottom_only)
+#  define splash_putcs(vc, info, s, count, ypos, xpos)
+
+#  define SPLASH_DATA(x) 0
+#  define TEXT_WIDTH_FROM_SPLASH_DATA(x) 0
+#  define TEXT_HIGHT_FROM_SPLASH_DATA(x) 0
+# endif
+
+#endif
diff --git a/include/linux/console.h b/include/linux/console.h

index 7201ce4..670afb7 100644 (file)
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -73,6 +73,7 @@ extern const struct consw dummy_con;  /* dummy console buffer */
  extern const struct consw vga_con;     /* VGA text console */
  extern const struct consw newport_con; /* SGI Newport console  */
  extern const struct consw prom_con;    /* SPARC PROM console */
+extern bool console_use_vt;
  
  int con_is_bound(const struct consw *csw);
  int register_con_driver(const struct consw *csw, int first, int last);
diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h

index 7f0c329..8e41552 100644 (file)
--- a/include/linux/console_struct.h
+++ b/include/linux/console_struct.h
@@ -107,6 +107,9 @@ struct vc_data {
         unsigned long   vc_uni_pagedir;
         unsigned long   *vc_uni_pagedir_loc;  /* [!] Location of uni_pagedir variable for this console */
         bool vc_panic_force_write; /* when oops/panic this VC can accept forced output/blanking */
+#ifdef CONFIG_BOOTSPLASH
+       struct splash_data *vc_splash_data;
+#endif
         /* additional information is in vt_kern.h */
  };
  
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h

index b60f6ba..e89fcb7 100644 (file)
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -323,7 +323,7 @@ static inline unsigned int cpufreq_get(unsigned int cpu)
  #endif
  
  /* query the last known CPU freq (in kHz). If zero, cpufreq couldn't detect it */
-#ifdef CONFIG_CPU_FREQ
+#if defined(CONFIG_CPU_FREQ) || defined(CONFIG_PROCESSOR_EXTERNAL_CONTROL)
  unsigned int cpufreq_quick_get(unsigned int cpu);
  unsigned int cpufreq_quick_get_max(unsigned int cpu);
  #else
diff --git a/include/linux/device.h b/include/linux/device.h

index 5ad17cc..a03b0e8 100644 (file)
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -879,6 +879,41 @@ extern const char *dev_driver_string(const struct device *dev);
  extern int __dev_printk(const char *level, const struct device *dev,
                         struct va_format *vaf);
  extern __printf(3, 4)
+
+#if defined(KMSG_COMPONENT) && (defined(CONFIG_KMSG_IDS) || defined(__KMSG_CHECKER))
+/* dev_printk_hash for message documentation */
+#if defined(__KMSG_CHECKER) && defined(KMSG_COMPONENT)
+
+/* generate magic string for scripts/kmsg-doc to parse */
+#define dev_printk_hash(level, dev, format, arg...) \
+       __KMSG_DEV(level _FMT_ format _ARGS_ dev, ## arg _END_)
+
+#elif defined(CONFIG_KMSG_IDS) && defined(KMSG_COMPONENT)
+
+int printk_dev_hash(const char *, const char *, const char *, ...);
+#define dev_printk_hash(level, dev, format, arg...) \
+       printk_dev_hash(level "%s.%06x: ", dev_driver_string(dev), \
+                       "%s: " format, dev_name(dev), ## arg)
+
+#endif
+
+#define dev_printk(level, dev, format, arg...)         \
+       dev_printk_hash(level , dev, format, ## arg)
+#define dev_emerg(dev, format, arg...)         \
+       dev_printk_hash(KERN_EMERG , dev , format , ## arg)
+#define dev_alert(dev, format, arg...)         \
+       dev_printk_hash(KERN_ALERT , dev , format , ## arg)
+#define dev_crit(dev, format, arg...)          \
+       dev_printk_hash(KERN_CRIT , dev , format , ## arg)
+#define dev_err(dev, format, arg...)           \
+       dev_printk_hash(KERN_ERR , dev , format , ## arg)
+#define dev_warn(dev, format, arg...)          \
+       dev_printk_hash(KERN_WARNING , dev , format , ## arg)
+#define dev_notice(dev, format, arg...)                \
+       dev_printk_hash(KERN_NOTICE , dev , format , ## arg)
+#define _dev_info(dev, format, arg...)         \
+       dev_printk_hash(KERN_INFO , dev , format , ## arg)
+#else
  int dev_printk(const char *level, const struct device *dev,
                const char *fmt, ...)
         ;
@@ -896,7 +931,7 @@ extern __printf(2, 3)
  int dev_notice(const struct device *dev, const char *fmt, ...);
  extern __printf(2, 3)
  int _dev_info(const struct device *dev, const char *fmt, ...);
-
+#endif
  #else
  
  static inline int __dev_printk(const char *level, const struct device *dev,
diff --git a/include/linux/dm-region-hash.h b/include/linux/dm-region-hash.h

index 9e2a7a4..7e7c83d 100644 (file)
--- a/include/linux/dm-region-hash.h
+++ b/include/linux/dm-region-hash.h
@@ -49,6 +49,7 @@ struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh);
   */
  region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio);
  sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region);
+region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector);
  void *dm_rh_region_context(struct dm_region *reg);
  
  /*
@@ -72,11 +73,14 @@ void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled);
  int dm_rh_flush(struct dm_region_hash *rh);
  
  /* Inc/dec pending count on regions. */
+void dm_rh_inc(struct dm_region_hash *rh, region_t region);
  void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios);
  void dm_rh_dec(struct dm_region_hash *rh, region_t region);
  
  /* Delay bios on regions. */
  void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio);
+void dm_rh_delay_by_region(struct dm_region_hash *rh, struct bio *bio,
+                          region_t region);
  
  void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio);
  
diff --git a/include/linux/efi.h b/include/linux/efi.h

index ec45ccd..49ff228 100644 (file)
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -451,7 +451,9 @@ typedef struct {
   * All runtime access to EFI goes through this structure:
   */
  extern struct efi {
+#ifndef CONFIG_XEN
         efi_system_table_t *systab;     /* EFI system table */
+#endif
         unsigned int runtime_version;   /* Runtime services version */
         unsigned long mps;              /* MPS table */
         unsigned long acpi;             /* ACPI table  (IA64 ext 0.71) */
@@ -473,8 +475,10 @@ extern struct efi {
         efi_update_capsule_t *update_capsule;
         efi_query_capsule_caps_t *query_capsule_caps;
         efi_get_next_high_mono_count_t *get_next_high_mono_count;
+#ifndef CONFIG_XEN
         efi_reset_system_t *reset_system;
         efi_set_virtual_address_map_t *set_virtual_address_map;
+#endif
  } efi;
  
  static inline int
diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h

index 278e3ef..9f9816a 100644 (file)
--- a/include/linux/elfnote.h
+++ b/include/linux/elfnote.h
@@ -52,7 +52,7 @@
  4484:.balign 4                         ;       \
  .popsection                            ;
  
-#define ELFNOTE(name, type, desc)              \
+#define ELFNOTE(name, type, desc...)           \
         ELFNOTE_START(name, type, "")           \
                 desc                    ;       \
         ELFNOTE_END
diff --git a/include/linux/fb.h b/include/linux/fb.h

index d31cb68..707b36b 100644 (file)
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -883,6 +883,10 @@ struct fb_info {
         void *fbcon_par;                /* fbcon use-only private area */
         /* From here on everything is device dependent */
         void *par;
+#ifdef CONFIG_BOOTSPLASH
+       struct splash_data *splash_data;
+       char fb_cursordata[64];
+#endif
         /* we need the PCI or similar aperture base/size not
            smem_start/size as smem_start may just be an object
            allocated inside the aperture so may not actually overlap */
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h

new file mode 100644 (file)

index 0000000..3e46c31
--- /dev/null
+++ b/include/linux/frontswap.h
@@ -0,0 +1,126 @@
+#ifndef _LINUX_FRONTSWAP_H
+#define _LINUX_FRONTSWAP_H
+
+#include <linux/swap.h>
+#include <linux/mm.h>
+#include <linux/bitops.h>
+
+struct frontswap_ops {
+       void (*init)(unsigned);
+       int (*put_page)(unsigned, pgoff_t, struct page *);
+       int (*get_page)(unsigned, pgoff_t, struct page *);
+       void (*invalidate_page)(unsigned, pgoff_t);
+       void (*invalidate_area)(unsigned);
+};
+
+extern int frontswap_enabled;
+extern struct frontswap_ops
+       frontswap_register_ops(struct frontswap_ops *ops);
+extern void frontswap_shrink(unsigned long);
+extern unsigned long frontswap_curr_pages(void);
+
+extern void __frontswap_init(unsigned type);
+extern int __frontswap_put_page(struct page *page);
+extern int __frontswap_get_page(struct page *page);
+extern void __frontswap_invalidate_page(unsigned, pgoff_t);
+extern void __frontswap_invalidate_area(unsigned);
+
+#ifdef CONFIG_FRONTSWAP
+
+static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+       int ret = 0;
+
+       if (frontswap_enabled && sis->frontswap_map)
+               ret = test_bit(offset, sis->frontswap_map);
+       return ret;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+       if (frontswap_enabled && sis->frontswap_map)
+               set_bit(offset, sis->frontswap_map);
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+       if (frontswap_enabled && sis->frontswap_map)
+               clear_bit(offset, sis->frontswap_map);
+}
+
+static inline void frontswap_map_set(struct swap_info_struct *p,
+                                    unsigned long *map)
+{
+       p->frontswap_map = map;
+}
+
+static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
+{
+       return p->frontswap_map;
+}
+#else
+/* all inline routines become no-ops and all externs are ignored */
+
+#define frontswap_enabled (0)
+
+static inline int frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
+{
+       return 0;
+}
+
+static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
+{
+}
+
+static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+}
+
+static inline void frontswap_map_set(struct swap_info_struct *p,
+                                    unsigned long *map)
+{
+}
+
+static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
+{
+       return NULL;
+}
+#endif
+
+static inline int frontswap_put_page(struct page *page)
+{
+       int ret = -1;
+
+       if (frontswap_enabled)
+               ret = __frontswap_put_page(page);
+       return ret;
+}
+
+static inline int frontswap_get_page(struct page *page)
+{
+       int ret = -1;
+
+       if (frontswap_enabled)
+               ret = __frontswap_get_page(page);
+       return ret;
+}
+
+static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+       if (frontswap_enabled)
+               __frontswap_invalidate_page(type, offset);
+}
+
+static inline void frontswap_invalidate_area(unsigned type)
+{
+       if (frontswap_enabled)
+               __frontswap_invalidate_area(type);
+}
+
+static inline void frontswap_init(unsigned type)
+{
+       if (frontswap_enabled)
+               __frontswap_init(type);
+}
+
+#endif /* _LINUX_FRONTSWAP_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 25c40b9..9a66809 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -205,7 +205,7 @@ struct inodes_stat_t {
  #define MS_VERBOSE     32768   /* War is peace. Verbosity is silence.
                                    MS_VERBOSE is deprecated. */
  #define MS_SILENT      32768
-#define MS_POSIXACL    (1<<16) /* VFS does not apply the umask */
+#define MS_POSIXACL    (1<<16) /* Supports POSIX ACLs */
  #define MS_UNBINDABLE  (1<<17) /* change to unbindable */
  #define MS_PRIVATE     (1<<18) /* change to private */
  #define MS_SLAVE       (1<<19) /* change to slave */
@@ -214,6 +214,7 @@ struct inodes_stat_t {
  #define MS_KERNMOUNT   (1<<22) /* this is a kern_mount call */
  #define MS_I_VERSION   (1<<23) /* Update inode I_version field */
  #define MS_STRICTATIME (1<<24) /* Always perform atime updates */
+#define MS_RICHACL     (1<<25) /* Supports richacls */
  #define MS_NOSEC       (1<<28)
  #define MS_BORN                (1<<29)
  #define MS_ACTIVE      (1<<30)
@@ -274,6 +275,7 @@ struct inodes_stat_t {
  #define IS_APPEND(inode)       ((inode)->i_flags & S_APPEND)
  #define IS_IMMUTABLE(inode)    ((inode)->i_flags & S_IMMUTABLE)
  #define IS_POSIXACL(inode)     __IS_FLG(inode, MS_POSIXACL)
+#define IS_RICHACL(inode)      __IS_FLG(inode, MS_RICHACL)
  
  #define IS_DEADDIR(inode)      ((inode)->i_flags & S_DEAD)
  #define IS_NOCMTIME(inode)     ((inode)->i_flags & S_NOCMTIME)
@@ -283,6 +285,12 @@ struct inodes_stat_t {
  #define IS_AUTOMOUNT(inode)    ((inode)->i_flags & S_AUTOMOUNT)
  #define IS_NOSEC(inode)                ((inode)->i_flags & S_NOSEC)
  
+/*
+ * IS_ACL() tells the VFS to not apply the umask
+ * and use iop->check_acl for acl permission checks when defined.
+ */
+#define IS_ACL(inode)          __IS_FLG(inode, MS_POSIXACL | MS_RICHACL)
+
  /* the read-only stuff doesn't really belong here, but any other place is
     probably as bad and I don't want to create yet another include file. */
  
@@ -1664,6 +1672,10 @@ struct inode_operations {
         void (*truncate_range)(struct inode *, loff_t, loff_t);
         int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
                       u64 len);
+       int (*may_create) (struct inode *, int);
+       int (*may_delete) (struct inode *, struct inode *, int);
+
+
  } ____cacheline_aligned;
  
  struct seq_file;
diff --git a/include/linux/highmem.h b/include/linux/highmem.h

index d3999b4..bf37041 100644 (file)
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -214,12 +214,14 @@ alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
         return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr);
  }
  
+#ifndef __HAVE_ARCH_CLEAR_HIGHPAGE
  static inline void clear_highpage(struct page *page)
  {
         void *kaddr = kmap_atomic(page);
         clear_page(kaddr);
         kunmap_atomic(kaddr);
  }
+#endif
  
  static inline void zero_user_segments(struct page *page,
         unsigned start1, unsigned end1,
@@ -273,6 +275,8 @@ static inline void copy_user_highpage(struct page *to, struct page *from,
  
  #endif
  
+#ifndef __HAVE_ARCH_COPY_HIGHPAGE
+
  static inline void copy_highpage(struct page *to, struct page *from)
  {
         char *vfrom, *vto;
@@ -284,4 +288,6 @@ static inline void copy_highpage(struct page *to, struct page *from)
         kunmap_atomic(vfrom);
  }
  
+#endif
+
  #endif /* _LINUX_HIGHMEM_H */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h

index 2aea5d2..8a888d6 100644 (file)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -392,6 +392,11 @@ static inline int disable_irq_wake(unsigned int irq)
  }
  #endif /* CONFIG_GENERIC_HARDIRQS */
  
+#ifdef CONFIG_HAVE_IRQ_IGNORE_UNHANDLED
+int irq_ignore_unhandled(unsigned int irq);
+#else
+#define irq_ignore_unhandled(irq) 0
+#endif
  
  #ifdef CONFIG_IRQ_FORCED_THREADING
  extern bool force_irqthreads;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h

index 645231c..5d6ebde 100644 (file)
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -358,9 +358,11 @@ extern int panic_timeout;
  extern int panic_on_oops;
  extern int panic_on_unrecovered_nmi;
  extern int panic_on_io_nmi;
+extern int unsupported;
  extern int sysctl_panic_on_stackoverflow;
  extern const char *print_tainted(void);
  extern void add_taint(unsigned flag);
+extern void add_nonfatal_taint(unsigned flag);
  extern int test_taint(unsigned flag);
  extern unsigned long get_taint(void);
  extern int root_mountflags;
@@ -391,6 +393,15 @@ extern enum system_states {
  #define TAINT_FIRMWARE_WORKAROUND      11
  #define TAINT_OOT_MODULE               12
  
+#ifdef CONFIG_ENTERPRISE_SUPPORT
+/*
+ * Take the upper bits to hopefully allow them
+ * to stay the same for more than one release.
+ */
+#define TAINT_NO_SUPPORT               30
+#define TAINT_EXTERNAL_SUPPORT         31
+#endif
+
  extern const char hex_asc[];
  #define hex_asc_lo(x)  hex_asc[((x) & 0x0f)]
  #define hex_asc_hi(x)  hex_asc[((x) & 0xf0) >> 4]
diff --git a/include/linux/kexec.h b/include/linux/kexec.h

index 0d7d6a1..a296b88 100644 (file)
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -56,6 +56,13 @@
                             KEXEC_CORE_NOTE_DESC_BYTES )
  #endif
  
+#ifndef KEXEC_ARCH_HAS_PAGE_MACROS
+#define kexec_page_to_pfn(page)  page_to_pfn(page)
+#define kexec_pfn_to_page(pfn)   pfn_to_page(pfn)
+#define kexec_virt_to_phys(addr) virt_to_phys(addr)
+#define kexec_phys_to_virt(addr) phys_to_virt(addr)
+#endif
+
  /*
   * This structure is used to hold the arguments that are used when loading
   * kernel binaries.
@@ -122,6 +129,12 @@ struct kimage {
  extern void machine_kexec(struct kimage *image);
  extern int machine_kexec_prepare(struct kimage *image);
  extern void machine_kexec_cleanup(struct kimage *image);
+#ifdef CONFIG_XEN
+extern int xen_machine_kexec_load(struct kimage *image);
+extern void xen_machine_kexec_unload(struct kimage *image);
+extern void xen_machine_kexec_setup_resources(void);
+extern void xen_machine_kexec_register_resources(struct resource *res);
+#endif
  extern asmlinkage long sys_kexec_load(unsigned long entry,
                                         unsigned long nr_segments,
                                         struct kexec_segment __user *segments,
@@ -204,8 +217,15 @@ extern struct kimage *kexec_crash_image;
  #define VMCOREINFO_BYTES           (4096)
  #define VMCOREINFO_NOTE_NAME       "VMCOREINFO"
  #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
+#if !defined(CONFIG_XEN) || !defined(CONFIG_X86)
  #define VMCOREINFO_NOTE_SIZE       (KEXEC_NOTE_HEAD_BYTES*2 + VMCOREINFO_BYTES \
                                     + VMCOREINFO_NOTE_NAME_BYTES)
+#else
+#define VMCOREINFO_NOTE_SIZE       ALIGN(KEXEC_NOTE_HEAD_BYTES*2 \
+                                        + VMCOREINFO_BYTES \
+                                        + VMCOREINFO_NOTE_NAME_BYTES, \
+                                        PAGE_SIZE)
+#endif
  
  /* Location of a reserved region to hold the crash kernel.
   */
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 74aa71b..194aaf8 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -116,7 +116,12 @@ extern unsigned int kobjsize(const void *objp);
  
  #define VM_CAN_NONLINEAR 0x08000000    /* Has ->fault & does nonlinear pages */
  #define VM_MIXEDMAP    0x10000000      /* Can contain "struct page" and pure PFN pages */
+#ifndef CONFIG_XEN
  #define VM_SAO         0x20000000      /* Strong Access Ordering (powerpc) */
+#else
+#define VM_SAO         0
+#define VM_FOREIGN     0x20000000      /* Has pages belonging to another VM */
+#endif
  #define VM_PFN_AT_MMAP 0x40000000      /* PFNMAP vma that is fully mapped at mmap time */
  #define VM_MERGEABLE   0x80000000      /* KSM may merge identical pages */
  
@@ -145,6 +150,12 @@ extern unsigned int kobjsize(const void *objp);
   */
  #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
  
+#ifdef CONFIG_XEN
+struct vm_foreign_map {
+       struct page **map;
+};
+#endif
+
  /*
   * mapping from the currently active vm_flags protection bits (the
   * low four bits) to a page protection mask..
@@ -216,6 +227,17 @@ struct vm_operations_struct {
          */
         int (*access)(struct vm_area_struct *vma, unsigned long addr,
                       void *buf, int len, int write);
+
+#ifdef CONFIG_XEN
+       /* Area-specific function for clearing the PTE at @ptep. Returns the
+        * original value of @ptep. */
+       pte_t (*zap_pte)(struct vm_area_struct *vma,
+                        unsigned long addr, pte_t *ptep, int is_fullmm);
+
+       /* called before close() to indicate no more pages should be mapped */
+       void (*unmap)(struct vm_area_struct *area);
+#endif
+
  #ifdef CONFIG_NUMA
         /*
          * set_policy() op must add a reference to any non-NULL @new mempolicy
@@ -1421,7 +1443,11 @@ int write_one_page(struct page *page, int wait);
  void task_dirty_inc(struct task_struct *tsk);
  
  /* readahead.c */
+#ifndef CONFIG_KERNEL_DESKTOP
+#define VM_MAX_READAHEAD       512     /* kbytes */
+#else
  #define VM_MAX_READAHEAD       128     /* kbytes */
+#endif
  #define VM_MIN_READAHEAD       16      /* kbytes (includes current page) */
  
  int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
diff --git a/include/linux/module.h b/include/linux/module.h

index fbcafe2..aa2295e 100644 (file)
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -287,6 +287,9 @@ struct module
         /* Size of RO sections of the module (text+rodata) */
         unsigned int init_ro_size, core_ro_size;
  
+       /* The handle returned from unwind_add_table. */
+       void *unwind_info;
+
         /* Arch-specific module values */
         struct mod_arch_specific arch;
  
@@ -387,6 +390,7 @@ struct module *__module_address(unsigned long addr);
  bool is_module_address(unsigned long addr);
  bool is_module_percpu_address(unsigned long addr);
  bool is_module_text_address(unsigned long addr);
+const char *supported_printable(int taint);
  
  static inline int within_module_core(unsigned long addr, struct module *mod)
  {
diff --git a/include/linux/msi.h b/include/linux/msi.h

index ce93a34..e693ef7 100644 (file)
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -4,6 +4,8 @@
  #include <linux/kobject.h>
  #include <linux/list.h>
  
+#ifndef CONFIG_XEN
+
  struct msi_msg {
         u32     address_lo;     /* low 32 bits of msi message address */
         u32     address_hi;     /* high 32 bits of msi message address */
@@ -49,6 +51,11 @@ struct msi_desc {
         struct kobject kobj;
  };
  
+#else /* CONFIG_XEN */
+struct pci_dev;
+struct msi_desc;
+#endif /* CONFIG_XEN */
+
  /*
   * The arch hook for setup up msi irqs
   */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h

index 52a1bdb..5c82ac0 100644 (file)
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -233,6 +233,10 @@ struct nfs_inode {
  #define NFS_INO_PNFS_COMMIT    (8)             /* use pnfs code for commit */
  #define NFS_INO_LAYOUTCOMMIT   (9)             /* layoutcommit required */
  #define NFS_INO_LAYOUTCOMMITTING (10)          /* layoutcommit inflight */
+#define NFS_INO_SEEN_GETATTR   (11)            /* flag to track if app is calling
+                                                * getattr in a directory during
+                                                * readdir
+                                                */
  
  static inline struct nfs_inode *NFS_I(const struct inode *inode)
  {
diff --git a/include/linux/nmi.h b/include/linux/nmi.h

index db50840..1be4486 100644 (file)
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -18,6 +18,9 @@
  #include <asm/nmi.h>
  extern void touch_nmi_watchdog(void);
  #else
+#ifdef CONFIG_XEN
+#include <asm/nmi.h>
+#endif
  static inline void touch_nmi_watchdog(void)
  {
         touch_softlockup_watchdog();
diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h

index a4c5624..7a33436 100644 (file)
--- a/include/linux/oprofile.h
+++ b/include/linux/oprofile.h
@@ -19,6 +19,9 @@
  #include <linux/errno.h>
  #include <linux/printk.h>
  #include <linux/atomic.h>
+#ifdef CONFIG_XEN
+#include <xen/interface/xenoprof.h>
+#endif
   
  /* Each escaped entry is prefixed by ESCAPE_CODE
   * then one of the following codes, then the
@@ -31,14 +34,18 @@
  #define CPU_SWITCH_CODE                        2
  #define COOKIE_SWITCH_CODE             3
  #define KERNEL_ENTER_SWITCH_CODE       4
-#define KERNEL_EXIT_SWITCH_CODE                5
+#define USER_ENTER_SWITCH_CODE         5
  #define MODULE_LOADED_CODE             6
  #define CTX_TGID_CODE                  7
  #define TRACE_BEGIN_CODE               8
  #define TRACE_END_CODE                 9
  #define XEN_ENTER_SWITCH_CODE          10
+#ifndef CONFIG_XEN
  #define SPU_PROFILING_CODE             11
  #define SPU_CTX_SWITCH_CODE            12
+#else
+#define DOMAIN_SWITCH_CODE             11
+#endif
  #define IBS_FETCH_CODE                 13
  #define IBS_OP_CODE                    14
  
@@ -52,6 +59,12 @@ struct oprofile_operations {
         /* create any necessary configuration files in the oprofile fs.
          * Optional. */
         int (*create_files)(struct super_block * sb, struct dentry * root);
+#ifdef CONFIG_XEN
+       /* setup active domains with Xen */
+       int (*set_active)(int *active_domains, unsigned int adomains);
+       /* setup passive domains with Xen */
+       int (*set_passive)(int *passive_domains, unsigned int pdomains);
+#endif
         /* Do any necessary interrupt setup. Optional. */
         int (*setup)(void);
         /* Do any necessary interrupt shutdown. Optional. */
@@ -117,9 +130,14 @@ void oprofile_add_ext_hw_sample(unsigned long pc, struct pt_regs * const regs,
   * backtrace. */
  void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event);
  
+void oprofile_add_mode(int cpu_mode);
+
  /* add a backtrace entry, to be called from the ->backtrace callback */
  void oprofile_add_trace(unsigned long eip);
  
+/* add a domain switch entry */
+int oprofile_add_domain_switch(int32_t domain_id);
+
  
  /**
   * Create a file of the given name as a child of the given root, with
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h

index c88d2a9..03b50ad 100644 (file)
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -108,6 +108,11 @@ enum pageflags {
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
         PG_compound_lock,
  #endif
+#ifdef CONFIG_XEN
+       PG_foreign,             /* Page is owned by foreign allocator. */
+       /* PG_netback,             Page is owned by netback */
+       PG_blkback,             /* Page is owned by blkback */
+#endif
         __NR_PAGEFLAGS,
  
         /* Filesystems */
@@ -120,8 +125,15 @@ enum pageflags {
         PG_fscache = PG_private_2,      /* page backed by cache */
  
         /* XEN */
+#if defined(CONFIG_XEN)
+       PG_pinned = PG_locked,  /* Cannot alias with PG_owner_priv_1 since
+                                * bad_page() checks should include this bit.
+                                * Should not use PG_arch_1 as that may have
+                                * a different purpose elsewhere. */
+#elif defined(CONFIG_PARAVIRT_XEN)
         PG_pinned = PG_owner_priv_1,
         PG_savepinned = PG_dirty,
+#endif
  
         /* SLOB */
         PG_slob_free = PG_private,
@@ -203,8 +215,12 @@ PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
         TESTCLEARFLAG(Active, active)
  __PAGEFLAG(Slab, slab)
  PAGEFLAG(Checked, checked)             /* Used by some filesystems */
+#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN)
  PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)    /* Xen */
+#endif
+#ifdef CONFIG_PARAVIRT_XEN
  PAGEFLAG(SavePinned, savepinned);                      /* Xen */
+#endif
  PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
  PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
  
@@ -321,6 +337,28 @@ static inline void SetPageUptodate(struct page *page)
  
  CLEARPAGEFLAG(Uptodate, uptodate)
  
+#ifdef CONFIG_XEN
+TESTPAGEFLAG(Foreign, foreign)
+static inline void SetPageForeign(struct page *page,
+                                 void (*dtor)(struct page *, unsigned int))
+{
+       BUG_ON(!dtor);
+       set_bit(PG_foreign, &page->flags);
+       page->index = (long)dtor;
+}
+static inline void ClearPageForeign(struct page *page)
+{
+       clear_bit(PG_foreign, &page->flags);
+       page->index = 0;
+}
+static inline void PageForeignDestructor(struct page *page, unsigned int order)
+{
+       ((void (*)(struct page *, unsigned int))page->index)(page, order);
+}
+/*PAGEFLAG(Netback, netback)*/
+PAGEFLAG(Blkback, blkback)
+#endif
+
  extern void cancel_dirty_page(struct page *page, unsigned int account_size);
  
  int test_clear_page_writeback(struct page *page);
@@ -465,6 +503,12 @@ static inline int PageTransTail(struct page *page)
  #define __PG_COMPOUND_LOCK             0
  #endif
  
+#ifndef CONFIG_XEN
+# define __PG_XEN              0
+#else
+# define __PG_XEN              (1 << PG_foreign)
+#endif
+
  /*
   * Flags checked when a page is freed.  Pages being freed should not have
   * these flags set.  It they are, there is a problem.
@@ -475,7 +519,7 @@ static inline int PageTransTail(struct page *page)
          1 << PG_writeback | 1 << PG_reserved | \
          1 << PG_slab    | 1 << PG_swapcache | 1 << PG_active | \
          1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \
-        __PG_COMPOUND_LOCK)
+        __PG_COMPOUND_LOCK | __PG_XEN)
  
  /*
   * Flags checked when a page is prepped for return by the page allocator.
diff --git a/include/linux/pci.h b/include/linux/pci.h

index e444f5b..7dadc57 100644 (file)
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -820,6 +820,9 @@ void pci_update_resource(struct pci_dev *dev, int resno);
  int __must_check pci_assign_resource(struct pci_dev *dev, int i);
  int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
  int pci_select_bars(struct pci_dev *dev, unsigned long flags);
+#ifdef CONFIG_XEN
+void pci_restore_bars(struct pci_dev *);
+#endif
  
  /* ROM control related routines */
  int pci_enable_rom(struct pci_dev *pdev);
@@ -1058,6 +1061,10 @@ extern void pci_disable_msix(struct pci_dev *dev);
  extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
  extern void pci_restore_msi_state(struct pci_dev *dev);
  extern int pci_msi_enabled(void);
+#ifdef CONFIG_XEN
+extern int register_msi_get_owner(int (*func)(struct pci_dev *dev));
+extern int unregister_msi_get_owner(int (*func)(struct pci_dev *dev));
+#endif
  #endif
  
  #ifdef CONFIG_PCIEPORTBUS
@@ -1720,5 +1727,11 @@ static inline struct eeh_dev *pci_dev_to_eeh_dev(struct pci_dev *pdev)
   */
  struct pci_dev *pci_find_upstream_pcie_bridge(struct pci_dev *pdev);
  
+#ifdef CONFIG_PCI_GUESTDEV
+int pci_is_guestdev(struct pci_dev *dev);
+#else
+#define pci_is_guestdev(dev)   0
+#endif
+
  #endif /* __KERNEL__ */
  #endif /* LINUX_PCI_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h

index 0525927..6d3776d 100644 (file)
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -162,23 +162,42 @@ extern void dump_stack(void) __cold;
  #define pr_fmt(fmt) fmt
  #endif
  
+#if defined(__KMSG_CHECKER) && defined(KMSG_COMPONENT)
+
+/* generate magic string for scripts/kmsg-doc to parse */
+#define pr_printk_hash(level, format, ...) \
+       __KMSG_PRINT(level _FMT_ format _ARGS_ #__VA_ARGS__ _END_)
+
+#elif defined(CONFIG_KMSG_IDS) && defined(KMSG_COMPONENT)
+
+int printk_hash(const char *, const char *, ...);
+#define pr_printk_hash(level, format, ...) \
+       printk_hash(level KMSG_COMPONENT ".%06x" ": ", format, ##__VA_ARGS__)
+
+#else /* !defined(CONFIG_KMSG_IDS) */
+
+#define pr_printk_hash(level, format, ...) \
+       printk(level pr_fmt(format), ##__VA_ARGS__)
+
+#endif
+
  #define pr_emerg(fmt, ...) \
-       printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
+       pr_printk_hash(KERN_EMERG, fmt, ##__VA_ARGS__)
  #define pr_alert(fmt, ...) \
-       printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
+       pr_printk_hash(KERN_ALERT, fmt, ##__VA_ARGS__)
  #define pr_crit(fmt, ...) \
-       printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
+       pr_printk_hash(KERN_CRIT, fmt, ##__VA_ARGS__)
  #define pr_err(fmt, ...) \
-       printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+       pr_printk_hash(KERN_ERR, fmt, ##__VA_ARGS__)
  #define pr_warning(fmt, ...) \
-       printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+       pr_printk_hash(KERN_WARNING, fmt, ##__VA_ARGS__)
  #define pr_warn pr_warning
  #define pr_notice(fmt, ...) \
-       printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
+       pr_printk_hash(KERN_NOTICE, fmt, ##__VA_ARGS__)
  #define pr_info(fmt, ...) \
-       printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+       pr_printk_hash(KERN_INFO, fmt, ##__VA_ARGS__)
  #define pr_cont(fmt, ...) \
-       printk(KERN_CONT fmt, ##__VA_ARGS__)
+       pr_printk_hash(KERN_CONT, fmt, ##__VA_ARGS__)
  
  /* pr_devel() should produce zero code unless DEBUG is defined */
  #ifdef DEBUG
diff --git a/include/linux/richacl.h b/include/linux/richacl.h

new file mode 100644 (file)

index 0000000..3da00a2
--- /dev/null
+++ b/include/linux/richacl.h
@@ -0,0 +1,322 @@
+/*
+ * Copyright (C) 2006, 2010  Novell, Inc.
+ * Written by Andreas Gruenbacher <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __RICHACL_H
+#define __RICHACL_H
+#include <linux/slab.h>
+
+struct richace {
+       unsigned short  e_type;
+       unsigned short  e_flags;
+       unsigned int    e_mask;
+       union {
+               unsigned int    e_id;
+               const char      *e_who;
+       } u;
+};
+
+struct richacl {
+       atomic_t        a_refcount;
+       unsigned int    a_owner_mask;
+       unsigned int    a_group_mask;
+       unsigned int    a_other_mask;
+       unsigned short  a_count;
+       unsigned short  a_flags;
+       struct richace  a_entries[0];
+};
+
+#define richacl_for_each_entry(_ace, _acl) \
+       for (_ace = _acl->a_entries; \
+            _ace != _acl->a_entries + _acl->a_count; \
+            _ace++)
+
+#define richacl_for_each_entry_reverse(_ace, _acl) \
+       for (_ace = _acl->a_entries + _acl->a_count - 1; \
+            _ace != _acl->a_entries - 1; \
+            _ace--)
+
+/* a_flags values */
+#define ACL4_AUTO_INHERIT              0x01
+#define ACL4_PROTECTED                 0x02
+/*#define ACL4_DEFAULTED                       0x04*/
+
+#define ACL4_VALID_FLAGS (     \
+       ACL4_AUTO_INHERIT |     \
+       ACL4_PROTECTED)
+
+/* e_type values */
+#define ACE4_ACCESS_ALLOWED_ACE_TYPE   0x0000
+#define ACE4_ACCESS_DENIED_ACE_TYPE    0x0001
+/*#define ACE4_SYSTEM_AUDIT_ACE_TYPE   0x0002*/
+/*#define ACE4_SYSTEM_ALARM_ACE_TYPE   0x0003*/
+
+/* e_flags bitflags */
+#define ACE4_FILE_INHERIT_ACE          0x0001
+#define ACE4_DIRECTORY_INHERIT_ACE     0x0002
+#define ACE4_NO_PROPAGATE_INHERIT_ACE  0x0004
+#define ACE4_INHERIT_ONLY_ACE          0x0008
+/*#define ACE4_SUCCESSFUL_ACCESS_ACE_FLAG      0x0010*/
+/*#define ACE4_FAILED_ACCESS_ACE_FLAG  0x0020*/
+#define ACE4_IDENTIFIER_GROUP          0x0040
+#define ACE4_INHERITED_ACE             0x0080
+/* in-memory representation only */
+#define ACE4_SPECIAL_WHO               0x4000
+
+#define ACE4_VALID_FLAGS (                     \
+       ACE4_FILE_INHERIT_ACE |                 \
+       ACE4_DIRECTORY_INHERIT_ACE |            \
+       ACE4_NO_PROPAGATE_INHERIT_ACE |         \
+       ACE4_INHERIT_ONLY_ACE |                 \
+       ACE4_IDENTIFIER_GROUP |                 \
+       ACE4_INHERITED_ACE)
+
+/* e_mask bitflags */
+#define ACE4_READ_DATA                 0x00000001
+#define ACE4_LIST_DIRECTORY            0x00000001
+#define ACE4_WRITE_DATA                        0x00000002
+#define ACE4_ADD_FILE                  0x00000002
+#define ACE4_APPEND_DATA               0x00000004
+#define ACE4_ADD_SUBDIRECTORY          0x00000004
+#define ACE4_READ_NAMED_ATTRS          0x00000008
+#define ACE4_WRITE_NAMED_ATTRS         0x00000010
+#define ACE4_EXECUTE                   0x00000020
+#define ACE4_DELETE_CHILD              0x00000040
+#define ACE4_READ_ATTRIBUTES           0x00000080
+#define ACE4_WRITE_ATTRIBUTES          0x00000100
+#define ACE4_WRITE_RETENTION           0x00000200
+#define ACE4_WRITE_RETENTION_HOLD      0x00000400
+#define ACE4_DELETE                    0x00010000
+#define ACE4_READ_ACL                  0x00020000
+#define ACE4_WRITE_ACL                 0x00040000
+#define ACE4_WRITE_OWNER               0x00080000
+#define ACE4_SYNCHRONIZE               0x00100000
+
+/* Valid ACE4_* flags for directories and non-directories */
+#define ACE4_VALID_MASK (                              \
+       ACE4_READ_DATA | ACE4_LIST_DIRECTORY |          \
+       ACE4_WRITE_DATA | ACE4_ADD_FILE |               \
+       ACE4_APPEND_DATA | ACE4_ADD_SUBDIRECTORY |      \
+       ACE4_READ_NAMED_ATTRS |                         \
+       ACE4_WRITE_NAMED_ATTRS |                        \
+       ACE4_EXECUTE |                                  \
+       ACE4_DELETE_CHILD |                             \
+       ACE4_READ_ATTRIBUTES |                          \
+       ACE4_WRITE_ATTRIBUTES |                         \
+       ACE4_WRITE_RETENTION |                          \
+       ACE4_WRITE_RETENTION_HOLD |                     \
+       ACE4_DELETE |                                   \
+       ACE4_READ_ACL |                                 \
+       ACE4_WRITE_ACL |                                \
+       ACE4_WRITE_OWNER |                              \
+       ACE4_SYNCHRONIZE)
+
+/*
+ * The POSIX permissions are supersets of the following NFSv4 permissions:
+ *
+ *  - MAY_READ maps to READ_DATA or LIST_DIRECTORY, depending on the type
+ *    of the file system object.
+ *
+ *  - MAY_WRITE maps to WRITE_DATA or ACE4_APPEND_DATA for files, and to
+ *    ADD_FILE, ACE4_ADD_SUBDIRECTORY, or ACE4_DELETE_CHILD for directories.
+ *
+ *  - MAY_EXECUTE maps to ACE4_EXECUTE.
+ *
+ *  (Some of these NFSv4 permissions have the same bit values.)
+ */
+#define ACE4_POSIX_MODE_READ ( \
+       ACE4_READ_DATA | ACE4_LIST_DIRECTORY)
+#define ACE4_POSIX_MODE_WRITE ( \
+       ACE4_WRITE_DATA | ACE4_ADD_FILE | \
+       ACE4_APPEND_DATA | ACE4_ADD_SUBDIRECTORY | \
+       ACE4_DELETE_CHILD)
+#define ACE4_POSIX_MODE_EXEC ( \
+       ACE4_EXECUTE)
+#define ACE4_POSIX_MODE_ALL (ACE4_POSIX_MODE_READ | ACE4_POSIX_MODE_WRITE | \
+                            ACE4_POSIX_MODE_EXEC)
+
+/* These permissions are always allowed no matter what the acl says. */
+#define ACE4_POSIX_ALWAYS_ALLOWED (    \
+       ACE4_SYNCHRONIZE |              \
+       ACE4_READ_ATTRIBUTES |          \
+       ACE4_READ_ACL)
+
+/**
+ * richacl_get  -  grab another reference to a richacl handle
+ */
+static inline struct richacl *
+richacl_get(struct richacl *acl)
+{
+       if (acl)
+               atomic_inc(&acl->a_refcount);
+       return acl;
+}
+
+/**
+ * richacl_put  -  free a richacl handle
+ */
+static inline void
+richacl_put(struct richacl *acl)
+{
+       if (acl && atomic_dec_and_test(&acl->a_refcount))
+               kfree(acl);
+}
+
+static inline int
+richacl_is_auto_inherit(const struct richacl *acl)
+{
+       return acl->a_flags & ACL4_AUTO_INHERIT;
+}
+
+static inline int
+richacl_is_protected(const struct richacl *acl)
+{
+       return acl->a_flags & ACL4_PROTECTED;
+}
+
+/*
+ * Special e_who identifiers: we use these pointer values in comparisons
+ * instead of doing a strcmp.
+ */
+extern const char richace_owner_who[];
+extern const char richace_group_who[];
+extern const char richace_everyone_who[];
+
+/**
+ * richace_is_owner  -  check if @ace is an OWNER@ entry
+ */
+static inline int
+richace_is_owner(const struct richace *ace)
+{
+       return (ace->e_flags & ACE4_SPECIAL_WHO) &&
+              ace->u.e_who == richace_owner_who;
+}
+
+/**
+ * richace_is_group  -  check if @ace is a GROUP@ entry
+ */
+static inline int
+richace_is_group(const struct richace *ace)
+{
+       return (ace->e_flags & ACE4_SPECIAL_WHO) &&
+              ace->u.e_who == richace_group_who;
+}
+
+/**
+ * richace_is_everyone  -  check if @ace is an EVERYONE@ entry
+ */
+static inline int
+richace_is_everyone(const struct richace *ace)
+{
+       return (ace->e_flags & ACE4_SPECIAL_WHO) &&
+              ace->u.e_who == richace_everyone_who;
+}
+
+/**
+ * richace_is_unix_id  -  check if @ace applies to a specific uid or gid
+ */
+static inline int
+richace_is_unix_id(const struct richace *ace)
+{
+       return !(ace->e_flags & ACE4_SPECIAL_WHO);
+}
+
+/**
+ * richace_is_inherit_only  -  check if @ace is for inheritance only
+ *
+ * ACEs with the %ACE4_INHERIT_ONLY_ACE flag set have no effect during
+ * permission checking.
+ */
+static inline int
+richace_is_inherit_only(const struct richace *ace)
+{
+       return ace->e_flags & ACE4_INHERIT_ONLY_ACE;
+}
+
+/**
+ * richace_is_inheritable  -  check if @ace is inheritable
+ */
+static inline int
+richace_is_inheritable(const struct richace *ace)
+{
+       return ace->e_flags & (ACE4_FILE_INHERIT_ACE |
+                              ACE4_DIRECTORY_INHERIT_ACE);
+}
+
+/**
+ * richace_clear_inheritance_flags  - clear all inheritance flags in @ace
+ */
+static inline void
+richace_clear_inheritance_flags(struct richace *ace)
+{
+       ace->e_flags &= ~(ACE4_FILE_INHERIT_ACE |
+                         ACE4_DIRECTORY_INHERIT_ACE |
+                         ACE4_NO_PROPAGATE_INHERIT_ACE |
+                         ACE4_INHERIT_ONLY_ACE);
+}
+
+/**
+ * richace_is_allow  -  check if @ace is an %ALLOW type entry
+ */
+static inline int
+richace_is_allow(const struct richace *ace)
+{
+       return ace->e_type == ACE4_ACCESS_ALLOWED_ACE_TYPE;
+}
+
+/**
+ * richace_is_deny  -  check if @ace is a %DENY type entry
+ */
+static inline int
+richace_is_deny(const struct richace *ace)
+{
+       return ace->e_type == ACE4_ACCESS_DENIED_ACE_TYPE;
+}
+
+extern struct richacl *richacl_alloc(int);
+extern int richace_is_same_identifier(const struct richace *,
+                                     const struct richace *);
+extern int richace_set_who(struct richace *, const char *);
+extern int richacl_masks_to_mode(const struct richacl *);
+extern unsigned int richacl_mode_to_mask(mode_t);
+extern unsigned int richacl_want_to_mask(int);
+extern void richacl_compute_max_masks(struct richacl *);
+extern struct richacl *richacl_chmod(struct richacl *, mode_t);
+extern int richacl_permission(struct inode *, const struct richacl *,
+                             unsigned int);
+extern struct richacl *richacl_inherit(const struct richacl *, struct inode *);
+extern int richacl_equiv_mode(const struct richacl *, mode_t *);
+
+/* richacl_inode.c */
+
+#ifdef CONFIG_FS_RICHACL
+extern int richacl_may_create(struct inode *, int,
+                             int (*)(struct inode *, unsigned int));
+extern int richacl_may_delete(struct inode *, struct inode *, int,
+                             int (*)(struct inode *, unsigned int));
+extern int richacl_inode_permission(struct inode *, const struct richacl *,
+                                   unsigned int);
+extern int richacl_inode_change_ok(struct inode *, struct iattr *,
+                                  int (*)(struct inode *, unsigned int));
+#else
+static inline int
+richacl_inode_change_ok(struct inode *inode, struct iattr *attr,
+                       int (*richacl_permission)(struct inode *inode,
+                                                 unsigned int mask))
+{
+       return -EPERM;
+}
+#endif
+
+#endif /* __RICHACL_H */
diff --git a/include/linux/richacl_xattr.h b/include/linux/richacl_xattr.h

new file mode 100644 (file)

index 0000000..e038a7c
--- /dev/null
+++ b/include/linux/richacl_xattr.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2006, 2010  Novell, Inc.
+ * Written by Andreas Gruenbacher <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __RICHACL_XATTR_H
+#define __RICHACL_XATTR_H
+
+#include <linux/richacl.h>
+
+#define RICHACL_XATTR "system.richacl"
+
+struct richace_xattr {
+       __le16          e_type;
+       __le16          e_flags;
+       __le32          e_mask;
+       __le32          e_id;
+       char            e_who[0];
+};
+
+struct richacl_xattr {
+       unsigned char   a_version;
+       unsigned char   a_flags;
+       __le16          a_count;
+       __le32          a_owner_mask;
+       __le32          a_group_mask;
+       __le32          a_other_mask;
+};
+
+#define ACL4_XATTR_VERSION     0
+#define ACL4_XATTR_MAX_COUNT   1024
+
+extern struct richacl *richacl_from_xattr(const void *, size_t);
+extern size_t richacl_xattr_size(const struct richacl *acl);
+extern void richacl_to_xattr(const struct richacl *, void *);
+
+#endif /* __RICHACL_XATTR_H */
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h

index dc0c3cc..4c99cae 100644 (file)
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -240,6 +240,7 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *,
                                         bool (*)(struct rpc_task *, void *),
                                         void *);
  void           rpc_wake_up_status(struct rpc_wait_queue *, int);
+void           rpc_wake_up_softconn_status(struct rpc_wait_queue *, int);
  int            rpc_queue_empty(struct rpc_wait_queue *);
  void           rpc_delay(struct rpc_task *, unsigned long);
  void *         rpc_malloc(struct rpc_task *, size_t);
diff --git a/include/linux/swap.h b/include/linux/swap.h

index b1fd5c7..50a55e2 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -197,6 +197,10 @@ struct swap_info_struct {
         struct block_device *bdev;      /* swap device or bdev of swap file */
         struct file *swap_file;         /* seldom referenced */
         unsigned int old_block_size;    /* seldom referenced */
+#ifdef CONFIG_FRONTSWAP
+       unsigned long *frontswap_map;   /* frontswap in-use, one bit per page */
+       atomic_t frontswap_pages;       /* frontswap pages in-use counter */
+#endif
  };
  
  struct swap_list_t {
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h

new file mode 100644 (file)

index 0000000..e282624
--- /dev/null
+++ b/include/linux/swapfile.h
@@ -0,0 +1,13 @@
+#ifndef _LINUX_SWAPFILE_H
+#define _LINUX_SWAPFILE_H
+
+/*
+ * these were static in swapfile.c but frontswap.c needs them and we don't
+ * want to expose them to the dozens of source files that include swap.h
+ */
+extern spinlock_t swap_lock;
+extern struct swap_list_t swap_list;
+extern struct swap_info_struct *swap_info[];
+extern int try_to_unuse(unsigned int, bool, unsigned long);
+
+#endif /* _LINUX_SWAPFILE_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h

index c34b4c8..b12e7bd 100644 (file)
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -59,6 +59,7 @@ enum
         CTL_BUS=8,              /* Busses */
         CTL_ABI=9,              /* Binary emulation */
         CTL_CPU=10,             /* CPU stuff (speed scaling, etc) */
+       CTL_XEN=123,            /* Xen info and control */
         CTL_ARLAN=254,          /* arlan wireless driver */
         CTL_S390DBF=5677,       /* s390 debug */
         CTL_SUNRPC=7249,        /* sunrpc debug */
diff --git a/include/linux/timex.h b/include/linux/timex.h

index 99bc88b..d338962 100644 (file)
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -236,6 +236,9 @@ extern unsigned long tick_nsec;             /* ACTHZ          period (nsec) */
  
  extern void ntp_init(void);
  extern void ntp_clear(void);
+#ifdef CONFIG_XEN
+extern int ntp_synced(void);
+#endif
  
  /* Required to safely shift negative values */
  #define shift_right(x, s) ({   \
diff --git a/include/linux/unwind.h b/include/linux/unwind.h

new file mode 100644 (file)

index 0000000..64cf103
--- /dev/null
+++ b/include/linux/unwind.h
@@ -0,0 +1,135 @@
+#ifndef _LINUX_UNWIND_H
+#define _LINUX_UNWIND_H
+
+/*
+ * Copyright (C) 2002-2009 Novell, Inc.
+ *     Jan Beulich <jbeulich@novell.com>
+ * This code is released under version 2 of the GNU GPL.
+ *
+ * A simple API for unwinding kernel stacks.  This is used for
+ * debugging and error reporting purposes.  The kernel doesn't need
+ * full-blown stack unwinding with all the bells and whistles, so there
+ * is not much point in implementing the full Dwarf2 unwind API.
+ */
+
+#include <linux/linkage.h>
+
+struct module;
+struct stacktrace_ops;
+struct unwind_frame_info;
+
+typedef asmlinkage int (*unwind_callback_fn)(struct unwind_frame_info *,
+                                            const struct stacktrace_ops *,
+                                            void *);
+
+#ifdef CONFIG_STACK_UNWIND
+
+#include <asm/unwind.h>
+#include <asm/stacktrace.h>
+
+#ifndef ARCH_UNWIND_SECTION_NAME
+#define ARCH_UNWIND_SECTION_NAME ".eh_frame"
+#endif
+
+/*
+ * Initialize unwind support.
+ */
+extern void unwind_init(void);
+extern void unwind_setup(void);
+
+#ifdef CONFIG_MODULES
+
+extern void *unwind_add_table(struct module *,
+                              const void *table_start,
+                              unsigned long table_size);
+
+extern void unwind_remove_table(void *handle, int init_only);
+
+#endif
+
+extern int unwind_init_frame_info(struct unwind_frame_info *,
+                                  struct task_struct *,
+                                  /*const*/ struct pt_regs *);
+
+/*
+ * Prepare to unwind a blocked task.
+ */
+extern int unwind_init_blocked(struct unwind_frame_info *,
+                               struct task_struct *);
+
+/*
+ * Prepare to unwind the currently running thread.
+ */
+extern int unwind_init_running(struct unwind_frame_info *,
+                              unwind_callback_fn,
+                              const struct stacktrace_ops *,
+                               void *data);
+
+/*
+ * Unwind to previous to frame.  Returns 0 if successful, negative
+ * number in case of an error.
+ */
+extern int unwind(struct unwind_frame_info *);
+
+/*
+ * Unwind until the return pointer is in user-land (or until an error
+ * occurs).  Returns 0 if successful, negative number in case of
+ * error.
+ */
+extern int unwind_to_user(struct unwind_frame_info *);
+
+#else /* CONFIG_STACK_UNWIND */
+
+struct unwind_frame_info {};
+
+static inline void unwind_init(void) {}
+static inline void unwind_setup(void) {}
+
+#ifdef CONFIG_MODULES
+
+static inline void *unwind_add_table(struct module *mod,
+                                     const void *table_start,
+                                     unsigned long table_size)
+{
+       return NULL;
+}
+
+#endif
+
+static inline void unwind_remove_table(void *handle, int init_only)
+{
+}
+
+static inline int unwind_init_frame_info(struct unwind_frame_info *info,
+                                         struct task_struct *tsk,
+                                         const struct pt_regs *regs)
+{
+       return -ENOSYS;
+}
+
+static inline int unwind_init_blocked(struct unwind_frame_info *info,
+                                      struct task_struct *tsk)
+{
+       return -ENOSYS;
+}
+
+static inline int unwind_init_running(struct unwind_frame_info *info,
+                              unwind_callback_fn cb,
+                              const struct stacktrace_ops *ops,
+                                      void *data)
+{
+       return -ENOSYS;
+}
+
+static inline int unwind(struct unwind_frame_info *info)
+{
+       return -ENOSYS;
+}
+
+static inline int unwind_to_user(struct unwind_frame_info *info)
+{
+       return -ENOSYS;
+}
+
+#endif /* CONFIG_STACK_UNWIND */
+#endif /* _LINUX_UNWIND_H */
diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h

index 6f8fbcf..faacdbc 100644 (file)
--- a/include/linux/vermagic.h
+++ b/include/linux/vermagic.h
@@ -21,6 +21,11 @@
  #else
  #define MODULE_VERMAGIC_MODVERSIONS ""
  #endif
+#ifdef CONFIG_XEN
+#define MODULE_VERMAGIC_XEN "Xen "
+#else
+#define MODULE_VERMAGIC_XEN
+#endif
  #ifndef MODULE_ARCH_VERMAGIC
  #define MODULE_ARCH_VERMAGIC ""
  #endif
@@ -29,5 +34,5 @@
         UTS_RELEASE " "                                                 \
         MODULE_VERMAGIC_SMP MODULE_VERMAGIC_PREEMPT                     \
         MODULE_VERMAGIC_MODULE_UNLOAD MODULE_VERMAGIC_MODVERSIONS       \
-       MODULE_ARCH_VERMAGIC
+       MODULE_VERMAGIC_XEN MODULE_ARCH_VERMAGIC
  
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h

index 6efb2e1..f421efc 100644 (file)
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -304,6 +304,7 @@ extern void starget_for_each_device(struct scsi_target *, void *,
  extern void __starget_for_each_device(struct scsi_target *, void *,
                                       void (*fn)(struct scsi_device *,
                                                  void *));
+extern struct scsi_device *scsi_device_from_queue(struct request_queue *);
  
  /* only exposed to implement shost_for_each_device */
  extern struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *,
diff --git a/include/scsi/scsi_netlink.h b/include/scsi/scsi_netlink.h

index 5cb20cc..1291017 100644 (file)
--- a/include/scsi/scsi_netlink.h
+++ b/include/scsi/scsi_netlink.h
@@ -35,7 +35,8 @@
  /* SCSI Transport Broadcast Groups */
         /* leaving groups 0 and 1 unassigned */
  #define SCSI_NL_GRP_FC_EVENTS          (1<<2)          /* Group 2 */
-#define SCSI_NL_GRP_CNT                        3
+#define SCSI_NL_GRP_ML_EVENTS          (1<<3)          /* Group 3 */
+#define SCSI_NL_GRP_CNT                        4
  
  
  /* SCSI_TRANSPORT_MSG event message header */
@@ -56,7 +57,8 @@ struct scsi_nl_hdr {
  /* scsi_nl_hdr->transport value */
  #define SCSI_NL_TRANSPORT                      0
  #define SCSI_NL_TRANSPORT_FC                   1
-#define SCSI_NL_MAX_TRANSPORTS                 2
+#define SCSI_NL_TRANSPORT_ML                   2
+#define SCSI_NL_MAX_TRANSPORTS                 3
  
  /* Transport-based scsi_nl_hdr->msgtype values are defined in each transport */
  
diff --git a/include/scsi/scsi_netlink_ml.h b/include/scsi/scsi_netlink_ml.h

new file mode 100644 (file)

index 0000000..c988458
--- /dev/null
+++ b/include/scsi/scsi_netlink_ml.h
@@ -0,0 +1,64 @@
+/*
+ *  SCSI Midlayer Netlink Interface
+ *
+ *  Copyright (C) 2008 Hannes Reinecke, SuSE Linux Products GmbH
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+#ifndef SCSI_NETLINK_ML_H
+#define SCSI_NETLINK_ML_H
+
+#include <scsi/scsi_netlink.h>
+
+/*
+ * This file intended to be included by both kernel and user space
+ */
+
+/*
+ * FC Transport Message Types
+ */
+       /* kernel -> user */
+#define ML_NL_SCSI_SENSE                       0x0100
+       /* user -> kernel */
+/* none */
+
+
+/*
+ * Message Structures :
+ */
+
+/* macro to round up message lengths to 8byte boundary */
+#define SCSI_NL_MSGALIGN(len)          (((len) + 7) & ~7)
+
+
+/*
+ * SCSI Midlayer SCSI Sense messages :
+ *   SCSI_NL_SCSI_SENSE
+ *
+ */
+struct scsi_nl_sense_msg {
+       struct scsi_nl_hdr snlh;                /* must be 1st element ! */
+       uint64_t seconds;
+       u64 id;
+       u64 lun;
+       u16 host_no;
+       u16 channel;
+       u32 sense;
+} __attribute__((aligned(sizeof(uint64_t))));
+
+
+#endif /* SCSI_NETLINK_ML_H */
+
diff --git a/include/xen/Kbuild b/include/xen/Kbuild

index 84ad8f0..e4a826b 100644 (file)
--- a/include/xen/Kbuild
+++ b/include/xen/Kbuild
@@ -1,2 +1 @@
-header-y += evtchn.h
-header-y += privcmd.h
+header-y += public/
diff --git a/include/xen/balloon.h b/include/xen/balloon.h

index cc2e1a7..9958b0c 100644 (file)
--- a/include/xen/balloon.h
+++ b/include/xen/balloon.h
@@ -1,7 +1,68 @@
  /******************************************************************************
- * Xen balloon functionality
+ * balloon.h
+ *
+ * Xen balloon driver - enables returning/claiming memory to/from Xen.
+ *
+ * Copyright (c) 2003, B Dragovic
+ * Copyright (c) 2003-2004, M Williamson, K Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
   */
  
+#ifndef __XEN_BALLOON_H__
+#define __XEN_BALLOON_H__
+
+#include <linux/spinlock.h>
+
+#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+/*
+ * Inform the balloon driver that it should allow some slop for device-driver
+ * memory activities.
+ */
+void balloon_update_driver_allowance(long delta);
+
+/* Allocate/free a set of empty pages in low memory (i.e., no RAM mapped). */
+struct page **alloc_empty_pages_and_pagevec(int nr_pages);
+void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages);
+
+/* Free an empty page range (not allocated through
+   alloc_empty_pages_and_pagevec), adding to the balloon. */
+void free_empty_pages(struct page **pagevec, int nr_pages);
+
+void balloon_release_driver_page(struct page *page);
+
+/*
+ * Prevent the balloon driver from changing the memory reservation during
+ * a driver critical region.
+ */
+extern spinlock_t balloon_lock;
+#define balloon_lock(__flags)   spin_lock_irqsave(&balloon_lock, __flags)
+#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags)
+
+#else /* CONFIG_PARAVIRT_XEN */
+
  #define RETRY_UNLIMITED        0
  
  struct balloon_stats {
@@ -29,6 +90,8 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages,
                 bool highmem);
  void free_xenballooned_pages(int nr_pages, struct page **pages);
  
+#endif /* CONFIG_PARAVIRT_XEN */
+
  struct device;
  #ifdef CONFIG_XEN_SELFBALLOONING
  extern int register_xen_selfballooning(struct device *dev);
@@ -38,3 +101,5 @@ static inline int register_xen_selfballooning(struct device *dev)
         return -ENOSYS;
  }
  #endif
+
+#endif /* __XEN_BALLOON_H__ */
diff --git a/include/xen/blkif.h b/include/xen/blkif.h

new file mode 100644 (file)

index 0000000..af6055d
--- /dev/null
+++ b/include/xen/blkif.h
@@ -0,0 +1,160 @@
+/* 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_BLKIF_H__
+#define __XEN_BLKIF_H__
+
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+/* Not a real protocol.  Used to generate ring structs which contain
+ * the elements common to all protocols only.  This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places.  */
+struct blkif_common_request {
+       char dummy;
+};
+struct blkif_common_response {
+       char dummy;
+};
+
+/* i386 protocol version */
+#pragma pack(push, 4)
+struct blkif_x86_32_request {
+       uint8_t        operation;    /* BLKIF_OP_???                         */
+       uint8_t        nr_segments;  /* number of segments                   */
+       blkif_vdev_t   handle;       /* only for read/write requests         */
+       uint64_t       id;           /* private guest value, echoed in resp  */
+       blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+       struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_32_discard {
+       uint8_t        operation;    /* BLKIF_OP_DISCARD                     */
+       uint8_t        flag;         /* BLKIF_DISCARD_*                      */
+       blkif_vdev_t   handle;       /* same as for read/write requests      */
+       uint64_t       id;           /* private guest value, echoed in resp  */
+       blkif_sector_t sector_number;/* start sector idx on disk             */
+       uint64_t       nr_sectors;   /* number of contiguous sectors         */
+};
+struct blkif_x86_32_response {
+       uint64_t        id;              /* copied from request */
+       uint8_t         operation;       /* copied from request */
+       int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_32_request blkif_x86_32_request_t;
+typedef struct blkif_x86_32_discard blkif_x86_32_discard_t;
+typedef struct blkif_x86_32_response blkif_x86_32_response_t;
+#pragma pack(pop)
+
+/* x86_64 protocol version */
+struct blkif_x86_64_request {
+       uint8_t        operation;    /* BLKIF_OP_???                         */
+       uint8_t        nr_segments;  /* number of segments                   */
+       blkif_vdev_t   handle;       /* only for read/write requests         */
+       uint64_t       __attribute__((__aligned__(8))) id;
+       blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+       struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_64_discard {
+       uint8_t        operation;    /* BLKIF_OP_DISCARD                     */
+       uint8_t        flag;         /* BLKIF_DISCARD_*                      */
+       blkif_vdev_t   handle;       /* sane as for read/write requests      */
+       uint64_t       __attribute__((__aligned__(8))) id;
+       blkif_sector_t sector_number;/* start sector idx on disk             */
+       uint64_t       nr_sectors;   /* number of contiguous sectors         */
+};
+struct blkif_x86_64_response {
+       uint64_t       __attribute__((__aligned__(8))) id;
+       uint8_t         operation;       /* copied from request */
+       int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_64_request blkif_x86_64_request_t;
+typedef struct blkif_x86_64_discard blkif_x86_64_discard_t;
+typedef struct blkif_x86_64_response blkif_x86_64_response_t;
+
+#define blkif_native_sring blkif_sring
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response);
+
+union blkif_back_rings {
+       blkif_back_ring_t        native;
+       blkif_common_back_ring_t common;
+       blkif_x86_32_back_ring_t x86_32;
+       blkif_x86_64_back_ring_t x86_64;
+};
+typedef union blkif_back_rings blkif_back_rings_t;
+
+enum blkif_protocol {
+       BLKIF_PROTOCOL_NATIVE = 1,
+       BLKIF_PROTOCOL_X86_32 = 2,
+       BLKIF_PROTOCOL_X86_64 = 3,
+};
+
+static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src)
+{
+       int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       dst->operation = src->operation;
+       dst->nr_segments = src->nr_segments;
+       dst->handle = src->handle;
+       dst->id = src->id;
+       dst->sector_number = src->sector_number;
+       barrier();
+       if (unlikely(dst->operation == BLKIF_OP_DISCARD)) {
+               blkif_request_discard_t *d = (void *)dst;
+               const blkif_x86_32_discard_t *s = (const void *)src;
+
+               /* We should be doing "d->flag = s->flag;" but the
+                * copying of nr_segments does it for us already. */
+               d->nr_sectors = s->nr_sectors;
+               return;
+       }
+       if (n > dst->nr_segments)
+               n = dst->nr_segments;
+       for (i = 0; i < n; i++)
+               dst->seg[i] = src->seg[i];
+}
+
+static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src)
+{
+       int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       dst->operation = src->operation;
+       dst->nr_segments = src->nr_segments;
+       dst->handle = src->handle;
+       dst->id = src->id;
+       dst->sector_number = src->sector_number;
+       barrier();
+       if (unlikely(dst->operation == BLKIF_OP_DISCARD)) {
+               blkif_request_discard_t *d = (void *)dst;
+               const blkif_x86_64_discard_t *s = (const void *)src;
+
+               /* We should be doing "d->flag = s->flag" but the
+                * copying of nr_segments does it for us already. */
+               d->nr_sectors = s->nr_sectors;
+               return;
+       }
+       if (n > dst->nr_segments)
+               n = dst->nr_segments;
+       for (i = 0; i < n; i++)
+               dst->seg[i] = src->seg[i];
+}
+
+#endif /* __XEN_BLKIF_H__ */
diff --git a/include/xen/clock.h b/include/xen/clock.h

new file mode 100644 (file)

index 0000000..e45552f
--- /dev/null
+++ b/include/xen/clock.h
@@ -0,0 +1,18 @@
+#ifndef __XEN_CPU_CLOCK_H__
+#define __XEN_CPU_CLOCK_H__
+
+void setup_runstate_area(unsigned int cpu);
+
+unsigned long long xen_local_clock(void);
+void xen_check_wallclock_update(void);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+void xen_clockevents_init(void);
+void xen_setup_cpu_clockevents(void);
+void xen_clockevents_resume(void);
+#else
+static inline void xen_setup_cpu_clockevents(void) {}
+static inline void xen_clockevents_resume(void) {}
+#endif
+
+#endif /* __XEN_CPU_CLOCK_H__ */
diff --git a/include/xen/compat_ioctl.h b/include/xen/compat_ioctl.h

new file mode 100644 (file)

index 0000000..975afb6
--- /dev/null
+++ b/include/xen/compat_ioctl.h
@@ -0,0 +1,75 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Jimi Xenidis <jimix@watson.ibm.com>
+ *          Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#ifndef __LINUX_XEN_COMPAT_H__ 
+#define __LINUX_XEN_COMPAT_H__ 
+
+#include <linux/compat.h>
+#include <linux/compiler.h>
+
+#if defined(CONFIG_X86) || defined(CONFIG_IA64)
+#define xen_pfn32_t __u32
+#endif
+
+extern int privcmd_ioctl_32(int fd, unsigned int cmd, void __user *arg);
+struct privcmd_mmap_32 {
+       int num;
+       domid_t dom;
+       compat_uptr_t entry;
+};
+
+struct privcmd_mmapbatch_32 {
+       int num;     /* number of pages to populate */
+       domid_t dom; /* target domain */
+#if defined(CONFIG_X86) || defined(CONFIG_IA64)
+       union {      /* virtual address */
+               __u64 addr __attribute__((packed));
+               __u32 va; /* ensures union is 4-byte aligned */
+       };
+#else
+       __u64 addr;  /* virtual address */
+#endif
+       compat_uptr_t arr; /* array of mfns - top nibble set on err */
+};
+
+struct privcmd_mmapbatch_v2_32 {
+       unsigned int num; /* number of pages to populate */
+       domid_t dom;      /* target domain */
+#if defined(CONFIG_X86) || defined(CONFIG_IA64)
+       union {      /* virtual address */
+               __u64 addr __attribute__((packed));
+               __u32 va; /* ensures union is 4-byte aligned */
+       };
+#else
+       __u64 addr;  /* virtual address */
+#endif
+       compat_uptr_t arr; /* array of mfns */
+       compat_uptr_t err; /* array of error codes */
+};
+
+#define IOCTL_PRIVCMD_MMAP_32                   \
+       _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap_32))
+#define IOCTL_PRIVCMD_MMAPBATCH_32              \
+       _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch_32))
+#define IOCTL_PRIVCMD_MMAPBATCH_V2_32           \
+       _IOC(_IOC_NONE, 'P', 4, sizeof(struct privcmd_mmapbatch_v2_32))
+
+#endif /* __LINUX_XEN_COMPAT_H__ */
diff --git a/include/xen/cpu_hotplug.h b/include/xen/cpu_hotplug.h

new file mode 100644 (file)

index 0000000..9c0f5b8
--- /dev/null
+++ b/include/xen/cpu_hotplug.h
@@ -0,0 +1,39 @@
+#ifndef __XEN_CPU_HOTPLUG_H__
+#define __XEN_CPU_HOTPLUG_H__
+
+#include <linux/kernel.h>
+#include <linux/cpumask.h>
+
+#if defined(CONFIG_X86) && defined(CONFIG_SMP)
+extern cpumask_var_t vcpu_initialized_mask;
+#endif
+
+#if defined(CONFIG_HOTPLUG_CPU)
+
+int cpu_up_check(unsigned int cpu);
+void init_xenbus_allowed_cpumask(void);
+int smp_suspend(void);
+void smp_resume(void);
+
+#else /* !defined(CONFIG_HOTPLUG_CPU) */
+
+#define cpu_up_check(cpu)              (0)
+#define init_xenbus_allowed_cpumask()  ((void)0)
+
+static inline int smp_suspend(void)
+{
+       if (num_online_cpus() > 1) {
+               pr_warning("Can't suspend SMP guests without"
+                          " CONFIG_HOTPLUG_CPU\n");
+               return -EOPNOTSUPP;
+       }
+       return 0;
+}
+
+static inline void smp_resume(void)
+{
+}
+
+#endif /* !defined(CONFIG_HOTPLUG_CPU) */
+
+#endif /* __XEN_CPU_HOTPLUG_H__ */
diff --git a/include/xen/driver_util.h b/include/xen/driver_util.h

new file mode 100644 (file)

index 0000000..12d10f7
--- /dev/null
+++ b/include/xen/driver_util.h
@@ -0,0 +1,14 @@
+#ifndef __XEN_DRIVER_UTIL_H__
+#define __XEN_DRIVER_UTIL_H__
+
+#include <linux/compiler.h>
+#include <linux/device.h>
+
+extern struct class *get_xen_class(void);
+extern struct device *xen_class_device_create(struct device_type *,
+                                             struct device *parent,
+                                             dev_t devt, void *drvdata,
+                                             const char *fmt, ...)
+                     __printf(5, 6);
+
+#endif /* __XEN_DRIVER_UTIL_H__ */
diff --git a/include/xen/evtchn.h b/include/xen/evtchn.h

index 14e833e..5ada11d 100644 (file)
--- a/include/xen/evtchn.h
+++ b/include/xen/evtchn.h
@@ -1,7 +1,11 @@
+#if defined(CONFIG_PARAVIRT_XEN) || !defined(__KERNEL__)
+#include "public/evtchn.h"
+#else
  /******************************************************************************
   * evtchn.h
   *
- * Interface to /dev/xen/evtchn.
+ * Communication via Xen event channels.
+ * Also definitions for the device that demuxes notifications to userspace.
   *
   * Copyright (c) 2003-2005, K A Fraser
   *
@@ -30,59 +34,202 @@
   * IN THE SOFTWARE.
   */
  
-#ifndef __LINUX_PUBLIC_EVTCHN_H__
-#define __LINUX_PUBLIC_EVTCHN_H__
+#ifndef __ASM_EVTCHN_H__
+#define __ASM_EVTCHN_H__
  
-/*
- * Bind a fresh port to VIRQ @virq.
- * Return allocated port.
- */
-#define IOCTL_EVTCHN_BIND_VIRQ                         \
-       _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
-struct ioctl_evtchn_bind_virq {
-       unsigned int virq;
-};
+#include <linux/interrupt.h>
+#include <asm/hypervisor.h>
+#include <asm/ptrace.h>
+#include <asm/sync_bitops.h>
+#include <xen/interface/event_channel.h>
+#include <linux/smp.h>
  
  /*
- * Bind a fresh port to remote <@remote_domain, @remote_port>.
- * Return allocated port.
+ * LOW-LEVEL DEFINITIONS
   */
-#define IOCTL_EVTCHN_BIND_INTERDOMAIN                  \
-       _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
-struct ioctl_evtchn_bind_interdomain {
-       unsigned int remote_domain, remote_port;
+
+#ifdef CONFIG_XEN
+struct irq_cfg {
+       u32 info;
+       union {
+               int bindcount; /* for dynamic IRQs */
+#ifdef CONFIG_X86_IO_APIC
+               u8 vector; /* for physical IRQs */
+#endif
+       };
  };
+struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
+static inline int evtchn_make_refcounted(unsigned int evtchn) { return 0; }
+#endif
  
  /*
- * Allocate a fresh port for binding to @remote_domain.
- * Return allocated port.
+ * Dynamically bind an event source to an IRQ-like callback handler.
+ * On some platforms this may not be implemented via the Linux IRQ subsystem.
+ * The IRQ argument passed to the callback handler is the same as returned
+ * from the bind call. It may not correspond to a Linux IRQ number.
+ * Returns IRQ or negative errno.
   */
-#define IOCTL_EVTCHN_BIND_UNBOUND_PORT                 \
-       _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
-struct ioctl_evtchn_bind_unbound_port {
-       unsigned int remote_domain;
-};
+int bind_caller_port_to_irqhandler(
+       unsigned int caller_port,
+       irq_handler_t handler,
+       unsigned long irqflags,
+       const char *devname,
+       void *dev_id);
+int bind_listening_port_to_irqhandler(
+       unsigned int remote_domain,
+       irq_handler_t handler,
+       unsigned long irqflags,
+       const char *devname,
+       void *dev_id);
+int bind_interdomain_evtchn_to_irqhandler(
+       unsigned int remote_domain,
+       unsigned int remote_port,
+       irq_handler_t handler,
+       unsigned long irqflags,
+       const char *devname,
+       void *dev_id);
+int bind_virq_to_irqhandler(
+       unsigned int virq,
+       unsigned int cpu,
+       irq_handler_t handler,
+       unsigned long irqflags,
+       const char *devname,
+       void *dev_id);
+#if defined(CONFIG_SMP) && defined(CONFIG_XEN) && defined(CONFIG_X86)
+int bind_virq_to_irqaction(
+       unsigned int virq,
+       unsigned int cpu,
+       struct irqaction *action);
+#else
+#define bind_virq_to_irqaction(virq, cpu, action) \
+       bind_virq_to_irqhandler(virq, cpu, (action)->handler, \
+                               (action)->flags | IRQF_NOBALANCING, \
+                               (action)->name, action)
+#endif
+#if defined(CONFIG_SMP) && !defined(MODULE)
+#ifndef CONFIG_X86
+int bind_ipi_to_irqhandler(
+       unsigned int ipi,
+       unsigned int cpu,
+       irq_handler_t handler,
+       unsigned long irqflags,
+       const char *devname,
+       void *dev_id);
+#else
+int bind_ipi_to_irqaction(
+       unsigned int cpu,
+       struct irqaction *action);
+DECLARE_PER_CPU(DECLARE_BITMAP(, NR_IPIS), ipi_pending);
+#endif
+#endif
  
  /*
- * Unbind previously allocated @port.
+ * Common unbind function for all event sources. Takes IRQ to unbind from.
+ * Automatically closes the underlying event channel (except for bindings
+ * made with bind_caller_port_to_irqhandler()).
   */
-#define IOCTL_EVTCHN_UNBIND                            \
-       _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
-struct ioctl_evtchn_unbind {
-       unsigned int port;
-};
+void unbind_from_irqhandler(unsigned int irq, void *dev_id);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_XEN) && defined(CONFIG_X86)
+/* Specialized unbind function for per-CPU IRQs. */
+void unbind_from_per_cpu_irq(unsigned int irq, unsigned int cpu,
+                            struct irqaction *);
+#else
+#define unbind_from_per_cpu_irq(irq, cpu, action) \
+       unbind_from_irqhandler(irq, action)
+#endif
+
+#ifndef CONFIG_XEN
+void irq_resume(void);
+#endif
+
+/* Entry point for notifications into Linux subsystems. */
+asmlinkage void evtchn_do_upcall(struct pt_regs *regs);
+
+/* Mark a PIRQ as unavailable for dynamic allocation. */
+void evtchn_register_pirq(int irq);
+/* Map a Xen-supplied PIRQ to a dynamically allocated one. */
+int evtchn_map_pirq(int irq, int xen_pirq);
+/* Look up a Xen-supplied PIRQ for a dynamically allocated one. */
+int evtchn_get_xen_pirq(int irq);
+
+void mask_evtchn(int port);
+void disable_all_local_evtchn(void);
+void unmask_evtchn(int port);
+unsigned int irq_from_evtchn(unsigned int port);
+
+static inline int test_and_set_evtchn_mask(int port)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+       return sync_test_and_set_bit(port, s->evtchn_mask);
+}
+
+static inline void clear_evtchn(int port)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+       sync_clear_bit(port, s->evtchn_pending);
+}
+
+static inline void set_evtchn(int port)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+       sync_set_bit(port, s->evtchn_pending);
+}
+
+static inline int test_evtchn(int port)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+       return sync_test_bit(port, s->evtchn_pending);
+}
+
+static inline void notify_remote_via_evtchn(int port)
+{
+       struct evtchn_send send = { .port = port };
+       VOID(HYPERVISOR_event_channel_op(EVTCHNOP_send, &send));
+}
+
+static inline void
+multi_notify_remote_via_evtchn(multicall_entry_t *mcl, int port)
+{
+       struct evtchn_send *send = (void *)(mcl->args + 2);
+
+       BUILD_BUG_ON(sizeof(*send) > sizeof(mcl->args) - 2 * sizeof(*mcl->args));
+       send->port = port;
+       mcl->op = __HYPERVISOR_event_channel_op;
+       mcl->args[0] = EVTCHNOP_send;
+       mcl->args[1] = (unsigned long)send;
+}
+
+static inline int close_evtchn(int port)
+{
+       struct evtchn_close close = { .port = port };
+       return HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
+}
+
+/* Test an irq's pending state. */
+int xen_test_irq_pending(int irq);
  
  /*
- * Unbind previously allocated @port.
+ * Use these to access the event channel underlying the IRQ handle returned
+ * by bind_*_to_irqhandler().
   */
-#define IOCTL_EVTCHN_NOTIFY                            \
-       _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
-struct ioctl_evtchn_notify {
-       unsigned int port;
-};
+void notify_remote_via_irq(int irq);
+int multi_notify_remote_via_irq(multicall_entry_t *, int irq);
+int irq_to_evtchn_port(int irq);
+
+#if defined(CONFIG_SMP) && !defined(MODULE) && defined(CONFIG_X86)
+void notify_remote_via_ipi(unsigned int ipi, unsigned int cpu);
+void clear_ipi_evtchn(void);
+#endif
  
-/* Clear and reinitialise the event buffer. Clear error condition. */
-#define IOCTL_EVTCHN_RESET                             \
-       _IOC(_IOC_NONE, 'E', 5, 0)
+#if defined(CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING) \
+    && CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING
+void xen_spin_irq_enter(void);
+void xen_spin_irq_exit(void);
+#else
+static inline void xen_spin_irq_enter(void) {}
+static inline void xen_spin_irq_exit(void) {}
+#endif
  
-#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
+#endif /* __ASM_EVTCHN_H__ */
+#endif /* CONFIG_PARAVIRT_XEN */
diff --git a/include/xen/features.h b/include/xen/features.h

index 27292d4..6c89605 100644 (file)
--- a/include/xen/features.h
+++ b/include/xen/features.h
@@ -10,6 +10,7 @@
  #define __XEN_FEATURES_H__
  
  #include <xen/interface/features.h>
+#include <xen/interface/version.h>
  
  void xen_setup_features(void);
  
@@ -20,4 +21,4 @@ static inline int xen_feature(int flag)
         return xen_features[flag];
  }
  
-#endif /* __ASM_XEN_FEATURES_H__ */
+#endif /* __XEN_FEATURES_H__ */
diff --git a/include/xen/firmware.h b/include/xen/firmware.h

new file mode 100644 (file)

index 0000000..3be378c
--- /dev/null
+++ b/include/xen/firmware.h
@@ -0,0 +1,14 @@
+#ifndef __XEN_FIRMWARE_H__
+#define __XEN_FIRMWARE_H__
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+void copy_edd(void);
+#endif
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+void copy_edid(void);
+#else
+static inline void copy_edid(void) {}
+#endif
+
+#endif /* __XEN_FIRMWARE_H__ */
diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h

index 5304bd3..ce4936d 100644 (file)
--- a/include/xen/gntdev.h
+++ b/include/xen/gntdev.h
@@ -1,150 +1,3 @@
-/******************************************************************************
- * gntdev.h
- * 
- * Interface to /dev/xen/gntdev.
- * 
- * Copyright (c) 2007, D G Murray
- * 
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef __LINUX_PUBLIC_GNTDEV_H__
-#define __LINUX_PUBLIC_GNTDEV_H__
-
-struct ioctl_gntdev_grant_ref {
-       /* The domain ID of the grant to be mapped. */
-       uint32_t domid;
-       /* The grant reference of the grant to be mapped. */
-       uint32_t ref;
-};
-
-/*
- * Inserts the grant references into the mapping table of an instance
- * of gntdev. N.B. This does not perform the mapping, which is deferred
- * until mmap() is called with @index as the offset.
- */
-#define IOCTL_GNTDEV_MAP_GRANT_REF \
-_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
-struct ioctl_gntdev_map_grant_ref {
-       /* IN parameters */
-       /* The number of grants to be mapped. */
-       uint32_t count;
-       uint32_t pad;
-       /* OUT parameters */
-       /* The offset to be used on a subsequent call to mmap(). */
-       uint64_t index;
-       /* Variable IN parameter. */
-       /* Array of grant references, of size @count. */
-       struct ioctl_gntdev_grant_ref refs[1];
-};
-
-/*
- * Removes the grant references from the mapping table of an instance of
- * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
- * before this ioctl is called, or an error will result.
- */
-#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
-_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
-struct ioctl_gntdev_unmap_grant_ref {
-       /* IN parameters */
-       /* The offset was returned by the corresponding map operation. */
-       uint64_t index;
-       /* The number of pages to be unmapped. */
-       uint32_t count;
-       uint32_t pad;
-};
-
-/*
- * Returns the offset in the driver's address space that corresponds
- * to @vaddr. This can be used to perform a munmap(), followed by an
- * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
- * the caller. The number of pages that were allocated at the same time as
- * @vaddr is returned in @count.
- *
- * N.B. Where more than one page has been mapped into a contiguous range, the
- *      supplied @vaddr must correspond to the start of the range; otherwise
- *      an error will result. It is only possible to munmap() the entire
- *      contiguously-allocated range at once, and not any subrange thereof.
- */
-#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
-_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
-struct ioctl_gntdev_get_offset_for_vaddr {
-       /* IN parameters */
-       /* The virtual address of the first mapped page in a range. */
-       uint64_t vaddr;
-       /* OUT parameters */
-       /* The offset that was used in the initial mmap() operation. */
-       uint64_t offset;
-       /* The number of pages mapped in the VM area that begins at @vaddr. */
-       uint32_t count;
-       uint32_t pad;
-};
-
-/*
- * Sets the maximum number of grants that may mapped at once by this gntdev
- * instance.
- *
- * N.B. This must be called before any other ioctl is performed on the device.
- */
-#define IOCTL_GNTDEV_SET_MAX_GRANTS \
-_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
-struct ioctl_gntdev_set_max_grants {
-       /* IN parameter */
-       /* The maximum number of grants that may be mapped at once. */
-       uint32_t count;
-};
-
-/*
- * Sets up an unmap notification within the page, so that the other side can do
- * cleanup if this side crashes. Required to implement cross-domain robust
- * mutexes or close notification on communication channels.
- *
- * Each mapped page only supports one notification; multiple calls referring to
- * the same page overwrite the previous notification. You must clear the
- * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it
- * to occur.
- */
-#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \
-_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntdev_unmap_notify))
-struct ioctl_gntdev_unmap_notify {
-       /* IN parameters */
-       /* Offset in the file descriptor for a byte within the page (same as
-        * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to
-        * be cleared. Otherwise, it can be any byte in the page whose
-        * notification we are adjusting.
-        */
-       uint64_t index;
-       /* Action(s) to take on unmap */
-       uint32_t action;
-       /* Event channel to notify */
-       uint32_t event_channel_port;
-};
-
-/* Clear (set to zero) the byte specified by index */
-#define UNMAP_NOTIFY_CLEAR_BYTE 0x1
-/* Send an interrupt on the indicated event channel */
-#define UNMAP_NOTIFY_SEND_EVENT 0x2
-
-#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
+#if defined(CONFIG_PARAVIRT_XEN) || !defined(__KERNEL__)
+#include "public/gntdev.h"
+#endif
diff --git a/include/xen/gnttab.h b/include/xen/gnttab.h

new file mode 100644 (file)

index 0000000..4d6e52f
--- /dev/null
+++ b/include/xen/gnttab.h
@@ -0,0 +1,209 @@
+/******************************************************************************
+ * gnttab.h
+ * 
+ * Two sets of functionality:
+ * 1. Granting foreign access to our memory reservation.
+ * 2. Accessing others' memory reservations via grant references.
+ * (i.e., mechanisms for both sender and recipient of grant references)
+ * 
+ * Copyright (c) 2004-2005, K A Fraser
+ * Copyright (c) 2005, Christopher Clark
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __ASM_GNTTAB_H__
+#define __ASM_GNTTAB_H__
+
+#include <asm/hypervisor.h>
+#include <asm/maddr.h> /* maddr_t */
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <xen/interface/grant_table.h>
+#include <xen/features.h>
+
+#define GRANT_INVALID_REF      0
+
+struct gnttab_free_callback {
+       struct gnttab_free_callback *next;
+       void (*fn)(void *);
+       void *arg;
+       u16 count;
+       u8 queued;
+};
+
+int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
+                               int flags);
+
+/*
+ * End access through the given grant reference, iff the grant entry is no
+ * longer in use.  Return 1 if the grant entry was freed, 0 if it is still in
+ * use.
+ */
+int gnttab_end_foreign_access_ref(grant_ref_t ref);
+
+/*
+ * Eventually end access through the given grant reference, and once that
+ * access has been ended, free the given page too.  Access will be ended
+ * immediately iff the grant entry is not in use, otherwise it will happen
+ * some time later.  page may be 0, in which case no freeing will occur.
+ */
+void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page);
+
+int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
+
+unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
+unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
+
+int gnttab_query_foreign_access(grant_ref_t ref);
+
+/*
+ * operations on reserved batches of grant references
+ */
+int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head);
+
+void gnttab_free_grant_reference(grant_ref_t ref);
+
+void gnttab_free_grant_references(grant_ref_t head);
+
+int gnttab_empty_grant_references(const grant_ref_t *pprivate_head);
+
+int gnttab_claim_grant_reference(grant_ref_t *pprivate_head);
+
+void gnttab_release_grant_reference(grant_ref_t *private_head,
+                                   grant_ref_t release);
+
+void gnttab_request_free_callback(struct gnttab_free_callback *callback,
+                                 void (*fn)(void *), void *arg, u16 count);
+void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
+
+void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
+                                    unsigned long frame, int flags);
+
+void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
+                                      unsigned long pfn);
+
+int gnttab_copy_grant_page(grant_ref_t ref, struct page **pagep);
+#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE)
+void __gnttab_dma_map_page(struct page *page);
+#else
+#define __gnttab_dma_map_page __gnttab_dma_unmap_page
+#endif
+static inline void __gnttab_dma_unmap_page(struct page *page)
+{
+}
+
+void gnttab_reset_grant_page(struct page *page);
+
+#ifndef CONFIG_XEN
+int gnttab_resume(void);
+#endif
+
+void *arch_gnttab_alloc_shared(unsigned long *frames);
+
+static inline void
+gnttab_set_map_op(struct gnttab_map_grant_ref *map, maddr_t addr,
+                 uint32_t flags, grant_ref_t ref, domid_t domid)
+{
+       if (flags & GNTMAP_contains_pte)
+               map->host_addr = addr;
+       else if (xen_feature(XENFEAT_auto_translated_physmap))
+               map->host_addr = __pa(addr);
+       else
+               map->host_addr = addr;
+
+       map->flags = flags;
+       map->ref = ref;
+       map->dom = domid;
+}
+
+static inline void
+gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, maddr_t addr,
+                   uint32_t flags, grant_handle_t handle)
+{
+       if (flags & GNTMAP_contains_pte)
+               unmap->host_addr = addr;
+       else if (xen_feature(XENFEAT_auto_translated_physmap))
+               unmap->host_addr = __pa(addr);
+       else
+               unmap->host_addr = addr;
+
+       unmap->handle = handle;
+       unmap->dev_bus_addr = 0;
+}
+
+static inline void
+gnttab_set_replace_op(struct gnttab_unmap_and_replace *unmap, maddr_t addr,
+                     maddr_t new_addr, grant_handle_t handle)
+{
+       if (xen_feature(XENFEAT_auto_translated_physmap)) {
+               unmap->host_addr = __pa(addr);
+               unmap->new_addr = __pa(new_addr);
+       } else {
+               unmap->host_addr = addr;
+               unmap->new_addr = new_addr;
+       }
+
+       unmap->handle = handle;
+}
+
+#define gnttab_check_GNTST_eagain_while(__HCop, __HCarg_p)                     \
+{                                                                              \
+       u8 __hc_delay = 1;                                                      \
+       int __ret;                                                              \
+       while (unlikely((__HCarg_p)->status == GNTST_eagain && __hc_delay)) {   \
+               msleep(__hc_delay++);                                           \
+               __ret = HYPERVISOR_grant_table_op(__HCop, (__HCarg_p), 1);      \
+               BUG_ON(__ret);                                                  \
+       }                                                                       \
+       if (__hc_delay == 0) {                                                  \
+               pr_err("%s: %s gnt busy\n", __func__, current->comm);           \
+               (__HCarg_p)->status = GNTST_bad_page;                           \
+       }                                                                       \
+       if ((__HCarg_p)->status != GNTST_okay)                                  \
+               pr_err("%s: %s gnt status %x\n",                                \
+                       __func__, current->comm, (__HCarg_p)->status);          \
+}
+
+#define gnttab_check_GNTST_eagain_do_while(__HCop, __HCarg_p)                  \
+{                                                                              \
+       u8 __hc_delay = 1;                                                      \
+       int __ret;                                                              \
+       do {                                                                    \
+               __ret = HYPERVISOR_grant_table_op(__HCop, (__HCarg_p), 1);      \
+               BUG_ON(__ret);                                                  \
+               if ((__HCarg_p)->status == GNTST_eagain)                        \
+                       msleep(__hc_delay++);                                   \
+       } while ((__HCarg_p)->status == GNTST_eagain && __hc_delay);            \
+       if (__hc_delay == 0) {                                                  \
+               pr_err("%s: %s gnt busy\n", __func__, current->comm);           \
+               (__HCarg_p)->status = GNTST_bad_page;                           \
+       }                                                                       \
+       if ((__HCarg_p)->status != GNTST_okay)                                  \
+               pr_err("%s: %s gnt status %x\n",                                \
+                       __func__, current->comm, (__HCarg_p)->status);          \
+}
+
+#endif /* __ASM_GNTTAB_H__ */
diff --git a/include/xen/hvm.h b/include/xen/hvm.h

index b193fa2..b883740 100644 (file)
--- a/include/xen/hvm.h
+++ b/include/xen/hvm.h
@@ -3,7 +3,9 @@
  #define XEN_HVM_H__
  
  #include <xen/interface/hvm/params.h>
+#ifndef HAVE_XEN_PLATFORM_COMPAT_H
  #include <asm/xen/hypercall.h>
+#endif
  
  static inline int hvm_get_parameter(int idx, uint64_t *value)
  {
@@ -14,8 +16,7 @@ static inline int hvm_get_parameter(int idx, uint64_t *value)
         xhv.index = idx;
         r = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv);
         if (r < 0) {
-               printk(KERN_ERR "Cannot get hvm parameter %d: %d!\n",
-                       idx, r);
+               pr_err("Cannot get hvm parameter %d: %d!\n", idx, r);
                 return r;
         }
         *value = xhv.value;
diff --git a/include/xen/hypercall.h b/include/xen/hypercall.h

new file mode 100644 (file)

index 0000000..62071ea
--- /dev/null
+++ b/include/xen/hypercall.h
@@ -0,0 +1,30 @@
+#ifndef __XEN_HYPERCALL_H__
+#define __XEN_HYPERCALL_H__
+
+#include <asm/hypercall.h>
+
+static inline int __must_check
+HYPERVISOR_multicall_check(
+       multicall_entry_t *call_list, unsigned int nr_calls,
+       const unsigned long *rc_list)
+{
+       int rc = HYPERVISOR_multicall(call_list, nr_calls);
+
+       if (unlikely(rc < 0))
+               return rc;
+       BUG_ON(rc);
+       BUG_ON((int)nr_calls < 0);
+
+       for ( ; nr_calls > 0; --nr_calls, ++call_list)
+               if (unlikely(call_list->result != (rc_list ? *rc_list++ : 0)))
+                       return nr_calls;
+
+       return 0;
+}
+
+/* A construct to ignore the return value of hypercall wrappers in a few
+ * exceptional cases (simply casting the function result to void doesn't
+ * avoid the compiler warning): */
+#define VOID(expr) ((void)((expr)?:0))
+
+#endif /* __XEN_HYPERCALL_H__ */
diff --git a/include/xen/interface/COPYING b/include/xen/interface/COPYING

new file mode 100644 (file)

index 0000000..ffc6d61
--- /dev/null
+++ b/include/xen/interface/COPYING
@@ -0,0 +1,38 @@
+XEN NOTICE
+==========
+
+This copyright applies to all files within this subdirectory and its
+subdirectories:
+  include/public/*.h
+  include/public/hvm/*.h
+  include/public/io/*.h
+
+The intention is that these files can be freely copied into the source
+tree of an operating system when porting that OS to run on Xen. Doing
+so does *not* cause the OS to become subject to the terms of the GPL.
+
+All other files in the Xen source distribution are covered by version
+2 of the GNU General Public License except where explicitly stated
+otherwise within individual source files.
+
+ -- Keir Fraser (on behalf of the Xen team)
+
+=====================================================================
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
+DEALINGS IN THE SOFTWARE.
diff --git a/include/xen/interface/arch-x86/cpuid.h b/include/xen/interface/arch-x86/cpuid.h

new file mode 100644 (file)

index 0000000..d9bd627
--- /dev/null
+++ b/include/xen/interface/arch-x86/cpuid.h
@@ -0,0 +1,68 @@
+/******************************************************************************
+ * arch-x86/cpuid.h
+ * 
+ * CPUID interface to Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ * 
+ * Copyright (c) 2007 Citrix Systems, Inc.
+ * 
+ * Authors:
+ *    Keir Fraser <keir@xen.org>
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__
+#define __XEN_PUBLIC_ARCH_X86_CPUID_H__
+
+/* Xen identification leaves start at 0x40000000. */
+#define XEN_CPUID_FIRST_LEAF 0x40000000
+#define XEN_CPUID_LEAF(i)    (XEN_CPUID_FIRST_LEAF + (i))
+
+/*
+ * Leaf 1 (0x40000000)
+ * EAX: Largest Xen-information leaf. All leaves up to an including @EAX
+ *      are supported by the Xen host.
+ * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification
+ *      of a Xen host.
+ */
+#define XEN_CPUID_SIGNATURE_EBX 0x566e6558 /* "XenV" */
+#define XEN_CPUID_SIGNATURE_ECX 0x65584d4d /* "MMXe" */
+#define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */
+
+/*
+ * Leaf 2 (0x40000001)
+ * EAX[31:16]: Xen major version.
+ * EAX[15: 0]: Xen minor version.
+ * EBX-EDX: Reserved (currently all zeroes).
+ */
+
+/*
+ * Leaf 3 (0x40000002)
+ * EAX: Number of hypercall transfer pages. This register is always guaranteed
+ *      to specify one hypercall page.
+ * EBX: Base address of Xen-specific MSRs.
+ * ECX: Features 1. Unused bits are set to zero.
+ * EDX: Features 2. Unused bits are set to zero.
+ */
+
+/* Does the host support MMU_PT_UPDATE_PRESERVE_AD for this guest? */
+#define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0
+#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD  (1u<<0)
+
+#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
diff --git a/include/xen/interface/arch-x86/hvm/save.h b/include/xen/interface/arch-x86/hvm/save.h

new file mode 100644 (file)

index 0000000..9ae645d
--- /dev/null
+++ b/include/xen/interface/arch-x86/hvm/save.h
@@ -0,0 +1,589 @@
+/* 
+ * Structure definitions for HVM state that is held by Xen and must
+ * be saved along with the domain's memory and device-model state.
+ * 
+ * Copyright (c) 2007 XenSource Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_SAVE_X86_H__
+#define __XEN_PUBLIC_HVM_SAVE_X86_H__
+
+/* 
+ * Save/restore header: general info about the save file. 
+ */
+
+#define HVM_FILE_MAGIC   0x54381286
+#define HVM_FILE_VERSION 0x00000001
+
+struct hvm_save_header {
+    uint32_t magic;             /* Must be HVM_FILE_MAGIC */
+    uint32_t version;           /* File format version */
+    uint64_t changeset;         /* Version of Xen that saved this file */
+    uint32_t cpuid;             /* CPUID[0x01][%eax] on the saving machine */
+    uint32_t gtsc_khz;        /* Guest's TSC frequency in kHz */
+};
+
+DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header);
+
+
+/*
+ * Processor
+ *
+ * Compat: Pre-3.4 didn't have msr_tsc_aux
+ */
+
+struct hvm_hw_cpu {
+    uint8_t  fpu_regs[512];
+
+    uint64_t rax;
+    uint64_t rbx;
+    uint64_t rcx;
+    uint64_t rdx;
+    uint64_t rbp;
+    uint64_t rsi;
+    uint64_t rdi;
+    uint64_t rsp;
+    uint64_t r8;
+    uint64_t r9;
+    uint64_t r10;
+    uint64_t r11;
+    uint64_t r12;
+    uint64_t r13;
+    uint64_t r14;
+    uint64_t r15;
+
+    uint64_t rip;
+    uint64_t rflags;
+
+    uint64_t cr0;
+    uint64_t cr2;
+    uint64_t cr3;
+    uint64_t cr4;
+
+    uint64_t dr0;
+    uint64_t dr1;
+    uint64_t dr2;
+    uint64_t dr3;
+    uint64_t dr6;
+    uint64_t dr7;    
+
+    uint32_t cs_sel;
+    uint32_t ds_sel;
+    uint32_t es_sel;
+    uint32_t fs_sel;
+    uint32_t gs_sel;
+    uint32_t ss_sel;
+    uint32_t tr_sel;
+    uint32_t ldtr_sel;
+
+    uint32_t cs_limit;
+    uint32_t ds_limit;
+    uint32_t es_limit;
+    uint32_t fs_limit;
+    uint32_t gs_limit;
+    uint32_t ss_limit;
+    uint32_t tr_limit;
+    uint32_t ldtr_limit;
+    uint32_t idtr_limit;
+    uint32_t gdtr_limit;
+
+    uint64_t cs_base;
+    uint64_t ds_base;
+    uint64_t es_base;
+    uint64_t fs_base;
+    uint64_t gs_base;
+    uint64_t ss_base;
+    uint64_t tr_base;
+    uint64_t ldtr_base;
+    uint64_t idtr_base;
+    uint64_t gdtr_base;
+
+    uint32_t cs_arbytes;
+    uint32_t ds_arbytes;
+    uint32_t es_arbytes;
+    uint32_t fs_arbytes;
+    uint32_t gs_arbytes;
+    uint32_t ss_arbytes;
+    uint32_t tr_arbytes;
+    uint32_t ldtr_arbytes;
+
+    uint64_t sysenter_cs;
+    uint64_t sysenter_esp;
+    uint64_t sysenter_eip;
+
+    /* msr for em64t */
+    uint64_t shadow_gs;
+
+    /* msr content saved/restored. */
+    uint64_t msr_flags;
+    uint64_t msr_lstar;
+    uint64_t msr_star;
+    uint64_t msr_cstar;
+    uint64_t msr_syscall_mask;
+    uint64_t msr_efer;
+    uint64_t msr_tsc_aux;
+
+    /* guest's idea of what rdtsc() would return */
+    uint64_t tsc;
+
+    /* pending event, if any */
+    union {
+        uint32_t pending_event;
+        struct {
+            uint8_t  pending_vector:8;
+            uint8_t  pending_type:3;
+            uint8_t  pending_error_valid:1;
+            uint32_t pending_reserved:19;
+            uint8_t  pending_valid:1;
+        };
+    };
+    /* error code for pending event */
+    uint32_t error_code;
+};
+
+struct hvm_hw_cpu_compat {
+    uint8_t  fpu_regs[512];
+
+    uint64_t rax;
+    uint64_t rbx;
+    uint64_t rcx;
+    uint64_t rdx;
+    uint64_t rbp;
+    uint64_t rsi;
+    uint64_t rdi;
+    uint64_t rsp;
+    uint64_t r8;
+    uint64_t r9;
+    uint64_t r10;
+    uint64_t r11;
+    uint64_t r12;
+    uint64_t r13;
+    uint64_t r14;
+    uint64_t r15;
+
+    uint64_t rip;
+    uint64_t rflags;
+
+    uint64_t cr0;
+    uint64_t cr2;
+    uint64_t cr3;
+    uint64_t cr4;
+
+    uint64_t dr0;
+    uint64_t dr1;
+    uint64_t dr2;
+    uint64_t dr3;
+    uint64_t dr6;
+    uint64_t dr7;    
+
+    uint32_t cs_sel;
+    uint32_t ds_sel;
+    uint32_t es_sel;
+    uint32_t fs_sel;
+    uint32_t gs_sel;
+    uint32_t ss_sel;
+    uint32_t tr_sel;
+    uint32_t ldtr_sel;
+
+    uint32_t cs_limit;
+    uint32_t ds_limit;
+    uint32_t es_limit;
+    uint32_t fs_limit;
+    uint32_t gs_limit;
+    uint32_t ss_limit;
+    uint32_t tr_limit;
+    uint32_t ldtr_limit;
+    uint32_t idtr_limit;
+    uint32_t gdtr_limit;
+
+    uint64_t cs_base;
+    uint64_t ds_base;
+    uint64_t es_base;
+    uint64_t fs_base;
+    uint64_t gs_base;
+    uint64_t ss_base;
+    uint64_t tr_base;
+    uint64_t ldtr_base;
+    uint64_t idtr_base;
+    uint64_t gdtr_base;
+
+    uint32_t cs_arbytes;
+    uint32_t ds_arbytes;
+    uint32_t es_arbytes;
+    uint32_t fs_arbytes;
+    uint32_t gs_arbytes;
+    uint32_t ss_arbytes;
+    uint32_t tr_arbytes;
+    uint32_t ldtr_arbytes;
+
+    uint64_t sysenter_cs;
+    uint64_t sysenter_esp;
+    uint64_t sysenter_eip;
+
+    /* msr for em64t */
+    uint64_t shadow_gs;
+
+    /* msr content saved/restored. */
+    uint64_t msr_flags;
+    uint64_t msr_lstar;
+    uint64_t msr_star;
+    uint64_t msr_cstar;
+    uint64_t msr_syscall_mask;
+    uint64_t msr_efer;
+    /*uint64_t msr_tsc_aux; COMPAT */
+
+    /* guest's idea of what rdtsc() would return */
+    uint64_t tsc;
+
+    /* pending event, if any */
+    union {
+        uint32_t pending_event;
+        struct {
+            uint8_t  pending_vector:8;
+            uint8_t  pending_type:3;
+            uint8_t  pending_error_valid:1;
+            uint32_t pending_reserved:19;
+            uint8_t  pending_valid:1;
+        };
+    };
+    /* error code for pending event */
+    uint32_t error_code;
+};
+
+static inline int _hvm_hw_fix_cpu(void *h) {
+    struct hvm_hw_cpu *new=h;
+    struct hvm_hw_cpu_compat *old=h;
+
+    /* If we copy from the end backwards, we should
+     * be able to do the modification in-place */
+    new->error_code=old->error_code;
+    new->pending_event=old->pending_event;
+    new->tsc=old->tsc;
+    new->msr_tsc_aux=0;
+
+    return 0;
+}
+
+DECLARE_HVM_SAVE_TYPE_COMPAT(CPU, 2, struct hvm_hw_cpu, \
+                             struct hvm_hw_cpu_compat, _hvm_hw_fix_cpu);
+
+/*
+ * PIC
+ */
+
+struct hvm_hw_vpic {
+    /* IR line bitmasks. */
+    uint8_t irr;
+    uint8_t imr;
+    uint8_t isr;
+
+    /* Line IRx maps to IRQ irq_base+x */
+    uint8_t irq_base;
+
+    /*
+     * Where are we in ICW2-4 initialisation (0 means no init in progress)?
+     * Bits 0-1 (=x): Next write at A=1 sets ICW(x+1).
+     * Bit 2: ICW1.IC4  (1 == ICW4 included in init sequence)
+     * Bit 3: ICW1.SNGL (0 == ICW3 included in init sequence)
+     */
+    uint8_t init_state:4;
+
+    /* IR line with highest priority. */
+    uint8_t priority_add:4;
+
+    /* Reads from A=0 obtain ISR or IRR? */
+    uint8_t readsel_isr:1;
+
+    /* Reads perform a polling read? */
+    uint8_t poll:1;
+
+    /* Automatically clear IRQs from the ISR during INTA? */
+    uint8_t auto_eoi:1;
+
+    /* Automatically rotate IRQ priorities during AEOI? */
+    uint8_t rotate_on_auto_eoi:1;
+
+    /* Exclude slave inputs when considering in-service IRQs? */
+    uint8_t special_fully_nested_mode:1;
+
+    /* Special mask mode excludes masked IRs from AEOI and priority checks. */
+    uint8_t special_mask_mode:1;
+
+    /* Is this a master PIC or slave PIC? (NB. This is not programmable.) */
+    uint8_t is_master:1;
+
+    /* Edge/trigger selection. */
+    uint8_t elcr;
+
+    /* Virtual INT output. */
+    uint8_t int_output;
+};
+
+DECLARE_HVM_SAVE_TYPE(PIC, 3, struct hvm_hw_vpic);
+
+
+/*
+ * IO-APIC
+ */
+
+#define VIOAPIC_NUM_PINS  48 /* 16 ISA IRQs, 32 non-legacy PCI IRQS. */
+
+struct hvm_hw_vioapic {
+    uint64_t base_address;
+    uint32_t ioregsel;
+    uint32_t id;
+    union vioapic_redir_entry
+    {
+        uint64_t bits;
+        struct {
+            uint8_t vector;
+            uint8_t delivery_mode:3;
+            uint8_t dest_mode:1;
+            uint8_t delivery_status:1;
+            uint8_t polarity:1;
+            uint8_t remote_irr:1;
+            uint8_t trig_mode:1;
+            uint8_t mask:1;
+            uint8_t reserve:7;
+            uint8_t reserved[4];
+            uint8_t dest_id;
+        } fields;
+    } redirtbl[VIOAPIC_NUM_PINS];
+};
+
+DECLARE_HVM_SAVE_TYPE(IOAPIC, 4, struct hvm_hw_vioapic);
+
+
+/*
+ * LAPIC
+ */
+
+struct hvm_hw_lapic {
+    uint64_t             apic_base_msr;
+    uint32_t             disabled; /* VLAPIC_xx_DISABLED */
+    uint32_t             timer_divisor;
+    uint64_t             tdt_msr;
+};
+
+DECLARE_HVM_SAVE_TYPE(LAPIC, 5, struct hvm_hw_lapic);
+
+struct hvm_hw_lapic_regs {
+    uint8_t data[1024];
+};
+
+DECLARE_HVM_SAVE_TYPE(LAPIC_REGS, 6, struct hvm_hw_lapic_regs);
+
+
+/*
+ * IRQs
+ */
+
+struct hvm_hw_pci_irqs {
+    /*
+     * Virtual interrupt wires for a single PCI bus.
+     * Indexed by: device*4 + INTx#.
+     */
+    union {
+        unsigned long i[16 / sizeof (unsigned long)]; /* DECLARE_BITMAP(i, 32*4); */
+        uint64_t pad[2];
+    };
+};
+
+DECLARE_HVM_SAVE_TYPE(PCI_IRQ, 7, struct hvm_hw_pci_irqs);
+
+struct hvm_hw_isa_irqs {
+    /*
+     * Virtual interrupt wires for ISA devices.
+     * Indexed by ISA IRQ (assumes no ISA-device IRQ sharing).
+     */
+    union {
+        unsigned long i[1];  /* DECLARE_BITMAP(i, 16); */
+        uint64_t pad[1];
+    };
+};
+
+DECLARE_HVM_SAVE_TYPE(ISA_IRQ, 8, struct hvm_hw_isa_irqs);
+
+struct hvm_hw_pci_link {
+    /*
+     * PCI-ISA interrupt router.
+     * Each PCI <device:INTx#> is 'wire-ORed' into one of four links using
+     * the traditional 'barber's pole' mapping ((device + INTx#) & 3).
+     * The router provides a programmable mapping from each link to a GSI.
+     */
+    uint8_t route[4];
+    uint8_t pad0[4];
+};
+
+DECLARE_HVM_SAVE_TYPE(PCI_LINK, 9, struct hvm_hw_pci_link);
+
+/* 
+ *  PIT
+ */
+
+struct hvm_hw_pit {
+    struct hvm_hw_pit_channel {
+        uint32_t count; /* can be 65536 */
+        uint16_t latched_count;
+        uint8_t count_latched;
+        uint8_t status_latched;
+        uint8_t status;
+        uint8_t read_state;
+        uint8_t write_state;
+        uint8_t write_latch;
+        uint8_t rw_mode;
+        uint8_t mode;
+        uint8_t bcd; /* not supported */
+        uint8_t gate; /* timer start */
+    } channels[3];  /* 3 x 16 bytes */
+    uint32_t speaker_data_on;
+    uint32_t pad0;
+};
+
+DECLARE_HVM_SAVE_TYPE(PIT, 10, struct hvm_hw_pit);
+
+
+/* 
+ * RTC
+ */ 
+
+#define RTC_CMOS_SIZE 14
+struct hvm_hw_rtc {
+    /* CMOS bytes */
+    uint8_t cmos_data[RTC_CMOS_SIZE];
+    /* Index register for 2-part operations */
+    uint8_t cmos_index;
+    uint8_t pad0;
+};
+
+DECLARE_HVM_SAVE_TYPE(RTC, 11, struct hvm_hw_rtc);
+
+
+/*
+ * HPET
+ */
+
+#define HPET_TIMER_NUM     3    /* 3 timers supported now */
+struct hvm_hw_hpet {
+    /* Memory-mapped, software visible registers */
+    uint64_t capability;        /* capabilities */
+    uint64_t res0;              /* reserved */
+    uint64_t config;            /* configuration */
+    uint64_t res1;              /* reserved */
+    uint64_t isr;               /* interrupt status reg */
+    uint64_t res2[25];          /* reserved */
+    uint64_t mc64;              /* main counter */
+    uint64_t res3;              /* reserved */
+    struct {                    /* timers */
+        uint64_t config;        /* configuration/cap */
+        uint64_t cmp;           /* comparator */
+        uint64_t fsb;           /* FSB route, not supported now */
+        uint64_t res4;          /* reserved */
+    } timers[HPET_TIMER_NUM];
+    uint64_t res5[4*(24-HPET_TIMER_NUM)];  /* reserved, up to 0x3ff */
+
+    /* Hidden register state */
+    uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
+};
+
+DECLARE_HVM_SAVE_TYPE(HPET, 12, struct hvm_hw_hpet);
+
+
+/*
+ * PM timer
+ */
+
+struct hvm_hw_pmtimer {
+    uint32_t tmr_val;   /* PM_TMR_BLK.TMR_VAL: 32bit free-running counter */
+    uint16_t pm1a_sts;  /* PM1a_EVT_BLK.PM1a_STS: status register */
+    uint16_t pm1a_en;   /* PM1a_EVT_BLK.PM1a_EN: enable register */
+};
+
+DECLARE_HVM_SAVE_TYPE(PMTIMER, 13, struct hvm_hw_pmtimer);
+
+/*
+ * MTRR MSRs
+ */
+
+struct hvm_hw_mtrr {
+#define MTRR_VCNT 8
+#define NUM_FIXED_MSR 11
+    uint64_t msr_pat_cr;
+    /* mtrr physbase & physmask msr pair*/
+    uint64_t msr_mtrr_var[MTRR_VCNT*2];
+    uint64_t msr_mtrr_fixed[NUM_FIXED_MSR];
+    uint64_t msr_mtrr_cap;
+    uint64_t msr_mtrr_def_type;
+};
+
+DECLARE_HVM_SAVE_TYPE(MTRR, 14, struct hvm_hw_mtrr);
+
+/*
+ * The save area of XSAVE/XRSTOR.
+ */
+
+struct hvm_hw_cpu_xsave {
+    uint64_t xfeature_mask;
+    uint64_t xcr0;                 /* Updated by XSETBV */
+    uint64_t xcr0_accum;           /* Updated by XSETBV */
+    struct {
+        struct { char x[512]; } fpu_sse;
+
+        struct {
+            uint64_t xstate_bv;         /* Updated by XRSTOR */
+            uint64_t reserved[7];
+        } xsave_hdr;                    /* The 64-byte header */
+
+        struct { char x[0]; } ymm;    /* YMM */
+    } save_area;
+};
+
+#define CPU_XSAVE_CODE  16
+
+/*
+ * Viridian hypervisor context.
+ */
+
+struct hvm_viridian_domain_context {
+    uint64_t hypercall_gpa;
+    uint64_t guest_os_id;
+};
+
+DECLARE_HVM_SAVE_TYPE(VIRIDIAN_DOMAIN, 15, struct hvm_viridian_domain_context);
+
+struct hvm_viridian_vcpu_context {
+    uint64_t apic_assist;
+};
+
+DECLARE_HVM_SAVE_TYPE(VIRIDIAN_VCPU, 17, struct hvm_viridian_vcpu_context);
+
+struct hvm_vmce_vcpu {
+    uint64_t caps;
+};
+
+DECLARE_HVM_SAVE_TYPE(VMCE_VCPU, 18, struct hvm_vmce_vcpu);
+
+/* 
+ * Largest type-code in use
+ */
+#define HVM_SAVE_CODE_MAX 18
+
+#endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */
diff --git a/include/xen/interface/arch-x86/xen-mca.h b/include/xen/interface/arch-x86/xen-mca.h

new file mode 100644 (file)

index 0000000..dca6b3e
--- /dev/null
+++ b/include/xen/interface/arch-x86/xen-mca.h
@@ -0,0 +1,440 @@
+/******************************************************************************
+ * arch-x86/mca.h
+ * 
+ * Contributed by Advanced Micro Devices, Inc.
+ * Author: Christoph Egger <Christoph.Egger@amd.com>
+ *
+ * Guest OS machine check interface to x86 Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* Full MCA functionality has the following Usecases from the guest side:
+ *
+ * Must have's:
+ * 1. Dom0 and DomU register machine check trap callback handlers
+ *    (already done via "set_trap_table" hypercall)
+ * 2. Dom0 registers machine check event callback handler
+ *    (doable via EVTCHNOP_bind_virq)
+ * 3. Dom0 and DomU fetches machine check data
+ * 4. Dom0 wants Xen to notify a DomU
+ * 5. Dom0 gets DomU ID from physical address
+ * 6. Dom0 wants Xen to kill DomU (already done for "xm destroy")
+ *
+ * Nice to have's:
+ * 7. Dom0 wants Xen to deactivate a physical CPU
+ *    This is better done as separate task, physical CPU hotplugging,
+ *    and hypercall(s) should be sysctl's
+ * 8. Page migration proposed from Xen NUMA work, where Dom0 can tell Xen to
+ *    move a DomU (or Dom0 itself) away from a malicious page
+ *    producing correctable errors.
+ * 9. offlining physical page:
+ *    Xen free's and never re-uses a certain physical page.
+ * 10. Testfacility: Allow Dom0 to write values into machine check MSR's
+ *     and tell Xen to trigger a machine check
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__
+#define __XEN_PUBLIC_ARCH_X86_MCA_H__
+
+/* Hypercall */
+#define __HYPERVISOR_mca __HYPERVISOR_arch_0
+
+/*
+ * The xen-unstable repo has interface version 0x03000001; out interface
+ * is incompatible with that and any future minor revisions, so we
+ * choose a different version number range that is numerically less
+ * than that used in xen-unstable.
+ */
+#define XEN_MCA_INTERFACE_VERSION 0x01ecc003
+
+/* IN: Dom0 calls hypercall to retrieve nonurgent telemetry */
+#define XEN_MC_NONURGENT  0x0001
+/* IN: Dom0/DomU calls hypercall to retrieve urgent telemetry */
+#define XEN_MC_URGENT     0x0002
+/* IN: Dom0 acknowledges previosly-fetched telemetry */
+#define XEN_MC_ACK        0x0004
+
+/* OUT: All is ok */
+#define XEN_MC_OK           0x0
+/* OUT: Domain could not fetch data. */
+#define XEN_MC_FETCHFAILED  0x1
+/* OUT: There was no machine check data to fetch. */
+#define XEN_MC_NODATA       0x2
+/* OUT: Between notification time and this hypercall an other
+ *  (most likely) correctable error happened. The fetched data,
+ *  does not match the original machine check data. */
+#define XEN_MC_NOMATCH      0x4
+
+/* OUT: DomU did not register MC NMI handler. Try something else. */
+#define XEN_MC_CANNOTHANDLE 0x8
+/* OUT: Notifying DomU failed. Retry later or try something else. */
+#define XEN_MC_NOTDELIVERED 0x10
+/* Note, XEN_MC_CANNOTHANDLE and XEN_MC_NOTDELIVERED are mutually exclusive. */
+
+
+#ifndef __ASSEMBLY__
+
+#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */
+
+/*
+ * Machine Check Architecure:
+ * structs are read-only and used to report all kinds of
+ * correctable and uncorrectable errors detected by the HW.
+ * Dom0 and DomU: register a handler to get notified.
+ * Dom0 only: Correctable errors are reported via VIRQ_MCA
+ * Dom0 and DomU: Uncorrectable errors are reported via nmi handlers
+ */
+#define MC_TYPE_GLOBAL          0
+#define MC_TYPE_BANK            1
+#define MC_TYPE_EXTENDED        2
+#define MC_TYPE_RECOVERY        3
+
+struct mcinfo_common {
+    uint16_t type;      /* structure type */
+    uint16_t size;      /* size of this struct in bytes */
+};
+
+
+#define MC_FLAG_CORRECTABLE     (1 << 0)
+#define MC_FLAG_UNCORRECTABLE   (1 << 1)
+#define MC_FLAG_RECOVERABLE    (1 << 2)
+#define MC_FLAG_POLLED         (1 << 3)
+#define MC_FLAG_RESET          (1 << 4)
+#define MC_FLAG_CMCI           (1 << 5)
+#define MC_FLAG_MCE            (1 << 6)
+/* contains global x86 mc information */
+struct mcinfo_global {
+    struct mcinfo_common common;
+
+    /* running domain at the time in error (most likely the impacted one) */
+    uint16_t mc_domid;
+    uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
+    uint32_t mc_socketid; /* physical socket of the physical core */
+    uint16_t mc_coreid; /* physical impacted core */
+    uint16_t mc_core_threadid; /* core thread of physical core */
+    uint32_t mc_apicid;
+    uint32_t mc_flags;
+    uint64_t mc_gstatus; /* global status */
+};
+
+/* contains bank local x86 mc information */
+struct mcinfo_bank {
+    struct mcinfo_common common;
+
+    uint16_t mc_bank; /* bank nr */
+    uint16_t mc_domid; /* Usecase 5: domain referenced by mc_addr on dom0
+                        * and if mc_addr is valid. Never valid on DomU. */
+    uint64_t mc_status; /* bank status */
+    uint64_t mc_addr;   /* bank address, only valid
+                         * if addr bit is set in mc_status */
+    uint64_t mc_misc;
+    uint64_t mc_ctrl2;
+    uint64_t mc_tsc;
+};
+
+
+struct mcinfo_msr {
+    uint64_t reg;   /* MSR */
+    uint64_t value; /* MSR value */
+};
+
+/* contains mc information from other
+ * or additional mc MSRs */ 
+struct mcinfo_extended {
+    struct mcinfo_common common;
+
+    /* You can fill up to five registers.
+     * If you need more, then use this structure
+     * multiple times. */
+
+    uint32_t mc_msrs; /* Number of msr with valid values. */
+    /*
+     * Currently Intel extended MSR (32/64) include all gp registers
+     * and E(R)FLAGS, E(R)IP, E(R)MISC, up to 11/19 of them might be
+     * useful at present. So expand this array to 16/32 to leave room.
+     */
+    struct mcinfo_msr mc_msr[sizeof(void *) * 4];
+};
+
+/* Recovery Action flags. Giving recovery result information to DOM0 */
+
+/* Xen takes successful recovery action, the error is recovered */
+#define REC_ACTION_RECOVERED (0x1 << 0)
+/* No action is performed by XEN */
+#define REC_ACTION_NONE (0x1 << 1)
+/* It's possible DOM0 might take action ownership in some case */
+#define REC_ACTION_NEED_RESET (0x1 << 2)
+
+/* Different Recovery Action types, if the action is performed successfully,
+ * REC_ACTION_RECOVERED flag will be returned.
+ */
+
+/* Page Offline Action */
+#define MC_ACTION_PAGE_OFFLINE (0x1 << 0)
+/* CPU offline Action */
+#define MC_ACTION_CPU_OFFLINE (0x1 << 1)
+/* L3 cache disable Action */
+#define MC_ACTION_CACHE_SHRINK (0x1 << 2)
+
+/* Below interface used between XEN/DOM0 for passing XEN's recovery action 
+ * information to DOM0. 
+ * usage Senario: After offlining broken page, XEN might pass its page offline
+ * recovery action result to DOM0. DOM0 will save the information in 
+ * non-volatile memory for further proactive actions, such as offlining the
+ * easy broken page earlier when doing next reboot.
+*/
+struct page_offline_action
+{
+    /* Params for passing the offlined page number to DOM0 */
+    uint64_t mfn;
+    uint64_t status;
+};
+
+struct cpu_offline_action
+{
+    /* Params for passing the identity of the offlined CPU to DOM0 */
+    uint32_t mc_socketid;
+    uint16_t mc_coreid;
+    uint16_t mc_core_threadid;
+};
+
+#define MAX_UNION_SIZE 16
+struct mcinfo_recovery
+{
+    struct mcinfo_common common;
+    uint16_t mc_bank; /* bank nr */
+    uint8_t action_flags;
+    uint8_t action_types;
+    union {
+        struct page_offline_action page_retire;
+        struct cpu_offline_action cpu_offline;
+        uint8_t pad[MAX_UNION_SIZE];
+    } action_info;
+};
+
+
+#define MCINFO_HYPERCALLSIZE   1024
+#define MCINFO_MAXSIZE         768
+
+#define MCINFO_FLAGS_UNCOMPLETE 0x1
+struct mc_info {
+    /* Number of mcinfo_* entries in mi_data */
+    uint32_t mi_nentries;
+    uint32_t flags;
+    uint64_t mi_data[(MCINFO_MAXSIZE - 1) / 8];
+};
+typedef struct mc_info mc_info_t;
+DEFINE_XEN_GUEST_HANDLE(mc_info_t);
+
+#define __MC_MSR_ARRAYSIZE 8
+#define __MC_NMSRS 1
+#define MC_NCAPS       7       /* 7 CPU feature flag words */
+#define MC_CAPS_STD_EDX        0       /* cpuid level 0x00000001 (%edx) */
+#define MC_CAPS_AMD_EDX        1       /* cpuid level 0x80000001 (%edx) */
+#define MC_CAPS_TM     2       /* cpuid level 0x80860001 (TransMeta) */
+#define MC_CAPS_LINUX  3       /* Linux-defined */
+#define MC_CAPS_STD_ECX        4       /* cpuid level 0x00000001 (%ecx) */
+#define MC_CAPS_VIA    5       /* cpuid level 0xc0000001 */
+#define MC_CAPS_AMD_ECX        6       /* cpuid level 0x80000001 (%ecx) */
+
+struct mcinfo_logical_cpu {
+    uint32_t mc_cpunr;          
+    uint32_t mc_chipid; 
+    uint16_t mc_coreid;
+    uint16_t mc_threadid;
+    uint32_t mc_apicid;
+    uint32_t mc_clusterid;
+    uint32_t mc_ncores;
+    uint32_t mc_ncores_active;
+    uint32_t mc_nthreads;
+    int32_t mc_cpuid_level;
+    uint32_t mc_family;
+    uint32_t mc_vendor;
+    uint32_t mc_model;
+    uint32_t mc_step;
+    char mc_vendorid[16];
+    char mc_brandid[64];
+    uint32_t mc_cpu_caps[MC_NCAPS];
+    uint32_t mc_cache_size;
+    uint32_t mc_cache_alignment;
+    int32_t mc_nmsrvals;
+    struct mcinfo_msr mc_msrvalues[__MC_MSR_ARRAYSIZE];
+};
+typedef struct mcinfo_logical_cpu xen_mc_logical_cpu_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_logical_cpu_t);
+
+
+/* 
+ * OS's should use these instead of writing their own lookup function
+ * each with its own bugs and drawbacks.
+ * We use macros instead of static inline functions to allow guests
+ * to include this header in assembly files (*.S).
+ */
+/* Prototype:
+ *    uint32_t x86_mcinfo_nentries(struct mc_info *mi);
+ */
+#define x86_mcinfo_nentries(_mi)    \
+    (_mi)->mi_nentries
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi);
+ */
+#define x86_mcinfo_first(_mi)       \
+    ((struct mcinfo_common *)(_mi)->mi_data)
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
+ */
+#define x86_mcinfo_next(_mic)       \
+    ((struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size))
+
+/* Prototype:
+ *    void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type);
+ */
+#define x86_mcinfo_lookup(_ret, _mi, _type)    \
+    do {                                                        \
+        uint32_t found, i;                                      \
+        struct mcinfo_common *_mic;                             \
+                                                                \
+        found = 0;                                              \
+       (_ret) = NULL;                                          \
+       if (_mi == NULL) break;                                 \
+        _mic = x86_mcinfo_first(_mi);                           \
+        for (i = 0; i < x86_mcinfo_nentries(_mi); i++) {        \
+            if (_mic->type == (_type)) {                        \
+                found = 1;                                      \
+                break;                                          \
+            }                                                   \
+            _mic = x86_mcinfo_next(_mic);                       \
+        }                                                       \
+        (_ret) = found ? _mic : NULL;                           \
+    } while (0)
+
+
+/* Usecase 1
+ * Register machine check trap callback handler
+ *    (already done via "set_trap_table" hypercall)
+ */
+
+/* Usecase 2
+ * Dom0 registers machine check event callback handler
+ * done by EVTCHNOP_bind_virq
+ */
+
+/* Usecase 3
+ * Fetch machine check data from hypervisor.
+ * Note, this hypercall is special, because both Dom0 and DomU must use this.
+ */
+#define XEN_MC_fetch            1
+struct xen_mc_fetch {
+    /* IN/OUT variables. */
+    uint32_t flags;    /* IN: XEN_MC_NONURGENT, XEN_MC_URGENT,
+                           XEN_MC_ACK if ack'ing an earlier fetch */
+                       /* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED,
+                          XEN_MC_NODATA, XEN_MC_NOMATCH */
+    uint32_t _pad0;
+    uint64_t fetch_id; /* OUT: id for ack, IN: id we are ack'ing */
+
+    /* OUT variables. */
+    XEN_GUEST_HANDLE(mc_info_t) data;
+};
+typedef struct xen_mc_fetch xen_mc_fetch_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t);
+
+
+/* Usecase 4
+ * This tells the hypervisor to notify a DomU about the machine check error
+ */
+#define XEN_MC_notifydomain     2
+struct xen_mc_notifydomain {
+    /* IN variables. */
+    uint16_t mc_domid;    /* The unprivileged domain to notify. */
+    uint16_t mc_vcpuid;   /* The vcpu in mc_domid to notify.
+                           * Usually echo'd value from the fetch hypercall. */
+
+    /* IN/OUT variables. */
+    uint32_t flags;
+
+/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
+/* OUT: XEN_MC_OK, XEN_MC_CANNOTHANDLE, XEN_MC_NOTDELIVERED, XEN_MC_NOMATCH */
+};
+typedef struct xen_mc_notifydomain xen_mc_notifydomain_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_notifydomain_t);
+
+#define XEN_MC_physcpuinfo 3
+struct xen_mc_physcpuinfo {
+       /* IN/OUT */
+       uint32_t ncpus;
+       uint32_t _pad0;
+       /* OUT */
+       XEN_GUEST_HANDLE(xen_mc_logical_cpu_t) info;
+};
+
+#define XEN_MC_msrinject    4
+#define MC_MSRINJ_MAXMSRS       8
+struct xen_mc_msrinject {
+       /* IN */
+       uint32_t mcinj_cpunr;           /* target processor id */
+       uint32_t mcinj_flags;           /* see MC_MSRINJ_F_* below */
+       uint32_t mcinj_count;           /* 0 .. count-1 in array are valid */
+       uint32_t _pad0;
+       struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS];
+};
+
+/* Flags for mcinj_flags above; bits 16-31 are reserved */
+#define MC_MSRINJ_F_INTERPOSE   0x1
+
+#define XEN_MC_mceinject    5
+struct xen_mc_mceinject {
+       unsigned int mceinj_cpunr;      /* target processor id */
+};
+
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+#define XEN_MC_inject_v2        6
+#define XEN_MC_INJECT_TYPE_MASK     0x7
+#define XEN_MC_INJECT_TYPE_MCE      0x0
+#define XEN_MC_INJECT_TYPE_CMCI     0x1
+
+#define XEN_MC_INJECT_CPU_BROADCAST 0x8
+
+struct xen_mc_inject_v2 {
+       uint32_t flags;
+       struct xenctl_cpumap cpumap;
+};
+#endif
+
+struct xen_mc {
+    uint32_t cmd;
+    uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
+    union {
+        struct xen_mc_fetch        mc_fetch;
+        struct xen_mc_notifydomain mc_notifydomain;
+        struct xen_mc_physcpuinfo  mc_physcpuinfo;
+        struct xen_mc_msrinject    mc_msrinject;
+        struct xen_mc_mceinject    mc_mceinject;
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+        struct xen_mc_inject_v2    mc_inject_v2;
+#endif
+    } u;
+};
+typedef struct xen_mc xen_mc_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_t);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */
diff --git a/include/xen/interface/arch-x86/xen-x86_32.h b/include/xen/interface/arch-x86/xen-x86_32.h

new file mode 100644 (file)

index 0000000..de584ea
--- /dev/null
+++ b/include/xen/interface/arch-x86/xen-x86_32.h
@@ -0,0 +1,171 @@
+/******************************************************************************
+ * xen-x86_32.h
+ * 
+ * Guest OS interface to x86 32-bit Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2007, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
+#define __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__
+
+/*
+ * Hypercall interface:
+ *  Input:  %ebx, %ecx, %edx, %esi, %edi (arguments 1-5)
+ *  Output: %eax
+ * Access is via hypercall page (set up by guest loader or via a Xen MSR):
+ *  call hypercall_page + hypercall-number * 32
+ * Clobbered: Argument registers (e.g., 2-arg hypercall clobbers %ebx,%ecx)
+ */
+
+/*
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
+#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
+#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
+#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
+#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
+#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
+
+#define FLAT_KERNEL_CS FLAT_RING1_CS
+#define FLAT_KERNEL_DS FLAT_RING1_DS
+#define FLAT_KERNEL_SS FLAT_RING1_SS
+#define FLAT_USER_CS    FLAT_RING3_CS
+#define FLAT_USER_DS    FLAT_RING3_DS
+#define FLAT_USER_SS    FLAT_RING3_SS
+
+#define __HYPERVISOR_VIRT_START_PAE    0xF5800000
+#define __MACH2PHYS_VIRT_START_PAE     0xF5800000
+#define __MACH2PHYS_VIRT_END_PAE       0xF6800000
+#define HYPERVISOR_VIRT_START_PAE      \
+    mk_unsigned_long(__HYPERVISOR_VIRT_START_PAE)
+#define MACH2PHYS_VIRT_START_PAE       \
+    mk_unsigned_long(__MACH2PHYS_VIRT_START_PAE)
+#define MACH2PHYS_VIRT_END_PAE         \
+    mk_unsigned_long(__MACH2PHYS_VIRT_END_PAE)
+
+/* Non-PAE bounds are obsolete. */
+#define __HYPERVISOR_VIRT_START_NONPAE 0xFC000000
+#define __MACH2PHYS_VIRT_START_NONPAE  0xFC000000
+#define __MACH2PHYS_VIRT_END_NONPAE    0xFC400000
+#define HYPERVISOR_VIRT_START_NONPAE   \
+    mk_unsigned_long(__HYPERVISOR_VIRT_START_NONPAE)
+#define MACH2PHYS_VIRT_START_NONPAE    \
+    mk_unsigned_long(__MACH2PHYS_VIRT_START_NONPAE)
+#define MACH2PHYS_VIRT_END_NONPAE      \
+    mk_unsigned_long(__MACH2PHYS_VIRT_END_NONPAE)
+
+#define __HYPERVISOR_VIRT_START __HYPERVISOR_VIRT_START_PAE
+#define __MACH2PHYS_VIRT_START  __MACH2PHYS_VIRT_START_PAE
+#define __MACH2PHYS_VIRT_END    __MACH2PHYS_VIRT_END_PAE
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#endif
+
+#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>2)
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)MACH2PHYS_VIRT_START)
+#endif
+
+/* 32-/64-bit invariability for control interfaces (domctl/sysctl). */
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+#undef ___DEFINE_XEN_GUEST_HANDLE
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type)                  \
+    typedef struct { type *p; }                                 \
+        __guest_handle_ ## name;                                \
+    typedef struct { union { type *p; uint64_aligned_t q; }; }  \
+        __guest_handle_64_ ## name
+#undef set_xen_guest_handle_raw
+#define set_xen_guest_handle_raw(hnd, val)                  \
+    do { if ( sizeof(hnd) == 8 ) *(uint64_t *)&(hnd) = 0;   \
+         (hnd).p = val;                                     \
+    } while ( 0 )
+#define uint64_aligned_t uint64_t __attribute__((aligned(8)))
+#define __XEN_GUEST_HANDLE_64(name) __guest_handle_64_ ## name
+#define XEN_GUEST_HANDLE_64(name) __XEN_GUEST_HANDLE_64(name)
+#endif
+
+#ifndef __ASSEMBLY__
+
+struct cpu_user_regs {
+    uint32_t ebx;
+    uint32_t ecx;
+    uint32_t edx;
+    uint32_t esi;
+    uint32_t edi;
+    uint32_t ebp;
+    uint32_t eax;
+    uint16_t error_code;    /* private */
+    uint16_t entry_vector;  /* private */
+    uint32_t eip;
+    uint16_t cs;
+    uint8_t  saved_upcall_mask;
+    uint8_t  _pad0;
+    uint32_t eflags;        /* eflags.IF == !saved_upcall_mask */
+    uint32_t esp;
+    uint16_t ss, _pad1;
+    uint16_t es, _pad2;
+    uint16_t ds, _pad3;
+    uint16_t fs, _pad4;
+    uint16_t gs, _pad5;
+};
+typedef struct cpu_user_regs cpu_user_regs_t;
+DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+struct arch_vcpu_info {
+    unsigned long cr2;
+    unsigned long pad[5]; /* sizeof(vcpu_info_t) == 64 */
+};
+typedef struct arch_vcpu_info arch_vcpu_info_t;
+
+struct xen_callback {
+    unsigned long cs;
+    unsigned long eip;
+};
+typedef struct xen_callback xen_callback_t;
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_32_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/arch-x86/xen-x86_64.h b/include/xen/interface/arch-x86/xen-x86_64.h

new file mode 100644 (file)

index 0000000..0bdd868
--- /dev/null
+++ b/include/xen/interface/arch-x86/xen-x86_64.h
@@ -0,0 +1,202 @@
+/******************************************************************************
+ * xen-x86_64.h
+ * 
+ * Guest OS interface to x86 64-bit Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
+#define __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__
+
+/*
+ * Hypercall interface:
+ *  Input:  %rdi, %rsi, %rdx, %r10, %r8 (arguments 1-5)
+ *  Output: %rax
+ * Access is via hypercall page (set up by guest loader or via a Xen MSR):
+ *  call hypercall_page + hypercall-number * 32
+ * Clobbered: argument registers (e.g., 2-arg hypercall clobbers %rdi,%rsi)
+ */
+
+/*
+ * 64-bit segment selectors
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+
+#define FLAT_RING3_CS32 0xe023  /* GDT index 260 */
+#define FLAT_RING3_CS64 0xe033  /* GDT index 261 */
+#define FLAT_RING3_DS32 0xe02b  /* GDT index 262 */
+#define FLAT_RING3_DS64 0x0000  /* NULL selector */
+#define FLAT_RING3_SS32 0xe02b  /* GDT index 262 */
+#define FLAT_RING3_SS64 0xe02b  /* GDT index 262 */
+
+#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
+#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
+#define FLAT_KERNEL_DS   FLAT_KERNEL_DS64
+#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
+#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
+#define FLAT_KERNEL_CS   FLAT_KERNEL_CS64
+#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
+#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
+#define FLAT_KERNEL_SS   FLAT_KERNEL_SS64
+
+#define FLAT_USER_DS64 FLAT_RING3_DS64
+#define FLAT_USER_DS32 FLAT_RING3_DS32
+#define FLAT_USER_DS   FLAT_USER_DS64
+#define FLAT_USER_CS64 FLAT_RING3_CS64
+#define FLAT_USER_CS32 FLAT_RING3_CS32
+#define FLAT_USER_CS   FLAT_USER_CS64
+#define FLAT_USER_SS64 FLAT_RING3_SS64
+#define FLAT_USER_SS32 FLAT_RING3_SS32
+#define FLAT_USER_SS   FLAT_USER_SS64
+
+#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
+#define __HYPERVISOR_VIRT_END   0xFFFF880000000000
+#define __MACH2PHYS_VIRT_START  0xFFFF800000000000
+#define __MACH2PHYS_VIRT_END    0xFFFF804000000000
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#define HYPERVISOR_VIRT_END   mk_unsigned_long(__HYPERVISOR_VIRT_END)
+#endif
+
+#define MACH2PHYS_VIRT_START  mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END    mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES  ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+ *  @which == SEGBASE_*  ;  @base == 64-bit base address
+ * Returns 0 on success.
+ */
+#define SEGBASE_FS          0
+#define SEGBASE_GS_USER     1
+#define SEGBASE_GS_KERNEL   2
+#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
+
+/*
+ * int HYPERVISOR_iret(void)
+ * All arguments are on the kernel stack, in the following format.
+ * Never returns if successful. Current kernel context is lost.
+ * The saved CS is mapped as follows:
+ *   RING0 -> RING3 kernel mode.
+ *   RING1 -> RING3 kernel mode.
+ *   RING2 -> RING3 kernel mode.
+ *   RING3 -> RING3 user mode.
+ * However RING0 indicates that the guest kernel should return to iteself
+ * directly with
+ *      orb   $3,1*8(%rsp)
+ *      iretq
+ * If flags contains VGCF_in_syscall:
+ *   Restore RAX, RIP, RFLAGS, RSP.
+ *   Discard R11, RCX, CS, SS.
+ * Otherwise:
+ *   Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
+ * All other registers are saved on hypercall entry and restored to user.
+ */
+/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
+#define _VGCF_in_syscall 8
+#define VGCF_in_syscall  (1<<_VGCF_in_syscall)
+#define VGCF_IN_SYSCALL  VGCF_in_syscall
+
+#ifndef __ASSEMBLY__
+
+struct iret_context {
+    /* Top of stack (%rsp at point of hypercall). */
+    uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+    /* Bottom of iret stack frame. */
+};
+
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
+#define __DECL_REG(name) union { \
+    uint64_t r ## name, e ## name; \
+    uint32_t _e ## name; \
+}
+#else
+/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
+#define __DECL_REG(name) uint64_t r ## name
+#endif
+
+struct cpu_user_regs {
+    uint64_t r15;
+    uint64_t r14;
+    uint64_t r13;
+    uint64_t r12;
+    __DECL_REG(bp);
+    __DECL_REG(bx);
+    uint64_t r11;
+    uint64_t r10;
+    uint64_t r9;
+    uint64_t r8;
+    __DECL_REG(ax);
+    __DECL_REG(cx);
+    __DECL_REG(dx);
+    __DECL_REG(si);
+    __DECL_REG(di);
+    uint32_t error_code;    /* private */
+    uint32_t entry_vector;  /* private */
+    __DECL_REG(ip);
+    uint16_t cs, _pad0[1];
+    uint8_t  saved_upcall_mask;
+    uint8_t  _pad1[3];
+    __DECL_REG(flags);      /* rflags.IF == !saved_upcall_mask */
+    __DECL_REG(sp);
+    uint16_t ss, _pad2[3];
+    uint16_t es, _pad3[3];
+    uint16_t ds, _pad4[3];
+    uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base.     */
+    uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
+};
+typedef struct cpu_user_regs cpu_user_regs_t;
+DEFINE_XEN_GUEST_HANDLE(cpu_user_regs_t);
+
+#undef __DECL_REG
+
+#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
+#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
+
+struct arch_vcpu_info {
+    unsigned long cr2;
+    unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
+};
+typedef struct arch_vcpu_info arch_vcpu_info_t;
+
+typedef unsigned long xen_callback_t;
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_ARCH_X86_XEN_X86_64_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/arch-x86/xen.h b/include/xen/interface/arch-x86/xen.h

new file mode 100644 (file)

index 0000000..691541a
--- /dev/null
+++ b/include/xen/interface/arch-x86/xen.h
@@ -0,0 +1,210 @@
+/******************************************************************************
+ * arch-x86/xen.h
+ * 
+ * Guest OS interface to x86 Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
+ */
+
+#include "../xen.h"
+
+#ifndef __XEN_PUBLIC_ARCH_X86_XEN_H__
+#define __XEN_PUBLIC_ARCH_X86_XEN_H__
+
+/* Structural guest handles introduced in 0x00030201. */
+#if __XEN_INTERFACE_VERSION__ >= 0x00030201
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
+    typedef struct { type *p; } __guest_handle_ ## name
+#else
+#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \
+    typedef type * __guest_handle_ ## name
+#endif
+
+#define __DEFINE_XEN_GUEST_HANDLE(name, type) \
+    ___DEFINE_XEN_GUEST_HANDLE(name, type);   \
+    ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type)
+#define DEFINE_XEN_GUEST_HANDLE(name)   __DEFINE_XEN_GUEST_HANDLE(name, name)
+#define __XEN_GUEST_HANDLE(name)        __guest_handle_ ## name
+#define XEN_GUEST_HANDLE(name)          __XEN_GUEST_HANDLE(name)
+#define set_xen_guest_handle_raw(hnd, val)  do { (hnd).p = val; } while (0)
+#ifdef __XEN_TOOLS__
+#define get_xen_guest_handle(val, hnd)  do { val = (hnd).p; } while (0)
+#endif
+#define set_xen_guest_handle(hnd, val) set_xen_guest_handle_raw(hnd, val)
+
+/* Allow co-existing Linux 2.6.23+ Xen interface definitions. */
+#define DEFINE_GUEST_HANDLE_STRUCT(name) struct name
+
+#if defined(__i386__)
+#include "xen-x86_32.h"
+#elif defined(__x86_64__)
+#include "xen-x86_64.h"
+#endif
+
+#ifndef __ASSEMBLY__
+typedef unsigned long xen_pfn_t;
+#define PRI_xen_pfn "lx"
+#endif
+
+/*
+ * SEGMENT DESCRIPTOR TABLES
+ */
+/*
+ * A number of GDT entries are reserved by Xen. These are not situated at the
+ * start of the GDT because some stupid OSes export hard-coded selector values
+ * in their ABI. These hard-coded values are always near the start of the GDT,
+ * so Xen places itself out of the way, at the far end of the GDT.
+ */
+#define FIRST_RESERVED_GDT_PAGE  14
+#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
+#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
+
+/* Maximum number of virtual CPUs in legacy multi-processor guests. */
+#define XEN_LEGACY_MAX_VCPUS 32
+
+#ifndef __ASSEMBLY__
+
+typedef unsigned long xen_ulong_t;
+
+/*
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_set_trap_table(const struct trap_info traps[]);
+ * `
+ */
+/*
+ * Send an array of these to HYPERVISOR_set_trap_table().
+ * Terminate the array with a sentinel entry, with traps[].address==0.
+ * The privilege level specifies which modes may enter a trap via a software
+ * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
+ * privilege levels as follows:
+ *  Level == 0: Noone may enter
+ *  Level == 1: Kernel may enter
+ *  Level == 2: Kernel may enter
+ *  Level == 3: Everyone may enter
+ */
+#define TI_GET_DPL(_ti)      ((_ti)->flags & 3)
+#define TI_GET_IF(_ti)       ((_ti)->flags & 4)
+#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
+#define TI_SET_IF(_ti,_if)   ((_ti)->flags |= ((!!(_if))<<2))
+struct trap_info {
+    uint8_t       vector;  /* exception vector                              */
+    uint8_t       flags;   /* 0-3: privilege level; 4: clear event enable?  */
+    uint16_t      cs;      /* code selector                                 */
+    unsigned long address; /* code offset                                   */
+};
+typedef struct trap_info trap_info_t;
+DEFINE_XEN_GUEST_HANDLE(trap_info_t);
+
+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+/*
+ * The following is all CPU context. Note that the fpu_ctxt block is filled 
+ * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ */
+struct vcpu_guest_context {
+    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
+    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
+#define VGCF_I387_VALID                (1<<0)
+#define VGCF_IN_KERNEL                 (1<<2)
+#define _VGCF_i387_valid               0
+#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
+#define _VGCF_in_kernel                2
+#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events  4
+#define VGCF_syscall_disables_events   (1<<_VGCF_syscall_disables_events)
+#define _VGCF_online                   5
+#define VGCF_online                    (1<<_VGCF_online)
+    unsigned long flags;                    /* VGCF_* flags                 */
+    struct cpu_user_regs user_regs;         /* User-level CPU registers     */
+    struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
+    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
+    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
+    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
+    /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
+    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
+    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
+#ifdef __i386__
+    unsigned long event_callback_cs;        /* CS:EIP of event callback     */
+    unsigned long event_callback_eip;
+    unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
+    unsigned long failsafe_callback_eip;
+#else
+    unsigned long event_callback_eip;
+    unsigned long failsafe_callback_eip;
+#ifdef __XEN__
+    union {
+        unsigned long syscall_callback_eip;
+        struct {
+            unsigned int event_callback_cs;    /* compat CS of event cb     */
+            unsigned int failsafe_callback_cs; /* compat CS of failsafe cb  */
+        };
+    };
+#else
+    unsigned long syscall_callback_eip;
+#endif
+#endif
+    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
+#ifdef __x86_64__
+    /* Segment base addresses. */
+    uint64_t      fs_base;
+    uint64_t      gs_base_kernel;
+    uint64_t      gs_base_user;
+#endif
+};
+typedef struct vcpu_guest_context vcpu_guest_context_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_guest_context_t);
+
+struct arch_shared_info {
+    unsigned long max_pfn;                  /* max pfn that appears in table */
+    /* Frame containing list of mfns containing list of mfns containing p2m. */
+    xen_pfn_t     pfn_to_mfn_frame_list_list;
+    unsigned long nmi_reason;
+    uint64_t pad[32];
+};
+typedef struct arch_shared_info arch_shared_info_t;
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * Prefix forces emulation of some non-trapping instructions.
+ * Currently only CPUID.
+ */
+#ifdef __ASSEMBLY__
+#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
+#define XEN_CPUID          XEN_EMULATE_PREFIX cpuid
+#else
+#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
+#define XEN_CPUID          XEN_EMULATE_PREFIX "cpuid"
+#endif
+
+#endif /* __XEN_PUBLIC_ARCH_X86_XEN_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/arch-x86_32.h b/include/xen/interface/arch-x86_32.h

new file mode 100644 (file)

index 0000000..45842b2
--- /dev/null
+++ b/include/xen/interface/arch-x86_32.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * arch-x86_32.h
+ * 
+ * Guest OS interface to x86 32-bit Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
+ */
+
+#include "arch-x86/xen.h"
diff --git a/include/xen/interface/arch-x86_64.h b/include/xen/interface/arch-x86_64.h

new file mode 100644 (file)

index 0000000..fbb2639
--- /dev/null
+++ b/include/xen/interface/arch-x86_64.h
@@ -0,0 +1,27 @@
+/******************************************************************************
+ * arch-x86_64.h
+ * 
+ * Guest OS interface to x86 64-bit Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
+ */
+
+#include "arch-x86/xen.h"
diff --git a/include/xen/interface/callback.h b/include/xen/interface/callback.h

index 2ae3cd2..0323e51 100644 (file)
--- a/include/xen/interface/callback.h
+++ b/include/xen/interface/callback.h
@@ -86,6 +86,8 @@ struct callback_register {
         uint16_t flags;
         xen_callback_t address;
  };
+typedef struct callback_register callback_register_t;
+DEFINE_XEN_GUEST_HANDLE(callback_register_t);
  
  /*
   * Unregister a callback.
@@ -98,5 +100,12 @@ struct callback_unregister {
      uint16_t type;
      uint16_t _unused;
  };
+typedef struct callback_unregister callback_unregister_t;
+DEFINE_XEN_GUEST_HANDLE(callback_unregister_t);
+
+#if __XEN_INTERFACE_VERSION__ < 0x00030207
+#undef CALLBACKTYPE_sysenter
+#define CALLBACKTYPE_sysenter CALLBACKTYPE_sysenter_deprecated
+#endif
  
  #endif /* __XEN_PUBLIC_CALLBACK_H__ */
diff --git a/include/xen/interface/dom0_ops.h b/include/xen/interface/dom0_ops.h

new file mode 100644 (file)

index 0000000..5d2b324
--- /dev/null
+++ b/include/xen/interface/dom0_ops.h
@@ -0,0 +1,120 @@
+/******************************************************************************
+ * dom0_ops.h
+ * 
+ * Process command requests from domain-0 guest OS.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2002-2003, B Dragovic
+ * Copyright (c) 2002-2006, K Fraser
+ */
+
+#ifndef __XEN_PUBLIC_DOM0_OPS_H__
+#define __XEN_PUBLIC_DOM0_OPS_H__
+
+#include "xen.h"
+#include "platform.h"
+
+#if __XEN_INTERFACE_VERSION__ >= 0x00030204
+#error "dom0_ops.h is a compatibility interface only"
+#endif
+
+#define DOM0_INTERFACE_VERSION XENPF_INTERFACE_VERSION
+
+#define DOM0_SETTIME          XENPF_settime
+#define dom0_settime          xenpf_settime
+#define dom0_settime_t        xenpf_settime_t
+
+#define DOM0_ADD_MEMTYPE      XENPF_add_memtype
+#define dom0_add_memtype      xenpf_add_memtype
+#define dom0_add_memtype_t    xenpf_add_memtype_t
+
+#define DOM0_DEL_MEMTYPE      XENPF_del_memtype
+#define dom0_del_memtype      xenpf_del_memtype
+#define dom0_del_memtype_t    xenpf_del_memtype_t
+
+#define DOM0_READ_MEMTYPE     XENPF_read_memtype
+#define dom0_read_memtype     xenpf_read_memtype
+#define dom0_read_memtype_t   xenpf_read_memtype_t
+
+#define DOM0_MICROCODE        XENPF_microcode_update
+#define dom0_microcode        xenpf_microcode_update
+#define dom0_microcode_t      xenpf_microcode_update_t
+
+#define DOM0_PLATFORM_QUIRK   XENPF_platform_quirk
+#define dom0_platform_quirk   xenpf_platform_quirk
+#define dom0_platform_quirk_t xenpf_platform_quirk_t
+
+typedef uint64_t cpumap_t;
+
+/* Unsupported legacy operation -- defined for API compatibility. */
+#define DOM0_MSR                 15
+struct dom0_msr {
+    /* IN variables. */
+    uint32_t write;
+    cpumap_t cpu_mask;
+    uint32_t msr;
+    uint32_t in1;
+    uint32_t in2;
+    /* OUT variables. */
+    uint32_t out1;
+    uint32_t out2;
+};
+typedef struct dom0_msr dom0_msr_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_msr_t);
+
+/* Unsupported legacy operation -- defined for API compatibility. */
+#define DOM0_PHYSICAL_MEMORY_MAP 40
+struct dom0_memory_map_entry {
+    uint64_t start, end;
+    uint32_t flags; /* reserved */
+    uint8_t  is_ram;
+};
+typedef struct dom0_memory_map_entry dom0_memory_map_entry_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_memory_map_entry_t);
+
+struct dom0_op {
+    uint32_t cmd;
+    uint32_t interface_version; /* DOM0_INTERFACE_VERSION */
+    union {
+        struct dom0_msr               msr;
+        struct dom0_settime           settime;
+        struct dom0_add_memtype       add_memtype;
+        struct dom0_del_memtype       del_memtype;
+        struct dom0_read_memtype      read_memtype;
+        struct dom0_microcode         microcode;
+        struct dom0_platform_quirk    platform_quirk;
+        struct dom0_memory_map_entry  physical_memory_map;
+        uint8_t                       pad[128];
+    } u;
+};
+typedef struct dom0_op dom0_op_t;
+DEFINE_XEN_GUEST_HANDLE(dom0_op_t);
+
+#endif /* __XEN_PUBLIC_DOM0_OPS_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/domctl.h b/include/xen/interface/domctl.h

new file mode 100644 (file)

index 0000000..4945207
--- /dev/null
+++ b/include/xen/interface/domctl.h
@@ -0,0 +1,979 @@
+/******************************************************************************
+ * domctl.h
+ * 
+ * Domain management operations. For use by node control stack.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2002-2003, B Dragovic
+ * Copyright (c) 2002-2006, K Fraser
+ */
+
+#ifndef __XEN_PUBLIC_DOMCTL_H__
+#define __XEN_PUBLIC_DOMCTL_H__
+
+#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
+#error "domctl operations are intended for use by node control tools only"
+#endif
+
+#include "xen.h"
+#include "grant_table.h"
+
+#define XEN_DOMCTL_INTERFACE_VERSION 0x00000008
+
+/*
+ * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
+ * If it is specified as zero, an id is auto-allocated and returned.
+ */
+/* XEN_DOMCTL_createdomain */
+struct xen_domctl_createdomain {
+    /* IN parameters */
+    uint32_t ssidref;
+    xen_domain_handle_t handle;
+ /* Is this an HVM guest (as opposed to a PV guest)? */
+#define _XEN_DOMCTL_CDF_hvm_guest     0
+#define XEN_DOMCTL_CDF_hvm_guest      (1U<<_XEN_DOMCTL_CDF_hvm_guest)
+ /* Use hardware-assisted paging if available? */
+#define _XEN_DOMCTL_CDF_hap           1
+#define XEN_DOMCTL_CDF_hap            (1U<<_XEN_DOMCTL_CDF_hap)
+ /* Should domain memory integrity be verifed by tboot during Sx? */
+#define _XEN_DOMCTL_CDF_s3_integrity  2
+#define XEN_DOMCTL_CDF_s3_integrity   (1U<<_XEN_DOMCTL_CDF_s3_integrity)
+ /* Disable out-of-sync shadow page tables? */
+#define _XEN_DOMCTL_CDF_oos_off       3
+#define XEN_DOMCTL_CDF_oos_off        (1U<<_XEN_DOMCTL_CDF_oos_off)
+    uint32_t flags;
+};
+typedef struct xen_domctl_createdomain xen_domctl_createdomain_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_createdomain_t);
+
+/* XEN_DOMCTL_getdomaininfo */
+struct xen_domctl_getdomaininfo {
+    /* OUT variables. */
+    domid_t  domain;              /* Also echoed in domctl.domain */
+ /* Domain is scheduled to die. */
+#define _XEN_DOMINF_dying     0
+#define XEN_DOMINF_dying      (1U<<_XEN_DOMINF_dying)
+ /* Domain is an HVM guest (as opposed to a PV guest). */
+#define _XEN_DOMINF_hvm_guest 1
+#define XEN_DOMINF_hvm_guest  (1U<<_XEN_DOMINF_hvm_guest)
+ /* The guest OS has shut down. */
+#define _XEN_DOMINF_shutdown  2
+#define XEN_DOMINF_shutdown   (1U<<_XEN_DOMINF_shutdown)
+ /* Currently paused by control software. */
+#define _XEN_DOMINF_paused    3
+#define XEN_DOMINF_paused     (1U<<_XEN_DOMINF_paused)
+ /* Currently blocked pending an event.     */
+#define _XEN_DOMINF_blocked   4
+#define XEN_DOMINF_blocked    (1U<<_XEN_DOMINF_blocked)
+ /* Domain is currently running.            */
+#define _XEN_DOMINF_running   5
+#define XEN_DOMINF_running    (1U<<_XEN_DOMINF_running)
+ /* Being debugged.  */
+#define _XEN_DOMINF_debugged  6
+#define XEN_DOMINF_debugged   (1U<<_XEN_DOMINF_debugged)
+ /* XEN_DOMINF_shutdown guest-supplied code.  */
+#define XEN_DOMINF_shutdownmask 255
+#define XEN_DOMINF_shutdownshift 16
+    uint32_t flags;              /* XEN_DOMINF_* */
+    uint64_aligned_t tot_pages;
+    uint64_aligned_t max_pages;
+    uint64_aligned_t shr_pages;
+    uint64_aligned_t paged_pages;
+    uint64_aligned_t shared_info_frame; /* GMFN of shared_info struct */
+    uint64_aligned_t cpu_time;
+    uint32_t nr_online_vcpus;    /* Number of VCPUs currently online. */
+    uint32_t max_vcpu_id;        /* Maximum VCPUID in use by this domain. */
+    uint32_t ssidref;
+    xen_domain_handle_t handle;
+    uint32_t cpupool;
+};
+typedef struct xen_domctl_getdomaininfo xen_domctl_getdomaininfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getdomaininfo_t);
+
+
+/* XEN_DOMCTL_getmemlist */
+struct xen_domctl_getmemlist {
+    /* IN variables. */
+    /* Max entries to write to output buffer. */
+    uint64_aligned_t max_pfns;
+    /* Start index in guest's page list. */
+    uint64_aligned_t start_pfn;
+    XEN_GUEST_HANDLE_64(uint64) buffer;
+    /* OUT variables. */
+    uint64_aligned_t num_pfns;
+};
+typedef struct xen_domctl_getmemlist xen_domctl_getmemlist_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getmemlist_t);
+
+
+/* XEN_DOMCTL_getpageframeinfo */
+
+#define XEN_DOMCTL_PFINFO_LTAB_SHIFT 28
+#define XEN_DOMCTL_PFINFO_NOTAB   (0x0U<<28)
+#define XEN_DOMCTL_PFINFO_L1TAB   (0x1U<<28)
+#define XEN_DOMCTL_PFINFO_L2TAB   (0x2U<<28)
+#define XEN_DOMCTL_PFINFO_L3TAB   (0x3U<<28)
+#define XEN_DOMCTL_PFINFO_L4TAB   (0x4U<<28)
+#define XEN_DOMCTL_PFINFO_LTABTYPE_MASK (0x7U<<28)
+#define XEN_DOMCTL_PFINFO_LPINTAB (0x1U<<31)
+#define XEN_DOMCTL_PFINFO_XTAB    (0xfU<<28) /* invalid page */
+#define XEN_DOMCTL_PFINFO_XALLOC  (0xeU<<28) /* allocate-only page */
+#define XEN_DOMCTL_PFINFO_PAGEDTAB (0x8U<<28)
+#define XEN_DOMCTL_PFINFO_LTAB_MASK (0xfU<<28)
+
+struct xen_domctl_getpageframeinfo {
+    /* IN variables. */
+    uint64_aligned_t gmfn; /* GMFN to query */
+    /* OUT variables. */
+    /* Is the page PINNED to a type? */
+    uint32_t type;         /* see above type defs */
+};
+typedef struct xen_domctl_getpageframeinfo xen_domctl_getpageframeinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo_t);
+
+
+/* XEN_DOMCTL_getpageframeinfo2 */
+struct xen_domctl_getpageframeinfo2 {
+    /* IN variables. */
+    uint64_aligned_t num;
+    /* IN/OUT variables. */
+    XEN_GUEST_HANDLE_64(uint32) array;
+};
+typedef struct xen_domctl_getpageframeinfo2 xen_domctl_getpageframeinfo2_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getpageframeinfo2_t);
+
+/* XEN_DOMCTL_getpageframeinfo3 */
+struct xen_domctl_getpageframeinfo3 {
+    /* IN variables. */
+    uint64_aligned_t num;
+    /* IN/OUT variables. */
+    XEN_GUEST_HANDLE_64(xen_pfn_t) array;
+};
+
+
+/*
+ * Control shadow pagetables operation
+ */
+/* XEN_DOMCTL_shadow_op */
+
+/* Disable shadow mode. */
+#define XEN_DOMCTL_SHADOW_OP_OFF         0
+
+/* Enable shadow mode (mode contains ORed XEN_DOMCTL_SHADOW_ENABLE_* flags). */
+#define XEN_DOMCTL_SHADOW_OP_ENABLE      32
+
+/* Log-dirty bitmap operations. */
+ /* Return the bitmap and clean internal copy for next round. */
+#define XEN_DOMCTL_SHADOW_OP_CLEAN       11
+ /* Return the bitmap but do not modify internal copy. */
+#define XEN_DOMCTL_SHADOW_OP_PEEK        12
+
+/* Memory allocation accessors. */
+#define XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION   30
+#define XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION   31
+
+/* Legacy enable operations. */
+ /* Equiv. to ENABLE with no mode flags. */
+#define XEN_DOMCTL_SHADOW_OP_ENABLE_TEST       1
+ /* Equiv. to ENABLE with mode flag ENABLE_LOG_DIRTY. */
+#define XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY   2
+ /* Equiv. to ENABLE with mode flags ENABLE_REFCOUNT and ENABLE_TRANSLATE. */
+#define XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE  3
+
+/* Mode flags for XEN_DOMCTL_SHADOW_OP_ENABLE. */
+ /*
+  * Shadow pagetables are refcounted: guest does not use explicit mmu
+  * operations nor write-protect its pagetables.
+  */
+#define XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT  (1 << 1)
+ /*
+  * Log pages in a bitmap as they are dirtied.
+  * Used for live relocation to determine which pages must be re-sent.
+  */
+#define XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY (1 << 2)
+ /*
+  * Automatically translate GPFNs into MFNs.
+  */
+#define XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE (1 << 3)
+ /*
+  * Xen does not steal virtual address space from the guest.
+  * Requires HVM support.
+  */
+#define XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL  (1 << 4)
+
+struct xen_domctl_shadow_op_stats {
+    uint32_t fault_count;
+    uint32_t dirty_count;
+};
+typedef struct xen_domctl_shadow_op_stats xen_domctl_shadow_op_stats_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_stats_t);
+
+struct xen_domctl_shadow_op {
+    /* IN variables. */
+    uint32_t       op;       /* XEN_DOMCTL_SHADOW_OP_* */
+
+    /* OP_ENABLE */
+    uint32_t       mode;     /* XEN_DOMCTL_SHADOW_ENABLE_* */
+
+    /* OP_GET_ALLOCATION / OP_SET_ALLOCATION */
+    uint32_t       mb;       /* Shadow memory allocation in MB */
+
+    /* OP_PEEK / OP_CLEAN */
+    XEN_GUEST_HANDLE_64(uint8) dirty_bitmap;
+    uint64_aligned_t pages; /* Size of buffer. Updated with actual size. */
+    struct xen_domctl_shadow_op_stats stats;
+};
+typedef struct xen_domctl_shadow_op xen_domctl_shadow_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_shadow_op_t);
+
+
+/* XEN_DOMCTL_max_mem */
+struct xen_domctl_max_mem {
+    /* IN variables. */
+    uint64_aligned_t max_memkb;
+};
+typedef struct xen_domctl_max_mem xen_domctl_max_mem_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_mem_t);
+
+
+/* XEN_DOMCTL_setvcpucontext */
+/* XEN_DOMCTL_getvcpucontext */
+struct xen_domctl_vcpucontext {
+    uint32_t              vcpu;                  /* IN */
+    XEN_GUEST_HANDLE_64(vcpu_guest_context_t) ctxt; /* IN/OUT */
+};
+typedef struct xen_domctl_vcpucontext xen_domctl_vcpucontext_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpucontext_t);
+
+
+/* XEN_DOMCTL_getvcpuinfo */
+struct xen_domctl_getvcpuinfo {
+    /* IN variables. */
+    uint32_t vcpu;
+    /* OUT variables. */
+    uint8_t  online;                  /* currently online (not hotplugged)? */
+    uint8_t  blocked;                 /* blocked waiting for an event? */
+    uint8_t  running;                 /* currently scheduled on its CPU? */
+    uint64_aligned_t cpu_time;        /* total cpu time consumed (ns) */
+    uint32_t cpu;                     /* current mapping   */
+};
+typedef struct xen_domctl_getvcpuinfo xen_domctl_getvcpuinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t);
+
+
+/* Get/set which physical cpus a vcpu can execute on. */
+/* XEN_DOMCTL_setvcpuaffinity */
+/* XEN_DOMCTL_getvcpuaffinity */
+struct xen_domctl_vcpuaffinity {
+    uint32_t  vcpu;              /* IN */
+    struct xenctl_cpumap cpumap; /* IN/OUT */
+};
+typedef struct xen_domctl_vcpuaffinity xen_domctl_vcpuaffinity_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuaffinity_t);
+
+
+/* XEN_DOMCTL_max_vcpus */
+struct xen_domctl_max_vcpus {
+    uint32_t max;           /* maximum number of vcpus */
+};
+typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
+
+
+/* XEN_DOMCTL_scheduler_op */
+/* Scheduler types. */
+#define XEN_SCHEDULER_SEDF     4
+#define XEN_SCHEDULER_CREDIT   5
+#define XEN_SCHEDULER_CREDIT2  6
+#define XEN_SCHEDULER_ARINC653 7
+/* Set or get info? */
+#define XEN_DOMCTL_SCHEDOP_putinfo 0
+#define XEN_DOMCTL_SCHEDOP_getinfo 1
+struct xen_domctl_scheduler_op {
+    uint32_t sched_id;  /* XEN_SCHEDULER_* */
+    uint32_t cmd;       /* XEN_DOMCTL_SCHEDOP_* */
+    union {
+        struct xen_domctl_sched_sedf {
+            uint64_aligned_t period;
+            uint64_aligned_t slice;
+            uint64_aligned_t latency;
+            uint32_t extratime;
+            uint32_t weight;
+        } sedf;
+        struct xen_domctl_sched_credit {
+            uint16_t weight;
+            uint16_t cap;
+        } credit;
+        struct xen_domctl_sched_credit2 {
+            uint16_t weight;
+        } credit2;
+    } u;
+};
+typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_scheduler_op_t);
+
+
+/* XEN_DOMCTL_setdomainhandle */
+struct xen_domctl_setdomainhandle {
+    xen_domain_handle_t handle;
+};
+typedef struct xen_domctl_setdomainhandle xen_domctl_setdomainhandle_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdomainhandle_t);
+
+
+/* XEN_DOMCTL_setdebugging */
+struct xen_domctl_setdebugging {
+    uint8_t enable;
+};
+typedef struct xen_domctl_setdebugging xen_domctl_setdebugging_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_setdebugging_t);
+
+
+/* XEN_DOMCTL_irq_permission */
+struct xen_domctl_irq_permission {
+    uint8_t pirq;
+    uint8_t allow_access;    /* flag to specify enable/disable of IRQ access */
+};
+typedef struct xen_domctl_irq_permission xen_domctl_irq_permission_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_irq_permission_t);
+
+
+/* XEN_DOMCTL_iomem_permission */
+struct xen_domctl_iomem_permission {
+    uint64_aligned_t first_mfn;/* first page (physical page number) in range */
+    uint64_aligned_t nr_mfns;  /* number of pages in range (>0) */
+    uint8_t  allow_access;     /* allow (!0) or deny (0) access to range? */
+};
+typedef struct xen_domctl_iomem_permission xen_domctl_iomem_permission_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_iomem_permission_t);
+
+
+/* XEN_DOMCTL_ioport_permission */
+struct xen_domctl_ioport_permission {
+    uint32_t first_port;              /* first port int range */
+    uint32_t nr_ports;                /* size of port range */
+    uint8_t  allow_access;            /* allow or deny access to range? */
+};
+typedef struct xen_domctl_ioport_permission xen_domctl_ioport_permission_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_permission_t);
+
+
+/* XEN_DOMCTL_hypercall_init */
+struct xen_domctl_hypercall_init {
+    uint64_aligned_t  gmfn;           /* GMFN to be initialised */
+};
+typedef struct xen_domctl_hypercall_init xen_domctl_hypercall_init_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_hypercall_init_t);
+
+
+/* XEN_DOMCTL_arch_setup */
+#define _XEN_DOMAINSETUP_hvm_guest 0
+#define XEN_DOMAINSETUP_hvm_guest  (1UL<<_XEN_DOMAINSETUP_hvm_guest)
+#define _XEN_DOMAINSETUP_query 1 /* Get parameters (for save)  */
+#define XEN_DOMAINSETUP_query  (1UL<<_XEN_DOMAINSETUP_query)
+#define _XEN_DOMAINSETUP_sioemu_guest 2
+#define XEN_DOMAINSETUP_sioemu_guest  (1UL<<_XEN_DOMAINSETUP_sioemu_guest)
+typedef struct xen_domctl_arch_setup {
+    uint64_aligned_t flags;  /* XEN_DOMAINSETUP_* */
+#ifdef __ia64__
+    uint64_aligned_t bp;     /* mpaddr of boot param area */
+    uint64_aligned_t maxmem; /* Highest memory address for MDT.  */
+    uint64_aligned_t xsi_va; /* Xen shared_info area virtual address.  */
+    uint32_t hypercall_imm;  /* Break imm for Xen hypercalls.  */
+    int8_t vhpt_size_log2;   /* Log2 of VHPT size. */
+#endif
+} xen_domctl_arch_setup_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_arch_setup_t);
+
+
+/* XEN_DOMCTL_settimeoffset */
+struct xen_domctl_settimeoffset {
+    int32_t  time_offset_seconds; /* applied to domain wallclock time */
+};
+typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
+
+/* XEN_DOMCTL_gethvmcontext */
+/* XEN_DOMCTL_sethvmcontext */
+typedef struct xen_domctl_hvmcontext {
+    uint32_t size; /* IN/OUT: size of buffer / bytes filled */
+    XEN_GUEST_HANDLE_64(uint8) buffer; /* IN/OUT: data, or call
+                                        * gethvmcontext with NULL
+                                        * buffer to get size req'd */
+} xen_domctl_hvmcontext_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_t);
+
+
+/* XEN_DOMCTL_set_address_size */
+/* XEN_DOMCTL_get_address_size */
+typedef struct xen_domctl_address_size {
+    uint32_t size;
+} xen_domctl_address_size_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_address_size_t);
+
+
+/* XEN_DOMCTL_real_mode_area */
+struct xen_domctl_real_mode_area {
+    uint32_t log; /* log2 of Real Mode Area size */
+};
+typedef struct xen_domctl_real_mode_area xen_domctl_real_mode_area_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_real_mode_area_t);
+
+
+/* XEN_DOMCTL_sendtrigger */
+#define XEN_DOMCTL_SENDTRIGGER_NMI    0
+#define XEN_DOMCTL_SENDTRIGGER_RESET  1
+#define XEN_DOMCTL_SENDTRIGGER_INIT   2
+#define XEN_DOMCTL_SENDTRIGGER_POWER  3
+#define XEN_DOMCTL_SENDTRIGGER_SLEEP  4
+struct xen_domctl_sendtrigger {
+    uint32_t  trigger;  /* IN */
+    uint32_t  vcpu;     /* IN */
+};
+typedef struct xen_domctl_sendtrigger xen_domctl_sendtrigger_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_sendtrigger_t);
+
+
+/* Assign PCI device to HVM guest. Sets up IOMMU structures. */
+/* XEN_DOMCTL_assign_device */
+/* XEN_DOMCTL_test_assign_device */
+/* XEN_DOMCTL_deassign_device */
+struct xen_domctl_assign_device {
+    uint32_t  machine_sbdf;   /* machine PCI ID of assigned device */
+};
+typedef struct xen_domctl_assign_device xen_domctl_assign_device_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_assign_device_t);
+
+/* Retrieve sibling devices infomation of machine_sbdf */
+/* XEN_DOMCTL_get_device_group */
+struct xen_domctl_get_device_group {
+    uint32_t  machine_sbdf;     /* IN */
+    uint32_t  max_sdevs;        /* IN */
+    uint32_t  num_sdevs;        /* OUT */
+    XEN_GUEST_HANDLE_64(uint32)  sdev_array;   /* OUT */
+};
+typedef struct xen_domctl_get_device_group xen_domctl_get_device_group_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_get_device_group_t);
+
+/* Pass-through interrupts: bind real irq -> hvm devfn. */
+/* XEN_DOMCTL_bind_pt_irq */
+/* XEN_DOMCTL_unbind_pt_irq */
+typedef enum pt_irq_type_e {
+    PT_IRQ_TYPE_PCI,
+    PT_IRQ_TYPE_ISA,
+    PT_IRQ_TYPE_MSI,
+    PT_IRQ_TYPE_MSI_TRANSLATE,
+} pt_irq_type_t;
+struct xen_domctl_bind_pt_irq {
+    uint32_t machine_irq;
+    pt_irq_type_t irq_type;
+    uint32_t hvm_domid;
+
+    union {
+        struct {
+            uint8_t isa_irq;
+        } isa;
+        struct {
+            uint8_t bus;
+            uint8_t device;
+            uint8_t intx;
+        } pci;
+        struct {
+            uint8_t gvec;
+            uint32_t gflags;
+            uint64_aligned_t gtable;
+        } msi;
+    } u;
+};
+typedef struct xen_domctl_bind_pt_irq xen_domctl_bind_pt_irq_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_bind_pt_irq_t);
+
+
+/* Bind machine I/O address range -> HVM address range. */
+/* XEN_DOMCTL_memory_mapping */
+#define DPCI_ADD_MAPPING         1
+#define DPCI_REMOVE_MAPPING      0
+struct xen_domctl_memory_mapping {
+    uint64_aligned_t first_gfn; /* first page (hvm guest phys page) in range */
+    uint64_aligned_t first_mfn; /* first page (machine page) in range */
+    uint64_aligned_t nr_mfns;   /* number of pages in range (>0) */
+    uint32_t add_mapping;       /* add or remove mapping */
+    uint32_t padding;           /* padding for 64-bit aligned structure */
+};
+typedef struct xen_domctl_memory_mapping xen_domctl_memory_mapping_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_memory_mapping_t);
+
+
+/* Bind machine I/O port range -> HVM I/O port range. */
+/* XEN_DOMCTL_ioport_mapping */
+struct xen_domctl_ioport_mapping {
+    uint32_t first_gport;     /* first guest IO port*/
+    uint32_t first_mport;     /* first machine IO port */
+    uint32_t nr_ports;        /* size of port range */
+    uint32_t add_mapping;     /* add or remove mapping */
+};
+typedef struct xen_domctl_ioport_mapping xen_domctl_ioport_mapping_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ioport_mapping_t);
+
+
+/*
+ * Pin caching type of RAM space for x86 HVM domU.
+ */
+/* XEN_DOMCTL_pin_mem_cacheattr */
+/* Caching types: these happen to be the same as x86 MTRR/PAT type codes. */
+#define XEN_DOMCTL_MEM_CACHEATTR_UC  0
+#define XEN_DOMCTL_MEM_CACHEATTR_WC  1
+#define XEN_DOMCTL_MEM_CACHEATTR_WT  4
+#define XEN_DOMCTL_MEM_CACHEATTR_WP  5
+#define XEN_DOMCTL_MEM_CACHEATTR_WB  6
+#define XEN_DOMCTL_MEM_CACHEATTR_UCM 7
+struct xen_domctl_pin_mem_cacheattr {
+    uint64_aligned_t start, end;
+    uint32_t type; /* XEN_DOMCTL_MEM_CACHEATTR_* */
+};
+typedef struct xen_domctl_pin_mem_cacheattr xen_domctl_pin_mem_cacheattr_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_pin_mem_cacheattr_t);
+
+
+/* XEN_DOMCTL_set_ext_vcpucontext */
+/* XEN_DOMCTL_get_ext_vcpucontext */
+struct xen_domctl_ext_vcpucontext {
+    /* IN: VCPU that this call applies to. */
+    uint32_t         vcpu;
+    /*
+     * SET: Size of struct (IN)
+     * GET: Size of struct (OUT, up to 128 bytes)
+     */
+    uint32_t         size;
+#if defined(__i386__) || defined(__x86_64__)
+    /* SYSCALL from 32-bit mode and SYSENTER callback information. */
+    /* NB. SYSCALL from 64-bit mode is contained in vcpu_guest_context_t */
+    uint64_aligned_t syscall32_callback_eip;
+    uint64_aligned_t sysenter_callback_eip;
+    uint16_t         syscall32_callback_cs;
+    uint16_t         sysenter_callback_cs;
+    uint8_t          syscall32_disables_events;
+    uint8_t          sysenter_disables_events;
+    uint64_aligned_t mcg_cap;
+#endif
+};
+typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpucontext_t);
+
+/*
+ * Set optimizaton features for a domain
+ */
+/* XEN_DOMCTL_set_opt_feature */
+struct xen_domctl_set_opt_feature {
+#if defined(__ia64__)
+    struct xen_ia64_opt_feature optf;
+#else
+    /* Make struct non-empty: do not depend on this field name! */
+    uint64_t dummy;
+#endif
+};
+typedef struct xen_domctl_set_opt_feature xen_domctl_set_opt_feature_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_opt_feature_t);
+
+/*
+ * Set the target domain for a domain
+ */
+/* XEN_DOMCTL_set_target */
+struct xen_domctl_set_target {
+    domid_t target;
+};
+typedef struct xen_domctl_set_target xen_domctl_set_target_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_target_t);
+
+#if defined(__i386__) || defined(__x86_64__)
+# define XEN_CPUID_INPUT_UNUSED  0xFFFFFFFF
+/* XEN_DOMCTL_set_cpuid */
+struct xen_domctl_cpuid {
+  uint32_t input[2];
+  uint32_t eax;
+  uint32_t ebx;
+  uint32_t ecx;
+  uint32_t edx;
+};
+typedef struct xen_domctl_cpuid xen_domctl_cpuid_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_cpuid_t);
+#endif
+
+/* XEN_DOMCTL_subscribe */
+struct xen_domctl_subscribe {
+    uint32_t port; /* IN */
+};
+typedef struct xen_domctl_subscribe xen_domctl_subscribe_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_subscribe_t);
+
+/*
+ * Define the maximum machine address size which should be allocated
+ * to a guest.
+ */
+/* XEN_DOMCTL_set_machine_address_size */
+/* XEN_DOMCTL_get_machine_address_size */
+
+/*
+ * Do not inject spurious page faults into this domain.
+ */
+/* XEN_DOMCTL_suppress_spurious_page_faults */
+
+/* XEN_DOMCTL_debug_op */
+#define XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF         0
+#define XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON          1
+struct xen_domctl_debug_op {
+    uint32_t op;   /* IN */
+    uint32_t vcpu; /* IN */
+};
+typedef struct xen_domctl_debug_op xen_domctl_debug_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_debug_op_t);
+
+/*
+ * Request a particular record from the HVM context
+ */
+/* XEN_DOMCTL_gethvmcontext_partial */
+typedef struct xen_domctl_hvmcontext_partial {
+    uint32_t type;                      /* IN: Type of record required */
+    uint32_t instance;                  /* IN: Instance of that type */
+    XEN_GUEST_HANDLE_64(uint8) buffer;  /* OUT: buffer to write record into */
+} xen_domctl_hvmcontext_partial_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t);
+
+/* XEN_DOMCTL_disable_migrate */
+typedef struct xen_domctl_disable_migrate {
+    uint32_t disable; /* IN: 1: disable migration and restore */
+} xen_domctl_disable_migrate_t;
+
+
+/* XEN_DOMCTL_gettscinfo */
+/* XEN_DOMCTL_settscinfo */
+struct xen_guest_tsc_info {
+    uint32_t tsc_mode;
+    uint32_t gtsc_khz;
+    uint32_t incarnation;
+    uint32_t pad;
+    uint64_aligned_t elapsed_nsec;
+};
+typedef struct xen_guest_tsc_info xen_guest_tsc_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t);
+typedef struct xen_domctl_tsc_info {
+    XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */
+    xen_guest_tsc_info_t info; /* IN */
+} xen_domctl_tsc_info_t;
+
+/* XEN_DOMCTL_gdbsx_guestmemio      guest mem io */
+struct xen_domctl_gdbsx_memio {
+    /* IN */
+    uint64_aligned_t pgd3val;/* optional: init_mm.pgd[3] value */
+    uint64_aligned_t gva;    /* guest virtual address */
+    uint64_aligned_t uva;    /* user buffer virtual address */
+    uint32_t         len;    /* number of bytes to read/write */
+    uint8_t          gwr;    /* 0 = read from guest. 1 = write to guest */
+    /* OUT */
+    uint32_t         remain; /* bytes remaining to be copied */
+};
+
+/* XEN_DOMCTL_gdbsx_pausevcpu */
+/* XEN_DOMCTL_gdbsx_unpausevcpu */
+struct xen_domctl_gdbsx_pauseunp_vcpu { /* pause/unpause a vcpu */
+    uint32_t         vcpu;         /* which vcpu */
+};
+
+/* XEN_DOMCTL_gdbsx_domstatus */
+struct xen_domctl_gdbsx_domstatus {
+    /* OUT */
+    uint8_t          paused;     /* is the domain paused */
+    uint32_t         vcpu_id;    /* any vcpu in an event? */
+    uint32_t         vcpu_ev;    /* if yes, what event? */
+};
+
+/*
+ * Memory event operations
+ */
+
+/* XEN_DOMCTL_mem_event_op */
+
+/*
+ * Domain memory paging
+ * Page memory in and out.
+ * Domctl interface to set up and tear down the 
+ * pager<->hypervisor interface. Use XENMEM_paging_op*
+ * to perform per-page operations.
+ */
+#define XEN_DOMCTL_MEM_EVENT_OP_PAGING            1
+
+#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_ENABLE     0
+#define XEN_DOMCTL_MEM_EVENT_OP_PAGING_DISABLE    1
+
+/*
+ * Access permissions.
+ *
+ * As with paging, use the domctl for teardown/setup of the
+ * helper<->hypervisor interface.
+ *
+ * There are HVM hypercalls to set the per-page access permissions of every
+ * page in a domain.  When one of these permissions--independent, read, 
+ * write, and execute--is violated, the VCPU is paused and a memory event 
+ * is sent with what happened.  (See public/mem_event.h) .
+ *
+ * The memory event handler can then resume the VCPU and redo the access 
+ * with a XENMEM_access_op_resume hypercall.
+ */
+#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS            2
+
+#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_ENABLE     0
+#define XEN_DOMCTL_MEM_EVENT_OP_ACCESS_DISABLE    1
+
+/*
+ * Sharing ENOMEM helper.
+ *
+ * As with paging, use the domctl for teardown/setup of the
+ * helper<->hypervisor interface.
+ *
+ * If setup, this ring is used to communicate failed allocations
+ * in the unshare path. XENMEM_sharing_op_resume is used to wake up
+ * vcpus that could not unshare.
+ *
+ * Note that shring can be turned on (as per the domctl below)
+ * *without* this ring being setup.
+ */
+#define XEN_DOMCTL_MEM_EVENT_OP_SHARING           3
+
+#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_ENABLE    0
+#define XEN_DOMCTL_MEM_EVENT_OP_SHARING_DISABLE   1
+
+/* Use for teardown/setup of helper<->hypervisor interface for paging, 
+ * access and sharing.*/
+struct xen_domctl_mem_event_op {
+    uint32_t       op;           /* XEN_DOMCTL_MEM_EVENT_OP_*_* */
+    uint32_t       mode;         /* XEN_DOMCTL_MEM_EVENT_OP_* */
+
+    uint32_t port;              /* OUT: event channel for ring */
+};
+typedef struct xen_domctl_mem_event_op xen_domctl_mem_event_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_event_op_t);
+
+/*
+ * Memory sharing operations
+ */
+/* XEN_DOMCTL_mem_sharing_op.
+ * The CONTROL sub-domctl is used for bringup/teardown. */
+#define XEN_DOMCTL_MEM_SHARING_CONTROL          0
+
+struct xen_domctl_mem_sharing_op {
+    uint8_t op; /* XEN_DOMCTL_MEM_SHARING_* */
+
+    union {
+        uint8_t enable;                   /* CONTROL */
+    } u;
+};
+typedef struct xen_domctl_mem_sharing_op xen_domctl_mem_sharing_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_mem_sharing_op_t);
+
+struct xen_domctl_audit_p2m {
+    /* OUT error counts */
+    uint64_t orphans;
+    uint64_t m2p_bad;
+    uint64_t p2m_bad;
+};
+typedef struct xen_domctl_audit_p2m xen_domctl_audit_p2m_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_audit_p2m_t);
+
+struct xen_domctl_set_virq_handler {
+    uint32_t virq; /* IN */
+};
+typedef struct xen_domctl_set_virq_handler xen_domctl_set_virq_handler_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_virq_handler_t);
+
+#if defined(__i386__) || defined(__x86_64__)
+/* XEN_DOMCTL_setvcpuextstate */
+/* XEN_DOMCTL_getvcpuextstate */
+struct xen_domctl_vcpuextstate {
+    /* IN: VCPU that this call applies to. */
+    uint32_t         vcpu;
+    /*
+     * SET: xfeature support mask of struct (IN)
+     * GET: xfeature support mask of struct (IN/OUT)
+     * xfeature mask is served as identifications of the saving format
+     * so that compatible CPUs can have a check on format to decide
+     * whether it can restore.
+     */
+    uint64_aligned_t         xfeature_mask;
+    /*
+     * SET: Size of struct (IN)
+     * GET: Size of struct (IN/OUT)
+     */
+    uint64_aligned_t         size;
+    XEN_GUEST_HANDLE_64(uint64) buffer;
+};
+typedef struct xen_domctl_vcpuextstate xen_domctl_vcpuextstate_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuextstate_t);
+#endif
+
+/* XEN_DOMCTL_set_access_required: sets whether a memory event listener
+ * must be present to handle page access events: if false, the page
+ * access will revert to full permissions if no one is listening;
+ *  */
+struct xen_domctl_set_access_required {
+    uint8_t access_required;
+};
+typedef struct xen_domctl_set_access_required xen_domctl_set_access_required_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_access_required_t);
+
+struct xen_domctl {
+    uint32_t cmd;
+#define XEN_DOMCTL_createdomain                   1
+#define XEN_DOMCTL_destroydomain                  2
+#define XEN_DOMCTL_pausedomain                    3
+#define XEN_DOMCTL_unpausedomain                  4
+#define XEN_DOMCTL_getdomaininfo                  5
+#define XEN_DOMCTL_getmemlist                     6
+#define XEN_DOMCTL_getpageframeinfo               7
+#define XEN_DOMCTL_getpageframeinfo2              8
+#define XEN_DOMCTL_setvcpuaffinity                9
+#define XEN_DOMCTL_shadow_op                     10
+#define XEN_DOMCTL_max_mem                       11
+#define XEN_DOMCTL_setvcpucontext                12
+#define XEN_DOMCTL_getvcpucontext                13
+#define XEN_DOMCTL_getvcpuinfo                   14
+#define XEN_DOMCTL_max_vcpus                     15
+#define XEN_DOMCTL_scheduler_op                  16
+#define XEN_DOMCTL_setdomainhandle               17
+#define XEN_DOMCTL_setdebugging                  18
+#define XEN_DOMCTL_irq_permission                19
+#define XEN_DOMCTL_iomem_permission              20
+#define XEN_DOMCTL_ioport_permission             21
+#define XEN_DOMCTL_hypercall_init                22
+#define XEN_DOMCTL_arch_setup                    23
+#define XEN_DOMCTL_settimeoffset                 24
+#define XEN_DOMCTL_getvcpuaffinity               25
+#define XEN_DOMCTL_real_mode_area                26
+#define XEN_DOMCTL_resumedomain                  27
+#define XEN_DOMCTL_sendtrigger                   28
+#define XEN_DOMCTL_subscribe                     29
+#define XEN_DOMCTL_gethvmcontext                 33
+#define XEN_DOMCTL_sethvmcontext                 34
+#define XEN_DOMCTL_set_address_size              35
+#define XEN_DOMCTL_get_address_size              36
+#define XEN_DOMCTL_assign_device                 37
+#define XEN_DOMCTL_bind_pt_irq                   38
+#define XEN_DOMCTL_memory_mapping                39
+#define XEN_DOMCTL_ioport_mapping                40
+#define XEN_DOMCTL_pin_mem_cacheattr             41
+#define XEN_DOMCTL_set_ext_vcpucontext           42
+#define XEN_DOMCTL_get_ext_vcpucontext           43
+#define XEN_DOMCTL_set_opt_feature               44
+#define XEN_DOMCTL_test_assign_device            45
+#define XEN_DOMCTL_set_target                    46
+#define XEN_DOMCTL_deassign_device               47
+#define XEN_DOMCTL_unbind_pt_irq                 48
+#define XEN_DOMCTL_set_cpuid                     49
+#define XEN_DOMCTL_get_device_group              50
+#define XEN_DOMCTL_set_machine_address_size      51
+#define XEN_DOMCTL_get_machine_address_size      52
+#define XEN_DOMCTL_suppress_spurious_page_faults 53
+#define XEN_DOMCTL_debug_op                      54
+#define XEN_DOMCTL_gethvmcontext_partial         55
+#define XEN_DOMCTL_mem_event_op                  56
+#define XEN_DOMCTL_mem_sharing_op                57
+#define XEN_DOMCTL_disable_migrate               58
+#define XEN_DOMCTL_gettscinfo                    59
+#define XEN_DOMCTL_settscinfo                    60
+#define XEN_DOMCTL_getpageframeinfo3             61
+#define XEN_DOMCTL_setvcpuextstate               62
+#define XEN_DOMCTL_getvcpuextstate               63
+#define XEN_DOMCTL_set_access_required           64
+#define XEN_DOMCTL_audit_p2m                     65
+#define XEN_DOMCTL_set_virq_handler              66
+#define XEN_DOMCTL_gdbsx_guestmemio            1000
+#define XEN_DOMCTL_gdbsx_pausevcpu             1001
+#define XEN_DOMCTL_gdbsx_unpausevcpu           1002
+#define XEN_DOMCTL_gdbsx_domstatus             1003
+    uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
+    domid_t  domain;
+    union {
+        struct xen_domctl_createdomain      createdomain;
+        struct xen_domctl_getdomaininfo     getdomaininfo;
+        struct xen_domctl_getmemlist        getmemlist;
+        struct xen_domctl_getpageframeinfo  getpageframeinfo;
+        struct xen_domctl_getpageframeinfo2 getpageframeinfo2;
+        struct xen_domctl_getpageframeinfo3 getpageframeinfo3;
+        struct xen_domctl_vcpuaffinity      vcpuaffinity;
+        struct xen_domctl_shadow_op         shadow_op;
+        struct xen_domctl_max_mem           max_mem;
+        struct xen_domctl_vcpucontext       vcpucontext;
+        struct xen_domctl_getvcpuinfo       getvcpuinfo;
+        struct xen_domctl_max_vcpus         max_vcpus;
+        struct xen_domctl_scheduler_op      scheduler_op;
+        struct xen_domctl_setdomainhandle   setdomainhandle;
+        struct xen_domctl_setdebugging      setdebugging;
+        struct xen_domctl_irq_permission    irq_permission;
+        struct xen_domctl_iomem_permission  iomem_permission;
+        struct xen_domctl_ioport_permission ioport_permission;
+        struct xen_domctl_hypercall_init    hypercall_init;
+        struct xen_domctl_arch_setup        arch_setup;
+        struct xen_domctl_settimeoffset     settimeoffset;
+        struct xen_domctl_disable_migrate   disable_migrate;
+        struct xen_domctl_tsc_info          tsc_info;
+        struct xen_domctl_real_mode_area    real_mode_area;
+        struct xen_domctl_hvmcontext        hvmcontext;
+        struct xen_domctl_hvmcontext_partial hvmcontext_partial;
+        struct xen_domctl_address_size      address_size;
+        struct xen_domctl_sendtrigger       sendtrigger;
+        struct xen_domctl_get_device_group  get_device_group;
+        struct xen_domctl_assign_device     assign_device;
+        struct xen_domctl_bind_pt_irq       bind_pt_irq;
+        struct xen_domctl_memory_mapping    memory_mapping;
+        struct xen_domctl_ioport_mapping    ioport_mapping;
+        struct xen_domctl_pin_mem_cacheattr pin_mem_cacheattr;
+        struct xen_domctl_ext_vcpucontext   ext_vcpucontext;
+        struct xen_domctl_set_opt_feature   set_opt_feature;
+        struct xen_domctl_set_target        set_target;
+        struct xen_domctl_subscribe         subscribe;
+        struct xen_domctl_debug_op          debug_op;
+        struct xen_domctl_mem_event_op      mem_event_op;
+        struct xen_domctl_mem_sharing_op    mem_sharing_op;
+#if defined(__i386__) || defined(__x86_64__)
+        struct xen_domctl_cpuid             cpuid;
+        struct xen_domctl_vcpuextstate      vcpuextstate;
+#endif
+        struct xen_domctl_set_access_required access_required;
+        struct xen_domctl_audit_p2m         audit_p2m;
+        struct xen_domctl_set_virq_handler  set_virq_handler;
+        struct xen_domctl_gdbsx_memio       gdbsx_guest_memio;
+        struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
+        struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
+        uint8_t                             pad[128];
+    } u;
+};
+typedef struct xen_domctl xen_domctl_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_t);
+
+#endif /* __XEN_PUBLIC_DOMCTL_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h

index 0360b15..08222ee 100644 (file)
--- a/include/xen/interface/elfnote.h
+++ b/include/xen/interface/elfnote.h
@@ -3,6 +3,24 @@
   *
   * Definitions used for the Xen ELF notes.
   *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
   * Copyright (c) 2006, Ian Campbell, XenSource Ltd.
   */
  
@@ -10,7 +28,7 @@
  #define __XEN_PUBLIC_ELFNOTE_H__
  
  /*
- * The notes should live in a SHT_NOTE segment and have "Xen" in the
+ * The notes should live in a PT_NOTE segment and have "Xen" in the
   * name field.
   *
   * Numeric types are either 4 or 8 bytes depending on the content of
@@ -22,8 +40,6 @@
  
  /*
   * NAME=VALUE pair (string).
- *
- * LEGACY: FEATURES and PAE
   */
  #define XEN_ELFNOTE_INFO           0
  
@@ -90,7 +106,12 @@
  #define XEN_ELFNOTE_LOADER         8
  
  /*
- * The kernel supports PAE (x86/32 only, string = "yes" or "no").
+ * The kernel supports PAE (x86/32 only, string = "yes", "no" or
+ * "bimodal").
+ *
+ * For compatibility with Xen 3.0.3 and earlier the "bimodal" setting
+ * may be given as "yes,bimodal" which will cause older Xen to treat
+ * this kernel as PAE.
   *
   * LEGACY: PAE (n.b. The legacy interface included a provision to
   * indicate 'extended-cr3' support allowing L3 page tables to be
@@ -140,6 +161,95 @@
   */
  #define XEN_ELFNOTE_SUSPEND_CANCEL 14
  
+/*
+ * The (non-default) location the initial phys-to-machine map should be
+ * placed at by the hypervisor (Dom0) or the tools (DomU).
+ * The kernel must be prepared for this mapping to be established using
+ * large pages, despite such otherwise not being available to guests.
+ * The kernel must also be able to handle the page table pages used for
+ * this mapping not being accessible through the initial mapping.
+ * (Only x86-64 supports this at present.)
+ */
+#define XEN_ELFNOTE_INIT_P2M      15
+
+/*
+ * Whether or not the guest can deal with being passed an initrd not
+ * mapped through its initial page tables.
+ */
+#define XEN_ELFNOTE_MOD_START_PFN 16
+
+/*
+ * The features supported by this kernel (numeric).
+ *
+ * Other than XEN_ELFNOTE_FEATURES on pre-4.2 Xen, this note allows a
+ * kernel to specify support for features that older hypervisors don't
+ * know about. The set of features 4.2 and newer hypervisors will
+ * consider supported by the kernel is the combination of the sets
+ * specified through this and the string note.
+ *
+ * LEGACY: FEATURES
+ */
+#define XEN_ELFNOTE_SUPPORTED_FEATURES 17
+
+/*
+ * The number of the highest elfnote defined.
+ */
+#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUPPORTED_FEATURES
+
+/*
+ * System information exported through crash notes.
+ *
+ * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_INFO
+ * note in case of a system crash. This note will contain various
+ * information about the system, see xen/include/xen/elfcore.h.
+ */
+#define XEN_ELFNOTE_CRASH_INFO 0x1000001
+
+/*
+ * System registers exported through crash notes.
+ *
+ * The kexec / kdump code will create one XEN_ELFNOTE_CRASH_REGS
+ * note per cpu in case of a system crash. This note is architecture
+ * specific and will contain registers not saved in the "CORE" note.
+ * See xen/include/xen/elfcore.h for more information.
+ */
+#define XEN_ELFNOTE_CRASH_REGS 0x1000002
+
+
+/*
+ * xen dump-core none note.
+ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_NONE
+ * in its dump file to indicate that the file is xen dump-core
+ * file. This note doesn't have any other information.
+ * See tools/libxc/xc_core.h for more information.
+ */
+#define XEN_ELFNOTE_DUMPCORE_NONE               0x2000000
+
+/*
+ * xen dump-core header note.
+ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_HEADER
+ * in its dump file.
+ * See tools/libxc/xc_core.h for more information.
+ */
+#define XEN_ELFNOTE_DUMPCORE_HEADER             0x2000001
+
+/*
+ * xen dump-core xen version note.
+ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_XEN_VERSION
+ * in its dump file. It contains the xen version obtained via the
+ * XENVER hypercall.
+ * See tools/libxc/xc_core.h for more information.
+ */
+#define XEN_ELFNOTE_DUMPCORE_XEN_VERSION        0x2000002
+
+/*
+ * xen dump-core format version note.
+ * xm dump-core code will create one XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION
+ * in its dump file. It contains a format version identifier.
+ * See tools/libxc/xc_core.h for more information.
+ */
+#define XEN_ELFNOTE_DUMPCORE_FORMAT_VERSION     0x2000003
+
  #endif /* __XEN_PUBLIC_ELFNOTE_H__ */
  
  /*
diff --git a/include/xen/interface/event_channel.h b/include/xen/interface/event_channel.h

index 2090881..a9ee6b1 100644 (file)
--- a/include/xen/interface/event_channel.h
+++ b/include/xen/interface/event_channel.h
@@ -3,6 +3,24 @@
   *
   * Event channels between domains.
   *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
   * Copyright (c) 2003-2004, K A Fraser.
   */
  
@@ -11,8 +29,52 @@
  
  #include <xen/interface/xen.h>
  
+/*
+ * `incontents 150 evtchn Event Channels
+ *
+ * Event channels are the basic primitive provided by Xen for event
+ * notifications. An event is the Xen equivalent of a hardware
+ * interrupt. They essentially store one bit of information, the event
+ * of interest is signalled by transitioning this bit from 0 to 1.
+ *
+ * Notifications are received by a guest via an upcall from Xen,
+ * indicating when an event arrives (setting the bit). Further
+ * notifications are masked until the bit is cleared again (therefore,
+ * guests must check the value of the bit after re-enabling event
+ * delivery to ensure no missed notifications).
+ *
+ * Event notifications can be masked by setting a flag; this is
+ * equivalent to disabling interrupts and can be used to ensure
+ * atomicity of certain operations in the guest kernel.
+ *
+ * Event channels are represented by the evtchn_* fields in
+ * struct shared_info and struct vcpu_info.
+ */
+
+/*
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_event_channel_op(enum event_channel_op cmd, void *args)
+ * `
+ * @cmd  == EVTCHNOP_* (event-channel operation).
+ * @args == struct evtchn_* Operation-specific extra arguments (NULL if none).
+ */
+
+/* ` enum event_channel_op { // EVTCHNOP_* => struct evtchn_* */
+#define EVTCHNOP_bind_interdomain 0
+#define EVTCHNOP_bind_virq        1
+#define EVTCHNOP_bind_pirq        2
+#define EVTCHNOP_close            3
+#define EVTCHNOP_send             4
+#define EVTCHNOP_status           5
+#define EVTCHNOP_alloc_unbound    6
+#define EVTCHNOP_bind_ipi         7
+#define EVTCHNOP_bind_vcpu        8
+#define EVTCHNOP_unmask           9
+#define EVTCHNOP_reset           10
+/* ` } */
+
  typedef uint32_t evtchn_port_t;
-DEFINE_GUEST_HANDLE(evtchn_port_t);
+DEFINE_XEN_GUEST_HANDLE(evtchn_port_t);
  
  /*
   * EVTCHNOP_alloc_unbound: Allocate a port in domain <dom> and mark as
@@ -22,13 +84,13 @@ DEFINE_GUEST_HANDLE(evtchn_port_t);
   *  1. If the caller is unprivileged then <dom> must be DOMID_SELF.
   *  2. <rdom> may be DOMID_SELF, allowing loopback connections.
   */
-#define EVTCHNOP_alloc_unbound   6
  struct evtchn_alloc_unbound {
         /* IN parameters */
         domid_t dom, remote_dom;
         /* OUT parameters */
         evtchn_port_t port;
  };
+typedef struct evtchn_alloc_unbound evtchn_alloc_unbound_t;
  
  /*
   * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
@@ -37,9 +99,8 @@ struct evtchn_alloc_unbound {
   * domain. A fresh port is allocated in the calling domain and returned as
   * <local_port>.
   * NOTES:
- *  2. <remote_dom> may be DOMID_SELF, allowing loopback connections.
+ *  1. <remote_dom> may be DOMID_SELF, allowing loopback connections.
   */
-#define EVTCHNOP_bind_interdomain 0
  struct evtchn_bind_interdomain {
         /* IN parameters. */
         domid_t remote_dom;
@@ -47,31 +108,35 @@ struct evtchn_bind_interdomain {
         /* OUT parameters. */
         evtchn_port_t local_port;
  };
+typedef struct evtchn_bind_interdomain evtchn_bind_interdomain_t;
  
  /*
   * EVTCHNOP_bind_virq: Bind a local event channel to VIRQ <irq> on specified
   * vcpu.
   * NOTES:
- *  1. A virtual IRQ may be bound to at most one event channel per vcpu.
- *  2. The allocated event channel is bound to the specified vcpu. The binding
- *     may not be changed.
+ *  1. Virtual IRQs are classified as per-vcpu or global. See the VIRQ list
+ *     in xen.h for the classification of each VIRQ.
+ *  2. Global VIRQs must be allocated on VCPU0 but can subsequently be
+ *     re-bound via EVTCHNOP_bind_vcpu.
+ *  3. Per-vcpu VIRQs may be bound to at most one event channel per vcpu.
+ *     The allocated event channel is bound to the specified vcpu and the
+ *     binding cannot be changed.
   */
-#define EVTCHNOP_bind_virq       1
  struct evtchn_bind_virq {
         /* IN parameters. */
-       uint32_t virq;
+       uint32_t virq; /* enum virq */
         uint32_t vcpu;
         /* OUT parameters. */
         evtchn_port_t port;
  };
+typedef struct evtchn_bind_virq evtchn_bind_virq_t;
  
  /*
- * EVTCHNOP_bind_pirq: Bind a local event channel to PIRQ <irq>.
+ * EVTCHNOP_bind_pirq: Bind a local event channel to a real IRQ (PIRQ <irq>).
   * NOTES:
   *  1. A physical IRQ may be bound to at most one event channel per domain.
   *  2. Only a sufficiently-privileged domain may bind to a physical IRQ.
   */
-#define EVTCHNOP_bind_pirq       2
  struct evtchn_bind_pirq {
         /* IN parameters. */
         uint32_t pirq;
@@ -80,6 +145,7 @@ struct evtchn_bind_pirq {
         /* OUT parameters. */
         evtchn_port_t port;
  };
+typedef struct evtchn_bind_pirq evtchn_bind_pirq_t;
  
  /*
   * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
@@ -87,33 +153,33 @@ struct evtchn_bind_pirq {
   *  1. The allocated event channel is bound to the specified vcpu. The binding
   *     may not be changed.
   */
-#define EVTCHNOP_bind_ipi        7
  struct evtchn_bind_ipi {
         uint32_t vcpu;
         /* OUT parameters. */
         evtchn_port_t port;
  };
+typedef struct evtchn_bind_ipi evtchn_bind_ipi_t;
  
  /*
   * EVTCHNOP_close: Close a local event channel <port>. If the channel is
   * interdomain then the remote end is placed in the unbound state
   * (EVTCHNSTAT_unbound), awaiting a new connection.
   */
-#define EVTCHNOP_close           3
  struct evtchn_close {
         /* IN parameters. */
         evtchn_port_t port;
  };
+typedef struct evtchn_close evtchn_close_t;
  
  /*
   * EVTCHNOP_send: Send an event to the remote end of the channel whose local
   * endpoint is <port>.
   */
-#define EVTCHNOP_send            4
  struct evtchn_send {
         /* IN parameters. */
         evtchn_port_t port;
  };
+typedef struct evtchn_send evtchn_send_t;
  
  /*
   * EVTCHNOP_status: Get the current status of the communication channel which
@@ -123,7 +189,6 @@ struct evtchn_send {
   *  2. Only a sufficiently-privileged domain may obtain the status of an event
   *     channel for which <dom> is not DOMID_SELF.
   */
-#define EVTCHNOP_status                  5
  struct evtchn_status {
         /* IN parameters */
         domid_t  dom;
@@ -149,36 +214,57 @@ struct evtchn_status {
                 uint32_t virq;      /* EVTCHNSTAT_virq        */
         } u;
  };
+typedef struct evtchn_status evtchn_status_t;
  
  /*
   * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
   * event is pending.
   * NOTES:
- *  1. IPI- and VIRQ-bound channels always notify the vcpu that initialised
- *     the binding. This binding cannot be changed.
- *  2. All other channels notify vcpu0 by default. This default is set when
+ *  1. IPI-bound channels always notify the vcpu specified at bind time.
+ *     This binding cannot be changed.
+ *  2. Per-VCPU VIRQ channels always notify the vcpu specified at bind time.
+ *     This binding cannot be changed.
+ *  3. All other channels notify vcpu0 by default. This default is set when
   *     the channel is allocated (a port that is freed and subsequently reused
   *     has its binding reset to vcpu0).
   */
-#define EVTCHNOP_bind_vcpu       8
  struct evtchn_bind_vcpu {
         /* IN parameters. */
         evtchn_port_t port;
         uint32_t vcpu;
  };
+typedef struct evtchn_bind_vcpu evtchn_bind_vcpu_t;
  
  /*
   * EVTCHNOP_unmask: Unmask the specified local event-channel port and deliver
   * a notification to the appropriate VCPU if an event is pending.
   */
-#define EVTCHNOP_unmask                  9
  struct evtchn_unmask {
         /* IN parameters. */
         evtchn_port_t port;
  };
+typedef struct evtchn_unmask evtchn_unmask_t;
+
+/*
+ * EVTCHNOP_reset: Close all event channels associated with specified domain.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may specify other than DOMID_SELF.
+ */
+struct evtchn_reset {
+    /* IN parameters. */
+    domid_t dom;
+};
+typedef struct evtchn_reset evtchn_reset_t;
  
+/*
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_event_channel_op_compat(struct evtchn_op *op)
+ * `
+ * Superceded by new event_channel_op() hypercall since 0x00030202.
+ */
  struct evtchn_op {
-       uint32_t cmd; /* EVTCHNOP_* */
+       uint32_t cmd; /* enum event_channel_op */
         union {
                 struct evtchn_alloc_unbound    alloc_unbound;
                 struct evtchn_bind_interdomain bind_interdomain;
@@ -193,5 +279,7 @@ struct evtchn_op {
         } u;
  };
  DEFINE_GUEST_HANDLE_STRUCT(evtchn_op);
+typedef struct evtchn_op evtchn_op_t;
+DEFINE_XEN_GUEST_HANDLE(evtchn_op_t);
  
  #endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
diff --git a/include/xen/interface/features.h b/include/xen/interface/features.h

index b6ca39a..d8fad89 100644 (file)
--- a/include/xen/interface/features.h
+++ b/include/xen/interface/features.h
@@ -3,6 +3,24 @@
   *
   * Feature flags, reported by XENVER_get_features.
   *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
   * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
   */
  
@@ -41,6 +59,15 @@
  /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */
  #define XENFEAT_mmu_pt_update_preserve_ad  5
  
+/* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */
+#define XENFEAT_highmem_assist             6
+
+/*
+ * If set, GNTTABOP_map_grant_ref honors flags to be placed into guest kernel
+ * available pte bits.
+ */
+#define XENFEAT_gnttab_map_avail_bits      7
+
  /* x86: Does this Xen host support the HVM callback vector type? */
  #define XENFEAT_hvm_callback_vector        8
  
@@ -48,7 +75,10 @@
  #define XENFEAT_hvm_safe_pvclock           9
  
  /* x86: pirq can be used by HVM guests */
-#define XENFEAT_hvm_pirqs           10
+#define XENFEAT_hvm_pirqs                 10
+
+/* operation as Dom0 is supported */
+#define XENFEAT_dom0                      11
  
  #define XENFEAT_NR_SUBMAPS 1
  
diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h

index a17d844..f54bfba 100644 (file)
--- a/include/xen/interface/grant_table.h
+++ b/include/xen/interface/grant_table.h
@@ -100,6 +100,12 @@ typedef uint32_t grant_ref_t;
   * Version 1 of the grant table entry structure is maintained purely
   * for backwards compatibility.  New guests should use version 2.
   */
+#if defined(CONFIG_PARAVIRT_XEN)
+#define grant_entry grant_entry_v1
+#elif __XEN_INTERFACE_VERSION__ < 0x0003020a
+#define grant_entry_v1 grant_entry
+#define grant_entry_v1_t grant_entry_t
+#endif
  struct grant_entry_v1 {
      /* GTF_xxx: various type and flag information.  [XEN,GST] */
      uint16_t flags;
@@ -111,6 +117,14 @@ struct grant_entry_v1 {
       */
      uint32_t frame;
  };
+typedef struct grant_entry_v1 grant_entry_v1_t;
+
+/* The first few grant table entries will be preserved across grant table
+ * version changes and may be pre-populated at domain creation by tools.
+ */
+#define GNTTAB_NR_RESERVED_ENTRIES     8
+#define GNTTAB_RESERVED_CONSOLE        0
+#define GNTTAB_RESERVED_XENSTORE       1
  
  /*
   * Type of grant entry.
@@ -132,6 +146,7 @@ struct grant_entry_v1 {
   *  GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
   *  GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
   *  GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
+ *  GTF_PAT, GTF_PWT, GTF_PCD: (x86) cache attribute flags for the grant [GST]
   *  GTF_sub_page: Grant access to only a subrange of the page.  @domid
   *                will only be allowed to copy from the grant, and not
   *                map it. [GST]
@@ -142,6 +157,12 @@ struct grant_entry_v1 {
  #define GTF_reading         (1U<<_GTF_reading)
  #define _GTF_writing        (4)
  #define GTF_writing         (1U<<_GTF_writing)
+#define _GTF_PWT            (5)
+#define GTF_PWT             (1U<<_GTF_PWT)
+#define _GTF_PCD            (6)
+#define GTF_PCD             (1U<<_GTF_PCD)
+#define _GTF_PAT            (7)
+#define GTF_PAT             (1U<<_GTF_PAT)
  #define _GTF_sub_page       (8)
  #define GTF_sub_page        (1U<<_GTF_sub_page)
  
@@ -169,7 +190,7 @@ struct grant_entry_v1 {
   * The interface by which domains use grant references does not depend
   * on the grant table version in use by the other domain.
   */
-
+#if defined(CONFIG_PARAVIRT_XEN) || __XEN_INTERFACE_VERSION__ >= 0x0003020a
  /*
   * Version 1 and version 2 grant entries share a common prefix.  The
   * fields of the prefix are documented as part of struct
@@ -179,10 +200,11 @@ struct grant_entry_header {
      uint16_t flags;
      domid_t  domid;
  };
+typedef struct grant_entry_header grant_entry_header_t;
  
  /*
- * Version 2 of the grant entry structure, here is an union because three
- * different types are suppotted: full_page, sub_page and transitive.
+ * Version 2 of the grant entry structure, here is a union because three
+ * different types are supported: full_page, sub_page and transitive.
   */
  union grant_entry_v2 {
      struct grant_entry_header hdr;
@@ -219,6 +241,9 @@ union grant_entry_v2 {
       * grant @gref in domain @trans_domid, as if it was the local
       * domain.  Obviously, the transitive access must be compatible
       * with the original grant.
+     *
+     * The current version of Xen does not allow transitive grants
+     * to be mapped.
       */
      struct {
         struct grant_entry_header hdr;
@@ -229,9 +254,12 @@ union grant_entry_v2 {
  
      uint32_t __spacer[4]; /* Pad to a power of two */
  };
+typedef union grant_entry_v2 grant_entry_v2_t;
  
  typedef uint16_t grant_status_t;
  
+#endif /* __XEN_INTERFACE_VERSION__ */
+
  /***********************************
   * GRANT TABLE QUERIES AND USES
   */
@@ -271,6 +299,8 @@ struct gnttab_map_grant_ref {
      uint64_t dev_bus_addr;
  };
  DEFINE_GUEST_HANDLE_STRUCT(gnttab_map_grant_ref);
+typedef struct gnttab_map_grant_ref gnttab_map_grant_ref_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_map_grant_ref_t);
  
  /*
   * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
@@ -293,6 +323,8 @@ struct gnttab_unmap_grant_ref {
      int16_t  status;              /* GNTST_* */
  };
  DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_grant_ref);
+typedef struct gnttab_unmap_grant_ref gnttab_unmap_grant_ref_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_grant_ref_t);
  
  /*
   * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
@@ -310,9 +342,11 @@ struct gnttab_setup_table {
      uint32_t nr_frames;
      /* OUT parameters. */
      int16_t  status;              /* GNTST_* */
-    GUEST_HANDLE(ulong) frame_list;
+    XEN_GUEST_HANDLE(ulong) frame_list;
  };
  DEFINE_GUEST_HANDLE_STRUCT(gnttab_setup_table);
+typedef struct gnttab_setup_table gnttab_setup_table_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_setup_table_t);
  
  /*
   * GNTTABOP_dump_table: Dump the contents of the grant table to the
@@ -326,6 +360,8 @@ struct gnttab_dump_table {
      int16_t status;               /* GNTST_* */
  };
  DEFINE_GUEST_HANDLE_STRUCT(gnttab_dump_table);
+typedef struct gnttab_dump_table gnttab_dump_table_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_dump_table_t);
  
  /*
   * GNTTABOP_transfer_grant_ref: Transfer <frame> to a foreign domain. The
@@ -338,13 +374,16 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_dump_table);
  #define GNTTABOP_transfer                4
  struct gnttab_transfer {
      /* IN parameters. */
-    unsigned long mfn;
+    xen_pfn_t     mfn;
      domid_t       domid;
      grant_ref_t   ref;
      /* OUT parameters. */
      int16_t       status;
  };
  DEFINE_GUEST_HANDLE_STRUCT(gnttab_transfer);
+typedef struct gnttab_transfer gnttab_transfer_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_transfer_t);
+
  
  /*
   * GNTTABOP_copy: Hypervisor based copy
@@ -368,14 +407,16 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_transfer);
  #define GNTCOPY_source_gref       (1<<_GNTCOPY_source_gref)
  #define _GNTCOPY_dest_gref        (1)
  #define GNTCOPY_dest_gref         (1<<_GNTCOPY_dest_gref)
+#define _GNTCOPY_can_fail         (2)
+#define GNTCOPY_can_fail          (1<<_GNTCOPY_can_fail)
  
  #define GNTTABOP_copy                 5
-struct gnttab_copy {
+typedef struct gnttab_copy {
         /* IN parameters. */
         struct {
                 union {
                         grant_ref_t ref;
-                       unsigned long   gmfn;
+                       xen_pfn_t   gmfn;
                 } u;
                 domid_t  domid;
                 uint16_t offset;
@@ -384,8 +425,9 @@ struct gnttab_copy {
         uint16_t      flags;          /* GNTCOPY_* */
         /* OUT parameters. */
         int16_t       status;
-};
+} gnttab_copy_t;
  DEFINE_GUEST_HANDLE_STRUCT(gnttab_copy);
+DEFINE_XEN_GUEST_HANDLE(gnttab_copy_t);
  
  /*
   * GNTTABOP_query_size: Query the current and maximum sizes of the shared
@@ -404,6 +446,8 @@ struct gnttab_query_size {
      int16_t  status;              /* GNTST_* */
  };
  DEFINE_GUEST_HANDLE_STRUCT(gnttab_query_size);
+typedef struct gnttab_query_size gnttab_query_size_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_query_size_t);
  
  /*
   * GNTTABOP_unmap_and_replace: Destroy one or more grant-reference mappings
@@ -426,7 +470,10 @@ struct gnttab_unmap_and_replace {
      int16_t  status;              /* GNTST_* */
  };
  DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace);
+typedef struct gnttab_unmap_and_replace gnttab_unmap_and_replace_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_unmap_and_replace_t);
  
+#if defined(CONFIG_PARAVIRT_XEN) || __XEN_INTERFACE_VERSION__ >= 0x0003020a
  /*
   * GNTTABOP_set_version: Request a particular version of the grant
   * table shared table structure.  This operation can only be performed
@@ -436,10 +483,13 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_unmap_and_replace);
   */
  #define GNTTABOP_set_version          8
  struct gnttab_set_version {
-    /* IN parameters */
+    /* IN/OUT parameters */
      uint32_t version;
  };
  DEFINE_GUEST_HANDLE_STRUCT(gnttab_set_version);
+typedef struct gnttab_set_version gnttab_set_version_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_set_version_t);
+
  
  /*
   * GNTTABOP_get_status_frames: Get the list of frames used to store grant
@@ -460,9 +510,11 @@ struct gnttab_get_status_frames {
      domid_t  dom;
      /* OUT parameters. */
      int16_t  status;              /* GNTST_* */
-    GUEST_HANDLE(uint64_t) frame_list;
+    XEN_GUEST_HANDLE(uint64_t) frame_list;
  };
  DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_status_frames);
+typedef struct gnttab_get_status_frames gnttab_get_status_frames_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_get_status_frames_t);
  
  /*
   * GNTTABOP_get_version: Get the grant table version which is in
@@ -477,9 +529,27 @@ struct gnttab_get_version {
      uint32_t version;
  };
  DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
+typedef struct gnttab_get_version gnttab_get_version_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_get_version_t);
+
+/*
+ * GNTTABOP_swap_grant_ref: Swap the contents of two grant entries.
+ */
+#define GNTTABOP_swap_grant_ref              11
+struct gnttab_swap_grant_ref {
+    /* IN parameters */
+    grant_ref_t ref_a;
+    grant_ref_t ref_b;
+    /* OUT parameters */
+    int16_t status;             /* GNTST_* */
+};
+typedef struct gnttab_swap_grant_ref gnttab_swap_grant_ref_t;
+DEFINE_XEN_GUEST_HANDLE(gnttab_swap_grant_ref_t);
+
+#endif /* __XEN_INTERFACE_VERSION__ */
  
  /*
- * Bitfield values for update_pin_status.flags.
+ * Bitfield values for gnttab_map_grant_ref.flags.
   */
   /* Map the grant entry for access by I/O devices. */
  #define _GNTMAP_device_map      (0)
@@ -506,6 +576,16 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
  #define _GNTMAP_contains_pte    (4)
  #define GNTMAP_contains_pte     (1<<_GNTMAP_contains_pte)
  
+#define _GNTMAP_can_fail        (5)
+#define GNTMAP_can_fail         (1<<_GNTMAP_can_fail)
+
+/*
+ * Bits to be placed in guest kernel available PTE bits (architecture
+ * dependent; only supported when XENFEAT_gnttab_map_avail_bits is set).
+ */
+#define _GNTMAP_guest_avail0    (16)
+#define GNTMAP_guest_avail_mask ((uint32_t)~0 << _GNTMAP_guest_avail0)
+
  /*
   * Values for error status returns. All errors are -ve.
   */
@@ -519,7 +599,9 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
  #define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
  #define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
  #define GNTST_bad_page         (-9) /* Specified page was invalid for op.    */
-#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary */
+#define GNTST_bad_copy_arg    (-10) /* copy arguments cross page boundary.   */
+#define GNTST_address_too_big (-11) /* transfer page address too large.      */
+#define GNTST_eagain          (-12) /* Operation not done; try again.        */
  
  #define GNTTABOP_error_msgs {                   \
      "okay",                                     \
@@ -532,7 +614,9 @@ DEFINE_GUEST_HANDLE_STRUCT(gnttab_get_version);
      "no spare translation slot in the I/O MMU", \
      "permission denied",                        \
      "bad page",                                 \
-    "copy arguments cross page boundary"        \
+    "copy arguments cross page boundary",       \
+    "page address size too large",              \
+    "operation not done; try again"             \
  }
  
  #endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
diff --git a/include/xen/interface/hvm/e820.h b/include/xen/interface/hvm/e820.h

new file mode 100644 (file)

index 0000000..5bdc227
--- /dev/null
+++ b/include/xen/interface/hvm/e820.h
@@ -0,0 +1,34 @@
+
+/*
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_E820_H__
+#define __XEN_PUBLIC_HVM_E820_H__
+
+/* E820 location in HVM virtual address space. */
+#define HVM_E820_PAGE        0x00090000
+#define HVM_E820_NR_OFFSET   0x000001E8
+#define HVM_E820_OFFSET      0x000002D0
+
+#define HVM_BELOW_4G_RAM_END        0xF0000000
+#define HVM_BELOW_4G_MMIO_START     HVM_BELOW_4G_RAM_END
+#define HVM_BELOW_4G_MMIO_LENGTH    ((1ULL << 32) - HVM_BELOW_4G_MMIO_START)
+
+#endif /* __XEN_PUBLIC_HVM_E820_H__ */
diff --git a/include/xen/interface/hvm/hvm_info_table.h b/include/xen/interface/hvm/hvm_info_table.h

new file mode 100644 (file)

index 0000000..36085fa
--- /dev/null
+++ b/include/xen/interface/hvm/hvm_info_table.h
@@ -0,0 +1,72 @@
+/******************************************************************************
+ * hvm/hvm_info_table.h
+ * 
+ * HVM parameter and information table, written into guest memory map.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
+#define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__
+
+#define HVM_INFO_PFN         0x09F
+#define HVM_INFO_OFFSET      0x800
+#define HVM_INFO_PADDR       ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET)
+
+/* Maximum we can support with current vLAPIC ID mapping. */
+#define HVM_MAX_VCPUS        128
+
+struct hvm_info_table {
+    char        signature[8]; /* "HVM INFO" */
+    uint32_t    length;
+    uint8_t     checksum;
+
+    /* Should firmware build APIC descriptors (APIC MADT / MP BIOS)? */
+    uint8_t     apic_mode;
+
+    /* How many CPUs does this domain have? */
+    uint32_t    nr_vcpus;
+
+    /*
+     * MEMORY MAP provided by HVM domain builder.
+     * Notes:
+     *  1. page_to_phys(x) = x << 12
+     *  2. If a field is zero, the corresponding range does not exist.
+     */
+    /*
+     *  0x0 to page_to_phys(low_mem_pgend)-1:
+     *    RAM below 4GB (except for VGA hole 0xA0000-0xBFFFF)
+     */
+    uint32_t    low_mem_pgend;
+    /*
+     *  page_to_phys(reserved_mem_pgstart) to 0xFFFFFFFF:
+     *    Reserved for special memory mappings
+     */
+    uint32_t    reserved_mem_pgstart;
+    /*
+     *  0x100000000 to page_to_phys(high_mem_pgend)-1:
+     *    RAM above 4GB
+     */
+    uint32_t    high_mem_pgend;
+
+    /* Bitmap of which CPUs are online at boot time. */
+    uint8_t     vcpu_online[(HVM_MAX_VCPUS + 7)/8];
+};
+
+#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */
diff --git a/include/xen/interface/hvm/hvm_op.h b/include/xen/interface/hvm/hvm_op.h

index a4827f4..80f167b 100644 (file)
--- a/include/xen/interface/hvm/hvm_op.h
+++ b/include/xen/interface/hvm/hvm_op.h
@@ -21,6 +21,9 @@
  #ifndef __XEN_PUBLIC_HVM_HVM_OP_H__
  #define __XEN_PUBLIC_HVM_HVM_OP_H__
  
+#include "../xen.h"
+#include "../trace.h"
+
  /* Get/set subcommands: the second argument of the hypercall is a
   * pointer to a xen_hvm_param struct. */
  #define HVMOP_set_param           0
@@ -31,16 +34,234 @@ struct xen_hvm_param {
      uint64_t value;    /* IN/OUT */
  };
  DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_param);
+typedef struct xen_hvm_param xen_hvm_param_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_param_t);
+
+/* Set the logical level of one of a domain's PCI INTx wires. */
+#define HVMOP_set_pci_intx_level  2
+struct xen_hvm_set_pci_intx_level {
+    /* Domain to be updated. */
+    domid_t  domid;
+    /* PCI INTx identification in PCI topology (domain:bus:device:intx). */
+    uint8_t  domain, bus, device, intx;
+    /* Assertion level (0 = unasserted, 1 = asserted). */
+    uint8_t  level;
+};
+typedef struct xen_hvm_set_pci_intx_level xen_hvm_set_pci_intx_level_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_intx_level_t);
+
+/* Set the logical level of one of a domain's ISA IRQ wires. */
+#define HVMOP_set_isa_irq_level   3
+struct xen_hvm_set_isa_irq_level {
+    /* Domain to be updated. */
+    domid_t  domid;
+    /* ISA device identification, by ISA IRQ (0-15). */
+    uint8_t  isa_irq;
+    /* Assertion level (0 = unasserted, 1 = asserted). */
+    uint8_t  level;
+};
+typedef struct xen_hvm_set_isa_irq_level xen_hvm_set_isa_irq_level_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_isa_irq_level_t);
+
+#define HVMOP_set_pci_link_route  4
+struct xen_hvm_set_pci_link_route {
+    /* Domain to be updated. */
+    domid_t  domid;
+    /* PCI link identifier (0-3). */
+    uint8_t  link;
+    /* ISA IRQ (1-15), or 0 (disable link). */
+    uint8_t  isa_irq;
+};
+typedef struct xen_hvm_set_pci_link_route xen_hvm_set_pci_link_route_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t);
+
+/* Flushes all VCPU TLBs: @arg must be NULL. */
+#define HVMOP_flush_tlbs          5
+
+typedef enum {
+    HVMMEM_ram_rw,             /* Normal read/write guest RAM */
+    HVMMEM_ram_ro,             /* Read-only; writes are discarded */
+    HVMMEM_mmio_dm,            /* Reads and write go to the device model */
+} hvmmem_type_t;
+
+/* Following tools-only interfaces may change in future. */
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+/* Track dirty VRAM. */
+#define HVMOP_track_dirty_vram    6
+struct xen_hvm_track_dirty_vram {
+    /* Domain to be tracked. */
+    domid_t  domid;
+    /* First pfn to track. */
+    uint64_aligned_t first_pfn;
+    /* Number of pages to track. */
+    uint64_aligned_t nr;
+    /* OUT variable. */
+    /* Dirty bitmap buffer. */
+    XEN_GUEST_HANDLE_64(uint8) dirty_bitmap;
+};
+typedef struct xen_hvm_track_dirty_vram xen_hvm_track_dirty_vram_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_track_dirty_vram_t);
+
+/* Notify that some pages got modified by the Device Model. */
+#define HVMOP_modified_memory    7
+struct xen_hvm_modified_memory {
+    /* Domain to be updated. */
+    domid_t  domid;
+    /* First pfn. */
+    uint64_aligned_t first_pfn;
+    /* Number of pages. */
+    uint64_aligned_t nr;
+};
+typedef struct xen_hvm_modified_memory xen_hvm_modified_memory_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_modified_memory_t);
+
+#define HVMOP_set_mem_type    8
+/* Notify that a region of memory is to be treated in a specific way. */
+struct xen_hvm_set_mem_type {
+    /* Domain to be updated. */
+    domid_t domid;
+    /* Memory type */
+    uint16_t hvmmem_type;
+    /* Number of pages. */
+    uint32_t nr;
+    /* First pfn. */
+    uint64_aligned_t first_pfn;
+};
+typedef struct xen_hvm_set_mem_type xen_hvm_set_mem_type_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_mem_type_t);
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
  
  /* Hint from PV drivers for pagetable destruction. */
  #define HVMOP_pagetable_dying       9
  struct xen_hvm_pagetable_dying {
      /* Domain with a pagetable about to be destroyed. */
      domid_t  domid;
+    uint16_t pad[3]; /* align next field on 8-byte boundary */
      /* guest physical address of the toplevel pagetable dying */
-    aligned_u64 gpa;
+    uint64_t gpa;
  };
+DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_pagetable_dying);
  typedef struct xen_hvm_pagetable_dying xen_hvm_pagetable_dying_t;
-DEFINE_GUEST_HANDLE_STRUCT(xen_hvm_pagetable_dying_t);
- 
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_pagetable_dying_t);
+
+/* Get the current Xen time, in nanoseconds since system boot. */
+#define HVMOP_get_time              10
+struct xen_hvm_get_time {
+    uint64_t now;      /* OUT */
+};
+typedef struct xen_hvm_get_time xen_hvm_get_time_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_time_t);
+
+#define HVMOP_xentrace              11
+struct xen_hvm_xentrace {
+    uint16_t event, extra_bytes;
+    uint8_t extra[TRACE_EXTRA_MAX * sizeof(uint32_t)];
+};
+typedef struct xen_hvm_xentrace xen_hvm_xentrace_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_xentrace_t);
+
+/* Following tools-only interfaces may change in future. */
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+#define HVMOP_set_mem_access        12
+typedef enum {
+    HVMMEM_access_n,
+    HVMMEM_access_r,
+    HVMMEM_access_w,
+    HVMMEM_access_rw,
+    HVMMEM_access_x,
+    HVMMEM_access_rx,
+    HVMMEM_access_wx,
+    HVMMEM_access_rwx,
+    HVMMEM_access_rx2rw,       /* Page starts off as r-x, but automatically
+                                * change to r-w on a write */
+    HVMMEM_access_n2rwx,       /* Log access: starts off as n, automatically
+                                * goes to rwx, generating an event without
+                                * pausing the vcpu */
+    HVMMEM_access_default      /* Take the domain default */
+} hvmmem_access_t;
+/* Notify that a region of memory is to have specific access types */
+struct xen_hvm_set_mem_access {
+    /* Domain to be updated. */
+    domid_t domid;
+    /* Memory type */
+    uint16_t hvmmem_access; /* hvm_access_t */
+    /* Number of pages, ignored on setting default access */
+    uint32_t nr;
+    /* First pfn, or ~0ull to set the default access for new pages */
+    uint64_aligned_t first_pfn;
+};
+typedef struct xen_hvm_set_mem_access xen_hvm_set_mem_access_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_mem_access_t);
+
+#define HVMOP_get_mem_access        13
+/* Get the specific access type for that region of memory */
+struct xen_hvm_get_mem_access {
+    /* Domain to be queried. */
+    domid_t domid;
+    /* Memory type: OUT */
+    uint16_t hvmmem_access; /* hvm_access_t */
+    /* pfn, or ~0ull for default access for new pages.  IN */
+    uint64_aligned_t pfn;
+};
+typedef struct xen_hvm_get_mem_access xen_hvm_get_mem_access_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_mem_access_t);
+
+#define HVMOP_inject_trap            14
+/* Inject a trap into a VCPU, which will get taken up on the next
+ * scheduling of it. Note that the caller should know enough of the
+ * state of the CPU before injecting, to know what the effect of
+ * injecting the trap will be.
+ */
+struct xen_hvm_inject_trap {
+    /* Domain to be queried. */
+    domid_t domid;
+    /* VCPU */
+    uint32_t vcpuid;
+    /* Trap number */
+    uint32_t trap;
+    /* Error code, or -1 to skip */
+    uint32_t error_code;
+    /* CR2 for page faults */
+    uint64_aligned_t cr2;
+};
+typedef struct xen_hvm_inject_trap xen_hvm_inject_trap_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_inject_trap_t);
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
+
+#define HVMOP_get_mem_type    15
+/* Return hvmmem_type_t for the specified pfn. */
+struct xen_hvm_get_mem_type {
+    /* Domain to be queried. */
+    domid_t domid;
+    /* OUT variable. */
+    uint16_t mem_type;
+    uint16_t pad[2]; /* align next field on 8-byte boundary */
+    /* IN variable. */
+    uint64_t pfn;
+};
+typedef struct xen_hvm_get_mem_type xen_hvm_get_mem_type_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_get_mem_type_t);
+
+/* Following tools-only interfaces may change in future. */
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+/* MSI injection for emulated devices */
+#define HVMOP_inject_msi         16
+struct xen_hvm_inject_msi {
+    /* Domain to be injected */
+    domid_t   domid;
+    /* Data -- lower 32 bits */
+    uint32_t  data;
+    /* Address (0xfeexxxxx) */
+    uint64_t  addr;
+};
+typedef struct xen_hvm_inject_msi xen_hvm_inject_msi_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_inject_msi_t);
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
+
  #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
diff --git a/include/xen/interface/hvm/ioreq.h b/include/xen/interface/hvm/ioreq.h

new file mode 100644 (file)

index 0000000..4022a1d
--- /dev/null
+++ b/include/xen/interface/hvm/ioreq.h
@@ -0,0 +1,140 @@
+/*
+ * ioreq.h: I/O request definitions for device models
+ * Copyright (c) 2004, Intel Corporation.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _IOREQ_H_
+#define _IOREQ_H_
+
+#define IOREQ_READ      1
+#define IOREQ_WRITE     0
+
+#define STATE_IOREQ_NONE        0
+#define STATE_IOREQ_READY       1
+#define STATE_IOREQ_INPROCESS   2
+#define STATE_IORESP_READY      3
+
+#define IOREQ_TYPE_PIO          0 /* pio */
+#define IOREQ_TYPE_COPY         1 /* mmio ops */
+#define IOREQ_TYPE_TIMEOFFSET   7
+#define IOREQ_TYPE_INVALIDATE   8 /* mapcache */
+
+/*
+ * VMExit dispatcher should cooperate with instruction decoder to
+ * prepare this structure and notify service OS and DM by sending
+ * virq
+ */
+struct ioreq {
+    uint64_t addr;          /* physical address */
+    uint64_t data;          /* data (or paddr of data) */
+    uint32_t count;         /* for rep prefixes */
+    uint32_t size;          /* size in bytes */
+    uint32_t vp_eport;      /* evtchn for notifications to/from device model */
+    uint16_t _pad0;
+    uint8_t state:4;
+    uint8_t data_is_ptr:1;  /* if 1, data above is the guest paddr 
+                             * of the real data to use. */
+    uint8_t dir:1;          /* 1=read, 0=write */
+    uint8_t df:1;
+    uint8_t _pad1:1;
+    uint8_t type;           /* I/O type */
+};
+typedef struct ioreq ioreq_t;
+
+struct shared_iopage {
+    struct ioreq vcpu_ioreq[1];
+};
+typedef struct shared_iopage shared_iopage_t;
+
+struct buf_ioreq {
+    uint8_t  type;   /* I/O type                    */
+    uint8_t  pad:1;
+    uint8_t  dir:1;  /* 1=read, 0=write             */
+    uint8_t  size:2; /* 0=>1, 1=>2, 2=>4, 3=>8. If 8, use two buf_ioreqs */
+    uint32_t addr:20;/* physical address            */
+    uint32_t data;   /* data                        */
+};
+typedef struct buf_ioreq buf_ioreq_t;
+
+#define IOREQ_BUFFER_SLOT_NUM     511 /* 8 bytes each, plus 2 4-byte indexes */
+struct buffered_iopage {
+    unsigned int read_pointer;
+    unsigned int write_pointer;
+    buf_ioreq_t buf_ioreq[IOREQ_BUFFER_SLOT_NUM];
+}; /* NB. Size of this structure must be no greater than one page. */
+typedef struct buffered_iopage buffered_iopage_t;
+
+#if defined(__ia64__)
+struct pio_buffer {
+    uint32_t page_offset;
+    uint32_t pointer;
+    uint32_t data_end;
+    uint32_t buf_size;
+    void *opaque;
+};
+
+#define PIO_BUFFER_IDE_PRIMARY   0 /* I/O port = 0x1F0 */
+#define PIO_BUFFER_IDE_SECONDARY 1 /* I/O port = 0x170 */
+#define PIO_BUFFER_ENTRY_NUM     2
+struct buffered_piopage {
+    struct pio_buffer pio[PIO_BUFFER_ENTRY_NUM];
+    uint8_t buffer[1];
+};
+#endif /* defined(__ia64__) */
+
+/*
+ * ACPI Control/Event register locations. Location is controlled by a 
+ * version number in HVM_PARAM_ACPI_IOPORTS_LOCATION.
+ */
+
+/* Version 0 (default): Traditional Xen locations. */
+#define ACPI_PM1A_EVT_BLK_ADDRESS_V0 0x1f40
+#define ACPI_PM1A_CNT_BLK_ADDRESS_V0 (ACPI_PM1A_EVT_BLK_ADDRESS_V0 + 0x04)
+#define ACPI_PM_TMR_BLK_ADDRESS_V0   (ACPI_PM1A_EVT_BLK_ADDRESS_V0 + 0x08)
+#define ACPI_GPE0_BLK_ADDRESS_V0     (ACPI_PM_TMR_BLK_ADDRESS_V0 + 0x20)
+#define ACPI_GPE0_BLK_LEN_V0         0x08
+
+/* Version 1: Locations preferred by modern Qemu. */
+#define ACPI_PM1A_EVT_BLK_ADDRESS_V1 0xb000
+#define ACPI_PM1A_CNT_BLK_ADDRESS_V1 (ACPI_PM1A_EVT_BLK_ADDRESS_V1 + 0x04)
+#define ACPI_PM_TMR_BLK_ADDRESS_V1   (ACPI_PM1A_EVT_BLK_ADDRESS_V1 + 0x08)
+#define ACPI_GPE0_BLK_ADDRESS_V1     0xafe0
+#define ACPI_GPE0_BLK_LEN_V1         0x04
+
+/* Compatibility definitions for the default location (version 0). */
+#define ACPI_PM1A_EVT_BLK_ADDRESS    ACPI_PM1A_EVT_BLK_ADDRESS_V0
+#define ACPI_PM1A_CNT_BLK_ADDRESS    ACPI_PM1A_CNT_BLK_ADDRESS_V0
+#define ACPI_PM_TMR_BLK_ADDRESS      ACPI_PM_TMR_BLK_ADDRESS_V0
+#define ACPI_GPE0_BLK_ADDRESS        ACPI_GPE0_BLK_ADDRESS_V0
+#define ACPI_GPE0_BLK_LEN            ACPI_GPE0_BLK_LEN_V0
+
+
+#endif /* _IOREQ_H_ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/hvm/params.h b/include/xen/interface/hvm/params.h

index 1b4f923..ef698ea 100644 (file)
--- a/include/xen/interface/hvm/params.h
+++ b/include/xen/interface/hvm/params.h
@@ -33,11 +33,17 @@
   * val[63:56] == 1: val[55:0] is a delivery PCI INTx line, as follows:
   *                  Domain = val[47:32], Bus  = val[31:16],
   *                  DevFn  = val[15: 8], IntX = val[ 1: 0]
- * val[63:56] == 2: val[7:0] is a vector number.
+ * val[63:56] == 2: val[7:0] is a vector number, check for
+ *                  XENFEAT_hvm_callback_vector to know if this delivery
+ *                  method is available.
   * If val == 0 then CPU0 event-channel notifications are not delivered.
   */
  #define HVM_PARAM_CALLBACK_IRQ 0
  
+/*
+ * These are not used by Xen. They are here for convenience of HVM-guest
+ * xenbus implementations.
+ */
  #define HVM_PARAM_STORE_PFN    1
  #define HVM_PARAM_STORE_EVTCHN 2
  
@@ -46,6 +52,20 @@
  #define HVM_PARAM_IOREQ_PFN    5
  
  #define HVM_PARAM_BUFIOREQ_PFN 6
+#define HVM_PARAM_BUFIOREQ_EVTCHN 26
+
+#ifdef __ia64__
+
+#define HVM_PARAM_NVRAM_FD     7
+#define HVM_PARAM_VHPT_SIZE    8
+#define HVM_PARAM_BUFPIOREQ_PFN        9
+
+#elif defined(__i386__) || defined(__x86_64__)
+
+/* Expose Viridian interfaces to this HVM guest? */
+#define HVM_PARAM_VIRIDIAN     9
+
+#endif
  
  /*
   * Set mode for virtual timers (currently x86 only):
@@ -94,6 +114,39 @@
  #define HVM_PARAM_CONSOLE_PFN    17
  #define HVM_PARAM_CONSOLE_EVTCHN 18
  
-#define HVM_NR_PARAMS          19
+/*
+ * Select location of ACPI PM1a and TMR control blocks. Currently two locations
+ * are supported, specified by version 0 or 1 in this parameter:
+ *   - 0: default, use the old addresses
+ *        PM1A_EVT == 0x1f40; PM1A_CNT == 0x1f44; PM_TMR == 0x1f48
+ *   - 1: use the new default qemu addresses
+ *        PM1A_EVT == 0xb000; PM1A_CNT == 0xb004; PM_TMR == 0xb008
+ * You can find these address definitions in <hvm/ioreq.h>
+ */
+#define HVM_PARAM_ACPI_IOPORTS_LOCATION 19
+
+/* Enable blocking memory events, async or sync (pause vcpu until response)
+ * onchangeonly indicates messages only on a change of value */
+#define HVM_PARAM_MEMORY_EVENT_CR0          20
+#define HVM_PARAM_MEMORY_EVENT_CR3          21
+#define HVM_PARAM_MEMORY_EVENT_CR4          22
+#define HVM_PARAM_MEMORY_EVENT_INT3         23
+#define HVM_PARAM_MEMORY_EVENT_SINGLE_STEP  25
+
+#define HVMPME_MODE_MASK       (3 << 0)
+#define HVMPME_mode_disabled   0
+#define HVMPME_mode_async      1
+#define HVMPME_mode_sync       2
+#define HVMPME_onchangeonly    (1 << 2)
+
+/* Boolean: Enable nestedhvm (hvm only) */
+#define HVM_PARAM_NESTEDHVM    24
+
+/* Params for the mem event rings */
+#define HVM_PARAM_PAGING_RING_PFN   27
+#define HVM_PARAM_ACCESS_RING_PFN   28
+#define HVM_PARAM_SHARING_RING_PFN  29
+
+#define HVM_NR_PARAMS          30
  
  #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
diff --git a/include/xen/interface/hvm/save.h b/include/xen/interface/hvm/save.h

new file mode 100644 (file)

index 0000000..58f8433
--- /dev/null
+++ b/include/xen/interface/hvm/save.h
@@ -0,0 +1,113 @@
+/* 
+ * hvm/save.h
+ *
+ * Structure definitions for HVM state that is held by Xen and must
+ * be saved along with the domain's memory and device-model state.
+ * 
+ * Copyright (c) 2007 XenSource Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_HVM_SAVE_H__
+#define __XEN_PUBLIC_HVM_SAVE_H__
+
+/*
+ * Structures in this header *must* have the same layout in 32bit 
+ * and 64bit environments: this means that all fields must be explicitly 
+ * sized types and aligned to their sizes, and the structs must be 
+ * a multiple of eight bytes long.
+ *
+ * Only the state necessary for saving and restoring (i.e. fields 
+ * that are analogous to actual hardware state) should go in this file. 
+ * Internal mechanisms should be kept in Xen-private headers.
+ */
+
+#if !defined(__GNUC__) || defined(__STRICT_ANSI__)
+#error "Anonymous structs/unions are a GNU extension."
+#endif
+
+/* 
+ * Each entry is preceded by a descriptor giving its type and length
+ */
+struct hvm_save_descriptor {
+    uint16_t typecode;          /* Used to demux the various types below */
+    uint16_t instance;          /* Further demux within a type */
+    uint32_t length;            /* In bytes, *not* including this descriptor */
+};
+
+
+/* 
+ * Each entry has a datatype associated with it: for example, the CPU state 
+ * is saved as a HVM_SAVE_TYPE(CPU), which has HVM_SAVE_LENGTH(CPU), 
+ * and is identified by a descriptor with typecode HVM_SAVE_CODE(CPU).
+ * DECLARE_HVM_SAVE_TYPE binds these things together with some type-system
+ * ugliness.
+ */
+
+#ifdef __XEN__
+# define DECLARE_HVM_SAVE_TYPE_COMPAT(_x, _code, _type, _ctype, _fix)     \
+    static inline int __HVM_SAVE_FIX_COMPAT_##_x(void *h) { return _fix(h); } \
+    struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[2];}; \
+    struct __HVM_SAVE_TYPE_COMPAT_##_x { _ctype t; }                   
+
+# include <xen/lib.h> /* BUG() */
+# define DECLARE_HVM_SAVE_TYPE(_x, _code, _type)                         \
+    static inline int __HVM_SAVE_FIX_COMPAT_##_x(void *h) { BUG(); return -1; } \
+    struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[1];}; \
+    struct __HVM_SAVE_TYPE_COMPAT_##_x { _type t; }                   
+#else
+# define DECLARE_HVM_SAVE_TYPE_COMPAT(_x, _code, _type, _ctype, _fix)     \
+    struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[2];} 
+
+# define DECLARE_HVM_SAVE_TYPE(_x, _code, _type)                         \
+    struct __HVM_SAVE_TYPE_##_x { _type t; char c[_code]; char cpt[1];} 
+#endif
+
+#define HVM_SAVE_TYPE(_x) typeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->t)
+#define HVM_SAVE_LENGTH(_x) (sizeof (HVM_SAVE_TYPE(_x)))
+#define HVM_SAVE_CODE(_x) (sizeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->c))
+
+#ifdef __XEN__
+# define HVM_SAVE_TYPE_COMPAT(_x) typeof (((struct __HVM_SAVE_TYPE_COMPAT_##_x *)(0))->t)
+# define HVM_SAVE_LENGTH_COMPAT(_x) (sizeof (HVM_SAVE_TYPE_COMPAT(_x)))
+
+# define HVM_SAVE_HAS_COMPAT(_x) (sizeof (((struct __HVM_SAVE_TYPE_##_x *)(0))->cpt)-1)
+# define HVM_SAVE_FIX_COMPAT(_x, _dst) __HVM_SAVE_FIX_COMPAT_##_x(_dst)
+#endif
+
+/* 
+ * The series of save records is teminated by a zero-type, zero-length 
+ * descriptor.
+ */
+
+struct hvm_save_end {};
+DECLARE_HVM_SAVE_TYPE(END, 0, struct hvm_save_end);
+
+#if defined(__i386__) || defined(__x86_64__)
+#include "../arch-x86/hvm/save.h"
+#elif defined(__ia64__)
+#include "../arch-ia64/hvm/save.h"
+#elif defined(__arm__)
+#include "../arch-arm/hvm/save.h"
+#else
+#error "unsupported architecture"
+#endif
+
+#endif /* __XEN_PUBLIC_HVM_SAVE_H__ */
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h

index ee338bf..446741a 100644 (file)
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -3,7 +3,26 @@
   *
   * Unified block-device I/O interface for Xen guest OSes.
   *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
   * Copyright (c) 2003-2004, Keir Fraser
+ * Copyright (c) 2012, Spectra Logic Corporation
   */
  
  #ifndef __XEN_PUBLIC_IO_BLKIF_H__
@@ -24,8 +43,316 @@
   * rsp_event appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()).
   */
  
-typedef uint16_t blkif_vdev_t;
-typedef uint64_t blkif_sector_t;
+#ifndef blkif_vdev_t
+#define blkif_vdev_t   uint16_t
+#endif
+#define blkif_sector_t uint64_t
+
+/*
+ * Feature and Parameter Negotiation
+ * =================================
+ * The two halves of a Xen block driver utilize nodes within the XenStore to
+ * communicate capabilities and to negotiate operating parameters.  This
+ * section enumerates these nodes which reside in the respective front and
+ * backend portions of the XenStore, following the XenBus convention.
+ *
+ * All data in the XenStore is stored as strings.  Nodes specifying numeric
+ * values are encoded in decimal.  Integer value ranges listed below are
+ * expressed as fixed sized integer types capable of storing the conversion
+ * of a properly formated node string, without loss of information.
+ *
+ * Any specified default value is in effect if the corresponding XenBus node
+ * is not present in the XenStore.
+ *
+ * XenStore nodes in sections marked "PRIVATE" are solely for use by the
+ * driver side whose XenBus tree contains them.
+ *
+ * XenStore nodes marked "DEPRECATED" in their notes section should only be
+ * used to provide interoperability with legacy implementations.
+ *
+ * See the XenBus state transition diagram below for details on when XenBus
+ * nodes must be published and when they can be queried.
+ *
+ *****************************************************************************
+ *                            Backend XenBus Nodes
+ *****************************************************************************
+ *
+ *------------------ Backend Device Identification (PRIVATE) ------------------
+ *
+ * mode
+ *      Values:         "r" (read only), "w" (writable)
+ *
+ *      The read or write access permissions to the backing store to be
+ *      granted to the frontend.
+ *
+ * params
+ *      Values:         string
+ *
+ *      A free formatted string providing sufficient information for the
+ *      backend driver to open the backing device.  (e.g. the path to the
+ *      file or block device representing the backing store.)
+ *
+ * type
+ *      Values:         "file", "phy", "tap"
+ *
+ *      The type of the backing device/object.
+ *
+ *--------------------------------- Features ---------------------------------
+ *
+ * feature-barrier
+ *      Values:         0/1 (boolean)
+ *      Default Value:  0
+ *
+ *      A value of "1" indicates that the backend can process requests
+ *      containing the BLKIF_OP_WRITE_BARRIER request opcode.  Requests
+ *      of this type may still be returned at any time with the
+ *      BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ * feature-flush-cache
+ *      Values:         0/1 (boolean)
+ *      Default Value:  0
+ *
+ *      A value of "1" indicates that the backend can process requests
+ *      containing the BLKIF_OP_FLUSH_DISKCACHE request opcode.  Requests
+ *      of this type may still be returned at any time with the
+ *      BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ * feature-discard
+ *      Values:         0/1 (boolean)
+ *      Default Value:  0
+ *
+ *      A value of "1" indicates that the backend can process requests
+ *      containing the BLKIF_OP_DISCARD request opcode.  Requests
+ *      of this type may still be returned at any time with the
+ *      BLKIF_RSP_EOPNOTSUPP result code.
+ *
+ *----------------------- Request Transport Parameters ------------------------
+ *
+ * max-ring-page-order
+ *      Values:         <uint32_t>
+ *      Default Value:  0
+ *      Notes:          1, 3
+ *
+ *      The maximum supported size of the request ring buffer in units of
+ *      lb(machine pages). (e.g. 0 == 1 page,  1 = 2 pages, 2 == 4 pages,
+ *      etc.).
+ *
+ * max-ring-pages
+ *      Values:         <uint32_t>
+ *      Default Value:  1
+ *      Notes:          DEPRECATED, 2, 3
+ *
+ *      The maximum supported size of the request ring buffer in units of
+ *      machine pages.  The value must be a power of 2.
+ *
+ *------------------------- Backend Device Properties -------------------------
+ *
+ * discard-aligment
+ *      Values:         <uint32_t>
+ *      Default Value:  0
+ *      Notes:          4, 5
+ *
+ *      The offset, in bytes from the beginning of the virtual block device,
+ *      to the first, addressable, discard extent on the underlying device.
+ *
+ * discard-granularity
+ *      Values:         <uint32_t>
+ *      Default Value:  <"sector-size">
+ *      Notes:          4
+ *
+ *      The size, in bytes, of the individually addressable discard extents
+ *      of the underlying device.
+ *
+ * discard-secure
+ *      Values:         0/1 (boolean)
+ *      Default Value:  0
+ *
+ *      A value of "1" indicates that the backend can process BLKIF_OP_DISCARD
+ *      requests with the BLKIF_DISCARD_SECURE flag set.
+ *
+ * info
+ *      Values:         <uint32_t> (bitmap)
+ *
+ *      A collection of bit flags describing attributes of the backing
+ *      device.  The VDISK_* macros define the meaning of each bit
+ *      location.
+ *
+ * sector-size
+ *      Values:         <uint32_t>
+ *
+ *      The native sector size, in bytes, of the backend device.
+ *
+ * sectors
+ *      Values:         <uint64_t>
+ *
+ *      The size of the backend device, expressed in units of its native
+ *      sector size ("sector-size").
+ *
+ *****************************************************************************
+ *                            Frontend XenBus Nodes
+ *****************************************************************************
+ *
+ *----------------------- Request Transport Parameters -----------------------
+ *
+ * event-channel
+ *      Values:         <uint32_t>
+ *
+ *      The identifier of the Xen event channel used to signal activity
+ *      in the ring buffer.
+ *
+ * ring-ref
+ *      Values:         <uint32_t>
+ *      Notes:          6
+ *
+ *      The Xen grant reference granting permission for the backend to map
+ *      the sole page in a single page sized ring buffer.
+ *
+ * ring-ref%u
+ *      Values:         <uint32_t>
+ *      Notes:          6
+ *
+ *      For a frontend providing a multi-page ring, a "number of ring pages"
+ *      sized list of nodes, each containing a Xen grant reference granting
+ *      permission for the backend to map the page of the ring located
+ *      at page index "%u".  Page indexes are zero based.
+ *
+ * protocol
+ *      Values:         string (XEN_IO_PROTO_ABI_*)
+ *      Default Value:  XEN_IO_PROTO_ABI_NATIVE
+ *
+ *      The machine ABI rules governing the format of all ring request and
+ *      response structures.
+ *
+ * ring-page-order
+ *      Values:         <uint32_t>
+ *      Default Value:  0
+ *      Maximum Value:  MAX(ffs(max-ring-pages) - 1, max-ring-page-order)
+ *      Notes:          1, 3
+ *
+ *      The size of the frontend allocated request ring buffer in units
+ *      of lb(machine pages). (e.g. 0 == 1 page, 1 = 2 pages, 2 == 4 pages,
+ *      etc.).
+ *
+ * num-ring-pages
+ *      Values:         <uint32_t>
+ *      Default Value:  1
+ *      Maximum Value:  MAX(max-ring-pages,(0x1 << max-ring-page-order))
+ *      Notes:          DEPRECATED, 2, 3
+ *
+ *      The size of the frontend allocated request ring buffer in units of
+ *      machine pages.  The value must be a power of 2.
+ *
+ *------------------------- Virtual Device Properties -------------------------
+ *
+ * device-type
+ *      Values:         "disk", "cdrom", "floppy", etc.
+ *
+ * virtual-device
+ *      Values:         <uint32_t>
+ *
+ *      A value indicating the physical device to virtualize within the
+ *      frontend's domain.  (e.g. "The first ATA disk", "The third SCSI
+ *      disk", etc.)
+ *
+ *      See docs/misc/vbd-interface.txt for details on the format of this
+ *      value.
+ *
+ * Notes
+ * -----
+ * (1) Multi-page ring buffer scheme first developed in the Citrix XenServer
+ *     PV drivers.
+ * (2) Multi-page ring buffer scheme first used in some RedHat distributions
+ *     including a distribution deployed on certain nodes of the Amazon
+ *     EC2 cluster.
+ * (3) Support for multi-page ring buffers was implemented independently,
+ *     in slightly different forms, by both Citrix and RedHat/Amazon.
+ *     For full interoperability, block front and backends should publish
+ *     identical ring parameters, adjusted for unit differences, to the
+ *     XenStore nodes used in both schemes.
+ * (4) Devices that support discard functionality may internally allocate
+ *     space (discardable extents) in units that are larger than the
+ *     exported logical block size.
+ * (5) The discard-alignment parameter allows a physical device to be
+ *     partitioned into virtual devices that do not necessarily begin or
+ *     end on a discardable extent boundary.
+ * (6) When there is only a single page allocated to the request ring,
+ *     'ring-ref' is used to communicate the grant reference for this
+ *     page to the backend.  When using a multi-page ring, the 'ring-ref'
+ *     node is not created.  Instead 'ring-ref0' - 'ring-refN' are used.
+ */
+
+/*
+ * STATE DIAGRAMS
+ *
+ *****************************************************************************
+ *                                   Startup                                 *
+ *****************************************************************************
+ *
+ * Tool stack creates front and back nodes with state XenbusStateInitialising.
+ *
+ * Front                                Back
+ * =================================    =====================================
+ * XenbusStateInitialising              XenbusStateInitialising
+ *  o Query virtual device               o Query backend device identification
+ *    properties.                          data.
+ *  o Setup OS device instance.          o Open and validate backend device.
+ *                                       o Publish backend features and
+ *                                         transport parameters.
+ *                                                      |
+ *                                                      |
+ *                                                      V
+ *                                      XenbusStateInitWait
+ *
+ * o Query backend features and
+ *   transport parameters.
+ * o Allocate and initialize the
+ *   request ring.
+ * o Publish transport parameters
+ *   that will be in effect during
+ *   this connection.
+ *              |
+ *              |
+ *              V
+ * XenbusStateInitialised
+ *
+ *                                       o Query frontend transport parameters.
+ *                                       o Connect to the request ring and
+ *                                         event channel.
+ *                                       o Publish backend device properties.
+ *                                                      |
+ *                                                      |
+ *                                                      V
+ *                                      XenbusStateConnected
+ *
+ *  o Query backend device properties.
+ *  o Finalize OS virtual device
+ *    instance.
+ *              |
+ *              |
+ *              V
+ * XenbusStateConnected
+ *
+ * Note: Drivers that do not support any optional features, or the negotiation
+ *       of transport parameters, can skip certain states in the state machine:
+ *
+ *       o A frontend may transition to XenbusStateInitialised without
+ *         waiting for the backend to enter XenbusStateInitWait.  In this
+ *         case, default transport parameters are in effect and any
+ *         transport parameters published by the frontend must contain
+ *         their default values.
+ *
+ *       o A backend may transition to XenbusStateInitialised, bypassing
+ *         XenbusStateInitWait, without waiting for the frontend to first
+ *         enter the XenbusStateInitialised state.  In this case, default
+ *         transport parameters are in effect and any transport parameters
+ *         published by the backend must contain their default values.
+ *
+ *       Drivers that support optional features and/or transport parameter
+ *       negotiation must tolerate these additional state transition paths.
+ *       In general this means performing the work of any skipped state
+ *       transition, if it has not already been performed, in addition to the
+ *       work associated with entry into the current state.
+ */
  
  /*
   * REQUEST CODES.
@@ -33,72 +360,44 @@ typedef uint64_t blkif_sector_t;
  #define BLKIF_OP_READ              0
  #define BLKIF_OP_WRITE             1
  /*
- * Recognised only if "feature-barrier" is present in backend xenbus info.
- * The "feature_barrier" node contains a boolean indicating whether barrier
- * requests are likely to succeed or fail. Either way, a barrier request
- * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
- * the underlying block-device hardware. The boolean simply indicates whether
- * or not it is worthwhile for the frontend to attempt barrier requests.
- * If a backend does not recognise BLKIF_OP_WRITE_BARRIER, it should *not*
- * create the "feature-barrier" node!
+ * All writes issued prior to a request with the BLKIF_OP_WRITE_BARRIER
+ * operation code ("barrier request") must be completed prior to the
+ * execution of the barrier request.  All writes issued after the barrier
+ * request must not execute until after the completion of the barrier request.
+ *
+ * Optional.  See "feature-barrier" XenBus node documentation above.
   */
  #define BLKIF_OP_WRITE_BARRIER     2
-
  /*
- * Recognised if "feature-flush-cache" is present in backend xenbus
- * info.  A flush will ask the underlying storage hardware to flush its
- * non-volatile caches as appropriate.  The "feature-flush-cache" node
- * contains a boolean indicating whether flush requests are likely to
- * succeed or fail. Either way, a flush request may fail at any time
- * with BLKIF_RSP_EOPNOTSUPP if it is unsupported by the underlying
- * block-device hardware. The boolean simply indicates whether or not it
- * is worthwhile for the frontend to attempt flushes.  If a backend does
- * not recognise BLKIF_OP_WRITE_FLUSH_CACHE, it should *not* create the
- * "feature-flush-cache" node!
+ * Commit any uncommitted contents of the backing device's volatile cache
+ * to stable storage.
+ *
+ * Optional.  See "feature-flush-cache" XenBus node documentation above.
   */
  #define BLKIF_OP_FLUSH_DISKCACHE   3
-
  /*
- * Recognised only if "feature-discard" is present in backend xenbus info.
- * The "feature-discard" node contains a boolean indicating whether trim
- * (ATA) or unmap (SCSI) - conviently called discard requests are likely
- * to succeed or fail. Either way, a discard request
- * may fail at any time with BLKIF_RSP_EOPNOTSUPP if it is unsupported by
- * the underlying block-device hardware. The boolean simply indicates whether
- * or not it is worthwhile for the frontend to attempt discard requests.
- * If a backend does not recognise BLKIF_OP_DISCARD, it should *not*
- * create the "feature-discard" node!
- *
- * Discard operation is a request for the underlying block device to mark
- * extents to be erased. However, discard does not guarantee that the blocks
- * will be erased from the device - it is just a hint to the device
- * controller that these blocks are no longer in use. What the device
- * controller does with that information is left to the controller.
- * Discard operations are passed with sector_number as the
- * sector index to begin discard operations at and nr_sectors as the number of
- * sectors to be discarded. The specified sectors should be discarded if the
- * underlying block device supports trim (ATA) or unmap (SCSI) operations,
- * or a BLKIF_RSP_EOPNOTSUPP  should be returned.
- * More information about trim/unmap operations at:
+ * Device specific command packet contained within the request
+ */
+#define BLKIF_OP_PACKET            4
+/*
+ * Indicate to the backend device that a region of storage is no longer in
+ * use, and may be discarded at any time without impact to the client.  If
+ * the BLKIF_DISCARD_SECURE flag is set on the request, all copies of the
+ * discarded region on the device must be rendered unrecoverable before the
+ * command returns.
+ *
+ * This operation is analogous to performing a trim (ATA) or unamp (SCSI),
+ * command on a native device.
+ *
+ * More information about trim/unmap operations can be found at:
   * http://t13.org/Documents/UploadedDocuments/docs2008/
   *     e07154r6-Data_Set_Management_Proposal_for_ATA-ACS2.doc
   * http://www.seagate.com/staticfiles/support/disc/manuals/
   *     Interface%20manuals/100293068c.pdf
- * The backend can optionally provide three extra XenBus attributes to
- * further optimize the discard functionality:
- * 'discard-aligment' - Devices that support discard functionality may
- * internally allocate space in units that are bigger than the exported
- * logical block size. The discard-alignment parameter indicates how many bytes
- * the beginning of the partition is offset from the internal allocation unit's
- * natural alignment.
- * 'discard-granularity'  - Devices that support discard functionality may
- * internally allocate space using units that are bigger than the logical block
- * size. The discard-granularity parameter indicates the size of the internal
- * allocation unit in bytes if reported by the device. Otherwise the
- * discard-granularity will be set to match the device's physical block size.
- * 'discard-secure' - All copies of the discarded sectors (potentially created
- * by garbage collection) must also be erased.  To use this feature, the flag
- * BLKIF_DISCARD_SECURE must be set in the blkif_request_trim.
+ *
+ * Optional.  See "feature-discard", "discard-alignment",
+ * "discard-granularity", and "discard-secure" in the XenBus node
+ * documentation above.
   */
  #define BLKIF_OP_DISCARD           5
  
@@ -109,48 +408,84 @@ typedef uint64_t blkif_sector_t;
   */
  #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
  
-struct blkif_request_rw {
-       uint8_t        nr_segments;  /* number of segments                   */
-       blkif_vdev_t   handle;       /* only for read/write requests         */
+/*
+ * NB. first_sect and last_sect in blkif_request_segment, as well as
+ * sector_number in blkif_request, are always expressed in 512-byte units.
+ * However they must be properly aligned to the real sector size of the
+ * physical disk, which is reported in the "sector-size" node in the backend
+ * xenbus info. Also the xenbus "sectors" node is expressed in 512-byte units.
+ */
+struct blkif_request_segment {
+    grant_ref_t gref;        /* reference to I/O buffer frame        */
+    /* @first_sect: first sector in frame to transfer (inclusive).   */
+    /* @last_sect: last sector in frame to transfer (inclusive).     */
+    uint8_t     first_sect, last_sect;
+};
+
+/*
+ * Starting ring element for any I/O request.
+ */
+struct blkif_request {
+    uint8_t        operation;    /* BLKIF_OP_???                         */
+#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+    uint8_t        nr_segments;  /* number of segments                   */
+    blkif_vdev_t   handle;       /* only for read/write requests         */
+    uint64_t       id;           /* private guest value, echoed in resp  */
+    blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+    struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+#else
+    union {
+        struct __attribute__((__packed__)) blkif_request_rw {
+            uint8_t        nr_segments;  /* number of segments                  */
+            blkif_vdev_t   handle;       /* only for read/write requests        */
  #ifdef CONFIG_X86_64
-       uint32_t       _pad1;        /* offsetof(blkif_request,u.rw.id) == 8 */
+            uint32_t       _pad1;        /* offsetof(blkif_request,u.rw.id) == 8 */
  #endif
-       uint64_t       id;           /* private guest value, echoed in resp  */
-       blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
-       struct blkif_request_segment {
-               grant_ref_t gref;        /* reference to I/O buffer frame        */
-               /* @first_sect: first sector in frame to transfer (inclusive).   */
-               /* @last_sect: last sector in frame to transfer (inclusive).     */
-               uint8_t     first_sect, last_sect;
-       } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-} __attribute__((__packed__));
-
-struct blkif_request_discard {
-       uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero.        */
-#define BLKIF_DISCARD_SECURE (1<<0)  /* ignored if discard-secure=0          */
-       blkif_vdev_t   _pad1;        /* only for read/write requests         */
+            uint64_t       id;           /* private guest value, echoed in resp */
+            blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+            struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+        } rw;
+        struct __attribute__((__packed__)) blkif_request_discard {
+            uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero.        */
+#define BLKIF_DISCARD_SECURE (1<<0)      /* ignored if discard-secure=0          */
+            blkif_vdev_t   _pad1;        /* only for read/write requests         */
  #ifdef CONFIG_X86_64
-       uint32_t       _pad2;        /* offsetof(blkif_req..,u.discard.id)==8*/
+            uint32_t       _pad2;        /* offsetof(blkif_req..,u.discard.id)==8*/
  #endif
-       uint64_t       id;           /* private guest value, echoed in resp  */
-       blkif_sector_t sector_number;
-       uint64_t       nr_sectors;
-       uint8_t        _pad3;
+            uint64_t       id;           /* private guest value, echoed in resp  */
+            blkif_sector_t sector_number;
+            uint64_t       nr_sectors;
+            uint8_t        _pad3;
+        } discard;
+    } u;
  } __attribute__((__packed__));
+#endif
+typedef struct blkif_request blkif_request_t;
  
-struct blkif_request {
-       uint8_t        operation;    /* BLKIF_OP_???                         */
-       union {
-               struct blkif_request_rw rw;
-               struct blkif_request_discard discard;
-       } u;
-} __attribute__((__packed__));
+#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+/*
+ * Cast to this structure when blkif_request.operation == BLKIF_OP_DISCARD
+ * sizeof(struct blkif_request_discard) <= sizeof(struct blkif_request)
+ */
+struct blkif_request_discard {
+    uint8_t        operation;    /* BLKIF_OP_DISCARD                     */
+    uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
+#define BLKIF_DISCARD_SECURE (1<<0)  /* ignored if discard-secure=0      */
+    blkif_vdev_t   handle;       /* same as for read/write requests      */
+    uint64_t       id;           /* private guest value, echoed in resp  */
+    blkif_sector_t sector_number;/* start sector idx on disk             */
+    uint64_t       nr_sectors;   /* number of contiguous sectors to discard*/
+};
+typedef struct blkif_request_discard blkif_request_discard_t;
+#endif
  
  struct blkif_response {
-       uint64_t        id;              /* copied from request */
-       uint8_t         operation;       /* copied from request */
-       int16_t         status;          /* BLKIF_RSP_???       */
+    uint64_t        id;              /* copied from request */
+    uint8_t         operation;       /* copied from request */
+    int16_t         status;          /* BLKIF_RSP_???       */
  };
+typedef struct blkif_response blkif_response_t;
  
  /*
   * STATUS RETURN CODES.
@@ -165,7 +500,6 @@ struct blkif_response {
  /*
   * Generate blkif ring structures and types.
   */
-
  DEFINE_RING_TYPES(blkif, struct blkif_request, struct blkif_response);
  
  #define VDISK_CDROM        0x1
diff --git a/include/xen/interface/io/cdromif.h b/include/xen/interface/io/cdromif.h

new file mode 100644 (file)

index 0000000..b691056
--- /dev/null
+++ b/include/xen/interface/io/cdromif.h
@@ -0,0 +1,120 @@
+/******************************************************************************
+ * cdromif.h
+ *
+ * Shared definitions between backend driver and Xen guest Virtual CDROM
+ * block device.
+ *
+ * Copyright (c) 2008, Pat Campell  plc@novell.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_IO_CDROMIF_H__
+#define __XEN_PUBLIC_IO_CDROMIF_H__
+
+/*
+ * Queries backend for CDROM support
+ */
+#define XEN_TYPE_CDROM_SUPPORT         _IO('c', 1)
+
+struct xen_cdrom_support
+{
+       uint32_t type;
+       int8_t ret;                  /* returned, 0 succeded, -1 error */
+       int8_t err;                  /* returned, backend errno */
+       int8_t supported;            /* returned, 1 supported */
+};
+
+/*
+ * Opens backend device, returns drive geometry or
+ * any encountered errors
+ */
+#define XEN_TYPE_CDROM_OPEN            _IO('c', 2)
+
+struct xen_cdrom_open
+{
+       uint32_t type;
+       int8_t ret;
+       int8_t err;
+       int8_t pad;
+       int8_t media_present;        /* returned */
+       uint32_t sectors;            /* returned */
+       uint32_t sector_size;        /* returned */
+       int32_t payload_offset;      /* offset to backend node name payload */
+};
+
+/*
+ * Queries backend for media changed status
+ */
+#define XEN_TYPE_CDROM_MEDIA_CHANGED   _IO('c', 3)
+
+struct xen_cdrom_media_changed
+{
+       uint32_t type;
+       int8_t ret;
+       int8_t err;
+       int8_t media_changed;        /* returned */
+};
+
+/*
+ * Sends vcd generic CDROM packet to backend, followed
+ * immediately by the vcd_generic_command payload
+ */
+#define XEN_TYPE_CDROM_PACKET          _IO('c', 4)
+
+struct xen_cdrom_packet
+{
+       uint32_t type;
+       int8_t ret;
+       int8_t err;
+       int8_t pad[2];
+       int32_t payload_offset;      /* offset to vcd_generic_command payload */
+};
+
+/* CDROM_PACKET_COMMAND, payload for XEN_TYPE_CDROM_PACKET */
+struct vcd_generic_command
+{
+       uint8_t  cmd[CDROM_PACKET_SIZE];
+       uint8_t  pad[4];
+       uint32_t buffer_offset;
+       uint32_t buflen;
+       int32_t  stat;
+       uint32_t sense_offset;
+       uint8_t  data_direction;
+       uint8_t  pad1[3];
+       int32_t  quiet;
+       int32_t  timeout;
+};
+
+union xen_block_packet
+{
+       uint32_t type;
+       struct xen_cdrom_support xcs;
+       struct xen_cdrom_open xco;
+       struct xen_cdrom_media_changed xcmc;
+       struct xen_cdrom_packet xcp;
+};
+
+#define PACKET_PAYLOAD_OFFSET (sizeof(struct xen_cdrom_packet))
+#define PACKET_SENSE_OFFSET (PACKET_PAYLOAD_OFFSET + sizeof(struct vcd_generic_command))
+#define PACKET_BUFFER_OFFSET (PACKET_SENSE_OFFSET + sizeof(struct request_sense))
+#define MAX_PACKET_DATA (PAGE_SIZE - sizeof(struct xen_cdrom_packet) - \
+            sizeof(struct vcd_generic_command) - sizeof(struct request_sense))
+
+#endif
diff --git a/include/xen/interface/io/console.h b/include/xen/interface/io/console.h

index e563de7..70906df 100644 (file)
--- a/include/xen/interface/io/console.h
+++ b/include/xen/interface/io/console.h
@@ -3,6 +3,24 @@
   *
   * Console I/O interface for Xen guest OSes.
   *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
   * Copyright (c) 2005, Keir Fraser
   */
  
diff --git a/include/xen/interface/io/fbif.h b/include/xen/interface/io/fbif.h

index 974a51e..4d6dbca 100644 (file)
--- a/include/xen/interface/io/fbif.h
+++ b/include/xen/interface/io/fbif.h
@@ -77,13 +77,32 @@ union xenfb_out_event {
  
  /*
   * Frontends should ignore unknown in events.
- * No in events currently defined.
   */
  
+/*
+ * Framebuffer refresh period advice
+ * Backend sends it to advise the frontend their preferred period of
+ * refresh.  Frontends that keep the framebuffer constantly up-to-date
+ * just ignore it.  Frontends that use the advice should immediately
+ * refresh the framebuffer (and send an update notification event if
+ * those have been requested), then use the update frequency to guide
+ * their periodical refreshs.
+ */
+#define XENFB_TYPE_REFRESH_PERIOD 1
+#define XENFB_NO_REFRESH 0
+
+struct xenfb_refresh_period
+{
+    uint8_t type;    /* XENFB_TYPE_UPDATE_PERIOD */
+    uint32_t period; /* period of refresh, in ms,
+                      * XENFB_NO_REFRESH if no refresh is needed */
+};
+
  #define XENFB_IN_EVENT_SIZE 40
  
  union xenfb_in_event {
         uint8_t type;
+       struct xenfb_refresh_period refresh_period;
         char pad[XENFB_IN_EVENT_SIZE];
  };
  
@@ -127,7 +146,12 @@ struct xenfb_page {
          * Should be enough for a while with room leftover for
          * expansion.
          */
+#ifndef CONFIG_PARAVIRT_XEN
         unsigned long pd[256];
+#else
+       /* Two directory pages should be enough for a while. */
+       unsigned long pd[2];
+#endif
  };
  
  /*
diff --git a/include/xen/interface/io/fsif.h b/include/xen/interface/io/fsif.h

new file mode 100644 (file)

index 0000000..8fc2174
--- /dev/null
+++ b/include/xen/interface/io/fsif.h
@@ -0,0 +1,192 @@
+/******************************************************************************
+ * fsif.h
+ * 
+ * Interface to FS level split device drivers.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2007, Grzegorz Milos, <gm281@cam.ac.uk>.
+ */
+
+#ifndef __XEN_PUBLIC_IO_FSIF_H__
+#define __XEN_PUBLIC_IO_FSIF_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+#define REQ_FILE_OPEN        1
+#define REQ_FILE_CLOSE       2
+#define REQ_FILE_READ        3
+#define REQ_FILE_WRITE       4
+#define REQ_STAT             5
+#define REQ_FILE_TRUNCATE    6
+#define REQ_REMOVE           7
+#define REQ_RENAME           8
+#define REQ_CREATE           9
+#define REQ_DIR_LIST        10
+#define REQ_CHMOD           11
+#define REQ_FS_SPACE        12
+#define REQ_FILE_SYNC       13
+
+struct fsif_open_request {
+    grant_ref_t gref;
+};
+
+struct fsif_close_request {
+    uint32_t fd;
+};
+
+struct fsif_read_request {
+    uint32_t fd;
+    int32_t pad;
+    uint64_t len;
+    uint64_t offset;
+    grant_ref_t grefs[1];  /* Variable length */
+};
+
+struct fsif_write_request {
+    uint32_t fd;
+    int32_t pad;
+    uint64_t len;
+    uint64_t offset;
+    grant_ref_t grefs[1];  /* Variable length */
+};
+
+struct fsif_stat_request {
+    uint32_t fd;
+};
+
+/* This structure is a copy of some fields from stat structure, returned
+ * via the ring. */
+struct fsif_stat_response {
+    int32_t  stat_mode;
+    uint32_t stat_uid;
+    uint32_t stat_gid;
+    int32_t  stat_ret;
+    int64_t  stat_size;
+    int64_t  stat_atime;
+    int64_t  stat_mtime;
+    int64_t  stat_ctime;
+};
+
+struct fsif_truncate_request {
+    uint32_t fd;
+    int32_t pad;
+    int64_t length;
+};
+
+struct fsif_remove_request {
+    grant_ref_t gref;
+};
+
+struct fsif_rename_request {
+    uint16_t old_name_offset;
+    uint16_t new_name_offset;
+    grant_ref_t gref;
+};
+
+struct fsif_create_request {
+    int8_t directory;
+    int8_t pad;
+    int16_t pad2;
+    int32_t mode;
+    grant_ref_t gref;
+};
+
+struct fsif_list_request {
+    uint32_t offset;
+    grant_ref_t gref;
+};
+
+#define NR_FILES_SHIFT  0
+#define NR_FILES_SIZE   16   /* 16 bits for the number of files mask */
+#define NR_FILES_MASK   (((1ULL << NR_FILES_SIZE) - 1) << NR_FILES_SHIFT)
+#define ERROR_SIZE      32   /* 32 bits for the error mask */
+#define ERROR_SHIFT     (NR_FILES_SIZE + NR_FILES_SHIFT)
+#define ERROR_MASK      (((1ULL << ERROR_SIZE) - 1) << ERROR_SHIFT)
+#define HAS_MORE_SHIFT  (ERROR_SHIFT + ERROR_SIZE)    
+#define HAS_MORE_FLAG   (1ULL << HAS_MORE_SHIFT)
+
+struct fsif_chmod_request {
+    uint32_t fd;
+    int32_t mode;
+};
+
+struct fsif_space_request {
+    grant_ref_t gref;
+};
+
+struct fsif_sync_request {
+    uint32_t fd;
+};
+
+
+/* FS operation request */
+struct fsif_request {
+    uint8_t type;                 /* Type of the request                  */
+    uint8_t pad;
+    uint16_t id;                  /* Request ID, copied to the response   */
+    uint32_t pad2;
+    union {
+        struct fsif_open_request     fopen;
+        struct fsif_close_request    fclose;
+        struct fsif_read_request     fread;
+        struct fsif_write_request    fwrite;
+        struct fsif_stat_request     fstat;
+        struct fsif_truncate_request ftruncate;
+        struct fsif_remove_request   fremove;
+        struct fsif_rename_request   frename;
+        struct fsif_create_request   fcreate;
+        struct fsif_list_request     flist;
+        struct fsif_chmod_request    fchmod;
+        struct fsif_space_request    fspace;
+        struct fsif_sync_request     fsync;
+    } u;
+};
+typedef struct fsif_request fsif_request_t;
+
+/* FS operation response */
+struct fsif_response {
+    uint16_t id;
+    uint16_t pad1;
+    uint32_t pad2;
+    union {
+        uint64_t ret_val;
+        struct fsif_stat_response fstat;
+    } u;
+};
+
+typedef struct fsif_response fsif_response_t;
+
+#define FSIF_RING_ENTRY_SIZE   64
+
+#define FSIF_NR_READ_GNTS  ((FSIF_RING_ENTRY_SIZE - sizeof(struct fsif_read_request)) /  \
+                                sizeof(grant_ref_t) + 1)
+#define FSIF_NR_WRITE_GNTS ((FSIF_RING_ENTRY_SIZE - sizeof(struct fsif_write_request)) / \
+                                sizeof(grant_ref_t) + 1)
+
+DEFINE_RING_TYPES(fsif, struct fsif_request, struct fsif_response);
+
+#define STATE_INITIALISED     "init"
+#define STATE_READY           "ready"
+#define STATE_CLOSING         "closing"
+#define STATE_CLOSED          "closed"
+
+
+#endif
diff --git a/include/xen/interface/io/libxenvchan.h b/include/xen/interface/io/libxenvchan.h

new file mode 100644 (file)

index 0000000..5c3d3d4
--- /dev/null
+++ b/include/xen/interface/io/libxenvchan.h
@@ -0,0 +1,97 @@
+/**
+ * @file
+ * @section AUTHORS
+ *
+ * Copyright (C) 2010  Rafal Wojtczuk  <rafal@invisiblethingslab.com>
+ *
+ *  Authors:
+ *       Rafal Wojtczuk  <rafal@invisiblethingslab.com>
+ *       Daniel De Graaf <dgdegra@tycho.nsa.gov>
+ *
+ * @section LICENSE
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ * @section DESCRIPTION
+ *
+ *  Originally borrowed from the Qubes OS Project, http://www.qubes-os.org,
+ *  this code has been substantially rewritten to use the gntdev and gntalloc
+ *  devices instead of raw MFNs and map_foreign_range.
+ *
+ *  This is a library for inter-domain communication.  A standard Xen ring
+ *  buffer is used, with a datagram-based interface built on top.  The grant
+ *  reference and event channels are shared in XenStore under a user-specified
+ *  path.
+ *
+ *  The ring.h macros define an asymmetric interface to a shared data structure
+ *  that assumes all rings reside in a single contiguous memory space. This is
+ *  not suitable for vchan because the interface to the ring is symmetric except
+ *  for the setup. Unlike the producer-consumer rings defined in ring.h, the
+ *  size of the rings used in vchan are determined at execution time instead of
+ *  compile time, so the macros in ring.h cannot be used to access the rings.
+ */
+
+#include <stdint.h>
+#include <sys/types.h>
+
+struct ring_shared {
+       uint32_t cons, prod;
+};
+
+#define VCHAN_NOTIFY_WRITE 0x1
+#define VCHAN_NOTIFY_READ 0x2
+
+/**
+ * vchan_interface: primary shared data structure
+ */
+struct vchan_interface {
+       /**
+        * Standard consumer/producer interface, one pair per buffer
+        * left is client write, server read
+        * right is client read, server write
+        */
+       struct ring_shared left, right;
+       /**
+        * size of the rings, which determines their location
+        * 10   - at offset 1024 in ring's page
+        * 11   - at offset 2048 in ring's page
+        * 12+  - uses 2^(N-12) grants to describe the multi-page ring
+        * These should remain constant once the page is shared.
+        * Only one of the two orders can be 10 (or 11).
+        */
+       uint16_t left_order, right_order;
+       /**
+        * Shutdown detection:
+        *  0: client (or server) has exited
+        *  1: client (or server) is connected
+        *  2: client has not yet connected
+        */
+       uint8_t cli_live, srv_live;
+       /**
+        * Notification bits:
+        *  VCHAN_NOTIFY_WRITE: send notify when data is written
+        *  VCHAN_NOTIFY_READ: send notify when data is read (consumed)
+        * cli_notify is used for the client to inform the server of its action
+        */
+       uint8_t cli_notify, srv_notify;
+       /**
+        * Grant list: ordering is left, right. Must not extend into actual ring
+        * or grow beyond the end of the initial shared page.
+        * These should remain constant once the page is shared, to allow
+        * for possible remapping by a client that restarts.
+        */
+       uint32_t grants[0];
+};
+
diff --git a/include/xen/interface/io/netif.h b/include/xen/interface/io/netif.h

index cb94668..c4763fe 100644 (file)
--- a/include/xen/interface/io/netif.h
+++ b/include/xen/interface/io/netif.h
@@ -3,6 +3,24 @@
   *
   * Unified network-device I/O interface for Xen guest OSes.
   *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
   * Copyright (c) 2003-2004, Keir Fraser
   */
  
@@ -47,18 +65,21 @@
  #define _XEN_NETTXF_extra_info         (3)
  #define  XEN_NETTXF_extra_info         (1U<<_XEN_NETTXF_extra_info)
  
-struct xen_netif_tx_request {
+struct netif_tx_request {
      grant_ref_t gref;      /* Reference to buffer page */
      uint16_t offset;       /* Offset within buffer page */
      uint16_t flags;        /* XEN_NETTXF_* */
      uint16_t id;           /* Echoed in response message. */
      uint16_t size;         /* Packet size in bytes.       */
  };
+typedef struct netif_tx_request netif_tx_request_t;
  
-/* Types of xen_netif_extra_info descriptors. */
+/* Types of netif_extra_info descriptors. */
  #define XEN_NETIF_EXTRA_TYPE_NONE      (0)  /* Never used - invalid */
  #define XEN_NETIF_EXTRA_TYPE_GSO       (1)  /* u.gso */
-#define XEN_NETIF_EXTRA_TYPE_MAX       (2)
+#define XEN_NETIF_EXTRA_TYPE_MCAST_ADD (2)  /* u.mcast */
+#define XEN_NETIF_EXTRA_TYPE_MCAST_DEL (3)  /* u.mcast */
+#define XEN_NETIF_EXTRA_TYPE_MAX       (4)
  
  /* xen_netif_extra_info flags. */
  #define _XEN_NETIF_EXTRA_FLAG_MORE     (0)
@@ -71,11 +92,14 @@ struct xen_netif_tx_request {
   * This structure needs to fit within both netif_tx_request and
   * netif_rx_response for compatibility.
   */
-struct xen_netif_extra_info {
+struct netif_extra_info {
         uint8_t type;  /* XEN_NETIF_EXTRA_TYPE_* */
         uint8_t flags; /* XEN_NETIF_EXTRA_FLAG_* */
  
         union {
+               /*
+                * XEN_NETIF_EXTRA_TYPE_GSO:
+                */
                 struct {
                         /*
                          * Maximum payload size of each segment. For
@@ -101,19 +125,39 @@ struct xen_netif_extra_info {
                         uint16_t features; /* XEN_NETIF_GSO_FEAT_* */
                 } gso;
  
+               /*
+                * XEN_NETIF_EXTRA_TYPE_MCAST_{ADD,DEL}:
+                * Backend advertises availability via
+                * 'feature-multicast-control' xenbus node containing value
+                * '1'.
+                * Frontend requests this feature by advertising
+                * 'request-multicast-control' xenbus node containing value
+                * '1'. If multicast control is requested then multicast
+                * flooding is disabled and the frontend must explicitly
+                * register its interest in multicast groups using dummy
+                * transmit requests containing MCAST_{ADD,DEL} extra-info
+                * fragments.
+                */
+               struct {
+                       uint8_t addr[6]; /* Address to add/remove. */
+               } mcast;
+
                 uint16_t pad[3];
         } u;
  };
+typedef struct netif_extra_info netif_extra_info_t;
  
-struct xen_netif_tx_response {
+struct netif_tx_response {
         uint16_t id;
         int16_t  status;       /* XEN_NETIF_RSP_* */
  };
+typedef struct netif_tx_response netif_tx_response_t;
  
-struct xen_netif_rx_request {
+struct netif_rx_request {
         uint16_t    id;        /* Echoed in response message.        */
         grant_ref_t gref;      /* Reference to incoming granted frame */
  };
+typedef struct netif_rx_request netif_rx_request_t;
  
  /* Packet data has been validated against protocol checksum. */
  #define _XEN_NETRXF_data_validated     (0)
@@ -135,28 +179,39 @@ struct xen_netif_rx_request {
  #define _XEN_NETRXF_gso_prefix         (4)
  #define  XEN_NETRXF_gso_prefix         (1U<<_XEN_NETRXF_gso_prefix)
  
-struct xen_netif_rx_response {
+struct netif_rx_response {
      uint16_t id;
      uint16_t offset;       /* Offset in page of start of received packet  */
      uint16_t flags;        /* XEN_NETRXF_* */
      int16_t  status;       /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
  };
+typedef struct netif_rx_response netif_rx_response_t;
  
  /*
   * Generate netif ring structures and types.
   */
  
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+DEFINE_RING_TYPES(netif_tx, struct netif_tx_request, struct netif_tx_response);
+DEFINE_RING_TYPES(netif_rx, struct netif_rx_request, struct netif_rx_response);
+#else
+#define xen_netif_tx_request netif_tx_request
+#define xen_netif_rx_request netif_rx_request
+#define xen_netif_tx_response netif_tx_response
+#define xen_netif_rx_response netif_rx_response
  DEFINE_RING_TYPES(xen_netif_tx,
                   struct xen_netif_tx_request,
                   struct xen_netif_tx_response);
  DEFINE_RING_TYPES(xen_netif_rx,
                   struct xen_netif_rx_request,
                   struct xen_netif_rx_response);
+#define xen_netif_extra_info netif_extra_info
+#endif
  
  #define XEN_NETIF_RSP_DROPPED  -2
  #define XEN_NETIF_RSP_ERROR    -1
  #define XEN_NETIF_RSP_OKAY      0
-/* No response: used for auxiliary requests (e.g., xen_netif_extra_info). */
+/* No response: used for auxiliary requests (e.g., netif_tx_extra). */
  #define XEN_NETIF_RSP_NULL      1
  
  #endif
diff --git a/include/xen/interface/io/protocols.h b/include/xen/interface/io/protocols.h

index 01fc8ae..0682ddc 100644 (file)
--- a/include/xen/interface/io/protocols.h
+++ b/include/xen/interface/io/protocols.h
@@ -1,3 +1,25 @@
+/******************************************************************************
+ * protocols.h
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
  #ifndef __XEN_PROTOCOLS_H__
  #define __XEN_PROTOCOLS_H__
  
@@ -5,6 +27,7 @@
  #define XEN_IO_PROTO_ABI_X86_64     "x86_64-abi"
  #define XEN_IO_PROTO_ABI_IA64       "ia64-abi"
  #define XEN_IO_PROTO_ABI_POWERPC64  "powerpc64-abi"
+#define XEN_IO_PROTO_ABI_ARM        "arm-abi"
  
  #if defined(__i386__)
  # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_X86_32
@@ -14,6 +37,8 @@
  # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_IA64
  #elif defined(__powerpc64__)
  # define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_POWERPC64
+#elif defined(__arm__)
+# define XEN_IO_PROTO_ABI_NATIVE XEN_IO_PROTO_ABI_ARM
  #else
  # error arch fixup needed here
  #endif
diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h

index 75271b9..ae95c7b 100644 (file)
--- a/include/xen/interface/io/ring.h
+++ b/include/xen/interface/io/ring.h
@@ -3,12 +3,38 @@
   *
   * Shared producer-consumer ring macros.
   *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
   * Tim Deegan and Andrew Warfield November 2004.
   */
  
  #ifndef __XEN_PUBLIC_IO_RING_H__
  #define __XEN_PUBLIC_IO_RING_H__
  
+#include "../xen-compat.h"
+
+#if __XEN_INTERFACE_VERSION__ < 0x00030208
+#define xen_mb()  mb()
+#define xen_rmb() rmb()
+#define xen_wmb() wmb()
+#endif
+
  typedef unsigned int RING_IDX;
  
  /* Round a 32-bit unsigned constant down to the nearest power of two. */
@@ -38,34 +64,32 @@ typedef unsigned int RING_IDX;
   * Macros to make the correct C datatypes for a new kind of ring.
   *
   * To make a new ring datatype, you need to have two message structures,
- * let's say struct request, and struct response already defined.
+ * let's say request_t, and response_t already defined.
   *
   * In a header where you want the ring datatype declared, you then do:
   *
- *     DEFINE_RING_TYPES(mytag, struct request, struct response);
+ *     DEFINE_RING_TYPES(mytag, request_t, response_t);
   *
   * These expand out to give you a set of types, as you can see below.
   * The most important of these are:
   *
- *     struct mytag_sring      - The shared ring.
- *     struct mytag_front_ring - The 'front' half of the ring.
- *     struct mytag_back_ring  - The 'back' half of the ring.
+ *     mytag_sring_t      - The shared ring.
+ *     mytag_front_ring_t - The 'front' half of the ring.
+ *     mytag_back_ring_t  - The 'back' half of the ring.
   *
   * To initialize a ring in your code you need to know the location and size
   * of the shared memory area (PAGE_SIZE, for instance). To initialise
   * the front half:
   *
- *     struct mytag_front_ring front_ring;
- *     SHARED_RING_INIT((struct mytag_sring *)shared_page);
- *     FRONT_RING_INIT(&front_ring, (struct mytag_sring *)shared_page,
- *                    PAGE_SIZE);
+ *     mytag_front_ring_t front_ring;
+ *     SHARED_RING_INIT((mytag_sring_t *)shared_page);
+ *     FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
   *
   * Initializing the back follows similarly (note that only the front
   * initializes the shared ring):
   *
- *     struct mytag_back_ring back_ring;
- *     BACK_RING_INIT(&back_ring, (struct mytag_sring *)shared_page,
- *                   PAGE_SIZE);
+ *     mytag_back_ring_t back_ring;
+ *     BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
   */
  
  #define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)                    \
@@ -80,7 +104,16 @@ union __name##_sring_entry {                                                \
  struct __name##_sring {                                                        \
      RING_IDX req_prod, req_event;                                      \
      RING_IDX rsp_prod, rsp_event;                                      \
-    uint8_t  pad[48];                                                  \
+    union {                                                            \
+        struct {                                                       \
+            uint8_t smartpoll_active;                                  \
+        } netif;                                                       \
+        struct {                                                       \
+            uint8_t msg;                                               \
+        } tapif_user;                                                  \
+        uint8_t pvt_pad[4];                                            \
+    } private;                                                         \
+    uint8_t __pad[44];                                                 \
      union __name##_sring_entry ring[1]; /* variable-length */          \
  };                                                                     \
                                                                         \
@@ -98,7 +131,12 @@ struct __name##_back_ring {                                         \
      RING_IDX req_cons;                                                 \
      unsigned int nr_ents;                                              \
      struct __name##_sring *sring;                                      \
-};
+};                                                                     \
+                                                                       \
+/* Syntactic sugar */                                                  \
+typedef struct __name##_sring __name##_sring_t;                        \
+typedef struct __name##_front_ring __name##_front_ring_t;              \
+typedef struct __name##_back_ring __name##_back_ring_t
  
  /*
   * Macros for manipulating rings.
@@ -119,7 +157,8 @@ struct __name##_back_ring {                                         \
  #define SHARED_RING_INIT(_s) do {                                      \
      (_s)->req_prod  = (_s)->rsp_prod  = 0;                             \
      (_s)->req_event = (_s)->rsp_event = 1;                             \
-    memset((_s)->pad, 0, sizeof((_s)->pad));                           \
+    (void)memset((_s)->private.pvt_pad, 0, sizeof((_s)->private.pvt_pad)); \
+    (void)memset((_s)->__pad, 0, sizeof((_s)->__pad));                 \
  } while(0)
  
  #define FRONT_RING_INIT(_r, _s, __size) do {                           \
@@ -169,6 +208,7 @@ struct __name##_back_ring {                                         \
  #define RING_HAS_UNCONSUMED_RESPONSES(_r)                              \
      ((_r)->sring->rsp_prod - (_r)->rsp_cons)
  
+#ifdef __GNUC__
  #define RING_HAS_UNCONSUMED_REQUESTS(_r)                               \
      ({                                                                 \
         unsigned int req = (_r)->sring->req_prod - (_r)->req_cons;      \
@@ -176,6 +216,14 @@ struct __name##_back_ring {                                                \
                            ((_r)->req_cons - (_r)->rsp_prod_pvt);       \
         req < rsp ? req : rsp;                                          \
      })
+#else
+/* Same as above, but without the nice GCC ({ ... }) syntax. */
+#define RING_HAS_UNCONSUMED_REQUESTS(_r)                               \
+    ((((_r)->sring->req_prod - (_r)->req_cons) <                       \
+      (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) ?       \
+     ((_r)->sring->req_prod - (_r)->req_cons) :                        \
+     (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt)))
+#endif
  
  /* Direct access to individual ring elements, by index. */
  #define RING_GET_REQUEST(_r, _idx)                                     \
@@ -189,12 +237,12 @@ struct __name##_back_ring {                                               \
      (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
  
  #define RING_PUSH_REQUESTS(_r) do {                                    \
-    wmb(); /* back sees requests /before/ updated producer index */    \
+    xen_wmb(); /* back sees requests /before/ updated producer index */        \
      (_r)->sring->req_prod = (_r)->req_prod_pvt;                                \
  } while (0)
  
  #define RING_PUSH_RESPONSES(_r) do {                                   \
-    wmb(); /* front sees responses /before/ updated producer index */  \
+    xen_wmb(); /* front sees resps /before/ updated producer index */  \
      (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt;                                \
  } while (0)
  
@@ -231,9 +279,9 @@ struct __name##_back_ring {                                         \
  #define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do {          \
      RING_IDX __old = (_r)->sring->req_prod;                            \
      RING_IDX __new = (_r)->req_prod_pvt;                               \
-    wmb(); /* back sees requests /before/ updated producer index */    \
+    xen_wmb(); /* back sees requests /before/ updated producer index */        \
      (_r)->sring->req_prod = __new;                                     \
-    mb(); /* back sees new requests /before/ we check req_event */     \
+    xen_mb(); /* back sees new requests /before/ we check req_event */ \
      (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) <          \
                  (RING_IDX)(__new - __old));                            \
  } while (0)
@@ -241,9 +289,9 @@ struct __name##_back_ring {                                         \
  #define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do {         \
      RING_IDX __old = (_r)->sring->rsp_prod;                            \
      RING_IDX __new = (_r)->rsp_prod_pvt;                               \
-    wmb(); /* front sees responses /before/ updated producer index */  \
+    xen_wmb(); /* front sees resps /before/ updated producer index */  \
      (_r)->sring->rsp_prod = __new;                                     \
-    mb(); /* front sees new responses /before/ we check rsp_event */   \
+    xen_mb(); /* front sees new resps /before/ we check rsp_event */   \
      (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) <          \
                  (RING_IDX)(__new - __old));                            \
  } while (0)
@@ -252,7 +300,7 @@ struct __name##_back_ring {                                         \
      (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                  \
      if (_work_to_do) break;                                            \
      (_r)->sring->req_event = (_r)->req_cons + 1;                       \
-    mb();                                                              \
+    xen_mb();                                                          \
      (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r);                  \
  } while (0)
  
@@ -260,7 +308,7 @@ struct __name##_back_ring {                                         \
      (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                 \
      if (_work_to_do) break;                                            \
      (_r)->sring->rsp_event = (_r)->rsp_cons + 1;                       \
-    mb();                                                              \
+    xen_mb();                                                          \
      (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r);                 \
  } while (0)
  
diff --git a/include/xen/interface/io/tpmif.h b/include/xen/interface/io/tpmif.h

new file mode 100644 (file)

index 0000000..02ccdab
--- /dev/null
+++ b/include/xen/interface/io/tpmif.h
@@ -0,0 +1,77 @@
+/******************************************************************************
+ * tpmif.h
+ *
+ * TPM I/O interface for Xen guest OSes.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2005, IBM Corporation
+ *
+ * Author: Stefan Berger, stefanb@us.ibm.com
+ * Grant table support: Mahadevan Gomathisankaran
+ *
+ * This code has been derived from tools/libxc/xen/io/netif.h
+ *
+ * Copyright (c) 2003-2004, Keir Fraser
+ */
+
+#ifndef __XEN_PUBLIC_IO_TPMIF_H__
+#define __XEN_PUBLIC_IO_TPMIF_H__
+
+#include "../grant_table.h"
+
+struct tpmif_tx_request {
+    unsigned long addr;   /* Machine address of packet.   */
+    grant_ref_t ref;      /* grant table access reference */
+    uint16_t unused;
+    uint16_t size;        /* Packet size in bytes.        */
+};
+typedef struct tpmif_tx_request tpmif_tx_request_t;
+
+/*
+ * The TPMIF_TX_RING_SIZE defines the number of pages the
+ * front-end and backend can exchange (= size of array).
+ */
+typedef uint32_t TPMIF_RING_IDX;
+
+#define TPMIF_TX_RING_SIZE 1
+
+/* This structure must fit in a memory page. */
+
+struct tpmif_ring {
+    struct tpmif_tx_request req;
+};
+typedef struct tpmif_ring tpmif_ring_t;
+
+struct tpmif_tx_interface {
+    struct tpmif_ring ring[TPMIF_TX_RING_SIZE];
+};
+typedef struct tpmif_tx_interface tpmif_tx_interface_t;
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/io/usbif.h b/include/xen/interface/io/usbif.h

new file mode 100644 (file)

index 0000000..6099c29
--- /dev/null
+++ b/include/xen/interface/io/usbif.h
@@ -0,0 +1,151 @@
+/*
+ * usbif.h
+ *
+ * USB I/O interface for Xen guest OSes.
+ *
+ * Copyright (C) 2009, FUJITSU LABORATORIES LTD.
+ * Author: Noboru Iwamatsu <n_iwamatsu@jp.fujitsu.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_PUBLIC_IO_USBIF_H__
+#define __XEN_PUBLIC_IO_USBIF_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+enum usb_spec_version {
+       USB_VER_UNKNOWN = 0,
+       USB_VER_USB11,
+       USB_VER_USB20,
+       USB_VER_USB30,  /* not supported yet */
+};
+
+/*
+ *  USB pipe in usbif_request
+ *
+ *  bits 0-5 are specific bits for virtual USB driver.
+ *  bits 7-31 are standard urb pipe.
+ *
+ *  - port number(NEW):        bits 0-4
+ *                             (USB_MAXCHILDREN is 31)
+ *
+ *  - operation flag(NEW):     bit 5
+ *                             (0 = submit urb,
+ *                              1 = unlink urb)
+ *
+ *  - direction:               bit 7
+ *                             (0 = Host-to-Device [Out]
+ *                           1 = Device-to-Host [In])
+ *
+ *  - device address:  bits 8-14
+ *
+ *  - endpoint:                bits 15-18
+ *
+ *  - pipe type:               bits 30-31
+ *                             (00 = isochronous, 01 = interrupt,
+ *                           10 = control, 11 = bulk)
+ */
+#define usbif_pipeportnum(pipe) ((pipe) & 0x1f)
+#define usbif_setportnum_pipe(pipe, portnum) \
+       ((pipe)|(portnum))
+
+#define usbif_pipeunlink(pipe) ((pipe) & 0x20)
+#define usbif_pipesubmit(pipe) (!usbif_pipeunlink(pipe))
+#define usbif_setunlink_pipe(pipe) ((pipe)|(0x20))
+
+#define USBIF_BACK_MAX_PENDING_REQS (128)
+#define USBIF_MAX_SEGMENTS_PER_REQUEST (16)
+
+/*
+ * RING for transferring urbs.
+ */
+struct usbif_request_segment {
+       grant_ref_t gref;
+       uint16_t offset;
+       uint16_t length;
+};
+
+struct usbif_urb_request {
+       uint16_t id; /* request id */
+       uint16_t nr_buffer_segs; /* number of urb->transfer_buffer segments */
+
+       /* basic urb parameter */
+       uint32_t pipe;
+       uint16_t transfer_flags;
+       uint16_t buffer_length;
+       union {
+               uint8_t ctrl[8]; /* setup_packet (Ctrl) */
+
+               struct {
+                       uint16_t interval; /* maximum (1024*8) in usb core */
+                       uint16_t start_frame; /* start frame */
+                       uint16_t number_of_packets; /* number of ISO packet */
+                       uint16_t nr_frame_desc_segs; /* number of iso_frame_desc segments */
+               } isoc;
+
+               struct {
+                       uint16_t interval; /* maximum (1024*8) in usb core */
+                       uint16_t pad[3];
+               } intr;
+
+               struct {
+                       uint16_t unlink_id; /* unlink request id */
+                       uint16_t pad[3];
+               } unlink;
+
+       } u;
+
+       /* urb data segments */
+       struct usbif_request_segment seg[USBIF_MAX_SEGMENTS_PER_REQUEST];
+};
+typedef struct usbif_urb_request usbif_urb_request_t;
+
+struct usbif_urb_response {
+       uint16_t id; /* request id */
+       uint16_t start_frame;  /* start frame (ISO) */
+       int32_t status; /* status (non-ISO) */
+       int32_t actual_length; /* actual transfer length */
+       int32_t error_count; /* number of ISO errors */
+};
+typedef struct usbif_urb_response usbif_urb_response_t;
+
+DEFINE_RING_TYPES(usbif_urb, struct usbif_urb_request, struct usbif_urb_response);
+#define USB_URB_RING_SIZE __CONST_RING_SIZE(usbif_urb, PAGE_SIZE)
+
+/*
+ * RING for notifying connect/disconnect events to frontend
+ */
+struct usbif_conn_request {
+       uint16_t id;
+};
+typedef struct usbif_conn_request usbif_conn_request_t;
+
+struct usbif_conn_response {
+       uint16_t id; /* request id */
+       uint8_t portnum; /* port number */
+       uint8_t speed; /* usb_device_speed */
+};
+typedef struct usbif_conn_response usbif_conn_response_t;
+
+DEFINE_RING_TYPES(usbif_conn, struct usbif_conn_request, struct usbif_conn_response);
+#define USB_CONN_RING_SIZE __CONST_RING_SIZE(usbif_conn, PAGE_SIZE)
+
+#endif /* __XEN_PUBLIC_IO_USBIF_H__ */
diff --git a/include/xen/interface/io/vscsiif.h b/include/xen/interface/io/vscsiif.h

new file mode 100644 (file)

index 0000000..3ce2914
--- /dev/null
+++ b/include/xen/interface/io/vscsiif.h
@@ -0,0 +1,105 @@
+/******************************************************************************
+ * vscsiif.h
+ * 
+ * Based on the blkif.h code.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright(c) FUJITSU Limited 2008.
+ */
+
+#ifndef __XEN__PUBLIC_IO_SCSI_H__
+#define __XEN__PUBLIC_IO_SCSI_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+/* command between backend and frontend */
+#define VSCSIIF_ACT_SCSI_CDB         1    /* SCSI CDB command */
+#define VSCSIIF_ACT_SCSI_ABORT       2    /* SCSI Device(Lun) Abort*/
+#define VSCSIIF_ACT_SCSI_RESET       3    /* SCSI Device(Lun) Reset*/
+
+
+#define VSCSIIF_BACK_MAX_PENDING_REQS    128
+
+/*
+ * Maximum scatter/gather segments per request.
+ *
+ * Considering balance between allocating al least 16 "vscsiif_request"
+ * structures on one page (4096bytes) and number of scatter gather 
+ * needed, we decided to use 26 as a magic number.
+ */
+#define VSCSIIF_SG_TABLESIZE             26
+
+/*
+ * base on linux kernel 2.6.18
+ */
+#define VSCSIIF_MAX_COMMAND_SIZE         16
+#define VSCSIIF_SENSE_BUFFERSIZE         96
+
+
+struct vscsiif_request {
+    uint16_t rqid;          /* private guest value, echoed in resp  */
+    uint8_t act;            /* command between backend and frontend */
+    uint8_t cmd_len;
+
+    uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE];
+    uint16_t timeout_per_command;     /* The command is issued by twice 
+                                         the value in Backend. */
+    uint16_t channel, id, lun;
+    uint16_t padding;
+    uint8_t sc_data_direction;        /* for DMA_TO_DEVICE(1)
+                                         DMA_FROM_DEVICE(2)
+                                         DMA_NONE(3) requests  */
+    uint8_t nr_segments;              /* Number of pieces of scatter-gather */
+
+    struct scsiif_request_segment {
+        grant_ref_t gref;
+        uint16_t offset;
+        uint16_t length;
+    } seg[VSCSIIF_SG_TABLESIZE];
+    uint32_t reserved[3];
+};
+typedef struct vscsiif_request vscsiif_request_t;
+
+struct vscsiif_response {
+    uint16_t rqid;
+    uint8_t padding;
+    uint8_t sense_len;
+    uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
+    int32_t rslt;
+    uint32_t residual_len;     /* request bufflen - 
+                                  return the value from physical device */
+    uint32_t reserved[36];
+};
+typedef struct vscsiif_response vscsiif_response_t;
+
+DEFINE_RING_TYPES(vscsiif, struct vscsiif_request, struct vscsiif_response);
+
+
+#endif  /*__XEN__PUBLIC_IO_SCSI_H__*/
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h

index 9fda532..b17afba 100644 (file)
--- a/include/xen/interface/io/xenbus.h
+++ b/include/xen/interface/io/xenbus.h
@@ -36,6 +36,7 @@ enum xenbus_state
  
         XenbusStateReconfigured  = 8
  };
+typedef enum xenbus_state XenbusState;
  
  #endif /* _XEN_PUBLIC_IO_XENBUS_H */
  
diff --git a/include/xen/interface/io/xs_wire.h b/include/xen/interface/io/xs_wire.h

index 7cdfca2..debb025 100644 (file)
--- a/include/xen/interface/io/xs_wire.h
+++ b/include/xen/interface/io/xs_wire.h
@@ -1,6 +1,25 @@
  /*
   * Details of the "wire" protocol between Xen Store Daemon and client
   * library or guest kernel.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
   * Copyright (C) 2005 Rusty Russell IBM Corporation
   */
  
@@ -29,7 +48,8 @@ enum xsd_sockmsg_type
      XS_IS_DOMAIN_INTRODUCED,
      XS_RESUME,
      XS_SET_TARGET,
-    XS_RESTRICT
+    XS_RESTRICT,
+    XS_RESET_WATCHES
  };
  
  #define XS_WRITE_NONE "NONE"
@@ -42,8 +62,14 @@ struct xsd_errors
      int errnum;
      const char *errstring;
  };
+#ifdef EINVAL
  #define XSD_ERROR(x) { x, #x }
-static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
+/* LINTED: static unused */
+static struct xsd_errors xsd_errors[]
+#if defined(__GNUC__)
+__attribute__((unused))
+#endif
+    = {
      XSD_ERROR(EINVAL),
      XSD_ERROR(EACCES),
      XSD_ERROR(EEXIST),
@@ -59,6 +85,7 @@ static struct xsd_errors xsd_errors[] __attribute__((unused)) = {
      XSD_ERROR(EAGAIN),
      XSD_ERROR(EISCONN)
  };
+#endif
  
  struct xsd_sockmsg
  {
@@ -90,4 +117,8 @@ struct xenstore_domain_interface {
  /* Violating this is very bad.  See docs/misc/xenstore.txt. */
  #define XENSTORE_PAYLOAD_MAX 4096
  
+/* Violating these just gets you an error back */
+#define XENSTORE_ABS_PATH_MAX 3072
+#define XENSTORE_REL_PATH_MAX 2048
+
  #endif /* _XS_WIRE_H */
diff --git a/include/xen/interface/kexec.h b/include/xen/interface/kexec.h

new file mode 100644 (file)

index 0000000..0425222
--- /dev/null
+++ b/include/xen/interface/kexec.h
@@ -0,0 +1,168 @@
+/******************************************************************************
+ * kexec.h - Public portion
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ * 
+ * Xen port written by:
+ * - Simon 'Horms' Horman <horms@verge.net.au>
+ * - Magnus Damm <magnus@valinux.co.jp>
+ */
+
+#ifndef _XEN_PUBLIC_KEXEC_H
+#define _XEN_PUBLIC_KEXEC_H
+
+
+/* This file describes the Kexec / Kdump hypercall interface for Xen.
+ *
+ * Kexec under vanilla Linux allows a user to reboot the physical machine 
+ * into a new user-specified kernel. The Xen port extends this idea
+ * to allow rebooting of the machine from dom0. When kexec for dom0
+ * is used to reboot,  both the hypervisor and the domains get replaced
+ * with some other kernel. It is possible to kexec between vanilla
+ * Linux and Xen and back again. Xen to Xen works well too.
+ *
+ * The hypercall interface for kexec can be divided into three main
+ * types of hypercall operations:
+ *
+ * 1) Range information:
+ *    This is used by the dom0 kernel to ask the hypervisor about various 
+ *    address information. This information is needed to allow kexec-tools 
+ *    to fill in the ELF headers for /proc/vmcore properly.
+ *
+ * 2) Load and unload of images:
+ *    There are no big surprises here, the kexec binary from kexec-tools
+ *    runs in userspace in dom0. The tool loads/unloads data into the
+ *    dom0 kernel such as new kernel, initramfs and hypervisor. When
+ *    loaded the dom0 kernel performs a load hypercall operation, and
+ *    before releasing all page references the dom0 kernel calls unload.
+ *
+ * 3) Kexec operation:
+ *    This is used to start a previously loaded kernel.
+ */
+
+#include "xen.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+#define KEXEC_XEN_NO_PAGES 17
+#endif
+
+/*
+ * Prototype for this hypercall is:
+ *  int kexec_op(int cmd, void *args)
+ * @cmd  == KEXEC_CMD_... 
+ *          KEXEC operation to perform
+ * @args == Operation-specific extra arguments (NULL if none).
+ */
+
+/*
+ * Kexec supports two types of operation:
+ * - kexec into a regular kernel, very similar to a standard reboot
+ *   - KEXEC_TYPE_DEFAULT is used to specify this type
+ * - kexec into a special "crash kernel", aka kexec-on-panic
+ *   - KEXEC_TYPE_CRASH is used to specify this type
+ *   - parts of our system may be broken at kexec-on-panic time
+ *     - the code should be kept as simple and self-contained as possible
+ */
+
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH   1
+
+
+/* The kexec implementation for Xen allows the user to load two
+ * types of kernels, KEXEC_TYPE_DEFAULT and KEXEC_TYPE_CRASH.
+ * All data needed for a kexec reboot is kept in one xen_kexec_image_t
+ * per "instance". The data mainly consists of machine address lists to pages
+ * together with destination addresses. The data in xen_kexec_image_t
+ * is passed to the "code page" which is one page of code that performs
+ * the final relocations before jumping to the new kernel.
+ */
+ 
+typedef struct xen_kexec_image {
+#if defined(__i386__) || defined(__x86_64__)
+    unsigned long page_list[KEXEC_XEN_NO_PAGES];
+#endif
+#if defined(__ia64__)
+    unsigned long reboot_code_buffer;
+#endif
+    unsigned long indirection_page;
+    unsigned long start_address;
+} xen_kexec_image_t;
+
+/*
+ * Perform kexec having previously loaded a kexec or kdump kernel
+ * as appropriate.
+ * type == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
+ */
+#define KEXEC_CMD_kexec                 0
+typedef struct xen_kexec_exec {
+    int type;
+} xen_kexec_exec_t;
+
+/*
+ * Load/Unload kernel image for kexec or kdump.
+ * type  == KEXEC_TYPE_DEFAULT or KEXEC_TYPE_CRASH [in]
+ * image == relocation information for kexec (ignored for unload) [in]
+ */
+#define KEXEC_CMD_kexec_load            1
+#define KEXEC_CMD_kexec_unload          2
+typedef struct xen_kexec_load {
+    int type;
+    xen_kexec_image_t image;
+} xen_kexec_load_t;
+
+#define KEXEC_RANGE_MA_CRASH      0 /* machine address and size of crash area */
+#define KEXEC_RANGE_MA_XEN        1 /* machine address and size of Xen itself */
+#define KEXEC_RANGE_MA_CPU        2 /* machine address and size of a CPU note */
+#define KEXEC_RANGE_MA_XENHEAP    3 /* machine address and size of xenheap
+                                     * Note that although this is adjacent
+                                     * to Xen it exists in a separate EFI
+                                     * region on ia64, and thus needs to be
+                                     * inserted into iomem_machine separately */
+#define KEXEC_RANGE_MA_BOOT_PARAM 4 /* machine address and size of
+                                     * the ia64_boot_param */
+#define KEXEC_RANGE_MA_EFI_MEMMAP 5 /* machine address and size of
+                                     * of the EFI Memory Map */
+#define KEXEC_RANGE_MA_VMCOREINFO 6 /* machine address and size of vmcoreinfo */
+
+/*
+ * Find the address and size of certain memory areas
+ * range == KEXEC_RANGE_... [in]
+ * nr    == physical CPU number (starting from 0) if KEXEC_RANGE_MA_CPU [in]
+ * size  == number of bytes reserved in window [out]
+ * start == address of the first byte in the window [out]
+ */
+#define KEXEC_CMD_kexec_get_range       3
+typedef struct xen_kexec_range {
+    int range;
+    int nr;
+    unsigned long size;
+    unsigned long start;
+} xen_kexec_range_t;
+
+#endif /* _XEN_PUBLIC_KEXEC_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/mem_event.h b/include/xen/interface/mem_event.h

new file mode 100644 (file)

index 0000000..5d0bd4c
--- /dev/null
+++ b/include/xen/interface/mem_event.h
@@ -0,0 +1,80 @@
+/******************************************************************************
+ * mem_event.h
+ *
+ * Memory event common structures.
+ *
+ * Copyright (c) 2009 by Citrix Systems, Inc. (Patrick Colp)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _XEN_PUBLIC_MEM_EVENT_H
+#define _XEN_PUBLIC_MEM_EVENT_H
+
+#include "xen.h"
+#include "io/ring.h"
+
+/* Memory event flags */
+#define MEM_EVENT_FLAG_VCPU_PAUSED  (1 << 0)
+#define MEM_EVENT_FLAG_DROP_PAGE    (1 << 1)
+#define MEM_EVENT_FLAG_EVICT_FAIL   (1 << 2)
+#define MEM_EVENT_FLAG_FOREIGN      (1 << 3)
+#define MEM_EVENT_FLAG_DUMMY        (1 << 4)
+
+/* Reasons for the memory event request */
+#define MEM_EVENT_REASON_UNKNOWN     0    /* typical reason */
+#define MEM_EVENT_REASON_VIOLATION   1    /* access violation, GFN is address */
+#define MEM_EVENT_REASON_CR0         2    /* CR0 was hit: gfn is CR0 value */
+#define MEM_EVENT_REASON_CR3         3    /* CR3 was hit: gfn is CR3 value */
+#define MEM_EVENT_REASON_CR4         4    /* CR4 was hit: gfn is CR4 value */
+#define MEM_EVENT_REASON_INT3        5    /* int3 was hit: gla/gfn are RIP */
+#define MEM_EVENT_REASON_SINGLESTEP  6    /* single step was invoked: gla/gfn are RIP */
+
+typedef struct mem_event_st {
+    uint32_t flags;
+    uint32_t vcpu_id;
+
+    uint64_t gfn;
+    uint64_t offset;
+    uint64_t gla; /* if gla_valid */
+
+    uint32_t p2mt;
+
+    uint16_t access_r:1;
+    uint16_t access_w:1;
+    uint16_t access_x:1;
+    uint16_t gla_valid:1;
+    uint16_t available:12;
+
+    uint16_t reason;
+} mem_event_request_t, mem_event_response_t;
+
+DEFINE_RING_TYPES(mem_event, mem_event_request_t, mem_event_response_t);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h

index eac3ce1..5b6d032 100644 (file)
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -3,13 +3,31 @@
   *
   * Memory reservation and information.
   *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
   * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
   */
  
  #ifndef __XEN_PUBLIC_MEMORY_H__
  #define __XEN_PUBLIC_MEMORY_H__
  
-#include <linux/spinlock.h>
+#include "xen.h"
  
  /*
   * Increase or decrease the specified domain's memory reservation. Returns a
@@ -19,6 +37,26 @@
  #define XENMEM_increase_reservation 0
  #define XENMEM_decrease_reservation 1
  #define XENMEM_populate_physmap     6
+
+#if __XEN_INTERFACE_VERSION__ >= 0x00030209
+/*
+ * Maximum # bits addressable by the user of the allocated region (e.g., I/O
+ * devices often have a 32-bit limitation even in 64-bit systems). If zero
+ * then the user has no addressing restriction. This field is not used by
+ * XENMEM_decrease_reservation.
+ */
+#define XENMEMF_address_bits(x)     (x)
+#define XENMEMF_get_address_bits(x) ((x) & 0xffu)
+/* NUMA node to allocate from. */
+#define XENMEMF_node(x)     (((x) + 1) << 8)
+#define XENMEMF_get_node(x) ((((x) >> 8) - 1) & 0xffu)
+/* Flag to populate physmap with populate-on-demand entries */
+#define XENMEMF_populate_on_demand (1<<16)
+/* Flag to request allocation only from the node specified */
+#define XENMEMF_exact_node_request  (1<<17)
+#define XENMEMF_exact_node(n) (XENMEMF_node(n) | XENMEMF_exact_node_request)
+#endif
+
  struct xen_memory_reservation {
  
      /*
@@ -31,12 +69,16 @@ struct xen_memory_reservation {
       *   OUT: GMFN bases of extents that were allocated
       *   (NB. This command also updates the mach_to_phys translation table)
       */
-    GUEST_HANDLE(ulong) extent_start;
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
  
      /* Number of extents, and size/alignment of each (2^extent_order pages). */
-    unsigned long  nr_extents;
+    xen_ulong_t    nr_extents;
      unsigned int   extent_order;
  
+#if __XEN_INTERFACE_VERSION__ >= 0x00030209
+    /* XENMEMF flags. */
+    unsigned int   mem_flags;
+#else
      /*
       * Maximum # bits addressable by the user of the allocated region (e.g.,
       * I/O devices often have a 32-bit limitation even in 64-bit systems). If
@@ -44,15 +86,17 @@ struct xen_memory_reservation {
       * This field is not used by XENMEM_decrease_reservation.
       */
      unsigned int   address_bits;
+#endif
  
      /*
       * Domain whose reservation is being changed.
       * Unprivileged domains can specify only DOMID_SELF.
       */
      domid_t        domid;
-
  };
  DEFINE_GUEST_HANDLE_STRUCT(xen_memory_reservation);
+typedef struct xen_memory_reservation xen_memory_reservation_t;
+DEFINE_XEN_GUEST_HANDLE(xen_memory_reservation_t);
  
  /*
   * An atomic exchange of memory pages. If return code is zero then
@@ -92,10 +136,12 @@ struct xen_memory_exchange {
       *     command will be non-zero.
       *  5. THIS FIELD MUST BE INITIALISED TO ZERO BY THE CALLER!
       */
-    unsigned long nr_exchanged;
+    xen_ulong_t nr_exchanged;
  };
-
  DEFINE_GUEST_HANDLE_STRUCT(xen_memory_exchange);
+typedef struct xen_memory_exchange xen_memory_exchange_t;
+DEFINE_XEN_GUEST_HANDLE(xen_memory_exchange_t);
+
  /*
   * Returns the maximum machine frame number of mapped RAM in this system.
   * This command always succeeds (it never returns an error code).
@@ -112,6 +158,11 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_exchange);
  #define XENMEM_maximum_reservation  4
  
  /*
+ * Returns the maximum GPFN in use by the guest, or -ve errcode on failure.
+ */
+#define XENMEM_maximum_gpfn         14
+
+/*
   * Returns a list of MFN bases of 2MB extents comprising the machine_to_phys
   * mapping table. Architectures which do not have a m2p table do not implement
   * this command.
@@ -130,7 +181,7 @@ struct xen_machphys_mfn_list {
       * any large discontiguities in the machine address space, 2MB gaps in
       * the machphys table will be represented by an MFN base of zero.
       */
-    GUEST_HANDLE(ulong) extent_start;
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
  
      /*
       * Number of extents written to the above array. This will be smaller
@@ -139,6 +190,8 @@ struct xen_machphys_mfn_list {
      unsigned int nr_extents;
  };
  DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
+typedef struct xen_machphys_mfn_list xen_machphys_mfn_list_t;
+DEFINE_XEN_GUEST_HANDLE(xen_machphys_mfn_list_t);
  
  /*
   * Returns the location in virtual address space of the machine_to_phys
@@ -148,10 +201,12 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mfn_list);
   */
  #define XENMEM_machphys_mapping     12
  struct xen_machphys_mapping {
-    unsigned long v_start, v_end; /* Start and end virtual addresses.   */
-    unsigned long max_mfn;        /* Maximum MFN that can be looked up. */
+    xen_ulong_t v_start, v_end; /* Start and end virtual addresses.   */
+    xen_ulong_t max_mfn;        /* Maximum MFN that can be looked up. */
  };
-DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mapping_t);
+DEFINE_GUEST_HANDLE_STRUCT(xen_machphys_mapping);
+typedef struct xen_machphys_mapping xen_machphys_mapping_t;
+DEFINE_XEN_GUEST_HANDLE(xen_machphys_mapping_t);
  
  /*
   * Sets the GPFN at which a particular page appears in the specified guest's
@@ -163,41 +218,46 @@ struct xen_add_to_physmap {
      /* Which domain to change the mapping for. */
      domid_t domid;
  
+    /* Number of pages to go through for gmfn_range */
+    uint16_t    size;
+
      /* Source mapping space. */
  #define XENMAPSPACE_shared_info 0 /* shared info page */
  #define XENMAPSPACE_grant_table 1 /* grant table page */
+#define XENMAPSPACE_gmfn        2 /* GMFN */
+#define XENMAPSPACE_gmfn_range  3 /* GMFN range */
      unsigned int space;
  
+#define XENMAPIDX_grant_table_status 0x80000000
+
      /* Index into source mapping space. */
-    unsigned long idx;
+    xen_ulong_t idx;
  
      /* GPFN where the source mapping page should appear. */
-    unsigned long gpfn;
+    xen_pfn_t     gpfn;
  };
  DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap);
+typedef struct xen_add_to_physmap xen_add_to_physmap_t;
+DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
  
  /*
- * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
- * code on failure. This call only works for auto-translated guests.
+ * Unmaps the page appearing at a particular GPFN from the specified guest's
+ * pseudophysical address space.
+ * arg == addr of xen_remove_from_physmap_t.
   */
-#define XENMEM_translate_gpfn_list  8
-struct xen_translate_gpfn_list {
-    /* Which domain to translate for? */
+#define XENMEM_remove_from_physmap      15
+struct xen_remove_from_physmap {
+    /* Which domain to change the mapping for. */
      domid_t domid;
  
-    /* Length of list. */
-    unsigned long nr_gpfns;
-
-    /* List of GPFNs to translate. */
-    GUEST_HANDLE(ulong) gpfn_list;
-
-    /*
-     * Output list to contain MFN translations. May be the same as the input
-     * list (in which case each input GPFN is overwritten with the output MFN).
-     */
-    GUEST_HANDLE(ulong) mfn_list;
+    /* GPFN of the current mapping of the page. */
+    xen_pfn_t     gpfn;
  };
-DEFINE_GUEST_HANDLE_STRUCT(xen_translate_gpfn_list);
+typedef struct xen_remove_from_physmap xen_remove_from_physmap_t;
+DEFINE_XEN_GUEST_HANDLE(xen_remove_from_physmap_t);
+
+/*** REMOVED ***/
+/*#define XENMEM_translate_gpfn_list  8*/
  
  /*
   * Returns the pseudo-physical memory map as it was when the domain
@@ -217,9 +277,11 @@ struct xen_memory_map {
       * Entries in the buffer are in the same format as returned by the
       * BIOS INT 0x15 EAX=0xE820 call.
       */
-    GUEST_HANDLE(void) buffer;
+    XEN_GUEST_HANDLE(void) buffer;
  };
  DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
+typedef struct xen_memory_map xen_memory_map_t;
+DEFINE_XEN_GUEST_HANDLE(xen_memory_map_t);
  
  /*
   * Returns the real physical memory map. Passes the same structure as
@@ -228,10 +290,135 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map);
   */
  #define XENMEM_machine_memory_map   10
  
+/*
+ * Set the pseudo-physical memory map of a domain, as returned by
+ * XENMEM_memory_map.
+ * arg == addr of xen_foreign_memory_map_t.
+ */
+#define XENMEM_set_memory_map       13
+struct xen_foreign_memory_map {
+    domid_t domid;
+    struct xen_memory_map map;
+};
+typedef struct xen_foreign_memory_map xen_foreign_memory_map_t;
+DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t);
+
+#define XENMEM_set_pod_target       16
+#define XENMEM_get_pod_target       17
+struct xen_pod_target {
+    /* IN */
+    uint64_t target_pages;
+    /* OUT */
+    uint64_t tot_pages;
+    uint64_t pod_cache_pages;
+    uint64_t pod_entries;
+    /* IN */
+    domid_t domid;
+};
+typedef struct xen_pod_target xen_pod_target_t;
+
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+#ifndef uint64_aligned_t
+#define uint64_aligned_t uint64_t
+#endif
+
+/*
+ * Get the number of MFNs saved through memory sharing.
+ * The call never fails.
+ */
+#define XENMEM_get_sharing_freed_pages    18
+#define XENMEM_get_sharing_shared_pages   19
+
+#define XENMEM_paging_op                    20
+#define XENMEM_paging_op_nominate           0
+#define XENMEM_paging_op_evict              1
+#define XENMEM_paging_op_prep               2
+
+#define XENMEM_access_op                    21
+#define XENMEM_access_op_resume             0
+
+struct xen_mem_event_op {
+    uint8_t     op;         /* XENMEM_*_op_* */
+    domid_t     domain;
+
+
+    /* PAGING_PREP IN: buffer to immediately fill page in */
+    uint64_aligned_t    buffer;
+    /* Other OPs */
+    uint64_aligned_t    gfn;           /* IN:  gfn of page being operated on */
+};
+typedef struct xen_mem_event_op xen_mem_event_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mem_event_op_t);
+
+#define XENMEM_sharing_op                   22
+#define XENMEM_sharing_op_nominate_gfn      0
+#define XENMEM_sharing_op_nominate_gref     1
+#define XENMEM_sharing_op_share             2
+#define XENMEM_sharing_op_resume            3
+#define XENMEM_sharing_op_debug_gfn         4
+#define XENMEM_sharing_op_debug_mfn         5
+#define XENMEM_sharing_op_debug_gref        6
+#define XENMEM_sharing_op_add_physmap       7
+#define XENMEM_sharing_op_audit             8
+
+#define XENMEM_SHARING_OP_S_HANDLE_INVALID  (-10)
+#define XENMEM_SHARING_OP_C_HANDLE_INVALID  (-9)
+
+/* The following allows sharing of grant refs. This is useful
+ * for sharing utilities sitting as "filters" in IO backends
+ * (e.g. memshr + blktap(2)). The IO backend is only exposed
+ * to grant references, and this allows sharing of the grefs */
+#define XENMEM_SHARING_OP_FIELD_IS_GREF_FLAG   (1ULL << 62)
+
+#define XENMEM_SHARING_OP_FIELD_MAKE_GREF(field, val)  \
+    (field) = (XENMEM_SHARING_OP_FIELD_IS_GREF_FLAG | val)
+#define XENMEM_SHARING_OP_FIELD_IS_GREF(field)         \
+    ((field) & XENMEM_SHARING_OP_FIELD_IS_GREF_FLAG)
+#define XENMEM_SHARING_OP_FIELD_GET_GREF(field)        \
+    ((field) & (~XENMEM_SHARING_OP_FIELD_IS_GREF_FLAG))
+
+struct xen_mem_sharing_op {
+    uint8_t     op;     /* XENMEM_sharing_op_* */
+    domid_t     domain;
+
+    union {
+        struct mem_sharing_op_nominate {  /* OP_NOMINATE_xxx           */
+            union {
+                uint64_aligned_t gfn;     /* IN: gfn to nominate       */
+                uint32_t      grant_ref;  /* IN: grant ref to nominate */
+            } u;
+            uint64_aligned_t  handle;     /* OUT: the handle           */
+        } nominate;
+        struct mem_sharing_op_share {     /* OP_SHARE/ADD_PHYSMAP */
+            uint64_aligned_t source_gfn;    /* IN: the gfn of the source page */
+            uint64_aligned_t source_handle; /* IN: handle to the source page */
+            uint64_aligned_t client_gfn;    /* IN: the client gfn */
+            uint64_aligned_t client_handle; /* IN: handle to the client page */
+            domid_t  client_domain; /* IN: the client domain id */
+        } share;
+        struct mem_sharing_op_debug {     /* OP_DEBUG_xxx */
+            union {
+                uint64_aligned_t gfn;      /* IN: gfn to debug          */
+                uint64_aligned_t mfn;      /* IN: mfn to debug          */
+                uint32_t gref;     /* IN: gref to debug         */
+            } u;
+        } debug;
+    } u;
+};
+typedef struct xen_mem_sharing_op xen_mem_sharing_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mem_sharing_op_t);
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
+
+#ifndef CONFIG_XEN
+#include <linux/spinlock.h>
  
  /*
   * Prevent the balloon driver from changing the memory reservation
   * during a driver critical region.
   */
  extern spinlock_t xen_reservation_lock;
+#endif
+
  #endif /* __XEN_PUBLIC_MEMORY_H__ */
diff --git a/include/xen/interface/nmi.h b/include/xen/interface/nmi.h

new file mode 100644 (file)

index 0000000..2fd21d2
--- /dev/null
+++ b/include/xen/interface/nmi.h
@@ -0,0 +1,80 @@
+/******************************************************************************
+ * nmi.h
+ * 
+ * NMI callback registration and reason codes.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
+ */
+
+#ifndef __XEN_PUBLIC_NMI_H__
+#define __XEN_PUBLIC_NMI_H__
+
+#include "xen.h"
+
+/*
+ * NMI reason codes:
+ * Currently these are x86-specific, stored in arch_shared_info.nmi_reason.
+ */
+ /* I/O-check error reported via ISA port 0x61, bit 6. */
+#define _XEN_NMIREASON_io_error     0
+#define XEN_NMIREASON_io_error      (1UL << _XEN_NMIREASON_io_error)
+ /* Parity error reported via ISA port 0x61, bit 7. */
+#define _XEN_NMIREASON_parity_error 1
+#define XEN_NMIREASON_parity_error  (1UL << _XEN_NMIREASON_parity_error)
+ /* Unknown hardware-generated NMI. */
+#define _XEN_NMIREASON_unknown      2
+#define XEN_NMIREASON_unknown       (1UL << _XEN_NMIREASON_unknown)
+
+/*
+ * long nmi_op(unsigned int cmd, void *arg)
+ * NB. All ops return zero on success, else a negative error code.
+ */
+
+/*
+ * Register NMI callback for this (calling) VCPU. Currently this only makes
+ * sense for domain 0, vcpu 0. All other callers will be returned EINVAL.
+ * arg == pointer to xennmi_callback structure.
+ */
+#define XENNMI_register_callback   0
+struct xennmi_callback {
+    unsigned long handler_address;
+    unsigned long pad;
+};
+typedef struct xennmi_callback xennmi_callback_t;
+DEFINE_XEN_GUEST_HANDLE(xennmi_callback_t);
+
+/*
+ * Deregister NMI callback for this (calling) VCPU.
+ * arg == NULL.
+ */
+#define XENNMI_unregister_callback 1
+
+#endif /* __XEN_PUBLIC_NMI_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h

index 9ce788d..7ff4ff9 100644 (file)
--- a/include/xen/interface/physdev.h
+++ b/include/xen/interface/physdev.h
@@ -21,6 +21,8 @@
  #ifndef __XEN_PUBLIC_PHYSDEV_H__
  #define __XEN_PUBLIC_PHYSDEV_H__
  
+#include "xen.h"
+
  /*
   * Prototype for this hypercall is:
   *  int physdev_op(int cmd, void *args)
@@ -37,6 +39,8 @@ struct physdev_eoi {
         /* IN */
         uint32_t irq;
  };
+typedef struct physdev_eoi physdev_eoi_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_eoi_t);
  
  /*
   * Register a shared page for the hypervisor to indicate whether the guest
@@ -56,8 +60,10 @@ struct physdev_eoi {
  #define PHYSDEVOP_pirq_eoi_gmfn_v2       28
  struct physdev_pirq_eoi_gmfn {
      /* IN */
-    unsigned long gmfn;
+    xen_pfn_t gmfn;
  };
+typedef struct physdev_pirq_eoi_gmfn physdev_pirq_eoi_gmfn_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_pirq_eoi_gmfn_t);
  
  /*
   * Query the status of an IRQ line.
@@ -70,6 +76,8 @@ struct physdev_irq_status_query {
         /* OUT */
         uint32_t flags; /* XENIRQSTAT_* */
  };
+typedef struct physdev_irq_status_query physdev_irq_status_query_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_irq_status_query_t);
  
  /* Need to call PHYSDEVOP_eoi when the IRQ has been serviced? */
  #define _XENIRQSTAT_needs_eoi  (0)
@@ -88,6 +96,8 @@ struct physdev_set_iopl {
         /* IN */
         uint32_t iopl;
  };
+typedef struct physdev_set_iopl physdev_set_iopl_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_set_iopl_t);
  
  /*
   * Set the current VCPU's I/O-port permissions bitmap.
@@ -96,9 +106,15 @@ struct physdev_set_iopl {
  #define PHYSDEVOP_set_iobitmap          7
  struct physdev_set_iobitmap {
         /* IN */
+#if __XEN_INTERFACE_VERSION__ >= 0x00030205
+       XEN_GUEST_HANDLE(uint8) bitmap;
+#else
         uint8_t * bitmap;
+#endif
         uint32_t nr_ports;
  };
+typedef struct physdev_set_iobitmap physdev_set_iobitmap_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_set_iobitmap_t);
  
  /*
   * Read or write an IO-APIC register.
@@ -113,6 +129,8 @@ struct physdev_apic {
         /* IN or OUT */
         uint32_t value;
  };
+typedef struct physdev_apic physdev_apic_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_apic_t);
  
  /*
   * Allocate or free a physical upcall vector for the specified IRQ line.
@@ -126,6 +144,8 @@ struct physdev_irq {
         /* IN or OUT */
         uint32_t vector;
  };
+typedef struct physdev_irq physdev_irq_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_irq_t);
  
  #define MAP_PIRQ_TYPE_MSI              0x0
  #define MAP_PIRQ_TYPE_GSI              0x1
@@ -150,6 +170,8 @@ struct physdev_map_pirq {
      /* IN */
      uint64_t table_base;
  };
+typedef struct physdev_map_pirq physdev_map_pirq_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_map_pirq_t);
  
  #define PHYSDEVOP_unmap_pirq           14
  struct physdev_unmap_pirq {
@@ -157,6 +179,8 @@ struct physdev_unmap_pirq {
      /* IN */
      int pirq;
  };
+typedef struct physdev_unmap_pirq physdev_unmap_pirq_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_unmap_pirq_t);
  
  #define PHYSDEVOP_manage_pci_add       15
  #define PHYSDEVOP_manage_pci_remove    16
@@ -165,6 +189,8 @@ struct physdev_manage_pci {
         uint8_t bus;
         uint8_t devfn;
  };
+typedef struct physdev_manage_pci physdev_manage_pci_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_t);
  
  #define PHYSDEVOP_restore_msi            19
  struct physdev_restore_msi {
@@ -172,6 +198,8 @@ struct physdev_restore_msi {
         uint8_t bus;
         uint8_t devfn;
  };
+typedef struct physdev_restore_msi physdev_restore_msi_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_restore_msi_t);
  
  #define PHYSDEVOP_manage_pci_add_ext   20
  struct physdev_manage_pci_ext {
@@ -185,6 +213,8 @@ struct physdev_manage_pci_ext {
                 uint8_t devfn;
         } physfn;
  };
+typedef struct physdev_manage_pci_ext physdev_manage_pci_ext_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_manage_pci_ext_t);
  
  /*
   * Argument to physdev_op_compat() hypercall. Superceded by new physdev_op()
@@ -200,6 +230,8 @@ struct physdev_op {
                 struct physdev_irq                   irq_op;
         } u;
  };
+typedef struct physdev_op physdev_op_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_op_t);
  
  #define PHYSDEVOP_setup_gsi    21
  struct physdev_setup_gsi {
@@ -210,12 +242,10 @@ struct physdev_setup_gsi {
      uint8_t polarity;
      /* IN */
  };
+typedef struct physdev_setup_gsi physdev_setup_gsi_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_setup_gsi_t);
  
-#define PHYSDEVOP_get_nr_pirqs    22
-struct physdev_nr_pirqs {
-    /* OUT */
-    uint32_t nr_pirqs;
-};
+/* leave PHYSDEVOP 22 free */
  
  /* type is MAP_PIRQ_TYPE_GSI or MAP_PIRQ_TYPE_MSI
   * the hypercall returns a free pirq */
@@ -226,6 +256,21 @@ struct physdev_get_free_pirq {
      /* OUT */
      uint32_t pirq;
  };
+typedef struct physdev_get_free_pirq physdev_get_free_pirq_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_get_free_pirq_t);
+
+#define XEN_PCI_MMCFG_RESERVED         0x1
+
+#define PHYSDEVOP_pci_mmcfg_reserved    24
+struct physdev_pci_mmcfg_reserved {
+    uint64_t address;
+    uint16_t segment;
+    uint8_t start_bus;
+    uint8_t end_bus;
+    uint32_t flags;
+};
+typedef struct physdev_pci_mmcfg_reserved physdev_pci_mmcfg_reserved_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_pci_mmcfg_reserved_t);
  
  #define XEN_PCI_DEV_EXTFN              0x1
  #define XEN_PCI_DEV_VIRTFN             0x2
@@ -248,6 +293,8 @@ struct physdev_pci_device_add {
      uint32_t optarr[0];
  #endif
  };
+typedef struct physdev_pci_device_add physdev_pci_device_add_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_pci_device_add_t);
  
  #define PHYSDEVOP_pci_device_remove     26
  #define PHYSDEVOP_restore_msi_ext       27
@@ -257,6 +304,8 @@ struct physdev_pci_device {
      uint8_t bus;
      uint8_t devfn;
  };
+typedef struct physdev_pci_device physdev_pci_device_t;
+DEFINE_XEN_GUEST_HANDLE(physdev_pci_device_t);
  
  /*
   * Notify that some PIRQ-bound event channels have been unmasked.
@@ -279,4 +328,10 @@ struct physdev_pci_device {
  #define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY XENIRQSTAT_needs_eoi
  #define PHYSDEVOP_IRQ_SHARED            XENIRQSTAT_shared
  
+#if __XEN_INTERFACE_VERSION__ < 0x00040200
+#define PHYSDEVOP_pirq_eoi_gmfn PHYSDEVOP_pirq_eoi_gmfn_v1
+#else
+#define PHYSDEVOP_pirq_eoi_gmfn PHYSDEVOP_pirq_eoi_gmfn_v2
+#endif
+
  #endif /* __XEN_PUBLIC_PHYSDEV_H__ */
diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h

index 486653f..18b7493 100644 (file)
--- a/include/xen/interface/platform.h
+++ b/include/xen/interface/platform.h
@@ -42,7 +42,8 @@ struct xenpf_settime {
         uint32_t nsecs;
         uint64_t system_time;
  };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_settime_t);
+typedef struct xenpf_settime xenpf_settime_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_settime_t);
  
  /*
   * Request memory range (@mfn, @mfn+@nr_mfns-1) to have type @type.
@@ -54,14 +55,15 @@ DEFINE_GUEST_HANDLE_STRUCT(xenpf_settime_t);
  #define XENPF_add_memtype         31
  struct xenpf_add_memtype {
         /* IN variables. */
-       unsigned long mfn;
+       xen_pfn_t mfn;
         uint64_t nr_mfns;
         uint32_t type;
         /* OUT variables. */
         uint32_t handle;
         uint32_t reg;
  };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_add_memtype_t);
+typedef struct xenpf_add_memtype xenpf_add_memtype_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_add_memtype_t);
  
  /*
   * Tear down an existing memory-range type. If @handle is remembered then it
@@ -76,7 +78,8 @@ struct xenpf_del_memtype {
         uint32_t handle;
         uint32_t reg;
  };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_del_memtype_t);
+typedef struct xenpf_del_memtype xenpf_del_memtype_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_del_memtype_t);
  
  /* Read current type of an MTRR (x86-specific). */
  #define XENPF_read_memtype        33
@@ -84,19 +87,21 @@ struct xenpf_read_memtype {
         /* IN variables. */
         uint32_t reg;
         /* OUT variables. */
-       unsigned long mfn;
+       xen_pfn_t mfn;
         uint64_t nr_mfns;
         uint32_t type;
  };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_read_memtype_t);
+typedef struct xenpf_read_memtype xenpf_read_memtype_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_read_memtype_t);
  
  #define XENPF_microcode_update    35
  struct xenpf_microcode_update {
         /* IN variables. */
-       GUEST_HANDLE(void) data;          /* Pointer to microcode data */
+       XEN_GUEST_HANDLE(const_void) data;/* Pointer to microcode data */
         uint32_t length;                  /* Length of microcode data. */
  };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_microcode_update_t);
+typedef struct xenpf_microcode_update xenpf_microcode_update_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_microcode_update_t);
  
  #define XENPF_platform_quirk      39
  #define QUIRK_NOIRQBALANCING      1 /* Do not restrict IO-APIC RTE targets */
@@ -106,12 +111,113 @@ struct xenpf_platform_quirk {
         /* IN variables. */
         uint32_t quirk_id;
  };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_platform_quirk_t);
+typedef struct xenpf_platform_quirk xenpf_platform_quirk_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_platform_quirk_t);
+
+#define XENPF_efi_runtime_call    49
+#define XEN_EFI_get_time                      1
+#define XEN_EFI_set_time                      2
+#define XEN_EFI_get_wakeup_time               3
+#define XEN_EFI_set_wakeup_time               4
+#define XEN_EFI_get_next_high_monotonic_count 5
+#define XEN_EFI_get_variable                  6
+#define XEN_EFI_set_variable                  7
+#define XEN_EFI_get_next_variable_name        8
+#define XEN_EFI_query_variable_info           9
+#define XEN_EFI_query_capsule_capabilities   10
+#define XEN_EFI_update_capsule               11
+struct xenpf_efi_runtime_call {
+    uint32_t function;
+    /*
+     * This field is generally used for per sub-function flags (defined
+     * below), except for the XEN_EFI_get_next_high_monotonic_count case,
+     * where it holds the single returned value.
+     */
+    uint32_t misc;
+    unsigned long status;
+    union {
+#define XEN_EFI_GET_TIME_SET_CLEARS_NS 0x00000001
+        struct {
+            struct xenpf_efi_time {
+                uint16_t year;
+                uint8_t month;
+                uint8_t day;
+                uint8_t hour;
+                uint8_t min;
+                uint8_t sec;
+                uint32_t ns;
+                int16_t tz;
+                uint8_t daylight;
+            } time;
+            uint32_t resolution;
+            uint32_t accuracy;
+        } get_time;
+
+        struct xenpf_efi_time set_time;
+
+#define XEN_EFI_GET_WAKEUP_TIME_ENABLED 0x00000001
+#define XEN_EFI_GET_WAKEUP_TIME_PENDING 0x00000002
+        struct xenpf_efi_time get_wakeup_time;
+
+#define XEN_EFI_SET_WAKEUP_TIME_ENABLE      0x00000001
+#define XEN_EFI_SET_WAKEUP_TIME_ENABLE_ONLY 0x00000002
+        struct xenpf_efi_time set_wakeup_time;
+
+#define XEN_EFI_VARIABLE_NON_VOLATILE       0x00000001
+#define XEN_EFI_VARIABLE_BOOTSERVICE_ACCESS 0x00000002
+#define XEN_EFI_VARIABLE_RUNTIME_ACCESS     0x00000004
+        struct {
+            XEN_GUEST_HANDLE(void) name;  /* UCS-2/UTF-16 string */
+            unsigned long size;
+            XEN_GUEST_HANDLE(void) data;
+            struct xenpf_efi_guid {
+                uint32_t data1;
+                uint16_t data2;
+                uint16_t data3;
+                uint8_t data4[8];
+            } vendor_guid;
+        } get_variable, set_variable;
+
+        struct {
+            unsigned long size;
+            XEN_GUEST_HANDLE(void) name;  /* UCS-2/UTF-16 string */
+            struct xenpf_efi_guid vendor_guid;
+        } get_next_variable_name;
+
+        struct {
+            uint32_t attr;
+            uint64_t max_store_size;
+            uint64_t remain_store_size;
+            uint64_t max_size;
+        } query_variable_info;
+
+        struct {
+            XEN_GUEST_HANDLE(void) capsule_header_array;
+            unsigned long capsule_count;
+            uint64_t max_capsule_size;
+            unsigned int reset_type;
+        } query_capsule_capabilities;
+
+        struct {
+            XEN_GUEST_HANDLE(void) capsule_header_array;
+            unsigned long capsule_count;
+            uint64_t sg_list; /* machine address */
+        } update_capsule;
+    } u;
+};
+typedef struct xenpf_efi_runtime_call xenpf_efi_runtime_call_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_efi_runtime_call_t);
  
  #define XENPF_firmware_info       50
  #define XEN_FW_DISK_INFO          1 /* from int 13 AH=08/41/48 */
  #define XEN_FW_DISK_MBR_SIGNATURE 2 /* from MBR offset 0x1b8 */
  #define XEN_FW_VBEDDC_INFO        3 /* from int 10 AX=4f15 */
+#define XEN_FW_EFI_INFO           4 /* from EFI */
+#define  XEN_FW_EFI_VERSION        0
+#define  XEN_FW_EFI_CONFIG_TABLE   1
+#define  XEN_FW_EFI_VENDOR         2
+#define  XEN_FW_EFI_MEM_INFO       3
+#define  XEN_FW_EFI_RT_VERSION     4
  struct xenpf_firmware_info {
         /* IN variables. */
         uint32_t type;
@@ -129,7 +235,7 @@ struct xenpf_firmware_info {
                         uint8_t legacy_sectors_per_track; /* %cl[5:0]: max sector #  */
                         /* Int13, Fn41: Get Device Parameters (as filled into %ds:%esi). */
                         /* NB. First uint16_t of buffer must be set to buffer size.      */
-                       GUEST_HANDLE(void) edd_params;
+                       XEN_GUEST_HANDLE(void) edd_params;
                 } disk_info; /* XEN_FW_DISK_INFO */
                 struct {
                         uint8_t device;                   /* bios device number  */
@@ -140,11 +246,30 @@ struct xenpf_firmware_info {
                         uint8_t capabilities;
                         uint8_t edid_transfer_time;
                         /* must refer to 128-byte buffer */
-                       GUEST_HANDLE(uchar) edid;
+                       XEN_GUEST_HANDLE(uint8) edid;
                 } vbeddc_info; /* XEN_FW_VBEDDC_INFO */
+        union xenpf_efi_info {
+            uint32_t version;
+            struct {
+                uint64_t addr;                /* EFI_CONFIGURATION_TABLE */
+                uint32_t nent;
+            } cfg;
+            struct {
+                uint32_t revision;
+                uint32_t bufsz;               /* input, in bytes */
+                XEN_GUEST_HANDLE(void) name;  /* UCS-2/UTF-16 string */
+            } vendor;
+            struct {
+                uint64_t addr;
+                uint64_t size;
+                uint64_t attr;
+                uint32_t type;
+            } mem;
+        } efi_info; /* XEN_FW_EFI_INFO */
         } u;
  };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_firmware_info_t);
+typedef struct xenpf_firmware_info xenpf_firmware_info_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_firmware_info_t);
  
  #define XENPF_enter_acpi_sleep    51
  struct xenpf_enter_acpi_sleep {
@@ -154,7 +279,8 @@ struct xenpf_enter_acpi_sleep {
         uint32_t sleep_state;       /* Which state to enter (Sn). */
         uint32_t flags;             /* Must be zero. */
  };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_enter_acpi_sleep_t);
+typedef struct xenpf_enter_acpi_sleep xenpf_enter_acpi_sleep_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_enter_acpi_sleep_t);
  
  #define XENPF_change_freq         52
  struct xenpf_change_freq {
@@ -163,7 +289,8 @@ struct xenpf_change_freq {
         uint32_t cpu;   /* Physical cpu. */
         uint64_t freq;  /* New frequency (Hz). */
  };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_change_freq_t);
+typedef struct xenpf_change_freq xenpf_change_freq_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_change_freq_t);
  
  /*
   * Get idle times (nanoseconds since boot) for physical CPUs specified in the
@@ -177,17 +304,18 @@ DEFINE_GUEST_HANDLE_STRUCT(xenpf_change_freq_t);
  struct xenpf_getidletime {
         /* IN/OUT variables */
         /* IN: CPUs to interrogate; OUT: subset of IN which are present */
-       GUEST_HANDLE(uchar) cpumap_bitmap;
+       XEN_GUEST_HANDLE(uint8) cpumap_bitmap;
         /* IN variables */
         /* Size of cpumap bitmap. */
         uint32_t cpumap_nr_cpus;
         /* Must be indexable for every cpu in cpumap_bitmap. */
-       GUEST_HANDLE(uint64_t) idletime;
+       XEN_GUEST_HANDLE(uint64) idletime;
         /* OUT variables */
         /* System time when the idletime snapshots were taken. */
         uint64_t now;
  };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_getidletime_t);
+typedef struct xenpf_getidletime xenpf_getidletime_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_getidletime_t);
  
  #define XENPF_set_processor_pminfo      54
  
@@ -201,6 +329,7 @@ DEFINE_GUEST_HANDLE_STRUCT(xenpf_getidletime_t);
  #define XEN_PM_PX   1
  #define XEN_PM_TX   2
  #define XEN_PM_PDC  3
+
  /* Px sub info type */
  #define XEN_PX_PCT   1
  #define XEN_PX_PSS   2
@@ -221,6 +350,8 @@ struct xen_processor_csd {
         uint32_t    num;         /* number of processors in same domain */
  };
  DEFINE_GUEST_HANDLE_STRUCT(xen_processor_csd);
+typedef struct xen_processor_csd xen_processor_csd_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_csd_t);
  
  struct xen_processor_cx {
         struct xen_power_register  reg; /* GAS for Cx trigger register */
@@ -228,9 +359,11 @@ struct xen_processor_cx {
         uint32_t    latency;  /* worst latency (ms) to enter/exit this cstate */
         uint32_t    power;    /* average power consumption(mW) */
         uint32_t    dpcnt;    /* number of dependency entries */
-       GUEST_HANDLE(xen_processor_csd) dp; /* NULL if no dependency */
+       XEN_GUEST_HANDLE(xen_processor_csd_t) dp; /* NULL if no dependency */
  };
  DEFINE_GUEST_HANDLE_STRUCT(xen_processor_cx);
+typedef struct xen_processor_cx xen_processor_cx_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_cx_t);
  
  struct xen_processor_flags {
         uint32_t bm_control:1;
@@ -243,7 +376,7 @@ struct xen_processor_flags {
  struct xen_processor_power {
         uint32_t count;  /* number of C state entries in array below */
         struct xen_processor_flags flags;  /* global flags of this processor */
-       GUEST_HANDLE(xen_processor_cx) states; /* supported c states */
+       XEN_GUEST_HANDLE(xen_processor_cx_t) states; /* supported c states */
  };
  
  struct xen_pct_register {
@@ -261,10 +394,12 @@ struct xen_processor_px {
         uint64_t power;      /* milliWatts */
         uint64_t transition_latency; /* microseconds */
         uint64_t bus_master_latency; /* microseconds */
-       uint64_t control;        /* control value */
-       uint64_t status;     /* success indicator */
+       uint64_t control;        /* control value */
+       uint64_t status;     /* success indicator */
  };
  DEFINE_GUEST_HANDLE_STRUCT(xen_processor_px);
+typedef struct xen_processor_px xen_processor_px_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_px_t);
  
  struct xen_psd_package {
         uint64_t num_entries;
@@ -280,11 +415,13 @@ struct xen_processor_performance {
         struct xen_pct_register control_register;
         struct xen_pct_register status_register;
         uint32_t state_count;     /* total available performance states */
-       GUEST_HANDLE(xen_processor_px) states;
+       XEN_GUEST_HANDLE(xen_processor_px_t) states;
         struct xen_psd_package domain_info;
         uint32_t shared_type;     /* coordination type of this processor */
  };
  DEFINE_GUEST_HANDLE_STRUCT(xen_processor_performance);
+typedef struct xen_processor_performance xen_processor_performance_t;
+DEFINE_XEN_GUEST_HANDLE(xen_processor_performance_t);
  
  struct xenpf_set_processor_pminfo {
         /* IN variables */
@@ -293,10 +430,16 @@ struct xenpf_set_processor_pminfo {
         union {
                 struct xen_processor_power          power;/* Cx: _CST/_CSD */
                 struct xen_processor_performance    perf; /* Px: _PPC/_PCT/_PSS/_PSD */
-               GUEST_HANDLE(uint32_t)              pdc;
+               XEN_GUEST_HANDLE(uint32)            pdc;  /* _PDC */
+#ifdef CONFIG_XEN
+       } u;
+#else
         };
+#endif
  };
  DEFINE_GUEST_HANDLE_STRUCT(xenpf_set_processor_pminfo);
+typedef struct xenpf_set_processor_pminfo xenpf_set_processor_pminfo_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_set_processor_pminfo_t);
  
  #define XENPF_get_cpuinfo 55
  struct xenpf_pcpuinfo {
@@ -312,7 +455,73 @@ struct xenpf_pcpuinfo {
         uint32_t apic_id;
         uint32_t acpi_id;
  };
-DEFINE_GUEST_HANDLE_STRUCT(xenpf_pcpuinfo);
+typedef struct xenpf_pcpuinfo xenpf_pcpuinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_pcpuinfo_t);
+
+#define XENPF_get_cpu_version 48
+struct xenpf_pcpu_version {
+    /* IN */
+    uint32_t xen_cpuid;
+    /* OUT */
+    /* The maxium cpu_id that is present */
+    uint32_t max_present;
+    char vendor_id[12];
+    uint32_t family;
+    uint32_t model;
+    uint32_t stepping;
+};
+typedef struct xenpf_pcpu_version xenpf_pcpu_version_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_pcpu_version_t);
+
+#define XENPF_cpu_online    56
+#define XENPF_cpu_offline   57
+struct xenpf_cpu_ol
+{
+    uint32_t cpuid;
+};
+typedef struct xenpf_cpu_ol xenpf_cpu_ol_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_cpu_ol_t);
+
+#define XENPF_cpu_hotadd    58
+struct xenpf_cpu_hotadd
+{
+       uint32_t apic_id;
+       uint32_t acpi_id;
+       uint32_t pxm;
+};
+
+#define XENPF_mem_hotadd    59
+struct xenpf_mem_hotadd
+{
+    uint64_t spfn;
+    uint64_t epfn;
+    uint32_t pxm;
+    uint32_t flags;
+};
+
+#define XENPF_core_parking  60
+
+#define XEN_CORE_PARKING_SET 1
+#define XEN_CORE_PARKING_GET 2
+struct xenpf_core_parking {
+    /* IN variables */
+    uint32_t type;
+    /* IN variables:  set cpu nums expected to be idled */
+    /* OUT variables: get cpu nums actually be idled */
+    uint32_t idle_nums;
+};
+typedef struct xenpf_core_parking xenpf_core_parking_t;
+DEFINE_XEN_GUEST_HANDLE(xenpf_core_parking_t);
+
+#define XENPF_get_cpu_freq        ('N' << 24)
+#define XENPF_get_cpu_freq_min    (XENPF_get_cpu_freq + 1)
+#define XENPF_get_cpu_freq_max    (XENPF_get_cpu_freq_min + 1)
+struct xenpf_get_cpu_freq {
+    /* IN variables */
+    uint32_t vcpu;
+    /* OUT variables */
+    uint32_t freq; /* in kHz */
+};
  
  struct xen_platform_op {
         uint32_t cmd;
@@ -324,15 +533,23 @@ struct xen_platform_op {
                 struct xenpf_read_memtype      read_memtype;
                 struct xenpf_microcode_update  microcode;
                 struct xenpf_platform_quirk    platform_quirk;
+               struct xenpf_efi_runtime_call  efi_runtime_call;
                 struct xenpf_firmware_info     firmware_info;
                 struct xenpf_enter_acpi_sleep  enter_acpi_sleep;
                 struct xenpf_change_freq       change_freq;
                 struct xenpf_getidletime       getidletime;
                 struct xenpf_set_processor_pminfo set_pminfo;
                 struct xenpf_pcpuinfo          pcpu_info;
+               struct xenpf_pcpu_version      pcpu_version;
+               struct xenpf_cpu_ol            cpu_ol;
+               struct xenpf_cpu_hotadd        cpu_add;
+               struct xenpf_mem_hotadd        mem_add;
+               struct xenpf_core_parking      core_parking;
+               struct xenpf_get_cpu_freq      get_cpu_freq;
                 uint8_t                        pad[128];
         } u;
  };
-DEFINE_GUEST_HANDLE_STRUCT(xen_platform_op_t);
+typedef struct xen_platform_op xen_platform_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_platform_op_t);
  
  #endif /* __XEN_PUBLIC_PLATFORM_H__ */
diff --git a/include/xen/interface/sched.h b/include/xen/interface/sched.h

index dd55dac..3531702 100644 (file)
--- a/include/xen/interface/sched.h
+++ b/include/xen/interface/sched.h
@@ -3,6 +3,24 @@
   *
   * Scheduler state interactions
   *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
   * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
   */
  
@@ -13,17 +31,17 @@
  
  /*
   * The prototype for this hypercall is:
- *  long sched_op_new(int cmd, void *arg)
+ *  long sched_op(int cmd, void *arg)
   * @cmd == SCHEDOP_??? (scheduler operation).
   * @arg == Operation-specific extra argument(s), as described below.
   *
- * **NOTE**:
   * Versions of Xen prior to 3.0.2 provide only the following legacy version
   * of this hypercall, supporting only the commands yield, block and shutdown:
   *  long sched_op(int cmd, unsigned long arg)
   * @cmd == SCHEDOP_??? (scheduler operation).
   * @arg == 0               (SCHEDOP_yield and SCHEDOP_block)
   *      == SHUTDOWN_* code (SCHEDOP_shutdown)
+ * This legacy version is available to new guests as sched_op_compat().
   */
  
  /*
@@ -50,6 +68,8 @@ struct sched_shutdown {
      unsigned int reason; /* SHUTDOWN_* */
  };
  DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
+typedef struct sched_shutdown sched_shutdown_t;
+DEFINE_XEN_GUEST_HANDLE(sched_shutdown_t);
  
  /*
   * Poll a set of event-channel ports. Return when one or more are pending. An
@@ -58,11 +78,13 @@ DEFINE_GUEST_HANDLE_STRUCT(sched_shutdown);
   */
  #define SCHEDOP_poll        3
  struct sched_poll {
-    GUEST_HANDLE(evtchn_port_t) ports;
+    XEN_GUEST_HANDLE(evtchn_port_t) ports;
      unsigned int nr_ports;
      uint64_t timeout;
  };
  DEFINE_GUEST_HANDLE_STRUCT(sched_poll);
+typedef struct sched_poll sched_poll_t;
+DEFINE_XEN_GUEST_HANDLE(sched_poll_t);
  
  /*
   * Declare a shutdown for another domain. The main use of this function is
@@ -75,6 +97,8 @@ struct sched_remote_shutdown {
      domid_t domain_id;         /* Remote domain ID */
      unsigned int reason;       /* SHUTDOWN_xxx reason */
  };
+typedef struct sched_remote_shutdown sched_remote_shutdown_t;
+DEFINE_XEN_GUEST_HANDLE(sched_remote_shutdown_t);
  
  /*
   * Latch a shutdown code, so that when the domain later shuts down it
@@ -96,6 +120,8 @@ struct sched_watchdog {
      uint32_t id;                /* watchdog ID */
      uint32_t timeout;           /* timeout */
  };
+typedef struct sched_watchdog sched_watchdog_t;
+DEFINE_XEN_GUEST_HANDLE(sched_watchdog_t);
  
  /*
   * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
diff --git a/include/xen/interface/sysctl.h b/include/xen/interface/sysctl.h

new file mode 100644 (file)

index 0000000..88ec803
--- /dev/null
+++ b/include/xen/interface/sysctl.h
@@ -0,0 +1,654 @@
+/******************************************************************************
+ * sysctl.h
+ * 
+ * System management operations. For use by node control stack.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2002-2006, K Fraser
+ */
+
+#ifndef __XEN_PUBLIC_SYSCTL_H__
+#define __XEN_PUBLIC_SYSCTL_H__
+
+#if !defined(__XEN__) && !defined(__XEN_TOOLS__)
+#error "sysctl operations are intended for use by node control tools only"
+#endif
+
+#include "xen.h"
+#include "domctl.h"
+
+#define XEN_SYSCTL_INTERFACE_VERSION 0x00000009
+
+/*
+ * Read console content from Xen buffer ring.
+ */
+/* XEN_SYSCTL_readconsole */
+struct xen_sysctl_readconsole {
+    /* IN: Non-zero -> clear after reading. */
+    uint8_t clear;
+    /* IN: Non-zero -> start index specified by @index field. */
+    uint8_t incremental;
+    uint8_t pad0, pad1;
+    /*
+     * IN:  Start index for consuming from ring buffer (if @incremental);
+     * OUT: End index after consuming from ring buffer.
+     */
+    uint32_t index; 
+    /* IN: Virtual address to write console data. */
+    XEN_GUEST_HANDLE_64(char) buffer;
+    /* IN: Size of buffer; OUT: Bytes written to buffer. */
+    uint32_t count;
+};
+typedef struct xen_sysctl_readconsole xen_sysctl_readconsole_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_readconsole_t);
+
+/* Get trace buffers machine base address */
+/* XEN_SYSCTL_tbuf_op */
+struct xen_sysctl_tbuf_op {
+    /* IN variables */
+#define XEN_SYSCTL_TBUFOP_get_info     0
+#define XEN_SYSCTL_TBUFOP_set_cpu_mask 1
+#define XEN_SYSCTL_TBUFOP_set_evt_mask 2
+#define XEN_SYSCTL_TBUFOP_set_size     3
+#define XEN_SYSCTL_TBUFOP_enable       4
+#define XEN_SYSCTL_TBUFOP_disable      5
+    uint32_t cmd;
+    /* IN/OUT variables */
+    struct xenctl_cpumap cpu_mask;
+    uint32_t             evt_mask;
+    /* OUT variables */
+    uint64_aligned_t buffer_mfn;
+    uint32_t size;  /* Also an IN variable! */
+};
+typedef struct xen_sysctl_tbuf_op xen_sysctl_tbuf_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_tbuf_op_t);
+
+/*
+ * Get physical information about the host machine
+ */
+/* XEN_SYSCTL_physinfo */
+ /* (x86) The platform supports HVM guests. */
+#define _XEN_SYSCTL_PHYSCAP_hvm          0
+#define XEN_SYSCTL_PHYSCAP_hvm           (1u<<_XEN_SYSCTL_PHYSCAP_hvm)
+ /* (x86) The platform supports HVM-guest direct access to I/O devices. */
+#define _XEN_SYSCTL_PHYSCAP_hvm_directio 1
+#define XEN_SYSCTL_PHYSCAP_hvm_directio  (1u<<_XEN_SYSCTL_PHYSCAP_hvm_directio)
+struct xen_sysctl_physinfo {
+    uint32_t threads_per_core;
+    uint32_t cores_per_socket;
+    uint32_t nr_cpus;     /* # CPUs currently online */
+    uint32_t max_cpu_id;  /* Largest possible CPU ID on this host */
+    uint32_t nr_nodes;    /* # nodes currently online */
+    uint32_t max_node_id; /* Largest possible node ID on this host */
+    uint32_t cpu_khz;
+    uint64_aligned_t total_pages;
+    uint64_aligned_t free_pages;
+    uint64_aligned_t scrub_pages;
+    uint32_t hw_cap[8];
+
+    /* XEN_SYSCTL_PHYSCAP_??? */
+    uint32_t capabilities;
+};
+typedef struct xen_sysctl_physinfo xen_sysctl_physinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_physinfo_t);
+
+/*
+ * Get the ID of the current scheduler.
+ */
+/* XEN_SYSCTL_sched_id */
+struct xen_sysctl_sched_id {
+    /* OUT variable */
+    uint32_t sched_id;
+};
+typedef struct xen_sysctl_sched_id xen_sysctl_sched_id_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_sched_id_t);
+
+/* Interface for controlling Xen software performance counters. */
+/* XEN_SYSCTL_perfc_op */
+/* Sub-operations: */
+#define XEN_SYSCTL_PERFCOP_reset 1   /* Reset all counters to zero. */
+#define XEN_SYSCTL_PERFCOP_query 2   /* Get perfctr information. */
+struct xen_sysctl_perfc_desc {
+    char         name[80];             /* name of perf counter */
+    uint32_t     nr_vals;              /* number of values for this counter */
+};
+typedef struct xen_sysctl_perfc_desc xen_sysctl_perfc_desc_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_desc_t);
+typedef uint32_t xen_sysctl_perfc_val_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_val_t);
+
+struct xen_sysctl_perfc_op {
+    /* IN variables. */
+    uint32_t       cmd;                /*  XEN_SYSCTL_PERFCOP_??? */
+    /* OUT variables. */
+    uint32_t       nr_counters;       /*  number of counters description  */
+    uint32_t       nr_vals;           /*  number of values  */
+    /* counter information (or NULL) */
+    XEN_GUEST_HANDLE_64(xen_sysctl_perfc_desc_t) desc;
+    /* counter values (or NULL) */
+    XEN_GUEST_HANDLE_64(xen_sysctl_perfc_val_t) val;
+};
+typedef struct xen_sysctl_perfc_op xen_sysctl_perfc_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_perfc_op_t);
+
+/* XEN_SYSCTL_getdomaininfolist */
+struct xen_sysctl_getdomaininfolist {
+    /* IN variables. */
+    domid_t               first_domain;
+    uint32_t              max_domains;
+    XEN_GUEST_HANDLE_64(xen_domctl_getdomaininfo_t) buffer;
+    /* OUT variables. */
+    uint32_t              num_domains;
+};
+typedef struct xen_sysctl_getdomaininfolist xen_sysctl_getdomaininfolist_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getdomaininfolist_t);
+
+/* Inject debug keys into Xen. */
+/* XEN_SYSCTL_debug_keys */
+struct xen_sysctl_debug_keys {
+    /* IN variables. */
+    XEN_GUEST_HANDLE_64(char) keys;
+    uint32_t nr_keys;
+};
+typedef struct xen_sysctl_debug_keys xen_sysctl_debug_keys_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_debug_keys_t);
+
+/* Get physical CPU information. */
+/* XEN_SYSCTL_getcpuinfo */
+struct xen_sysctl_cpuinfo {
+    uint64_aligned_t idletime;
+};
+typedef struct xen_sysctl_cpuinfo xen_sysctl_cpuinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpuinfo_t); 
+struct xen_sysctl_getcpuinfo {
+    /* IN variables. */
+    uint32_t max_cpus;
+    XEN_GUEST_HANDLE_64(xen_sysctl_cpuinfo_t) info;
+    /* OUT variables. */
+    uint32_t nr_cpus;
+}; 
+typedef struct xen_sysctl_getcpuinfo xen_sysctl_getcpuinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_getcpuinfo_t); 
+
+/* XEN_SYSCTL_availheap */
+struct xen_sysctl_availheap {
+    /* IN variables. */
+    uint32_t min_bitwidth;  /* Smallest address width (zero if don't care). */
+    uint32_t max_bitwidth;  /* Largest address width (zero if don't care). */
+    int32_t  node;          /* NUMA node of interest (-1 for all nodes). */
+    /* OUT variables. */
+    uint64_aligned_t avail_bytes;/* Bytes available in the specified region. */
+};
+typedef struct xen_sysctl_availheap xen_sysctl_availheap_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_availheap_t);
+
+/* XEN_SYSCTL_get_pmstat */
+struct pm_px_val {
+    uint64_aligned_t freq;        /* Px core frequency */
+    uint64_aligned_t residency;   /* Px residency time */
+    uint64_aligned_t count;       /* Px transition count */
+};
+typedef struct pm_px_val pm_px_val_t;
+DEFINE_XEN_GUEST_HANDLE(pm_px_val_t);
+
+struct pm_px_stat {
+    uint8_t total;        /* total Px states */
+    uint8_t usable;       /* usable Px states */
+    uint8_t last;         /* last Px state */
+    uint8_t cur;          /* current Px state */
+    XEN_GUEST_HANDLE_64(uint64) trans_pt;   /* Px transition table */
+    XEN_GUEST_HANDLE_64(pm_px_val_t) pt;
+};
+typedef struct pm_px_stat pm_px_stat_t;
+DEFINE_XEN_GUEST_HANDLE(pm_px_stat_t);
+
+struct pm_cx_stat {
+    uint32_t nr;    /* entry nr in triggers & residencies, including C0 */
+    uint32_t last;  /* last Cx state */
+    uint64_aligned_t idle_time;                 /* idle time from boot */
+    XEN_GUEST_HANDLE_64(uint64) triggers;    /* Cx trigger counts */
+    XEN_GUEST_HANDLE_64(uint64) residencies; /* Cx residencies */
+    uint64_aligned_t pc2;
+    uint64_aligned_t pc3;
+    uint64_aligned_t pc6;
+    uint64_aligned_t pc7;
+    uint64_aligned_t cc3;
+    uint64_aligned_t cc6;
+    uint64_aligned_t cc7;
+};
+
+struct xen_sysctl_get_pmstat {
+#define PMSTAT_CATEGORY_MASK 0xf0
+#define PMSTAT_PX            0x10
+#define PMSTAT_CX            0x20
+#define PMSTAT_get_max_px    (PMSTAT_PX | 0x1)
+#define PMSTAT_get_pxstat    (PMSTAT_PX | 0x2)
+#define PMSTAT_reset_pxstat  (PMSTAT_PX | 0x3)
+#define PMSTAT_get_max_cx    (PMSTAT_CX | 0x1)
+#define PMSTAT_get_cxstat    (PMSTAT_CX | 0x2)
+#define PMSTAT_reset_cxstat  (PMSTAT_CX | 0x3)
+    uint32_t type;
+    uint32_t cpuid;
+    union {
+        struct pm_px_stat getpx;
+        struct pm_cx_stat getcx;
+        /* other struct for tx, etc */
+    } u;
+};
+typedef struct xen_sysctl_get_pmstat xen_sysctl_get_pmstat_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_get_pmstat_t);
+
+/* XEN_SYSCTL_cpu_hotplug */
+struct xen_sysctl_cpu_hotplug {
+    /* IN variables */
+    uint32_t cpu;   /* Physical cpu. */
+#define XEN_SYSCTL_CPU_HOTPLUG_ONLINE  0
+#define XEN_SYSCTL_CPU_HOTPLUG_OFFLINE 1
+    uint32_t op;    /* hotplug opcode */
+};
+typedef struct xen_sysctl_cpu_hotplug xen_sysctl_cpu_hotplug_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpu_hotplug_t);
+
+/*
+ * Get/set xen power management, include 
+ * 1. cpufreq governors and related parameters
+ */
+/* XEN_SYSCTL_pm_op */
+struct xen_userspace {
+    uint32_t scaling_setspeed;
+};
+typedef struct xen_userspace xen_userspace_t;
+
+struct xen_ondemand {
+    uint32_t sampling_rate_max;
+    uint32_t sampling_rate_min;
+
+    uint32_t sampling_rate;
+    uint32_t up_threshold;
+};
+typedef struct xen_ondemand xen_ondemand_t;
+
+/* 
+ * cpufreq para name of this structure named 
+ * same as sysfs file name of native linux
+ */
+#define CPUFREQ_NAME_LEN 16
+struct xen_get_cpufreq_para {
+    /* IN/OUT variable */
+    uint32_t cpu_num;
+    uint32_t freq_num;
+    uint32_t gov_num;
+
+    /* for all governors */
+    /* OUT variable */
+    XEN_GUEST_HANDLE_64(uint32) affected_cpus;
+    XEN_GUEST_HANDLE_64(uint32) scaling_available_frequencies;
+    XEN_GUEST_HANDLE_64(char)   scaling_available_governors;
+    char scaling_driver[CPUFREQ_NAME_LEN];
+
+    uint32_t cpuinfo_cur_freq;
+    uint32_t cpuinfo_max_freq;
+    uint32_t cpuinfo_min_freq;
+    uint32_t scaling_cur_freq;
+
+    char scaling_governor[CPUFREQ_NAME_LEN];
+    uint32_t scaling_max_freq;
+    uint32_t scaling_min_freq;
+
+    /* for specific governor */
+    union {
+        struct  xen_userspace userspace;
+        struct  xen_ondemand ondemand;
+    } u;
+
+    int32_t turbo_enabled;
+};
+
+struct xen_set_cpufreq_gov {
+    char scaling_governor[CPUFREQ_NAME_LEN];
+};
+
+struct xen_set_cpufreq_para {
+    #define SCALING_MAX_FREQ           1
+    #define SCALING_MIN_FREQ           2
+    #define SCALING_SETSPEED           3
+    #define SAMPLING_RATE              4
+    #define UP_THRESHOLD               5
+
+    uint32_t ctrl_type;
+    uint32_t ctrl_value;
+};
+
+struct xen_sysctl_pm_op {
+    #define PM_PARA_CATEGORY_MASK      0xf0
+    #define CPUFREQ_PARA               0x10
+
+    /* cpufreq command type */
+    #define GET_CPUFREQ_PARA           (CPUFREQ_PARA | 0x01)
+    #define SET_CPUFREQ_GOV            (CPUFREQ_PARA | 0x02)
+    #define SET_CPUFREQ_PARA           (CPUFREQ_PARA | 0x03)
+    #define GET_CPUFREQ_AVGFREQ        (CPUFREQ_PARA | 0x04)
+
+    /* set/reset scheduler power saving option */
+    #define XEN_SYSCTL_pm_op_set_sched_opt_smt    0x21
+
+    /* cpuidle max_cstate access command */
+    #define XEN_SYSCTL_pm_op_get_max_cstate       0x22
+    #define XEN_SYSCTL_pm_op_set_max_cstate       0x23
+
+    /* set scheduler migration cost value */
+    #define XEN_SYSCTL_pm_op_set_vcpu_migration_delay   0x24
+    #define XEN_SYSCTL_pm_op_get_vcpu_migration_delay   0x25
+
+    /* enable/disable turbo mode when in dbs governor */
+    #define XEN_SYSCTL_pm_op_enable_turbo               0x26
+    #define XEN_SYSCTL_pm_op_disable_turbo              0x27
+
+    uint32_t cmd;
+    uint32_t cpuid;
+    union {
+        struct xen_get_cpufreq_para get_para;
+        struct xen_set_cpufreq_gov  set_gov;
+        struct xen_set_cpufreq_para set_para;
+        uint64_aligned_t get_avgfreq;
+        uint32_t                    set_sched_opt_smt;
+        uint32_t                    get_max_cstate;
+        uint32_t                    set_max_cstate;
+        uint32_t                    get_vcpu_migration_delay;
+        uint32_t                    set_vcpu_migration_delay;
+    } u;
+};
+
+/* XEN_SYSCTL_page_offline_op */
+struct xen_sysctl_page_offline_op {
+    /* IN: range of page to be offlined */
+#define sysctl_page_offline     1
+#define sysctl_page_online      2
+#define sysctl_query_page_offline  3
+    uint32_t cmd;
+    uint32_t start;
+    uint32_t end;
+    /* OUT: result of page offline request */
+    /*
+     * bit 0~15: result flags
+     * bit 16~31: owner
+     */
+    XEN_GUEST_HANDLE(uint32) status;
+};
+
+#define PG_OFFLINE_STATUS_MASK    (0xFFUL)
+
+/* The result is invalid, i.e. HV does not handle it */
+#define PG_OFFLINE_INVALID   (0x1UL << 0)
+
+#define PG_OFFLINE_OFFLINED  (0x1UL << 1)
+#define PG_OFFLINE_PENDING   (0x1UL << 2)
+#define PG_OFFLINE_FAILED    (0x1UL << 3)
+#define PG_OFFLINE_AGAIN     (0x1UL << 4)
+
+#define PG_ONLINE_FAILED     PG_OFFLINE_FAILED
+#define PG_ONLINE_ONLINED    PG_OFFLINE_OFFLINED
+
+#define PG_OFFLINE_STATUS_OFFLINED              (0x1UL << 1)
+#define PG_OFFLINE_STATUS_ONLINE                (0x1UL << 2)
+#define PG_OFFLINE_STATUS_OFFLINE_PENDING       (0x1UL << 3)
+#define PG_OFFLINE_STATUS_BROKEN                (0x1UL << 4)
+
+#define PG_OFFLINE_MISC_MASK    (0xFFUL << 4)
+
+/* valid when PG_OFFLINE_FAILED or PG_OFFLINE_PENDING */
+#define PG_OFFLINE_XENPAGE   (0x1UL << 8)
+#define PG_OFFLINE_DOM0PAGE  (0x1UL << 9)
+#define PG_OFFLINE_ANONYMOUS (0x1UL << 10)
+#define PG_OFFLINE_NOT_CONV_RAM   (0x1UL << 11)
+#define PG_OFFLINE_OWNED     (0x1UL << 12)
+
+#define PG_OFFLINE_BROKEN    (0x1UL << 13)
+#define PG_ONLINE_BROKEN     PG_OFFLINE_BROKEN
+
+#define PG_OFFLINE_OWNER_SHIFT 16
+
+/* XEN_SYSCTL_lockprof_op */
+/* Sub-operations: */
+#define XEN_SYSCTL_LOCKPROF_reset 1   /* Reset all profile data to zero. */
+#define XEN_SYSCTL_LOCKPROF_query 2   /* Get lock profile information. */
+/* Record-type: */
+#define LOCKPROF_TYPE_GLOBAL      0   /* global lock, idx meaningless */
+#define LOCKPROF_TYPE_PERDOM      1   /* per-domain lock, idx is domid */
+#define LOCKPROF_TYPE_N           2   /* number of types */
+struct xen_sysctl_lockprof_data {
+    char     name[40];     /* lock name (may include up to 2 %d specifiers) */
+    int32_t  type;         /* LOCKPROF_TYPE_??? */
+    int32_t  idx;          /* index (e.g. domain id) */
+    uint64_aligned_t lock_cnt;     /* # of locking succeeded */
+    uint64_aligned_t block_cnt;    /* # of wait for lock */
+    uint64_aligned_t lock_time;    /* nsecs lock held */
+    uint64_aligned_t block_time;   /* nsecs waited for lock */
+};
+typedef struct xen_sysctl_lockprof_data xen_sysctl_lockprof_data_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_data_t);
+struct xen_sysctl_lockprof_op {
+    /* IN variables. */
+    uint32_t       cmd;               /* XEN_SYSCTL_LOCKPROF_??? */
+    uint32_t       max_elem;          /* size of output buffer */
+    /* OUT variables (query only). */
+    uint32_t       nr_elem;           /* number of elements available */
+    uint64_aligned_t time;            /* nsecs of profile measurement */
+    /* profile information (or NULL) */
+    XEN_GUEST_HANDLE_64(xen_sysctl_lockprof_data_t) data;
+};
+typedef struct xen_sysctl_lockprof_op xen_sysctl_lockprof_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_op_t);
+
+/* XEN_SYSCTL_topologyinfo */
+#define INVALID_TOPOLOGY_ID  (~0U)
+struct xen_sysctl_topologyinfo {
+    /*
+     * IN: maximum addressable entry in the caller-provided arrays.
+     * OUT: largest cpu identifier in the system.
+     * If OUT is greater than IN then the arrays are truncated!
+     * If OUT is leass than IN then the array tails are not written by sysctl.
+     */
+    uint32_t max_cpu_index;
+
+    /*
+     * If not NULL, these arrays are filled with core/socket/node identifier
+     * for each cpu.
+     * If a cpu has no core/socket/node information (e.g., cpu not present) 
+     * then the sentinel value ~0u is written to each array.
+     * The number of array elements written by the sysctl is:
+     *   min(@max_cpu_index_IN,@max_cpu_index_OUT)+1
+     */
+    XEN_GUEST_HANDLE_64(uint32) cpu_to_core;
+    XEN_GUEST_HANDLE_64(uint32) cpu_to_socket;
+    XEN_GUEST_HANDLE_64(uint32) cpu_to_node;
+};
+typedef struct xen_sysctl_topologyinfo xen_sysctl_topologyinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_topologyinfo_t);
+
+/* XEN_SYSCTL_numainfo */
+struct xen_sysctl_numainfo {
+    /*
+     * IN: maximum addressable entry in the caller-provided arrays.
+     * OUT: largest node identifier in the system.
+     * If OUT is greater than IN then the arrays are truncated!
+     */
+    uint32_t max_node_index;
+
+    /* NB. Entries are 0 if node is not present. */
+    XEN_GUEST_HANDLE_64(uint64) node_to_memsize;
+    XEN_GUEST_HANDLE_64(uint64) node_to_memfree;
+
+    /*
+     * Array, of size (max_node_index+1)^2, listing memory access distances
+     * between nodes. If an entry has no node distance information (e.g., node 
+     * not present) then the value ~0u is written.
+     * 
+     * Note that the array rows must be indexed by multiplying by the minimum 
+     * of the caller-provided max_node_index and the returned value of
+     * max_node_index. That is, if the largest node index in the system is
+     * smaller than the caller can handle, a smaller 2-d array is constructed
+     * within the space provided by the caller. When this occurs, trailing
+     * space provided by the caller is not modified. If the largest node index
+     * in the system is larger than the caller can handle, then a 2-d array of
+     * the maximum size handleable by the caller is constructed.
+     */
+    XEN_GUEST_HANDLE_64(uint32) node_to_node_distance;
+};
+typedef struct xen_sysctl_numainfo xen_sysctl_numainfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_numainfo_t);
+
+/* XEN_SYSCTL_cpupool_op */
+#define XEN_SYSCTL_CPUPOOL_OP_CREATE                1  /* C */
+#define XEN_SYSCTL_CPUPOOL_OP_DESTROY               2  /* D */
+#define XEN_SYSCTL_CPUPOOL_OP_INFO                  3  /* I */
+#define XEN_SYSCTL_CPUPOOL_OP_ADDCPU                4  /* A */
+#define XEN_SYSCTL_CPUPOOL_OP_RMCPU                 5  /* R */
+#define XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN            6  /* M */
+#define XEN_SYSCTL_CPUPOOL_OP_FREEINFO              7  /* F */
+#define XEN_SYSCTL_CPUPOOL_PAR_ANY     0xFFFFFFFF
+struct xen_sysctl_cpupool_op {
+    uint32_t op;          /* IN */
+    uint32_t cpupool_id;  /* IN: CDIARM OUT: CI */
+    uint32_t sched_id;    /* IN: C      OUT: I  */
+    uint32_t domid;       /* IN: M              */
+    uint32_t cpu;         /* IN: AR             */
+    uint32_t n_dom;       /*            OUT: I  */
+    struct xenctl_cpumap cpumap; /*     OUT: IF */
+};
+typedef struct xen_sysctl_cpupool_op xen_sysctl_cpupool_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpupool_op_t);
+
+#define ARINC653_MAX_DOMAINS_PER_SCHEDULE   64
+/*
+ * This structure is used to pass a new ARINC653 schedule from a
+ * privileged domain (ie dom0) to Xen.
+ */
+struct xen_sysctl_arinc653_schedule {
+    /* major_frame holds the time for the new schedule's major frame
+     * in nanoseconds. */
+    uint64_aligned_t     major_frame;
+    /* num_sched_entries holds how many of the entries in the
+     * sched_entries[] array are valid. */
+    uint8_t     num_sched_entries;
+    /* The sched_entries array holds the actual schedule entries. */
+    struct {
+        /* dom_handle must match a domain's UUID */
+        xen_domain_handle_t dom_handle;
+        /* If a domain has multiple VCPUs, vcpu_id specifies which one
+         * this schedule entry applies to. It should be set to 0 if
+         * there is only one VCPU for the domain. */
+        unsigned int vcpu_id;
+        /* runtime specifies the amount of time that should be allocated
+         * to this VCPU per major frame. It is specified in nanoseconds */
+        uint64_aligned_t runtime;
+    } sched_entries[ARINC653_MAX_DOMAINS_PER_SCHEDULE];
+};
+typedef struct xen_sysctl_arinc653_schedule xen_sysctl_arinc653_schedule_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_arinc653_schedule_t);
+
+struct xen_sysctl_credit_schedule {
+    /* Length of timeslice in milliseconds */
+#define XEN_SYSCTL_CSCHED_TSLICE_MAX 1000
+#define XEN_SYSCTL_CSCHED_TSLICE_MIN 1
+    unsigned tslice_ms;
+    /* Rate limit (minimum timeslice) in microseconds */
+#define XEN_SYSCTL_SCHED_RATELIMIT_MAX 500000
+#define XEN_SYSCTL_SCHED_RATELIMIT_MIN 100
+    unsigned ratelimit_us;
+};
+typedef struct xen_sysctl_credit_schedule xen_sysctl_credit_schedule_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_credit_schedule_t);
+
+/* XEN_SYSCTL_scheduler_op */
+/* Set or get info? */
+#define XEN_SYSCTL_SCHEDOP_putinfo 0
+#define XEN_SYSCTL_SCHEDOP_getinfo 1
+struct xen_sysctl_scheduler_op {
+    uint32_t cpupool_id; /* Cpupool whose scheduler is to be targetted. */
+    uint32_t sched_id;   /* XEN_SCHEDULER_* (domctl.h) */
+    uint32_t cmd;        /* XEN_SYSCTL_SCHEDOP_* */
+    union {
+        struct xen_sysctl_sched_arinc653 {
+            XEN_GUEST_HANDLE_64(xen_sysctl_arinc653_schedule_t) schedule;
+        } sched_arinc653;
+        struct xen_sysctl_credit_schedule sched_credit;
+    } u;
+};
+typedef struct xen_sysctl_scheduler_op xen_sysctl_scheduler_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_scheduler_op_t);
+
+struct xen_sysctl {
+    uint32_t cmd;
+#define XEN_SYSCTL_readconsole                    1
+#define XEN_SYSCTL_tbuf_op                        2
+#define XEN_SYSCTL_physinfo                       3
+#define XEN_SYSCTL_sched_id                       4
+#define XEN_SYSCTL_perfc_op                       5
+#define XEN_SYSCTL_getdomaininfolist              6
+#define XEN_SYSCTL_debug_keys                     7
+#define XEN_SYSCTL_getcpuinfo                     8
+#define XEN_SYSCTL_availheap                      9
+#define XEN_SYSCTL_get_pmstat                    10
+#define XEN_SYSCTL_cpu_hotplug                   11
+#define XEN_SYSCTL_pm_op                         12
+#define XEN_SYSCTL_page_offline_op               14
+#define XEN_SYSCTL_lockprof_op                   15
+#define XEN_SYSCTL_topologyinfo                  16 
+#define XEN_SYSCTL_numainfo                      17
+#define XEN_SYSCTL_cpupool_op                    18
+#define XEN_SYSCTL_scheduler_op                  19
+    uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
+    union {
+        struct xen_sysctl_readconsole       readconsole;
+        struct xen_sysctl_tbuf_op           tbuf_op;
+        struct xen_sysctl_physinfo          physinfo;
+        struct xen_sysctl_topologyinfo      topologyinfo;
+        struct xen_sysctl_numainfo          numainfo;
+        struct xen_sysctl_sched_id          sched_id;
+        struct xen_sysctl_perfc_op          perfc_op;
+        struct xen_sysctl_getdomaininfolist getdomaininfolist;
+        struct xen_sysctl_debug_keys        debug_keys;
+        struct xen_sysctl_getcpuinfo        getcpuinfo;
+        struct xen_sysctl_availheap         availheap;
+        struct xen_sysctl_get_pmstat        get_pmstat;
+        struct xen_sysctl_cpu_hotplug       cpu_hotplug;
+        struct xen_sysctl_pm_op             pm_op;
+        struct xen_sysctl_page_offline_op   page_offline;
+        struct xen_sysctl_lockprof_op       lockprof_op;
+        struct xen_sysctl_cpupool_op        cpupool_op;
+        struct xen_sysctl_scheduler_op      scheduler_op;
+        uint8_t                             pad[128];
+    } u;
+};
+typedef struct xen_sysctl xen_sysctl_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_t);
+
+#endif /* __XEN_PUBLIC_SYSCTL_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/tmem.h b/include/xen/interface/tmem.h

new file mode 100644 (file)

index 0000000..74bd1c6
--- /dev/null
+++ b/include/xen/interface/tmem.h
@@ -0,0 +1,148 @@
+/******************************************************************************
+ * tmem.h
+ * 
+ * Guest OS interface to Xen Transcendent Memory.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_TMEM_H__
+#define __XEN_PUBLIC_TMEM_H__
+
+#include "xen.h"
+
+/* version of ABI */
+#define TMEM_SPEC_VERSION          1
+
+/* Commands to HYPERVISOR_tmem_op() */
+#define TMEM_CONTROL               0
+#define TMEM_NEW_POOL              1
+#define TMEM_DESTROY_POOL          2
+#define TMEM_NEW_PAGE              3
+#define TMEM_PUT_PAGE              4
+#define TMEM_GET_PAGE              5
+#define TMEM_FLUSH_PAGE            6
+#define TMEM_FLUSH_OBJECT          7
+#define TMEM_READ                  8
+#define TMEM_WRITE                 9
+#define TMEM_XCHG                 10
+
+/* Privileged commands to HYPERVISOR_tmem_op() */
+#define TMEM_AUTH                 101 
+#define TMEM_RESTORE_NEW          102
+
+/* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */
+#define TMEMC_THAW                   0
+#define TMEMC_FREEZE                 1
+#define TMEMC_FLUSH                  2
+#define TMEMC_DESTROY                3
+#define TMEMC_LIST                   4
+#define TMEMC_SET_WEIGHT             5
+#define TMEMC_SET_CAP                6
+#define TMEMC_SET_COMPRESS           7
+#define TMEMC_QUERY_FREEABLE_MB      8
+#define TMEMC_SAVE_BEGIN             10
+#define TMEMC_SAVE_GET_VERSION       11
+#define TMEMC_SAVE_GET_MAXPOOLS      12
+#define TMEMC_SAVE_GET_CLIENT_WEIGHT 13
+#define TMEMC_SAVE_GET_CLIENT_CAP    14
+#define TMEMC_SAVE_GET_CLIENT_FLAGS  15
+#define TMEMC_SAVE_GET_POOL_FLAGS    16
+#define TMEMC_SAVE_GET_POOL_NPAGES   17
+#define TMEMC_SAVE_GET_POOL_UUID     18
+#define TMEMC_SAVE_GET_NEXT_PAGE     19
+#define TMEMC_SAVE_GET_NEXT_INV      20
+#define TMEMC_SAVE_END               21
+#define TMEMC_RESTORE_BEGIN          30
+#define TMEMC_RESTORE_PUT_PAGE       32
+#define TMEMC_RESTORE_FLUSH_PAGE     33
+
+/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
+#define TMEM_POOL_PERSIST          1
+#define TMEM_POOL_SHARED           2
+#define TMEM_POOL_PRECOMPRESSED    4
+#define TMEM_POOL_PAGESIZE_SHIFT   4
+#define TMEM_POOL_PAGESIZE_MASK  0xf
+#define TMEM_POOL_VERSION_SHIFT   24
+#define TMEM_POOL_VERSION_MASK  0xff
+#define TMEM_POOL_RESERVED_BITS  0x00ffff00
+
+/* Bits for client flags (save/restore) */
+#define TMEM_CLIENT_COMPRESS       1
+#define TMEM_CLIENT_FROZEN         2
+
+/* Special errno values */
+#define EFROZEN                 1000
+#define EEMPTY                  1001
+
+
+#ifndef __ASSEMBLY__
+typedef xen_pfn_t tmem_cli_mfn_t;
+typedef XEN_GUEST_HANDLE(char) tmem_cli_va_t;
+struct tmem_op {
+    uint32_t cmd;
+    int32_t pool_id;
+    union {
+        struct {
+            uint64_t uuid[2];
+            uint32_t flags;
+            uint32_t arg1;
+        } creat; /* for cmd == TMEM_NEW_POOL, TMEM_AUTH, TMEM_RESTORE_NEW */
+        struct { 
+            uint32_t subop;
+            uint32_t cli_id;
+            uint32_t arg1;
+            uint32_t arg2;
+            uint64_t oid[3];
+            tmem_cli_va_t buf;
+        } ctrl; /* for cmd == TMEM_CONTROL */
+        struct {
+            
+            uint64_t oid[3];
+            uint32_t index;
+            uint32_t tmem_offset;
+            uint32_t pfn_offset;
+            uint32_t len;
+            tmem_cli_mfn_t cmfn; /* client machine page frame */
+        } gen; /* for all other cmd ("generic") */
+    } u;
+};
+typedef struct tmem_op tmem_op_t;
+DEFINE_XEN_GUEST_HANDLE(tmem_op_t);
+
+struct tmem_handle {
+    uint32_t pool_id;
+    uint32_t index;
+    uint64_t oid[3];
+};
+#endif
+
+#endif /* __XEN_PUBLIC_TMEM_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/trace.h b/include/xen/interface/trace.h

new file mode 100644 (file)

index 0000000..0dfabe9
--- /dev/null
+++ b/include/xen/interface/trace.h
@@ -0,0 +1,245 @@
+/******************************************************************************
+ * include/public/trace.h
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Mark Williamson, (C) 2004 Intel Research Cambridge
+ * Copyright (C) 2005 Bin Ren
+ */
+
+#ifndef __XEN_PUBLIC_TRACE_H__
+#define __XEN_PUBLIC_TRACE_H__
+
+#define TRACE_EXTRA_MAX    7
+#define TRACE_EXTRA_SHIFT 28
+
+/* Trace classes */
+#define TRC_CLS_SHIFT 16
+#define TRC_GEN      0x0001f000    /* General trace            */
+#define TRC_SCHED    0x0002f000    /* Xen Scheduler trace      */
+#define TRC_DOM0OP   0x0004f000    /* Xen DOM0 operation trace */
+#define TRC_HVM      0x0008f000    /* Xen HVM trace            */
+#define TRC_MEM      0x0010f000    /* Xen memory trace         */
+#define TRC_PV       0x0020f000    /* Xen PV traces            */
+#define TRC_SHADOW   0x0040f000    /* Xen shadow tracing       */
+#define TRC_HW       0x0080f000    /* Xen hardware-related traces */
+#define TRC_GUEST    0x0800f000    /* Guest-generated traces   */
+#define TRC_ALL      0x0ffff000
+#define TRC_HD_TO_EVENT(x) ((x)&0x0fffffff)
+#define TRC_HD_CYCLE_FLAG (1UL<<31)
+#define TRC_HD_INCLUDES_CYCLE_COUNT(x) ( !!( (x) & TRC_HD_CYCLE_FLAG ) )
+#define TRC_HD_EXTRA(x)    (((x)>>TRACE_EXTRA_SHIFT)&TRACE_EXTRA_MAX)
+
+/* Trace subclasses */
+#define TRC_SUBCLS_SHIFT 12
+
+/* trace subclasses for SVM */
+#define TRC_HVM_ENTRYEXIT 0x00081000   /* VMENTRY and #VMEXIT       */
+#define TRC_HVM_HANDLER   0x00082000   /* various HVM handlers      */
+
+#define TRC_SCHED_MIN       0x00021000   /* Just runstate changes */
+#define TRC_SCHED_CLASS     0x00022000   /* Scheduler-specific    */
+#define TRC_SCHED_VERBOSE   0x00028000   /* More inclusive scheduling */
+
+/* Trace classes for Hardware */
+#define TRC_HW_PM           0x00801000   /* Power management traces */
+#define TRC_HW_IRQ          0x00802000   /* Traces relating to the handling of IRQs */
+
+/* Trace events per class */
+#define TRC_LOST_RECORDS        (TRC_GEN + 1)
+#define TRC_TRACE_WRAP_BUFFER  (TRC_GEN + 2)
+#define TRC_TRACE_CPU_CHANGE    (TRC_GEN + 3)
+
+#define TRC_SCHED_RUNSTATE_CHANGE   (TRC_SCHED_MIN + 1)
+#define TRC_SCHED_CONTINUE_RUNNING  (TRC_SCHED_MIN + 2)
+#define TRC_SCHED_DOM_ADD        (TRC_SCHED_VERBOSE +  1)
+#define TRC_SCHED_DOM_REM        (TRC_SCHED_VERBOSE +  2)
+#define TRC_SCHED_SLEEP          (TRC_SCHED_VERBOSE +  3)
+#define TRC_SCHED_WAKE           (TRC_SCHED_VERBOSE +  4)
+#define TRC_SCHED_YIELD          (TRC_SCHED_VERBOSE +  5)
+#define TRC_SCHED_BLOCK          (TRC_SCHED_VERBOSE +  6)
+#define TRC_SCHED_SHUTDOWN       (TRC_SCHED_VERBOSE +  7)
+#define TRC_SCHED_CTL            (TRC_SCHED_VERBOSE +  8)
+#define TRC_SCHED_ADJDOM         (TRC_SCHED_VERBOSE +  9)
+#define TRC_SCHED_SWITCH         (TRC_SCHED_VERBOSE + 10)
+#define TRC_SCHED_S_TIMER_FN     (TRC_SCHED_VERBOSE + 11)
+#define TRC_SCHED_T_TIMER_FN     (TRC_SCHED_VERBOSE + 12)
+#define TRC_SCHED_DOM_TIMER_FN   (TRC_SCHED_VERBOSE + 13)
+#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED_VERBOSE + 14)
+#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED_VERBOSE + 15)
+#define TRC_SCHED_SHUTDOWN_CODE  (TRC_SCHED_VERBOSE + 16)
+
+#define TRC_MEM_PAGE_GRANT_MAP      (TRC_MEM + 1)
+#define TRC_MEM_PAGE_GRANT_UNMAP    (TRC_MEM + 2)
+#define TRC_MEM_PAGE_GRANT_TRANSFER (TRC_MEM + 3)
+#define TRC_MEM_SET_P2M_ENTRY       (TRC_MEM + 4)
+#define TRC_MEM_DECREASE_RESERVATION (TRC_MEM + 5)
+#define TRC_MEM_POD_POPULATE        (TRC_MEM + 16)
+#define TRC_MEM_POD_ZERO_RECLAIM    (TRC_MEM + 17)
+#define TRC_MEM_POD_SUPERPAGE_SPLINTER (TRC_MEM + 18)
+
+
+#define TRC_PV_HYPERCALL             (TRC_PV +  1)
+#define TRC_PV_TRAP                  (TRC_PV +  3)
+#define TRC_PV_PAGE_FAULT            (TRC_PV +  4)
+#define TRC_PV_FORCED_INVALID_OP     (TRC_PV +  5)
+#define TRC_PV_EMULATE_PRIVOP        (TRC_PV +  6)
+#define TRC_PV_EMULATE_4GB           (TRC_PV +  7)
+#define TRC_PV_MATH_STATE_RESTORE    (TRC_PV +  8)
+#define TRC_PV_PAGING_FIXUP          (TRC_PV +  9)
+#define TRC_PV_GDT_LDT_MAPPING_FAULT (TRC_PV + 10)
+#define TRC_PV_PTWR_EMULATION        (TRC_PV + 11)
+#define TRC_PV_PTWR_EMULATION_PAE    (TRC_PV + 12)
+  /* Indicates that addresses in trace record are 64 bits */
+#define TRC_64_FLAG               (0x100) 
+
+#define TRC_SHADOW_NOT_SHADOW                 (TRC_SHADOW +  1)
+#define TRC_SHADOW_FAST_PROPAGATE             (TRC_SHADOW +  2)
+#define TRC_SHADOW_FAST_MMIO                  (TRC_SHADOW +  3)
+#define TRC_SHADOW_FALSE_FAST_PATH            (TRC_SHADOW +  4)
+#define TRC_SHADOW_MMIO                       (TRC_SHADOW +  5)
+#define TRC_SHADOW_FIXUP                      (TRC_SHADOW +  6)
+#define TRC_SHADOW_DOMF_DYING                 (TRC_SHADOW +  7)
+#define TRC_SHADOW_EMULATE                    (TRC_SHADOW +  8)
+#define TRC_SHADOW_EMULATE_UNSHADOW_USER      (TRC_SHADOW +  9)
+#define TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ    (TRC_SHADOW + 10)
+#define TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED (TRC_SHADOW + 11)
+#define TRC_SHADOW_WRMAP_BF                   (TRC_SHADOW + 12)
+#define TRC_SHADOW_PREALLOC_UNPIN             (TRC_SHADOW + 13)
+#define TRC_SHADOW_RESYNC_FULL                (TRC_SHADOW + 14)
+#define TRC_SHADOW_RESYNC_ONLY                (TRC_SHADOW + 15)
+
+/* trace events per subclass */
+#define TRC_HVM_NESTEDFLAG      (0x400)
+#define TRC_HVM_VMENTRY         (TRC_HVM_ENTRYEXIT + 0x01)
+#define TRC_HVM_VMEXIT          (TRC_HVM_ENTRYEXIT + 0x02)
+#define TRC_HVM_VMEXIT64        (TRC_HVM_ENTRYEXIT + TRC_64_FLAG + 0x02)
+#define TRC_HVM_PF_XEN          (TRC_HVM_HANDLER + 0x01)
+#define TRC_HVM_PF_XEN64        (TRC_HVM_HANDLER + TRC_64_FLAG + 0x01)
+#define TRC_HVM_PF_INJECT       (TRC_HVM_HANDLER + 0x02)
+#define TRC_HVM_PF_INJECT64     (TRC_HVM_HANDLER + TRC_64_FLAG + 0x02)
+#define TRC_HVM_INJ_EXC         (TRC_HVM_HANDLER + 0x03)
+#define TRC_HVM_INJ_VIRQ        (TRC_HVM_HANDLER + 0x04)
+#define TRC_HVM_REINJ_VIRQ      (TRC_HVM_HANDLER + 0x05)
+#define TRC_HVM_IO_READ         (TRC_HVM_HANDLER + 0x06)
+#define TRC_HVM_IO_WRITE        (TRC_HVM_HANDLER + 0x07)
+#define TRC_HVM_CR_READ         (TRC_HVM_HANDLER + 0x08)
+#define TRC_HVM_CR_READ64       (TRC_HVM_HANDLER + TRC_64_FLAG + 0x08)
+#define TRC_HVM_CR_WRITE        (TRC_HVM_HANDLER + 0x09)
+#define TRC_HVM_CR_WRITE64      (TRC_HVM_HANDLER + TRC_64_FLAG + 0x09)
+#define TRC_HVM_DR_READ         (TRC_HVM_HANDLER + 0x0A)
+#define TRC_HVM_DR_WRITE        (TRC_HVM_HANDLER + 0x0B)
+#define TRC_HVM_MSR_READ        (TRC_HVM_HANDLER + 0x0C)
+#define TRC_HVM_MSR_WRITE       (TRC_HVM_HANDLER + 0x0D)
+#define TRC_HVM_CPUID           (TRC_HVM_HANDLER + 0x0E)
+#define TRC_HVM_INTR            (TRC_HVM_HANDLER + 0x0F)
+#define TRC_HVM_NMI             (TRC_HVM_HANDLER + 0x10)
+#define TRC_HVM_SMI             (TRC_HVM_HANDLER + 0x11)
+#define TRC_HVM_VMMCALL         (TRC_HVM_HANDLER + 0x12)
+#define TRC_HVM_HLT             (TRC_HVM_HANDLER + 0x13)
+#define TRC_HVM_INVLPG          (TRC_HVM_HANDLER + 0x14)
+#define TRC_HVM_INVLPG64        (TRC_HVM_HANDLER + TRC_64_FLAG + 0x14)
+#define TRC_HVM_MCE             (TRC_HVM_HANDLER + 0x15)
+#define TRC_HVM_IOPORT_READ     (TRC_HVM_HANDLER + 0x16)
+#define TRC_HVM_IOMEM_READ      (TRC_HVM_HANDLER + 0x17)
+#define TRC_HVM_CLTS            (TRC_HVM_HANDLER + 0x18)
+#define TRC_HVM_LMSW            (TRC_HVM_HANDLER + 0x19)
+#define TRC_HVM_LMSW64          (TRC_HVM_HANDLER + TRC_64_FLAG + 0x19)
+#define TRC_HVM_RDTSC           (TRC_HVM_HANDLER + 0x1a)
+#define TRC_HVM_INTR_WINDOW     (TRC_HVM_HANDLER + 0x20)
+#define TRC_HVM_NPF             (TRC_HVM_HANDLER + 0x21)
+#define TRC_HVM_REALMODE_EMULATE (TRC_HVM_HANDLER + 0x22)
+#define TRC_HVM_TRAP             (TRC_HVM_HANDLER + 0x23)
+#define TRC_HVM_TRAP_DEBUG       (TRC_HVM_HANDLER + 0x24)
+#define TRC_HVM_VLAPIC           (TRC_HVM_HANDLER + 0x25)
+
+#define TRC_HVM_IOPORT_WRITE    (TRC_HVM_HANDLER + 0x216)
+#define TRC_HVM_IOMEM_WRITE     (TRC_HVM_HANDLER + 0x217)
+
+/* trace events for per class */
+#define TRC_PM_FREQ_CHANGE      (TRC_HW_PM + 0x01)
+#define TRC_PM_IDLE_ENTRY       (TRC_HW_PM + 0x02)
+#define TRC_PM_IDLE_EXIT        (TRC_HW_PM + 0x03)
+
+/* Trace events for IRQs */
+#define TRC_HW_IRQ_MOVE_CLEANUP_DELAY (TRC_HW_IRQ + 0x1)
+#define TRC_HW_IRQ_MOVE_CLEANUP       (TRC_HW_IRQ + 0x2)
+#define TRC_HW_IRQ_BIND_VECTOR        (TRC_HW_IRQ + 0x3)
+#define TRC_HW_IRQ_CLEAR_VECTOR       (TRC_HW_IRQ + 0x4)
+#define TRC_HW_IRQ_MOVE_FINISH        (TRC_HW_IRQ + 0x5)
+#define TRC_HW_IRQ_ASSIGN_VECTOR      (TRC_HW_IRQ + 0x6)
+#define TRC_HW_IRQ_UNMAPPED_VECTOR    (TRC_HW_IRQ + 0x7)
+#define TRC_HW_IRQ_HANDLED            (TRC_HW_IRQ + 0x8)
+
+
+/* This structure represents a single trace buffer record. */
+struct t_rec {
+    uint32_t event:28;
+    uint32_t extra_u32:3;         /* # entries in trailing extra_u32[] array */
+    uint32_t cycles_included:1;   /* u.cycles or u.no_cycles? */
+    union {
+        struct {
+            uint32_t cycles_lo, cycles_hi; /* cycle counter timestamp */
+            uint32_t extra_u32[7];         /* event data items */
+        } cycles;
+        struct {
+            uint32_t extra_u32[7];         /* event data items */
+        } nocycles;
+    } u;
+};
+
+/*
+ * This structure contains the metadata for a single trace buffer.  The head
+ * field, indexes into an array of struct t_rec's.
+ */
+struct t_buf {
+    /* Assume the data buffer size is X.  X is generally not a power of 2.
+     * CONS and PROD are incremented modulo (2*X):
+     *     0 <= cons < 2*X
+     *     0 <= prod < 2*X
+     * This is done because addition modulo X breaks at 2^32 when X is not a
+     * power of 2:
+     *     (((2^32 - 1) % X) + 1) % X != (2^32) % X
+     */
+    uint32_t cons;   /* Offset of next item to be consumed by control tools. */
+    uint32_t prod;   /* Offset of next item to be produced by Xen.           */
+    /*  Records follow immediately after the meta-data header.    */
+};
+
+/* Structure used to pass MFNs to the trace buffers back to trace consumers.
+ * Offset is an offset into the mapped structure where the mfn list will be held.
+ * MFNs will be at ((unsigned long *)(t_info))+(t_info->cpu_offset[cpu]).
+ */
+struct t_info {
+    uint16_t tbuf_size; /* Size in pages of each trace buffer */
+    uint16_t mfn_offset[];  /* Offset within t_info structure of the page list per cpu */
+    /* MFN lists immediately after the header */
+};
+
+#endif /* __XEN_PUBLIC_TRACE_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h

index 87e6f8a..768b5ff 100644 (file)
--- a/include/xen/interface/vcpu.h
+++ b/include/xen/interface/vcpu.h
@@ -27,6 +27,8 @@
  #ifndef __XEN_PUBLIC_VCPU_H__
  #define __XEN_PUBLIC_VCPU_H__
  
+#include "xen.h"
+
  /*
   * Prototype for this hypercall is:
   *     int vcpu_op(int cmd, int vcpuid, void *extra_args)
@@ -86,6 +88,8 @@ struct vcpu_runstate_info {
                 uint64_t time[4];
  };
  DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
+typedef struct vcpu_runstate_info vcpu_runstate_info_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_t);
  
  /* VCPU is currently running on a physical CPU. */
  #define RUNSTATE_running  0
@@ -120,11 +124,13 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_runstate_info);
  #define VCPUOP_register_runstate_memory_area 5
  struct vcpu_register_runstate_memory_area {
                 union {
-                               GUEST_HANDLE(vcpu_runstate_info) h;
+                               XEN_GUEST_HANDLE(vcpu_runstate_info_t) h;
                                 struct vcpu_runstate_info *v;
                                 uint64_t p;
                 } addr;
  };
+typedef struct vcpu_register_runstate_memory_area vcpu_register_runstate_memory_area_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_register_runstate_memory_area_t);
  
  /*
   * Set or stop a VCPU's periodic timer. Every VCPU has one periodic timer
@@ -137,6 +143,8 @@ struct vcpu_set_periodic_timer {
                 uint64_t period_ns;
  };
  DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_periodic_timer);
+typedef struct vcpu_set_periodic_timer vcpu_set_periodic_timer_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_set_periodic_timer_t);
  
  /*
   * Set or stop a VCPU's single-shot timer. Every VCPU has one single-shot
@@ -149,6 +157,8 @@ struct vcpu_set_singleshot_timer {
                 uint32_t flags;                    /* VCPU_SSHOTTMR_??? */
  };
  DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer);
+typedef struct vcpu_set_singleshot_timer vcpu_set_singleshot_timer_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_set_singleshot_timer_t);
  
  /* Flags to VCPUOP_set_singleshot_timer. */
   /* Require the timeout to be in the future (return -ETIME if it's passed). */
@@ -161,6 +171,8 @@ DEFINE_GUEST_HANDLE_STRUCT(vcpu_set_singleshot_timer);
   * structure in a convenient place, such as in a per-cpu data area.
   * The pointer need not be page aligned, but the structure must not
   * cross a page boundary.
+ *
+ * This may be called only once per vcpu.
   */
  #define VCPUOP_register_vcpu_info   10  /* arg == struct vcpu_info */
  struct vcpu_register_vcpu_info {
@@ -169,5 +181,53 @@ struct vcpu_register_vcpu_info {
      uint32_t rsvd;   /* unused */
  };
  DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info);
+typedef struct vcpu_register_vcpu_info vcpu_register_vcpu_info_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_register_vcpu_info_t);
+
+/* Send an NMI to the specified VCPU. @extra_arg == NULL. */
+#define VCPUOP_send_nmi             11
+
+/*
+ * Get the physical ID information for a pinned vcpu's underlying physical
+ * processor.  The physical ID informmation is architecture-specific.
+ * On x86: id[31:0]=apic_id, id[63:32]=acpi_id.
+ * This command returns -EINVAL if it is not a valid operation for this VCPU.
+ */
+#define VCPUOP_get_physid           12 /* arg == vcpu_get_physid_t */
+struct vcpu_get_physid {
+    uint64_t phys_id;
+};
+typedef struct vcpu_get_physid vcpu_get_physid_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_get_physid_t);
+#define xen_vcpu_physid_to_x86_apicid(physid) ((uint32_t)(physid))
+#define xen_vcpu_physid_to_x86_acpiid(physid) ((uint32_t)((physid) >> 32))
+
+/*
+ * Register a memory location to get a secondary copy of the vcpu time
+ * parameters.  The master copy still exists as part of the vcpu shared
+ * memory area, and this secondary copy is updated whenever the master copy
+ * is updated (and using the same versioning scheme for synchronisation).
+ *
+ * The intent is that this copy may be mapped (RO) into userspace so
+ * that usermode can compute system time using the time info and the
+ * tsc.  Usermode will see an array of vcpu_time_info structures, one
+ * for each vcpu, and choose the right one by an existing mechanism
+ * which allows it to get the current vcpu number (such as via a
+ * segment limit).  It can then apply the normal algorithm to compute
+ * system time from the tsc.
+ *
+ * @extra_arg == pointer to vcpu_register_time_info_memory_area structure.
+ */
+#define VCPUOP_register_vcpu_time_memory_area   13
+DEFINE_XEN_GUEST_HANDLE(vcpu_time_info_t);
+struct vcpu_register_time_memory_area {
+    union {
+        XEN_GUEST_HANDLE(vcpu_time_info_t) h;
+        struct vcpu_time_info *v;
+        uint64_t p;
+    } addr;
+};
+typedef struct vcpu_register_time_memory_area vcpu_register_time_memory_area_t;
+DEFINE_XEN_GUEST_HANDLE(vcpu_register_time_memory_area_t);
  
  #endif /* __XEN_PUBLIC_VCPU_H__ */
diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h

index e8b6519..da54fd8 100644 (file)
--- a/include/xen/interface/version.h
+++ b/include/xen/interface/version.h
@@ -3,6 +3,24 @@
   *
   * Xen version, type, and compile information.
   *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
   * Copyright (c) 2005, Nguyen Anh Quynh <aquynh@gmail.com>
   * Copyright (c) 2005, Keir Fraser <keir@xensource.com>
   */
@@ -10,17 +28,18 @@
  #ifndef __XEN_PUBLIC_VERSION_H__
  #define __XEN_PUBLIC_VERSION_H__
  
-/* NB. All ops return zero on success, except XENVER_version. */
+/* NB. All ops return zero on success, except XENVER_{version,pagesize} */
  
  /* arg == NULL; returns major:minor (16:16). */
  #define XENVER_version      0
  
  /* arg == xen_extraversion_t. */
  #define XENVER_extraversion 1
+typedef char xen_extraversion_t[16];
  struct xen_extraversion {
-    char extraversion[16];
+    xen_extraversion_t extraversion;
  };
-#define XEN_EXTRAVERSION_LEN (sizeof(struct xen_extraversion))
+#define XEN_EXTRAVERSION_LEN (sizeof(xen_extraversion_t))
  
  /* arg == xen_compile_info_t. */
  #define XENVER_compile_info 2
@@ -30,29 +49,34 @@ struct xen_compile_info {
      char compile_domain[32];
      char compile_date[32];
  };
+typedef struct xen_compile_info xen_compile_info_t;
  
  #define XENVER_capabilities 3
+typedef char xen_capabilities_info_t[1024];
  struct xen_capabilities_info {
-    char info[1024];
+    xen_capabilities_info_t info;
  };
-#define XEN_CAPABILITIES_INFO_LEN (sizeof(struct xen_capabilities_info))
+#define XEN_CAPABILITIES_INFO_LEN (sizeof(xen_capabilities_info_t))
  
  #define XENVER_changeset 4
+typedef char xen_changeset_info_t[64];
  struct xen_changeset_info {
-    char info[64];
+    xen_changeset_info_t info;
  };
-#define XEN_CHANGESET_INFO_LEN (sizeof(struct xen_changeset_info))
+#define XEN_CHANGESET_INFO_LEN (sizeof(xen_changeset_info_t))
  
  #define XENVER_platform_parameters 5
  struct xen_platform_parameters {
      unsigned long virt_start;
  };
+typedef struct xen_platform_parameters xen_platform_parameters_t;
  
  #define XENVER_get_features 6
  struct xen_feature_info {
      unsigned int submap_idx;    /* IN: which 32-bit submap to return */
      uint32_t     submap;        /* OUT: 32-bit submap */
  };
+typedef struct xen_feature_info xen_feature_info_t;
  
  /* Declares the features reported by XENVER_get_features. */
  #include "features.h"
@@ -60,4 +84,10 @@ struct xen_feature_info {
  /* arg == NULL; returns host memory page size. */
  #define XENVER_pagesize 7
  
+/* arg == xen_domain_handle_t. */
+#define XENVER_guest_handle 8
+
+#define XENVER_commandline 9
+typedef char xen_commandline_t[1024];
+
  #endif /* __XEN_PUBLIC_VERSION_H__ */
diff --git a/include/xen/interface/xen-compat.h b/include/xen/interface/xen-compat.h

new file mode 100644 (file)

index 0000000..d8c55bf
--- /dev/null
+++ b/include/xen/interface/xen-compat.h
@@ -0,0 +1,44 @@
+/******************************************************************************
+ * xen-compat.h
+ * 
+ * Guest OS interface to Xen.  Compatibility layer.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2006, Christian Limpach
+ */
+
+#ifndef __XEN_PUBLIC_XEN_COMPAT_H__
+#define __XEN_PUBLIC_XEN_COMPAT_H__
+
+#define __XEN_LATEST_INTERFACE_VERSION__ 0x00040200
+
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+/* Xen is built with matching headers and implements the latest interface. */
+#define __XEN_INTERFACE_VERSION__ __XEN_LATEST_INTERFACE_VERSION__
+#elif !defined(__XEN_INTERFACE_VERSION__)
+/* Guests which do not specify a version get the legacy interface. */
+#define __XEN_INTERFACE_VERSION__ 0x00000000
+#endif
+
+#if __XEN_INTERFACE_VERSION__ > __XEN_LATEST_INTERFACE_VERSION__
+#error "These header files do not support the requested interface version."
+#endif
+
+#endif /* __XEN_PUBLIC_XEN_COMPAT_H__ */
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h

index a890804..c018426 100644 (file)
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -3,35 +3,77 @@
   *
   * Guest OS interface to Xen.
   *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
   * Copyright (c) 2004, K A Fraser
   */
  
  #ifndef __XEN_PUBLIC_XEN_H__
  #define __XEN_PUBLIC_XEN_H__
  
-#include <asm/xen/interface.h>
+#include "xen-compat.h"
+#ifdef CONFIG_PARAVIRT_XEN
  #include <asm/pvclock-abi.h>
+#endif
+
+#if defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H)
+#include <asm/xen/interface.h>
+#elif defined(__i386__) || defined(__x86_64__)
+#include "arch-x86/xen.h"
+#elif defined(__ia64__)
+#include "arch-ia64.h"
+#elif defined(__arm__)
+#include "arch-arm.h"
+#else
+#error "Unsupported architecture"
+#endif
+
+#ifndef __ASSEMBLY__
+/* Guest handles for primitive C types. */
+DEFINE_XEN_GUEST_HANDLE(char);
+__DEFINE_XEN_GUEST_HANDLE(uchar, unsigned char);
+DEFINE_XEN_GUEST_HANDLE(int);
+__DEFINE_XEN_GUEST_HANDLE(uint,  unsigned int);
+DEFINE_XEN_GUEST_HANDLE(long);
+__DEFINE_XEN_GUEST_HANDLE(ulong, unsigned long);
+DEFINE_XEN_GUEST_HANDLE(void);
+
+DEFINE_XEN_GUEST_HANDLE(uint64_t);
+DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
+#endif
  
  /*
- * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
+ * HYPERCALLS
   */
  
-/*
- * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.
- *         EAX = return value
- *         (argument registers may be clobbered on return)
- * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6.
- *         RAX = return value
- *         (argument registers not clobbered on return; RCX, R11 are)
+/* `incontents 100 hcalls List of hypercalls
+ * ` enum hypercall_num { // __HYPERVISOR_* => HYPERVISOR_*()
   */
+
  #define __HYPERVISOR_set_trap_table        0
  #define __HYPERVISOR_mmu_update            1
  #define __HYPERVISOR_set_gdt               2
  #define __HYPERVISOR_stack_switch          3
  #define __HYPERVISOR_set_callbacks         4
  #define __HYPERVISOR_fpu_taskswitch        5
-#define __HYPERVISOR_sched_op_compat       6
-#define __HYPERVISOR_dom0_op               7
+#define __HYPERVISOR_sched_op_compat       6 /* compat since 0x00030101 */
+#define __HYPERVISOR_platform_op           7
  #define __HYPERVISOR_set_debugreg          8
  #define __HYPERVISOR_get_debugreg          9
  #define __HYPERVISOR_update_descriptor    10
@@ -39,10 +81,10 @@
  #define __HYPERVISOR_multicall            13
  #define __HYPERVISOR_update_va_mapping    14
  #define __HYPERVISOR_set_timer_op         15
-#define __HYPERVISOR_event_channel_op_compat 16
+#define __HYPERVISOR_event_channel_op_compat 16 /* compat since 0x00030202 */
  #define __HYPERVISOR_xen_version          17
  #define __HYPERVISOR_console_io           18
-#define __HYPERVISOR_physdev_op_compat    19
+#define __HYPERVISOR_physdev_op_compat    19 /* compat since 0x00030202 */
  #define __HYPERVISOR_grant_table_op       20
  #define __HYPERVISOR_vm_assist            21
  #define __HYPERVISOR_update_va_mapping_otherdomain 22
@@ -50,15 +92,19 @@
  #define __HYPERVISOR_vcpu_op              24
  #define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
  #define __HYPERVISOR_mmuext_op            26
-#define __HYPERVISOR_acm_op               27
+#define __HYPERVISOR_xsm_op               27
  #define __HYPERVISOR_nmi_op               28
-#define __HYPERVISOR_sched_op             29
+#define __HYPERVISOR_sched_op_new         29
  #define __HYPERVISOR_callback_op          30
  #define __HYPERVISOR_xenoprof_op          31
  #define __HYPERVISOR_event_channel_op     32
  #define __HYPERVISOR_physdev_op           33
  #define __HYPERVISOR_hvm_op               34
+#define __HYPERVISOR_sysctl               35
+#define __HYPERVISOR_domctl               36
+#define __HYPERVISOR_kexec_op             37
  #define __HYPERVISOR_tmem_op              38
+#define __HYPERVISOR_xc_reserved_op       39 /* reserved for XenClient */
  
  /* Architecture-specific hypercall definitions. */
  #define __HYPERVISOR_arch_0               48
@@ -70,16 +116,55 @@
  #define __HYPERVISOR_arch_6               54
  #define __HYPERVISOR_arch_7               55
  
+/* ` } */
+
+/*
+ * HYPERCALL COMPATIBILITY.
+ */
+
+/* New sched_op hypercall introduced in 0x00030101. */
+#if __XEN_INTERFACE_VERSION__ < 0x00030101 || (defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H))
+#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_compat
+#else
+#define __HYPERVISOR_sched_op __HYPERVISOR_sched_op_new
+#endif
+
+/* New event-channel and physdev hypercalls introduced in 0x00030202. */
+#if __XEN_INTERFACE_VERSION__ < 0x00030202
+#undef __HYPERVISOR_event_channel_op
+#define __HYPERVISOR_event_channel_op __HYPERVISOR_event_channel_op_compat
+#undef __HYPERVISOR_physdev_op
+#define __HYPERVISOR_physdev_op __HYPERVISOR_physdev_op_compat
+#endif
+
+/* New platform_op hypercall introduced in 0x00030204. */
+#if __XEN_INTERFACE_VERSION__ < 0x00030204 || (defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H))
+#define __HYPERVISOR_dom0_op __HYPERVISOR_platform_op
+#endif
+
  /*
   * VIRTUAL INTERRUPTS
   *
   * Virtual interrupts that a guest OS may receive from Xen.
+ *
+ * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a
+ * global VIRQ. The former can be bound once per VCPU and cannot be re-bound.
+ * The latter can be allocated only once per guest: they must initially be
+ * allocated to VCPU0 but can subsequently be re-bound.
   */
-#define VIRQ_TIMER      0  /* Timebase update, and/or requested timeout.  */
-#define VIRQ_DEBUG      1  /* Request guest to dump debug info.           */
-#define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
-#define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
-#define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
+/* ` enum virq { */
+#define VIRQ_TIMER      0  /* V. Timebase update, and/or requested timeout.  */
+#define VIRQ_DEBUG      1  /* V. Request guest to dump debug info.           */
+#define VIRQ_CONSOLE    2  /* G. (DOM0) Bytes received on emergency console. */
+#define VIRQ_DOM_EXC    3  /* G. (DOM0) Exceptional event for some domain.   */
+#define VIRQ_TBUF       4  /* G. (DOM0) Trace buffer has records available.  */
+#define VIRQ_DEBUGGER   6  /* G. (DOM0) A domain has paused for debugging.   */
+#define VIRQ_XENOPROF   7  /* V. XenOprofile interrupt: new sample available */
+#define VIRQ_CON_RING   8  /* G. (DOM0) Bytes received on console            */
+#define VIRQ_PCPU_STATE 9  /* G. (DOM0) PCPU state changed                   */
+#define VIRQ_MEM_EVENT  10 /* G. (DOM0) A memory event has occured           */
+#define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient                     */
+#define VIRQ_ENOMEM     12 /* G. (DOM0) Low on heap memory       */
  
  /* Architecture-specific VIRQ definitions. */
  #define VIRQ_ARCH_0    16
@@ -90,26 +175,73 @@
  #define VIRQ_ARCH_5    21
  #define VIRQ_ARCH_6    22
  #define VIRQ_ARCH_7    23
+/* ` } */
  
  #define NR_VIRQS       24
+
  /*
- * MMU-UPDATE REQUESTS
- *
- * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
- * A foreigndom (FD) can be specified (or DOMID_SELF for none).
- * Where the FD has some effect, it is described below.
- * ptr[1:0] specifies the appropriate MMU_* command.
+ * ` enum neg_errnoval
+ * ` HYPERVISOR_mmu_update(const struct mmu_update reqs[],
+ * `                       unsigned count, unsigned *done_out,
+ * `                       unsigned foreigndom)
+ * `
+ * @reqs is an array of mmu_update_t structures ((ptr, val) pairs).
+ * @count is the length of the above array.
+ * @pdone is an output parameter indicating number of completed operations
+ * @foreigndom[15:0]: FD, the expected owner of data pages referenced in this
+ *                    hypercall invocation. Can be DOMID_SELF.
+ * @foreigndom[31:16]: PFD, the expected owner of pagetable pages referenced
+ *                     in this hypercall invocation. The value of this field
+ *                     (x) encodes the PFD as follows:
+ *                     x == 0 => PFD == DOMID_SELF
+ *                     x != 0 => PFD == x - 1
   *
+ * Sub-commands: ptr[1:0] specifies the appropriate MMU_* command.
+ * -------------
   * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
- * Updates an entry in a page table. If updating an L1 table, and the new
- * table entry is valid/present, the mapped frame must belong to the FD, if
- * an FD has been specified. If attempting to map an I/O page then the
- * caller assumes the privilege of the FD.
+ * Updates an entry in a page table belonging to PFD. If updating an L1 table,
+ * and the new table entry is valid/present, the mapped frame must belong to
+ * FD. If attempting to map an I/O page then the caller assumes the privilege
+ * of the FD.
   * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
   * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
   * ptr[:2]  -- Machine address of the page-table entry to modify.
   * val      -- Value to write.
   *
+ * There also certain implicit requirements when using this hypercall. The
+ * pages that make up a pagetable must be mapped read-only in the guest.
+ * This prevents uncontrolled guest updates to the pagetable. Xen strictly
+ * enforces this, and will disallow any pagetable update which will end up
+ * mapping pagetable page RW, and will disallow using any writable page as a
+ * pagetable. In practice it means that when constructing a page table for a
+ * process, thread, etc, we MUST be very dilligient in following these rules:
+ *  1). Start with top-level page (PGD or in Xen language: L4). Fill out
+ *      the entries.
+ *  2). Keep on going, filling out the upper (PUD or L3), and middle (PMD
+ *      or L2).
+ *  3). Start filling out the PTE table (L1) with the PTE entries. Once
+ *     done, make sure to set each of those entries to RO (so writeable bit
+ *     is unset). Once that has been completed, set the PMD (L2) for this
+ *     PTE table as RO.
+ *  4). When completed with all of the PMD (L2) entries, and all of them have
+ *     been set to RO, make sure to set RO the PUD (L3). Do the same
+ *     operation on PGD (L4) pagetable entries that have a PUD (L3) entry.
+ *  5). Now before you can use those pages (so setting the cr3), you MUST also
+ *      pin them so that the hypervisor can verify the entries. This is done
+ *      via the HYPERVISOR_mmuext_op(MMUEXT_PIN_L4_TABLE, guest physical frame
+ *      number of the PGD (L4)). And this point the HYPERVISOR_mmuext_op(
+ *      MMUEXT_NEW_BASEPTR, guest physical frame number of the PGD (L4)) can be
+ *      issued.
+ * For 32-bit guests, the L4 is not used (as there is less pagetables), so
+ * instead use L3.
+ * At this point the pagetables can be modified using the MMU_NORMAL_PT_UPDATE
+ * hypercall. Also if so desired the OS can also try to write to the PTE
+ * and be trapped by the hypervisor (as the PTE entry is RO).
+ *
+ * To deallocate the pages, the operations are the reverse of the steps
+ * mentioned above. The argument is MMUEXT_UNPIN_TABLE for all levels and the
+ * pagetable MUST not be in use (meaning that the cr3 is not set to it).
+ *
   * ptr[1:0] == MMU_MACHPHYS_UPDATE:
   * Updates an entry in the machine->pseudo-physical mapping table.
   * ptr[:2]  -- Machine address within the frame whose mapping to modify.
@@ -119,6 +251,72 @@
   * ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD:
   * As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed
   * with those in @val.
+ *
+ * @val is usually the machine frame number along with some attributes.
+ * The attributes by default follow the architecture defined bits. Meaning that
+ * if this is a X86_64 machine and four page table layout is used, the layout
+ * of val is:
+ *  - 63 if set means No execute (NX)
+ *  - 46-13 the machine frame number
+ *  - 12 available for guest
+ *  - 11 available for guest
+ *  - 10 available for guest
+ *  - 9 available for guest
+ *  - 8 global
+ *  - 7 PAT (PSE is disabled, must use hypercall to make 4MB or 2MB pages)
+ *  - 6 dirty
+ *  - 5 accessed
+ *  - 4 page cached disabled
+ *  - 3 page write through
+ *  - 2 userspace accessible
+ *  - 1 writeable
+ *  - 0 present
+ *
+ *  The one bits that does not fit with the default layout is the PAGE_PSE
+ *  also called PAGE_PAT). The MMUEXT_[UN]MARK_SUPER arguments to the
+ *  HYPERVISOR_mmuext_op serve as mechanism to set a pagetable to be 4MB
+ *  (or 2MB) instead of using the PAGE_PSE bit.
+ *
+ *  The reason that the PAGE_PSE (bit 7) is not being utilized is due to Xen
+ *  using it as the Page Attribute Table (PAT) bit - for details on it please
+ *  refer to Intel SDM 10.12. The PAT allows to set the caching attributes of
+ *  pages instead of using MTRRs.
+ *
+ *  The PAT MSR is as follow (it is a 64-bit value, each entry is 8 bits):
+ *             PAT4                 PAT0
+ *   +---+----+----+----+-----+----+----+
+ *    WC | WC | WB | UC | UC- | WC | WB |  <= Linux
+ *   +---+----+----+----+-----+----+----+
+ *    WC | WT | WB | UC | UC- | WT | WB |  <= BIOS (default when machine boots)
+ *   +---+----+----+----+-----+----+----+
+ *    WC | WP | WC | UC | UC- | WT | WB |  <= Xen
+ *   +---+----+----+----+-----+----+----+
+ *
+ *  The lookup of this index table translates to looking up
+ *  Bit 7, Bit 4, and Bit 3 of val entry:
+ *
+ *  PAT/PSE (bit 7) ... PCD (bit 4) .. PWT (bit 3).
+ *
+ *  If all bits are off, then we are using PAT0. If bit 3 turned on,
+ *  then we are using PAT1, if bit 3 and bit 4, then PAT2..
+ *
+ *  As you can see, the Linux PAT1 translates to PAT4 under Xen. Which means
+ *  that if a guest that follows Linux's PAT setup and would like to set Write
+ *  Combined on pages it MUST use PAT4 entry. Meaning that Bit 7 (PAGE_PAT) is
+ *  set. For example, under Linux it only uses PAT0, PAT1, and PAT2 for the
+ *  caching as:
+ *
+ *   WB = none (so PAT0)
+ *   WC = PWT (bit 3 on)
+ *   UC = PWT | PCD (bit 3 and 4 are on).
+ *
+ * To make it work with Xen, it needs to translate the WC bit as so:
+ *
+ *  PWT (so bit 3 on) --> PAT (so bit 7 is on) and clear bit 3
+ *
+ * And to translate back it would:
+ *
+ * PAT (bit 7 on) --> PWT (bit 3 on) and clear bit 7.
   */
  #define MMU_NORMAL_PT_UPDATE      0 /* checked '*ptr = val'. ptr is MA.       */
  #define MMU_MACHPHYS_UPDATE       1 /* ptr = MA of frame to modify entry for  */
@@ -164,9 +362,23 @@
   * cmd: MMUEXT_FLUSH_CACHE
   * No additional arguments. Writes back and flushes cache contents.
   *
+ * cmd: MMUEXT_FLUSH_CACHE_GLOBAL
+ * No additional arguments. Writes back and flushes cache contents
+ * on all CPUs in the system.
+ *
   * cmd: MMUEXT_SET_LDT
   * linear_addr: Linear address of LDT base (NB. must be page-aligned).
   * nr_ents: Number of entries in LDT.
+ *
+ * cmd: MMUEXT_CLEAR_PAGE
+ * mfn: Machine frame number to be cleared.
+ *
+ * cmd: MMUEXT_COPY_PAGE
+ * mfn: Machine frame number of the destination page.
+ * src_mfn: Machine frame number of the source page.
+ *
+ * cmd: MMUEXT_[UN]MARK_SUPER
+ * mfn: Machine frame number of head of superpage to be [un]marked.
   */
  #define MMUEXT_PIN_L1_TABLE      0
  #define MMUEXT_PIN_L2_TABLE      1
@@ -183,13 +395,19 @@
  #define MMUEXT_FLUSH_CACHE      12
  #define MMUEXT_SET_LDT          13
  #define MMUEXT_NEW_USER_BASEPTR 15
+#define MMUEXT_CLEAR_PAGE       16
+#define MMUEXT_COPY_PAGE        17
+#define MMUEXT_FLUSH_CACHE_GLOBAL 18
+#define MMUEXT_MARK_SUPER       19
+#define MMUEXT_UNMARK_SUPER     20
  
  #ifndef __ASSEMBLY__
  struct mmuext_op {
         unsigned int cmd;
         union {
-               /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
-               unsigned long mfn;
+               /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR
+                * CLEAR_PAGE, COPY_PAGE, [UN]MARK_SUPER */
+               xen_pfn_t     mfn;
                 /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
                 unsigned long linear_addr;
         } arg1;
@@ -197,10 +415,18 @@ struct mmuext_op {
                 /* SET_LDT */
                 unsigned int nr_ents;
                 /* TLB_FLUSH_MULTI, INVLPG_MULTI */
-               void *vcpumask;
+#if __XEN_INTERFACE_VERSION__ >= 0x00030205
+               XEN_GUEST_HANDLE(const_void) vcpumask;
+#else
+               const void *vcpumask;
+#endif
+               /* COPY_PAGE */
+               xen_pfn_t src_mfn;
         } arg2;
  };
  DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
+typedef struct mmuext_op mmuext_op_t;
+DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
  #endif
  
  /* These are passed as 'flags' to update_va_mapping. They can be ORed. */
@@ -225,11 +451,24 @@ DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
   */
  #define VMASST_CMD_enable                0
  #define VMASST_CMD_disable               1
+
+/* x86/32 guests: simulate full 4GB segment limits. */
  #define VMASST_TYPE_4gb_segments         0
+
+/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
  #define VMASST_TYPE_4gb_segments_notify  1
+
+/*
+ * x86 guests: support writes to bottom-level PTEs.
+ * NB1. Page-directory entries cannot be written.
+ * NB2. Guest must continue to remove all writable mappings of PTEs.
+ */
  #define VMASST_TYPE_writable_pagetables  2
+
+/* x86/PAE guests: support PDPTs above 4GB. */
  #define VMASST_TYPE_pae_extended_cr3     3
-#define MAX_VMASST_TYPE 3
+
+#define MAX_VMASST_TYPE                  3
  
  #ifndef __ASSEMBLY__
  
@@ -261,6 +500,16 @@ typedef uint16_t domid_t;
  #define DOMID_XEN  (0x7FF2U)
  
  /*
+ * DOMID_COW is used as the owner of sharable pages */
+#define DOMID_COW  (0x7FF3U)
+
+/* DOMID_INVALID is used to identify pages with unknown owner. */
+#define DOMID_INVALID (0x7FF4U)
+
+/* Idle domain. */
+#define DOMID_IDLE (0x7FFFU)
+
+/*
   * Send an array of these to HYPERVISOR_mmu_update().
   * NB. The fields are natural pointer/address size for this architecture.
   */
@@ -269,6 +518,8 @@ struct mmu_update {
      uint64_t val;       /* New contents of PTE.    */
  };
  DEFINE_GUEST_HANDLE_STRUCT(mmu_update);
+typedef struct mmu_update mmu_update_t;
+DEFINE_XEN_GUEST_HANDLE(mmu_update_t);
  
  /*
   * Send an array of these to HYPERVISOR_multicall().
@@ -276,10 +527,16 @@ DEFINE_GUEST_HANDLE_STRUCT(mmu_update);
   */
  struct multicall_entry {
      unsigned long op;
+#if !defined(CONFIG_PARAVIRT_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+    unsigned long result;
+#else
      long result;
+#endif
      unsigned long args[6];
  };
  DEFINE_GUEST_HANDLE_STRUCT(multicall_entry);
+typedef struct multicall_entry multicall_entry_t;
+DEFINE_XEN_GUEST_HANDLE(multicall_entry_t);
  
  /*
   * Event channel endpoints per domain:
@@ -312,6 +569,7 @@ struct vcpu_time_info {
         int8_t   tsc_shift;
         int8_t   pad1[3];
  }; /* 32 bytes */
+typedef struct vcpu_time_info vcpu_time_info_t;
  
  struct vcpu_info {
         /*
@@ -343,15 +601,26 @@ struct vcpu_info {
         uint8_t evtchn_upcall_mask;
         unsigned long evtchn_pending_sel;
         struct arch_vcpu_info arch;
+#ifdef CONFIG_PARAVIRT_XEN
         struct pvclock_vcpu_time_info time;
+#else
+       struct vcpu_time_info time;
+#endif
  }; /* 64 bytes (x86) */
+#ifndef __XEN__
+typedef struct vcpu_info vcpu_info_t;
+#endif
  
  /*
   * Xen/kernel shared data -- pointer provided in start_info.
- * NB. We expect that this struct is smaller than a page.
+ *
+ * This structure is defined to be both smaller than a page, and the
+ * only data on the shared page, but may vary in actual size even within
+ * compatible Xen versions; guests should not rely on the size
+ * of this structure remaining constant.
   */
  struct shared_info {
-       struct vcpu_info vcpu_info[MAX_VIRT_CPUS];
+       struct vcpu_info vcpu_info[XEN_LEGACY_MAX_VCPUS];
  
         /*
          * A domain can create "event channels" on which it can send and receive
@@ -391,33 +660,41 @@ struct shared_info {
          * Wallclock time: updated only by control software. Guests should base
          * their gettimeofday() syscall on this wallclock-base value.
          */
-       struct pvclock_wall_clock wc;
+#ifdef CONFIG_PARAVIRT_XEN
+    struct pvclock_wall_clock wc;
+#else
+    uint32_t wc_version;      /* Version counter: see vcpu_time_info_t. */
+    uint32_t wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
+    uint32_t wc_nsec;         /* Nsecs 00:00:00 UTC, Jan 1, 1970.  */
+#endif
  
-       struct arch_shared_info arch;
+    struct arch_shared_info arch;
  
  };
+#ifndef __XEN__
+typedef struct shared_info shared_info_t;
+#endif
  
  /*
- * Start-of-day memory layout for the initial domain (DOM0):
+ * Start-of-day memory layout:
   *  1. The domain is started within contiguous virtual-memory region.
- *  2. The contiguous region begins and ends on an aligned 4MB boundary.
- *  3. The region start corresponds to the load address of the OS image.
- *     If the load address is not 4MB aligned then the address is rounded down.
- *  4. This the order of bootstrap elements in the initial virtual region:
+ *  2. The contiguous region ends on an aligned 4MB boundary.
+ *  3. This the order of bootstrap elements in the initial virtual region:
   *      a. relocated kernel image
   *      b. initial ram disk              [mod_start, mod_len]
   *      c. list of allocated page frames [mfn_list, nr_pages]
+ *         (unless relocated due to XEN_ELFNOTE_INIT_P2M)
   *      d. start_info_t structure        [register ESI (x86)]
   *      e. bootstrap page tables         [pt_base, CR3 (x86)]
   *      f. bootstrap stack               [register ESP (x86)]
- *  5. Bootstrap elements are packed together, but each is 4kB-aligned.
- *  6. The initial ram disk may be omitted.
- *  7. The list of page frames forms a contiguous 'pseudo-physical' memory
+ *  4. Bootstrap elements are packed together, but each is 4kB-aligned.
+ *  5. The initial ram disk may be omitted.
+ *  6. The list of page frames forms a contiguous 'pseudo-physical' memory
   *     layout for the domain. In particular, the bootstrap virtual-memory
   *     region is a 1:1 mapping to the first section of the pseudo-physical map.
- *  8. All bootstrap elements are mapped read-writable for the guest OS. The
+ *  7. All bootstrap elements are mapped read-writable for the guest OS. The
   *     only exception is the bootstrap page table, which is mapped read-only.
- *  9. There is guaranteed to be at least 512kB padding after the final
+ *  8. There is guaranteed to be at least 512kB padding after the final
   *     bootstrap element. If necessary, the bootstrap virtual region is
   *     extended by an extra 4MB to ensure this.
   */
@@ -429,11 +706,11 @@ struct start_info {
         unsigned long nr_pages;     /* Total pages allocated to this domain.  */
         unsigned long shared_info;  /* MACHINE address of shared info struct. */
         uint32_t flags;             /* SIF_xxx flags.                         */
-       unsigned long store_mfn;    /* MACHINE page number of shared page.    */
+       xen_pfn_t store_mfn;        /* MACHINE page number of shared page.    */
         uint32_t store_evtchn;      /* Event channel for store communication. */
         union {
                 struct {
-                       unsigned long mfn;  /* MACHINE page number of console page.   */
+                       xen_pfn_t mfn;      /* MACHINE page number of console page.   */
                         uint32_t  evtchn;   /* Event channel for console page.        */
                 } domU;
                 struct {
@@ -448,53 +725,93 @@ struct start_info {
         unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
         unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
         int8_t cmd_line[MAX_GUEST_CMDLINE];
+       /* The pfn range here covers both page table and p->m table frames.   */
+       unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
+       unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
  };
+typedef struct start_info start_info_t;
  
-struct dom0_vga_console_info {
-       uint8_t video_type;
-#define XEN_VGATYPE_TEXT_MODE_3 0x03
-#define XEN_VGATYPE_VESA_LFB    0x23
-
-       union {
-               struct {
-                       /* Font height, in pixels. */
-                       uint16_t font_height;
-                       /* Cursor location (column, row). */
-                       uint16_t cursor_x, cursor_y;
-                       /* Number of rows and columns (dimensions in characters). */
-                       uint16_t rows, columns;
-               } text_mode_3;
-
-               struct {
-                       /* Width and height, in pixels. */
-                       uint16_t width, height;
-                       /* Bytes per scan line. */
-                       uint16_t bytes_per_line;
-                       /* Bits per pixel. */
-                       uint16_t bits_per_pixel;
-                       /* LFB physical address, and size (in units of 64kB). */
-                       uint32_t lfb_base;
-                       uint32_t lfb_size;
-                       /* RGB mask offsets and sizes, as defined by VBE 1.2+ */
-                       uint8_t  red_pos, red_size;
-                       uint8_t  green_pos, green_size;
-                       uint8_t  blue_pos, blue_size;
-                       uint8_t  rsvd_pos, rsvd_size;
-
-                       /* VESA capabilities (offset 0xa, VESA command 0x4f00). */
-                       uint32_t gbl_caps;
-                       /* Mode attributes (offset 0x0, VESA command 0x4f01). */
-                       uint16_t mode_attrs;
-               } vesa_lfb;
-       } u;
-};
+/* New console union for dom0 introduced in 0x00030203. */
+#if __XEN_INTERFACE_VERSION__ < 0x00030203
+#define console_mfn    console.domU.mfn
+#define console_evtchn console.domU.evtchn
+#endif
  
  /* These flags are passed in the 'flags' field of start_info_t. */
  #define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
  #define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
+#define SIF_MULTIBOOT_MOD (1<<2)  /* Is mod_start a multiboot module? */
+#define SIF_MOD_START_PFN (1<<3)  /* Is mod_start a PFN? */
  #define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
  
-typedef uint64_t cpumap_t;
+/*
+ * A multiboot module is a package containing modules very similar to a
+ * multiboot module array. The only differences are:
+ * - the array of module descriptors is by convention simply at the beginning
+ *   of the multiboot module,
+ * - addresses in the module descriptors are based on the beginning of the
+ *   multiboot module,
+ * - the number of modules is determined by a termination descriptor that has
+ *   mod_start == 0.
+ *
+ * This permits to both build it statically and reference it in a configuration
+ * file, and let the PV guest easily rebase the addresses to virtual addresses
+ * and at the same time count the number of modules.
+ */
+struct xen_multiboot_mod_list
+{
+    /* Address of first byte of the module */
+    uint32_t mod_start;
+    /* Address of last byte of the module (inclusive) */
+    uint32_t mod_end;
+    /* Address of zero-terminated command line */
+    uint32_t cmdline;
+    /* Unused, must be zero */
+    uint32_t pad;
+};
+
+typedef struct dom0_vga_console_info {
+    uint8_t video_type; /* DOM0_VGA_CONSOLE_??? */
+#define XEN_VGATYPE_TEXT_MODE_3 0x03
+#define XEN_VGATYPE_VESA_LFB    0x23
+#define XEN_VGATYPE_EFI_LFB     0x70
+
+    union {
+        struct {
+            /* Font height, in pixels. */
+            uint16_t font_height;
+            /* Cursor location (column, row). */
+            uint16_t cursor_x, cursor_y;
+            /* Number of rows and columns (dimensions in characters). */
+            uint16_t rows, columns;
+        } text_mode_3;
+
+        struct {
+            /* Width and height, in pixels. */
+            uint16_t width, height;
+            /* Bytes per scan line. */
+            uint16_t bytes_per_line;
+            /* Bits per pixel. */
+            uint16_t bits_per_pixel;
+            /* LFB physical address, and size (in units of 64kB). */
+            uint32_t lfb_base;
+            uint32_t lfb_size;
+            /* RGB mask offsets and sizes, as defined by VBE 1.2+ */
+            uint8_t  red_pos, red_size;
+            uint8_t  green_pos, green_size;
+            uint8_t  blue_pos, blue_size;
+            uint8_t  rsvd_pos, rsvd_size;
+#if __XEN_INTERFACE_VERSION__ >= 0x00030206 || (defined(CONFIG_PARAVIRT_XEN) && !defined(HAVE_XEN_PLATFORM_COMPAT_H))
+            /* VESA capabilities (offset 0xa, VESA command 0x4f00). */
+            uint32_t gbl_caps;
+            /* Mode attributes (offset 0x0, VESA command 0x4f01). */
+            uint16_t mode_attrs;
+#endif
+        } vesa_lfb;
+    } u;
+} dom0_vga_console_info_t;
+#define xen_vga_console_info dom0_vga_console_info
+#define xen_vga_console_info_t dom0_vga_console_info_t
  
  typedef uint8_t xen_domain_handle_t[16];
  
@@ -502,28 +819,10 @@ typedef uint8_t xen_domain_handle_t[16];
  #define __mk_unsigned_long(x) x ## UL
  #define mk_unsigned_long(x) __mk_unsigned_long(x)
  
-#define TMEM_SPEC_VERSION 1
-
-struct tmem_op {
-       uint32_t cmd;
-       int32_t pool_id;
-       union {
-               struct {  /* for cmd == TMEM_NEW_POOL */
-                       uint64_t uuid[2];
-                       uint32_t flags;
-               } new;
-               struct {
-                       uint64_t oid[3];
-                       uint32_t index;
-                       uint32_t tmem_offset;
-                       uint32_t pfn_offset;
-                       uint32_t len;
-                       GUEST_HANDLE(void) gmfn; /* guest machine page frame */
-               } gen;
-       } u;
-};
-
-DEFINE_GUEST_HANDLE(u64);
+__DEFINE_XEN_GUEST_HANDLE(uint8,  uint8_t);
+__DEFINE_XEN_GUEST_HANDLE(uint16, uint16_t);
+__DEFINE_XEN_GUEST_HANDLE(uint32, uint32_t);
+__DEFINE_XEN_GUEST_HANDLE(uint64, uint64_t);
  
  #else /* __ASSEMBLY__ */
  
@@ -532,4 +831,23 @@ DEFINE_GUEST_HANDLE(u64);
  
  #endif /* !__ASSEMBLY__ */
  
+/* Default definitions for macros used by domctl/sysctl. */
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+#ifndef uint64_aligned_t
+#define uint64_aligned_t uint64_t
+#endif
+#ifndef XEN_GUEST_HANDLE_64
+#define XEN_GUEST_HANDLE_64(name) XEN_GUEST_HANDLE(name)
+#endif
+
+#ifndef __ASSEMBLY__
+struct xenctl_cpumap {
+    XEN_GUEST_HANDLE_64(uint8) bitmap;
+    uint32_t nr_cpus;
+};
+#endif
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
+
  #endif /* __XEN_PUBLIC_XEN_H__ */
diff --git a/include/xen/interface/xenoprof.h b/include/xen/interface/xenoprof.h

new file mode 100644 (file)

index 0000000..a0c6987
--- /dev/null
+++ b/include/xen/interface/xenoprof.h
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * xenoprof.h
+ * 
+ * Interface for enabling system wide profiling based on hardware performance
+ * counters
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (C) 2005 Hewlett-Packard Co.
+ * Written by Aravind Menon & Jose Renato Santos
+ */
+
+#ifndef __XEN_PUBLIC_XENOPROF_H__
+#define __XEN_PUBLIC_XENOPROF_H__
+
+#include "xen.h"
+
+/*
+ * Commands to HYPERVISOR_xenoprof_op().
+ */
+#define XENOPROF_init                0
+#define XENOPROF_reset_active_list   1
+#define XENOPROF_reset_passive_list  2
+#define XENOPROF_set_active          3
+#define XENOPROF_set_passive         4
+#define XENOPROF_reserve_counters    5
+#define XENOPROF_counter             6
+#define XENOPROF_setup_events        7
+#define XENOPROF_enable_virq         8
+#define XENOPROF_start               9
+#define XENOPROF_stop               10
+#define XENOPROF_disable_virq       11
+#define XENOPROF_release_counters   12
+#define XENOPROF_shutdown           13
+#define XENOPROF_get_buffer         14
+#define XENOPROF_set_backtrace      15
+
+/* AMD IBS support */
+#define XENOPROF_get_ibs_caps       16
+#define XENOPROF_ibs_counter        17
+#define XENOPROF_last_op            17
+
+#define MAX_OPROF_EVENTS    32
+#define MAX_OPROF_DOMAINS   25
+#define XENOPROF_CPU_TYPE_SIZE 64
+
+/* Xenoprof performance events (not Xen events) */
+struct event_log {
+    uint64_t eip;
+    uint8_t mode;
+    uint8_t event;
+};
+
+/* PC value that indicates a special code */
+#define XENOPROF_ESCAPE_CODE (~0ULL)
+/* Transient events for the xenoprof->oprofile cpu buf */
+#define XENOPROF_TRACE_BEGIN 1
+
+/* Xenoprof buffer shared between Xen and domain - 1 per VCPU */
+struct xenoprof_buf {
+    uint32_t event_head;
+    uint32_t event_tail;
+    uint32_t event_size;
+    uint32_t vcpu_id;
+    uint64_t xen_samples;
+    uint64_t kernel_samples;
+    uint64_t user_samples;
+    uint64_t lost_samples;
+    struct event_log event_log[1];
+};
+#ifndef __XEN__
+typedef struct xenoprof_buf xenoprof_buf_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_buf_t);
+#endif
+
+struct xenoprof_init {
+    int32_t  num_events;
+    int32_t  is_primary;
+    char cpu_type[XENOPROF_CPU_TYPE_SIZE];
+};
+typedef struct xenoprof_init xenoprof_init_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_init_t);
+
+struct xenoprof_get_buffer {
+    int32_t  max_samples;
+    int32_t  nbuf;
+    int32_t  bufsize;
+    uint64_t buf_gmaddr;
+};
+typedef struct xenoprof_get_buffer xenoprof_get_buffer_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_get_buffer_t);
+
+struct xenoprof_counter {
+    uint32_t ind;
+    uint64_t count;
+    uint32_t enabled;
+    uint32_t event;
+    uint32_t hypervisor;
+    uint32_t kernel;
+    uint32_t user;
+    uint64_t unit_mask;
+};
+typedef struct xenoprof_counter xenoprof_counter_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_counter_t);
+
+typedef struct xenoprof_passive {
+    uint16_t domain_id;
+    int32_t  max_samples;
+    int32_t  nbuf;
+    int32_t  bufsize;
+    uint64_t buf_gmaddr;
+} xenoprof_passive_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_passive_t);
+
+struct xenoprof_ibs_counter {
+    uint64_t op_enabled;
+    uint64_t fetch_enabled;
+    uint64_t max_cnt_fetch;
+    uint64_t max_cnt_op;
+    uint64_t rand_en;
+    uint64_t dispatched_ops;
+};
+typedef struct xenoprof_ibs_counter xenoprof_ibs_counter_t;
+DEFINE_XEN_GUEST_HANDLE(xenoprof_ibs_counter_t);
+
+#endif /* __XEN_PUBLIC_XENOPROF_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/xsm/acm.h b/include/xen/interface/xsm/acm.h

new file mode 100644 (file)

index 0000000..b6ac8d5
--- /dev/null
+++ b/include/xen/interface/xsm/acm.h
@@ -0,0 +1,223 @@
+/*
+ * acm.h: Xen access control module interface defintions
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Reiner Sailer <sailer@watson.ibm.com>
+ * Copyright (c) 2005, International Business Machines Corporation.
+ */
+
+#ifndef _XEN_PUBLIC_ACM_H
+#define _XEN_PUBLIC_ACM_H
+
+#include "../xen.h"
+
+/* default ssid reference value if not supplied */
+#define ACM_DEFAULT_SSID  0x0
+#define ACM_DEFAULT_LOCAL_SSID  0x0
+
+/* Internal ACM ERROR types */
+#define ACM_OK     0
+#define ACM_UNDEF   -1
+#define ACM_INIT_SSID_ERROR  -2
+#define ACM_INIT_SOID_ERROR  -3
+#define ACM_ERROR          -4
+
+/* External ACCESS DECISIONS */
+#define ACM_ACCESS_PERMITTED        0
+#define ACM_ACCESS_DENIED           -111
+#define ACM_NULL_POINTER_ERROR      -200
+
+/*
+   Error codes reported in when trying to test for a new policy
+   These error codes are reported in an array of tuples where
+   each error code is followed by a parameter describing the error
+   more closely, such as a domain id.
+*/
+#define ACM_EVTCHN_SHARING_VIOLATION       0x100
+#define ACM_GNTTAB_SHARING_VIOLATION       0x101
+#define ACM_DOMAIN_LOOKUP                  0x102
+#define ACM_CHWALL_CONFLICT                0x103
+#define ACM_SSIDREF_IN_USE                 0x104
+
+
+/* primary policy in lower 4 bits */
+#define ACM_NULL_POLICY 0
+#define ACM_CHINESE_WALL_POLICY 1
+#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2
+#define ACM_POLICY_UNDEFINED 15
+
+/* combinations have secondary policy component in higher 4bit */
+#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY \
+    ((ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY << 4) | ACM_CHINESE_WALL_POLICY)
+
+/* policy: */
+#define ACM_POLICY_NAME(X) \
+ ((X) == (ACM_NULL_POLICY)) ? "NULL" :                        \
+    ((X) == (ACM_CHINESE_WALL_POLICY)) ? "CHINESE WALL" :        \
+    ((X) == (ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "SIMPLE TYPE ENFORCEMENT" : \
+    ((X) == (ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY)) ? "CHINESE WALL AND SIMPLE TYPE ENFORCEMENT" : \
+     "UNDEFINED"
+
+/* the following policy versions must be increased
+ * whenever the interpretation of the related
+ * policy's data structure changes
+ */
+#define ACM_POLICY_VERSION 4
+#define ACM_CHWALL_VERSION 1
+#define ACM_STE_VERSION  1
+
+/* defines a ssid reference used by xen */
+typedef uint32_t ssidref_t;
+
+/* hooks that are known to domains */
+#define ACMHOOK_none          0
+#define ACMHOOK_sharing       1
+#define ACMHOOK_authorization 2
+#define ACMHOOK_conflictset   3
+
+/* -------security policy relevant type definitions-------- */
+
+/* type identifier; compares to "equal" or "not equal" */
+typedef uint16_t domaintype_t;
+
+/* CHINESE WALL POLICY DATA STRUCTURES
+ *
+ * current accumulated conflict type set:
+ * When a domain is started and has a type that is in
+ * a conflict set, the conflicting types are incremented in
+ * the aggregate set. When a domain is destroyed, the 
+ * conflicting types to its type are decremented.
+ * If a domain has multiple types, this procedure works over
+ * all those types.
+ *
+ * conflict_aggregate_set[i] holds the number of
+ *   running domains that have a conflict with type i.
+ *
+ * running_types[i] holds the number of running domains
+ *        that include type i in their ssidref-referenced type set
+ *
+ * conflict_sets[i][j] is "0" if type j has no conflict
+ *    with type i and is "1" otherwise.
+ */
+/* high-16 = version, low-16 = check magic */
+#define ACM_MAGIC  0x0001debc
+
+/* size of the SHA1 hash identifying the XML policy from which the
+   binary policy was created */
+#define ACM_SHA1_HASH_SIZE    20
+
+/* each offset in bytes from start of the struct they
+ * are part of */
+
+/* V3 of the policy buffer aded a version structure */
+struct acm_policy_version
+{
+    uint32_t major;
+    uint32_t minor;
+};
+
+
+/* each buffer consists of all policy information for
+ * the respective policy given in the policy code
+ *
+ * acm_policy_buffer, acm_chwall_policy_buffer,
+ * and acm_ste_policy_buffer need to stay 32-bit aligned
+ * because we create binary policies also with external
+ * tools that assume packed representations (e.g. the java tool)
+ */
+struct acm_policy_buffer {
+    uint32_t magic;
+    uint32_t policy_version; /* ACM_POLICY_VERSION */
+    uint32_t len;
+    uint32_t policy_reference_offset;
+    uint32_t primary_policy_code;
+    uint32_t primary_buffer_offset;
+    uint32_t secondary_policy_code;
+    uint32_t secondary_buffer_offset;
+    struct acm_policy_version xml_pol_version; /* add in V3 */
+    uint8_t xml_policy_hash[ACM_SHA1_HASH_SIZE]; /* added in V4 */
+};
+
+
+struct acm_policy_reference_buffer {
+    uint32_t len;
+};
+
+struct acm_chwall_policy_buffer {
+    uint32_t policy_version; /* ACM_CHWALL_VERSION */
+    uint32_t policy_code;
+    uint32_t chwall_max_types;
+    uint32_t chwall_max_ssidrefs;
+    uint32_t chwall_max_conflictsets;
+    uint32_t chwall_ssid_offset;
+    uint32_t chwall_conflict_sets_offset;
+    uint32_t chwall_running_types_offset;
+    uint32_t chwall_conflict_aggregate_offset;
+};
+
+struct acm_ste_policy_buffer {
+    uint32_t policy_version; /* ACM_STE_VERSION */
+    uint32_t policy_code;
+    uint32_t ste_max_types;
+    uint32_t ste_max_ssidrefs;
+    uint32_t ste_ssid_offset;
+};
+
+struct acm_stats_buffer {
+    uint32_t magic;
+    uint32_t len;
+    uint32_t primary_policy_code;
+    uint32_t primary_stats_offset;
+    uint32_t secondary_policy_code;
+    uint32_t secondary_stats_offset;
+};
+
+struct acm_ste_stats_buffer {
+    uint32_t ec_eval_count;
+    uint32_t gt_eval_count;
+    uint32_t ec_denied_count;
+    uint32_t gt_denied_count;
+    uint32_t ec_cachehit_count;
+    uint32_t gt_cachehit_count;
+};
+
+struct acm_ssid_buffer {
+    uint32_t len;
+    ssidref_t ssidref;
+    uint32_t policy_reference_offset;
+    uint32_t primary_policy_code;
+    uint32_t primary_max_types;
+    uint32_t primary_types_offset;
+    uint32_t secondary_policy_code;
+    uint32_t secondary_max_types;
+    uint32_t secondary_types_offset;
+};
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/xsm/acm_ops.h b/include/xen/interface/xsm/acm_ops.h

new file mode 100644 (file)

index 0000000..1fef7a0
--- /dev/null
+++ b/include/xen/interface/xsm/acm_ops.h
@@ -0,0 +1,159 @@
+/*
+ * acm_ops.h: Xen access control module hypervisor commands
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Reiner Sailer <sailer@watson.ibm.com>
+ * Copyright (c) 2005,2006 International Business Machines Corporation.
+ */
+
+#ifndef __XEN_PUBLIC_ACM_OPS_H__
+#define __XEN_PUBLIC_ACM_OPS_H__
+
+#include "../xen.h"
+#include "acm.h"
+
+/*
+ * Make sure you increment the interface version whenever you modify this file!
+ * This makes sure that old versions of acm tools will stop working in a
+ * well-defined way (rather than crashing the machine, for instance).
+ */
+#define ACM_INTERFACE_VERSION   0xAAAA000A
+
+/************************************************************************/
+
+/*
+ * Prototype for this hypercall is:
+ *  int acm_op(int cmd, void *args)
+ * @cmd  == ACMOP_??? (access control module operation).
+ * @args == Operation-specific extra arguments (NULL if none).
+ */
+
+
+#define ACMOP_setpolicy         1
+struct acm_setpolicy {
+    /* IN */
+    XEN_GUEST_HANDLE_64(void) pushcache;
+    uint32_t pushcache_size;
+};
+
+
+#define ACMOP_getpolicy         2
+struct acm_getpolicy {
+    /* IN */
+    XEN_GUEST_HANDLE_64(void) pullcache;
+    uint32_t pullcache_size;
+};
+
+
+#define ACMOP_dumpstats         3
+struct acm_dumpstats {
+    /* IN */
+    XEN_GUEST_HANDLE_64(void) pullcache;
+    uint32_t pullcache_size;
+};
+
+
+#define ACMOP_getssid           4
+#define ACM_GETBY_ssidref  1
+#define ACM_GETBY_domainid 2
+struct acm_getssid {
+    /* IN */
+    uint32_t get_ssid_by; /* ACM_GETBY_* */
+    union {
+        domaintype_t domainid;
+        ssidref_t    ssidref;
+    } id;
+    XEN_GUEST_HANDLE_64(void) ssidbuf;
+    uint32_t ssidbuf_size;
+};
+
+#define ACMOP_getdecision      5
+struct acm_getdecision {
+    /* IN */
+    uint32_t get_decision_by1; /* ACM_GETBY_* */
+    uint32_t get_decision_by2; /* ACM_GETBY_* */
+    union {
+        domaintype_t domainid;
+        ssidref_t    ssidref;
+    } id1;
+    union {
+        domaintype_t domainid;
+        ssidref_t    ssidref;
+    } id2;
+    uint32_t hook;
+    /* OUT */
+    uint32_t acm_decision;
+};
+
+
+#define ACMOP_chgpolicy        6
+struct acm_change_policy {
+    /* IN */
+    XEN_GUEST_HANDLE_64(void) policy_pushcache;
+    uint32_t policy_pushcache_size;
+    XEN_GUEST_HANDLE_64(void) del_array;
+    uint32_t delarray_size;
+    XEN_GUEST_HANDLE_64(void) chg_array;
+    uint32_t chgarray_size;
+    /* OUT */
+    /* array with error code */
+    XEN_GUEST_HANDLE_64(void) err_array;
+    uint32_t errarray_size;
+};
+
+#define ACMOP_relabeldoms       7
+struct acm_relabel_doms {
+    /* IN */
+    XEN_GUEST_HANDLE_64(void) relabel_map;
+    uint32_t relabel_map_size;
+    /* OUT */
+    XEN_GUEST_HANDLE_64(void) err_array;
+    uint32_t errarray_size;
+};
+
+/* future interface to Xen */
+struct xen_acmctl {
+    uint32_t cmd;
+    uint32_t interface_version;
+    union {
+        struct acm_setpolicy     setpolicy;
+        struct acm_getpolicy     getpolicy;
+        struct acm_dumpstats     dumpstats;
+        struct acm_getssid       getssid;
+        struct acm_getdecision   getdecision;
+        struct acm_change_policy change_policy;
+        struct acm_relabel_doms  relabel_doms;
+    } u;
+};
+
+typedef struct xen_acmctl xen_acmctl_t;
+DEFINE_XEN_GUEST_HANDLE(xen_acmctl_t);
+
+#endif /* __XEN_PUBLIC_ACM_OPS_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/include/xen/interface/xsm/flask_op.h b/include/xen/interface/xsm/flask_op.h

new file mode 100644 (file)

index 0000000..1a251c9
--- /dev/null
+++ b/include/xen/interface/xsm/flask_op.h
@@ -0,0 +1,193 @@
+/*
+ *  This file contains the flask_op hypercall commands and definitions.
+ *
+ *  Author:  George Coker, <gscoker@alpha.ncsc.mil>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __FLASK_OP_H__
+#define __FLASK_OP_H__
+
+#define XEN_FLASK_INTERFACE_VERSION 1
+
+struct xen_flask_load {
+    XEN_GUEST_HANDLE(char) buffer;
+    uint32_t size;
+};
+
+struct xen_flask_setenforce {
+    uint32_t enforcing;
+};
+
+struct xen_flask_sid_context {
+    /* IN/OUT: sid to convert to/from string */
+    uint32_t sid;
+    /* IN: size of the context buffer
+     * OUT: actual size of the output context string
+     */
+    uint32_t size;
+    XEN_GUEST_HANDLE(char) context;
+};
+
+struct xen_flask_access {
+    /* IN: access request */
+    uint32_t ssid;
+    uint32_t tsid;
+    uint32_t tclass;
+    uint32_t req;
+    /* OUT: AVC data */
+    uint32_t allowed;
+    uint32_t audit_allow;
+    uint32_t audit_deny;
+    uint32_t seqno;
+};
+
+struct xen_flask_transition {
+    /* IN: transition SIDs and class */
+    uint32_t ssid;
+    uint32_t tsid;
+    uint32_t tclass;
+    /* OUT: new SID */
+    uint32_t newsid;
+};
+
+struct xen_flask_userlist {
+    /* IN: starting SID for list */
+    uint32_t start_sid;
+    /* IN: size of user string and output buffer
+     * OUT: number of SIDs returned */
+    uint32_t size;
+    union {
+        /* IN: user to enumerate SIDs */
+        XEN_GUEST_HANDLE(char) user;
+        /* OUT: SID list */
+        XEN_GUEST_HANDLE(uint32) sids;
+    } u;
+};
+
+struct xen_flask_boolean {
+    /* IN/OUT: numeric identifier for boolean [GET/SET]
+     * If -1, name will be used and bool_id will be filled in. */
+    uint32_t bool_id;
+    /* OUT: current enforcing value of boolean [GET/SET] */
+    uint8_t enforcing;
+    /* OUT: pending value of boolean [GET/SET] */
+    uint8_t pending;
+    /* IN: new value of boolean [SET] */
+    uint8_t new_value;
+    /* IN: commit new value instead of only setting pending [SET] */
+    uint8_t commit;
+    /* IN: size of boolean name buffer [GET/SET]
+     * OUT: actual size of name [GET only] */
+    uint32_t size;
+    /* IN: if bool_id is -1, used to find boolean [GET/SET]
+     * OUT: textual name of boolean [GET only]
+     */
+    XEN_GUEST_HANDLE(char) name;
+};
+
+struct xen_flask_setavc_threshold {
+    /* IN */
+    uint32_t threshold;
+};
+
+struct xen_flask_hash_stats {
+    /* OUT */
+    uint32_t entries;
+    uint32_t buckets_used;
+    uint32_t buckets_total;
+    uint32_t max_chain_len;
+};
+
+struct xen_flask_cache_stats {
+    /* IN */
+    uint32_t cpu;
+    /* OUT */
+    uint32_t lookups;
+    uint32_t hits;
+    uint32_t misses;
+    uint32_t allocations;
+    uint32_t reclaims;
+    uint32_t frees;
+};
+
+struct xen_flask_ocontext {
+    /* IN */
+    uint32_t ocon;
+    uint32_t sid;
+    uint64_t low, high;
+};
+
+struct xen_flask_peersid {
+    /* IN */
+    evtchn_port_t evtchn;
+    /* OUT */
+    uint32_t sid;
+};
+
+struct xen_flask_op {
+    uint32_t cmd;
+#define FLASK_LOAD              1
+#define FLASK_GETENFORCE        2
+#define FLASK_SETENFORCE        3
+#define FLASK_CONTEXT_TO_SID    4
+#define FLASK_SID_TO_CONTEXT    5
+#define FLASK_ACCESS            6
+#define FLASK_CREATE            7
+#define FLASK_RELABEL           8
+#define FLASK_USER              9
+#define FLASK_POLICYVERS        10
+#define FLASK_GETBOOL           11
+#define FLASK_SETBOOL           12
+#define FLASK_COMMITBOOLS       13
+#define FLASK_MLS               14
+#define FLASK_DISABLE           15
+#define FLASK_GETAVC_THRESHOLD  16
+#define FLASK_SETAVC_THRESHOLD  17
+#define FLASK_AVC_HASHSTATS     18
+#define FLASK_AVC_CACHESTATS    19
+#define FLASK_MEMBER            20
+#define FLASK_ADD_OCONTEXT      21
+#define FLASK_DEL_OCONTEXT      22
+#define FLASK_GET_PEER_SID      23
+    uint32_t interface_version; /* XEN_FLASK_INTERFACE_VERSION */
+    union {
+        struct xen_flask_load load;
+        struct xen_flask_setenforce enforce;
+        /* FLASK_CONTEXT_TO_SID and FLASK_SID_TO_CONTEXT */
+        struct xen_flask_sid_context sid_context;
+        struct xen_flask_access access;
+        /* FLASK_CREATE, FLASK_RELABEL, FLASK_MEMBER */
+        struct xen_flask_transition transition;
+        struct xen_flask_userlist userlist;
+        /* FLASK_GETBOOL, FLASK_SETBOOL */
+        struct xen_flask_boolean boolean;
+        struct xen_flask_setavc_threshold setavc_threshold;
+        struct xen_flask_hash_stats hash_stats;
+        struct xen_flask_cache_stats cache_stats;
+        /* FLASK_ADD_OCONTEXT, FLASK_DEL_OCONTEXT */
+        struct xen_flask_ocontext ocontext;
+        struct xen_flask_peersid peersid;
+    } u;
+};
+typedef struct xen_flask_op xen_flask_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_flask_op_t);
+
+#endif
diff --git a/include/xen/net-util.h b/include/xen/net-util.h

new file mode 100644 (file)

index 0000000..8561e2c
--- /dev/null
+++ b/include/xen/net-util.h
@@ -0,0 +1,75 @@
+#ifndef __XEN_NETUTIL_H__
+#define __XEN_NETUTIL_H__
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <net/ip.h>
+
+static inline int skb_checksum_setup(struct sk_buff *skb,
+                                    unsigned long *fixup_counter)
+{
+       struct iphdr *iph = (void *)skb->data;
+       unsigned char *th;
+       __be16 *csum = NULL;
+       int err = -EPROTO;
+
+       if (skb->ip_summed != CHECKSUM_PARTIAL) {
+               /* A non-CHECKSUM_PARTIAL SKB does not require setup. */
+               if (!skb_is_gso(skb))
+                       return 0;
+
+               /*
+                * A GSO SKB must be CHECKSUM_PARTIAL. However some buggy
+                * peers can fail to set NETRXF_csum_blank when sending a GSO
+                * frame. In this case force the SKB to CHECKSUM_PARTIAL and
+                * recalculate the partial checksum.
+                */
+               ++*fixup_counter;
+               --csum;
+       }
+
+       if (skb->protocol != htons(ETH_P_IP))
+               goto out;
+
+       th = skb->data + 4 * iph->ihl;
+       if (th >= skb_tail_pointer(skb))
+               goto out;
+
+       skb->csum_start = th - skb->head;
+       switch (iph->protocol) {
+       case IPPROTO_TCP:
+               skb->csum_offset = offsetof(struct tcphdr, check);
+               if (csum)
+                       csum = &((struct tcphdr *)th)->check;
+               break;
+       case IPPROTO_UDP:
+               skb->csum_offset = offsetof(struct udphdr, check);
+               if (csum)
+                       csum = &((struct udphdr *)th)->check;
+               break;
+       default:
+               if (net_ratelimit())
+                       pr_err("Attempting to checksum a non-"
+                              "TCP/UDP packet, dropping a protocol"
+                              " %d packet\n", iph->protocol);
+               goto out;
+       }
+
+       if ((th + skb->csum_offset + sizeof(*csum)) > skb_tail_pointer(skb))
+               goto out;
+
+       if (csum) {
+               *csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+                                          skb->len - iph->ihl*4,
+                                          IPPROTO_TCP, 0);
+               skb->ip_summed = CHECKSUM_PARTIAL;
+       }
+
+       err = 0;
+out:
+       return err;
+}
+
+#endif /* __XEN_NETUTIL_H__ */
diff --git a/include/xen/pcifront.h b/include/xen/pcifront.h

new file mode 100644 (file)

index 0000000..2ff803f
--- /dev/null
+++ b/include/xen/pcifront.h
@@ -0,0 +1,61 @@
+/*
+ * PCI Frontend - arch-dependendent declarations
+ *
+ *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ */
+#ifndef __XEN_ASM_PCIFRONT_H__
+#define __XEN_ASM_PCIFRONT_H__
+
+#include <linux/spinlock.h>
+
+#ifdef __KERNEL__
+
+#ifndef __ia64__
+
+#include <asm/pci.h>
+
+struct pcifront_device;
+struct pci_bus;
+#define pcifront_sd pci_sysdata
+
+static inline struct pcifront_device *
+pcifront_get_pdev(struct pcifront_sd *sd)
+{
+       return sd->pdev;
+}
+
+static inline void pcifront_setup_root_resources(struct pci_bus *bus,
+                                                struct pcifront_sd *sd)
+{
+}
+
+#else /* __ia64__ */
+
+#include <linux/acpi.h>
+#include <asm/pci.h>
+#define pcifront_sd pci_controller
+
+extern void xen_add_resource(struct pci_controller *, unsigned int,
+                            unsigned int, struct acpi_resource *);
+extern void xen_pcibios_setup_root_windows(struct pci_bus *,
+                                          struct pci_controller *);
+
+static inline struct pcifront_device *
+pcifront_get_pdev(struct pcifront_sd *sd)
+{
+       return (struct pcifront_device *)sd->platform_data;
+}
+
+static inline void pcifront_setup_root_resources(struct pci_bus *bus,
+                                                struct pcifront_sd *sd)
+{
+       xen_pcibios_setup_root_windows(bus, sd);
+}
+
+#endif /* __ia64__ */
+
+extern struct rw_semaphore pci_bus_sem;
+
+#endif /* __KERNEL__ */
+
+#endif /* __XEN_ASM_PCIFRONT_H__ */
diff --git a/include/xen/pcpu.h b/include/xen/pcpu.h

new file mode 100644 (file)

index 0000000..47518e0
--- /dev/null
+++ b/include/xen/pcpu.h
@@ -0,0 +1,19 @@
+#ifndef _XEN_SYSCTL_H
+#define _XEN_SYSCTL_H
+
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+
+int register_pcpu_notifier(struct notifier_block *);
+void unregister_pcpu_notifier(struct notifier_block *);
+
+#ifdef CONFIG_X86
+int __must_check rdmsr_safe_on_pcpu(unsigned int pcpu, u32 msr_no,
+                                   u32 *l, u32 *h);
+int __must_check wrmsr_safe_on_pcpu(unsigned int pcpu, u32 msr_no,
+                                   u32 l, u32 h);
+int __must_check rdmsr_safe_regs_on_pcpu(unsigned int pcpu, u32 *regs);
+int __must_check wrmsr_safe_regs_on_pcpu(unsigned int pcpu, u32 *regs);
+#endif
+
+#endif /* _XEN_SYSCTL_H */
diff --git a/include/xen/privcmd.h b/include/xen/privcmd.h

index 17857fb..2a94439 100644 (file)
--- a/include/xen/privcmd.h
+++ b/include/xen/privcmd.h
@@ -1,77 +1,3 @@
-/******************************************************************************
- * privcmd.h
- *
- * Interface to /proc/xen/privcmd.
- *
- * Copyright (c) 2003-2005, K A Fraser
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#ifndef __LINUX_PUBLIC_PRIVCMD_H__
-#define __LINUX_PUBLIC_PRIVCMD_H__
-
-#include <linux/types.h>
-#include <linux/compiler.h>
-
-typedef unsigned long xen_pfn_t;
-
-struct privcmd_hypercall {
-       __u64 op;
-       __u64 arg[5];
-};
-
-struct privcmd_mmap_entry {
-       __u64 va;
-       __u64 mfn;
-       __u64 npages;
-};
-
-struct privcmd_mmap {
-       int num;
-       domid_t dom; /* target domain */
-       struct privcmd_mmap_entry __user *entry;
-};
-
-struct privcmd_mmapbatch {
-       int num;     /* number of pages to populate */
-       domid_t dom; /* target domain */
-       __u64 addr;  /* virtual address */
-       xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
-};
-
-/*
- * @cmd: IOCTL_PRIVCMD_HYPERCALL
- * @arg: &privcmd_hypercall_t
- * Return: Value returned from execution of the specified hypercall.
- */
-#define IOCTL_PRIVCMD_HYPERCALL                                        \
-       _IOC(_IOC_NONE, 'P', 0, sizeof(struct privcmd_hypercall))
-#define IOCTL_PRIVCMD_MMAP                                     \
-       _IOC(_IOC_NONE, 'P', 2, sizeof(struct privcmd_mmap))
-#define IOCTL_PRIVCMD_MMAPBATCH                                        \
-       _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch))
-
-#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
+#if defined(CONFIG_PARAVIRT_XEN) || !defined(__KERNEL__)
+#include "public/privcmd.h"
+#endif
diff --git a/include/xen/public/Kbuild b/include/xen/public/Kbuild

new file mode 100644 (file)

index 0000000..d4f1aa8
--- /dev/null
+++ b/include/xen/public/Kbuild
@@ -0,0 +1,5 @@
+header-y += evtchn.h
+header-y += gntdev.h
+header-y += iomulti.h
+header-y += privcmd.h
+header-y += xenbus.h
diff --git a/include/xen/public/evtchn.h b/include/xen/public/evtchn.h

new file mode 100644 (file)

index 0000000..938d4da
--- /dev/null
+++ b/include/xen/public/evtchn.h
@@ -0,0 +1,88 @@
+/******************************************************************************
+ * evtchn.h
+ * 
+ * Interface to /dev/xen/evtchn.
+ * 
+ * Copyright (c) 2003-2005, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_EVTCHN_H__
+#define __LINUX_PUBLIC_EVTCHN_H__
+
+/*
+ * Bind a fresh port to VIRQ @virq.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_VIRQ                         \
+       _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
+struct ioctl_evtchn_bind_virq {
+       unsigned int virq;
+};
+
+/*
+ * Bind a fresh port to remote <@remote_domain, @remote_port>.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_INTERDOMAIN                  \
+       _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
+struct ioctl_evtchn_bind_interdomain {
+       unsigned int remote_domain, remote_port;
+};
+
+/*
+ * Allocate a fresh port for binding to @remote_domain.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_UNBOUND_PORT                 \
+       _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
+struct ioctl_evtchn_bind_unbound_port {
+       unsigned int remote_domain;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_UNBIND                            \
+       _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
+struct ioctl_evtchn_unbind {
+       unsigned int port;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_NOTIFY                            \
+       _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
+struct ioctl_evtchn_notify {
+       unsigned int port;
+};
+
+/* Clear and reinitialise the event buffer. Clear error condition. */
+#define IOCTL_EVTCHN_RESET                             \
+       _IOC(_IOC_NONE, 'E', 5, 0)
+
+#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
diff --git a/include/xen/public/gntdev.h b/include/xen/public/gntdev.h

new file mode 100644 (file)

index 0000000..5304bd3
--- /dev/null
+++ b/include/xen/public/gntdev.h
@@ -0,0 +1,150 @@
+/******************************************************************************
+ * gntdev.h
+ * 
+ * Interface to /dev/xen/gntdev.
+ * 
+ * Copyright (c) 2007, D G Murray
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_GNTDEV_H__
+#define __LINUX_PUBLIC_GNTDEV_H__
+
+struct ioctl_gntdev_grant_ref {
+       /* The domain ID of the grant to be mapped. */
+       uint32_t domid;
+       /* The grant reference of the grant to be mapped. */
+       uint32_t ref;
+};
+
+/*
+ * Inserts the grant references into the mapping table of an instance
+ * of gntdev. N.B. This does not perform the mapping, which is deferred
+ * until mmap() is called with @index as the offset.
+ */
+#define IOCTL_GNTDEV_MAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
+struct ioctl_gntdev_map_grant_ref {
+       /* IN parameters */
+       /* The number of grants to be mapped. */
+       uint32_t count;
+       uint32_t pad;
+       /* OUT parameters */
+       /* The offset to be used on a subsequent call to mmap(). */
+       uint64_t index;
+       /* Variable IN parameter. */
+       /* Array of grant references, of size @count. */
+       struct ioctl_gntdev_grant_ref refs[1];
+};
+
+/*
+ * Removes the grant references from the mapping table of an instance of
+ * of gntdev. N.B. munmap() must be called on the relevant virtual address(es)
+ * before this ioctl is called, or an error will result.
+ */
+#define IOCTL_GNTDEV_UNMAP_GRANT_REF \
+_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
+struct ioctl_gntdev_unmap_grant_ref {
+       /* IN parameters */
+       /* The offset was returned by the corresponding map operation. */
+       uint64_t index;
+       /* The number of pages to be unmapped. */
+       uint32_t count;
+       uint32_t pad;
+};
+
+/*
+ * Returns the offset in the driver's address space that corresponds
+ * to @vaddr. This can be used to perform a munmap(), followed by an
+ * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by
+ * the caller. The number of pages that were allocated at the same time as
+ * @vaddr is returned in @count.
+ *
+ * N.B. Where more than one page has been mapped into a contiguous range, the
+ *      supplied @vaddr must correspond to the start of the range; otherwise
+ *      an error will result. It is only possible to munmap() the entire
+ *      contiguously-allocated range at once, and not any subrange thereof.
+ */
+#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \
+_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
+struct ioctl_gntdev_get_offset_for_vaddr {
+       /* IN parameters */
+       /* The virtual address of the first mapped page in a range. */
+       uint64_t vaddr;
+       /* OUT parameters */
+       /* The offset that was used in the initial mmap() operation. */
+       uint64_t offset;
+       /* The number of pages mapped in the VM area that begins at @vaddr. */
+       uint32_t count;
+       uint32_t pad;
+};
+
+/*
+ * Sets the maximum number of grants that may mapped at once by this gntdev
+ * instance.
+ *
+ * N.B. This must be called before any other ioctl is performed on the device.
+ */
+#define IOCTL_GNTDEV_SET_MAX_GRANTS \
+_IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
+struct ioctl_gntdev_set_max_grants {
+       /* IN parameter */
+       /* The maximum number of grants that may be mapped at once. */
+       uint32_t count;
+};
+
+/*
+ * Sets up an unmap notification within the page, so that the other side can do
+ * cleanup if this side crashes. Required to implement cross-domain robust
+ * mutexes or close notification on communication channels.
+ *
+ * Each mapped page only supports one notification; multiple calls referring to
+ * the same page overwrite the previous notification. You must clear the
+ * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it
+ * to occur.
+ */
+#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \
+_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntdev_unmap_notify))
+struct ioctl_gntdev_unmap_notify {
+       /* IN parameters */
+       /* Offset in the file descriptor for a byte within the page (same as
+        * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to
+        * be cleared. Otherwise, it can be any byte in the page whose
+        * notification we are adjusting.
+        */
+       uint64_t index;
+       /* Action(s) to take on unmap */
+       uint32_t action;
+       /* Event channel to notify */
+       uint32_t event_channel_port;
+};
+
+/* Clear (set to zero) the byte specified by index */
+#define UNMAP_NOTIFY_CLEAR_BYTE 0x1
+/* Send an interrupt on the indicated event channel */
+#define UNMAP_NOTIFY_SEND_EVENT 0x2
+
+#endif /* __LINUX_PUBLIC_GNTDEV_H__ */
diff --git a/include/xen/public/iomulti.h b/include/xen/public/iomulti.h

new file mode 100644 (file)

index 0000000..ae973f6
--- /dev/null
+++ b/include/xen/public/iomulti.h
@@ -0,0 +1,50 @@
+#ifndef __LINUX_PUBLIC_IOMULTI_H__
+#define __LINUX_PUBLIC_IOMULTI_H__
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (c) 2009 Isaku Yamahata
+ *                    VA Linux Systems Japan K.K.
+ */
+
+struct pci_iomul_setup {
+       uint16_t        segment;
+       uint8_t         bus;
+       uint8_t         dev;
+       uint8_t         func;
+};
+
+struct pci_iomul_in {
+       uint8_t         bar;
+       uint64_t        offset;
+
+       uint8_t         size;
+       uint32_t        value;
+};
+
+struct pci_iomul_out {
+       uint8_t         bar;
+       uint64_t        offset;
+
+       uint8_t         size;
+       uint32_t        value;
+};
+
+#define PCI_IOMUL_SETUP                _IOW ('P', 0, struct pci_iomul_setup)
+#define PCI_IOMUL_DISABLE_IO   _IO  ('P', 1)
+#define PCI_IOMUL_IN           _IOWR('P', 2, struct pci_iomul_in)
+#define PCI_IOMUL_OUT          _IOW ('P', 3, struct pci_iomul_out)
+
+#endif /* __LINUX_PUBLIC_IOMULTI_H__ */
diff --git a/include/xen/public/privcmd.h b/include/xen/public/privcmd.h

new file mode 100644 (file)

index 0000000..dba4e2e
--- /dev/null
+++ b/include/xen/public/privcmd.h
@@ -0,0 +1,86 @@
+/******************************************************************************
+ * privcmd.h
+ * 
+ * Interface to /proc/xen/privcmd.
+ * 
+ * Copyright (c) 2003-2005, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_PRIVCMD_H__
+#define __LINUX_PUBLIC_PRIVCMD_H__
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+
+typedef struct privcmd_hypercall
+{
+       __u64 op;
+       __u64 arg[5];
+} privcmd_hypercall_t;
+
+typedef struct privcmd_mmap_entry {
+       __u64 va;
+       __u64 mfn;
+       __u64 npages;
+} privcmd_mmap_entry_t; 
+
+typedef struct privcmd_mmap {
+       int num;
+       domid_t dom; /* target domain */
+       privcmd_mmap_entry_t __user *entry;
+} privcmd_mmap_t; 
+
+typedef struct privcmd_mmapbatch {
+       int num;     /* number of pages to populate */
+       domid_t dom; /* target domain */
+       __u64 addr;  /* virtual address */
+       xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
+} privcmd_mmapbatch_t; 
+
+typedef struct privcmd_mmapbatch_v2 {
+       unsigned int num; /* number of pages to populate */
+       domid_t dom;      /* target domain */
+       __u64 addr;       /* virtual address */
+       const xen_pfn_t __user *arr; /* array of mfns */
+       int __user *err;  /* array of error codes */
+} privcmd_mmapbatch_v2_t;
+
+/*
+ * @cmd: IOCTL_PRIVCMD_HYPERCALL
+ * @arg: &privcmd_hypercall_t
+ * Return: Value returned from execution of the specified hypercall.
+ */
+#define IOCTL_PRIVCMD_HYPERCALL                                        \
+       _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
+#define IOCTL_PRIVCMD_MMAP                                     \
+       _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
+#define IOCTL_PRIVCMD_MMAPBATCH                                        \
+       _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
+#define IOCTL_PRIVCMD_MMAPBATCH_V2                             \
+       _IOC(_IOC_NONE, 'P', 4, sizeof(privcmd_mmapbatch_v2_t))
+
+#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
diff --git a/include/xen/public/xenbus.h b/include/xen/public/xenbus.h

new file mode 100644 (file)

index 0000000..fd61373
--- /dev/null
+++ b/include/xen/public/xenbus.h
@@ -0,0 +1,52 @@
+/******************************************************************************
+ * xenbus.h
+ * 
+ * Interface to /proc/xen/xenbus.
+ * 
+ * Copyright (c) 2008, Diego Ongaro <diego.ongaro@citrix.com>
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_XENBUS_H__
+#define __LINUX_PUBLIC_XENBUS_H__
+
+#include <linux/types.h>
+
+typedef struct xenbus_alloc {
+       domid_t dom;
+       __u32 port;
+       __u32 grant_ref;
+} xenbus_alloc_t;
+
+/*
+ * @cmd: IOCTL_XENBUS_ALLOC
+ * @arg: &xenbus_alloc_t
+ * Return: 0, or -1 for error
+ */
+#define IOCTL_XENBUS_ALLOC                                     \
+       _IOC(_IOC_NONE, 'X', 0, sizeof(xenbus_alloc_t))
+
+#endif /* __LINUX_PUBLIC_XENBUS_H__ */
diff --git a/include/xen/sysctl.h b/include/xen/sysctl.h

new file mode 100644 (file)

index 0000000..7fe9250
--- /dev/null
+++ b/include/xen/sysctl.h
@@ -0,0 +1,11 @@
+#ifndef _XEN_SYSCTL_H
+#define _XEN_SYSCTL_H
+
+/* CTL_XEN names: */
+enum
+{
+       CTL_XEN_INDEPENDENT_WALLCLOCK=1,
+       CTL_XEN_PERMITTED_CLOCK_JITTER=2,
+};
+
+#endif /* _XEN_SYSCTL_H */
diff --git a/include/xen/xen.h b/include/xen/xen.h

index a164024..edb2f5a 100644 (file)
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -7,8 +7,10 @@ enum xen_domain_type {
         XEN_HVM_DOMAIN,         /* running in a Xen hvm domain */
  };
  
-#ifdef CONFIG_XEN
+#if defined(CONFIG_PARAVIRT_XEN)
  extern enum xen_domain_type xen_domain_type;
+#elif defined(CONFIG_XEN)
+#define xen_domain_type                XEN_PV_DOMAIN
  #else
  #define xen_domain_type                XEN_NATIVE
  #endif
@@ -25,6 +27,8 @@ extern enum xen_domain_type xen_domain_type;
  
  #define xen_initial_domain()   (xen_pv_domain() && \
                                  xen_start_info->flags & SIF_INITDOMAIN)
+#elif defined(CONFIG_XEN)
+#define xen_initial_domain()   is_initial_xendomain()
  #else  /* !CONFIG_XEN_DOM0 */
  #define xen_initial_domain()   (0)
  #endif /* CONFIG_XEN_DOM0 */
diff --git a/include/xen/xen_proc.h b/include/xen/xen_proc.h

new file mode 100644 (file)

index 0000000..44af17c
--- /dev/null
+++ b/include/xen/xen_proc.h
@@ -0,0 +1,12 @@
+
+#ifndef __ASM_XEN_PROC_H__
+#define __ASM_XEN_PROC_H__
+
+#include <linux/proc_fs.h>
+
+extern struct proc_dir_entry *create_xen_proc_entry(
+       const char *name, mode_t mode);
+extern void remove_xen_proc_entry(
+       const char *name);
+
+#endif /* __ASM_XEN_PROC_H__ */
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h

index 0a7515c..a4ef9cd 100644 (file)
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -41,6 +41,8 @@
  #include <linux/completion.h>
  #include <linux/init.h>
  #include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/version.h>
  #include <xen/interface/xen.h>
  #include <xen/interface/grant_table.h>
  #include <xen/interface/io/xenbus.h>
@@ -57,8 +59,21 @@ struct xenbus_watch
         /* Callback (executed in a process context with no locks held). */
         void (*callback)(struct xenbus_watch *,
                          const char **vec, unsigned int len);
+
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+       /* See XBWF_ definitions below. */
+       unsigned long flags;
+#endif
  };
  
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+/*
+ * Execute callback in its own kthread. Useful if the callback is long
+ * running or heavily serialised, to avoid taking out the main xenwatch thread
+ * for a long period of time (or even unwittingly causing a deadlock).
+ */
+#define XBWF_new_thread        1
+#endif
  
  /* A xenbus device. */
  struct xenbus_device {
@@ -92,6 +107,9 @@ struct xenbus_driver {
                                  enum xenbus_state backend_state);
         int (*remove)(struct xenbus_device *dev);
         int (*suspend)(struct xenbus_device *dev);
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+       int (*suspend_cancel)(struct xenbus_device *dev);
+#endif
         int (*resume)(struct xenbus_device *dev);
         int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *);
         struct device_driver driver;
@@ -99,10 +117,17 @@ struct xenbus_driver {
         int (*is_ready)(struct xenbus_device *dev);
  };
  
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
+# define XENBUS_DRIVER_SET_OWNER(mod) .driver.owner = mod,
+#else
+# define XENBUS_DRIVER_SET_OWNER(mod)
+#endif
+
  #define DEFINE_XENBUS_DRIVER(var, drvname, methods...)         \
  struct xenbus_driver var ## _driver = {                                \
         .driver.name = drvname + 0 ?: var ## _ids->devicetype,  \
-       .driver.owner = THIS_MODULE,                            \
+       .driver.mod_name = KBUILD_MODNAME,                      \
+       XENBUS_DRIVER_SET_OWNER(THIS_MODULE)                    \
         .ids = var ## _ids, ## methods                          \
  }
  
@@ -111,9 +136,8 @@ static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
         return container_of(drv, struct xenbus_driver, driver);
  }
  
-int __must_check xenbus_register_frontend(struct xenbus_driver *);
-int __must_check xenbus_register_backend(struct xenbus_driver *);
-
+int __must_check xenbus_register_frontend(struct xenbus_driver *drv);
+int __must_check xenbus_register_backend(struct xenbus_driver *drv);
  void xenbus_unregister_driver(struct xenbus_driver *drv);
  
  struct xenbus_transaction
@@ -153,7 +177,6 @@ int xenbus_printf(struct xenbus_transaction t,
  int xenbus_gather(struct xenbus_transaction t, const char *dir, ...);
  
  /* notifer routines for when the xenstore comes up */
-extern int xenstored_ready;
  int register_xenstore_notifier(struct notifier_block *nb);
  void unregister_xenstore_notifier(struct notifier_block *nb);
  
@@ -167,11 +190,11 @@ void xs_suspend_cancel(void);
  void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg);
  
  struct work_struct;
+void xenbus_probe(struct work_struct *);
  
  /* Prepare for domain suspend: then resume or cancel the suspend. */
  void xenbus_suspend(void);
  void xenbus_resume(void);
-void xenbus_probe(struct work_struct *);
  void xenbus_suspend_cancel(void);
  
  #define XENBUS_IS_ERR_READ(str) ({                     \
@@ -184,40 +207,135 @@ void xenbus_suspend_cancel(void);
  
  #define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE)
  
+
+/**
+ * Register a watch on the given path, using the given xenbus_watch structure
+ * for storage, and the given callback function as the callback.  Return 0 on
+ * success, or -errno on error.  On success, the given path will be saved as
+ * watch->node, and remains the caller's to free.  On error, watch->node will
+ * be NULL, the device will switch to XenbusStateClosing, and the error will
+ * be saved in the store.
+ */
  int xenbus_watch_path(struct xenbus_device *dev, const char *path,
                       struct xenbus_watch *watch,
                       void (*callback)(struct xenbus_watch *,
                                        const char **, unsigned int));
+
+
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+/**
+ * Register a watch on the given path/path2, using the given xenbus_watch
+ * structure for storage, and the given callback function as the callback.
+ * Return 0 on success, or -errno on error.  On success, the watched path
+ * (path/path2) will be saved as watch->node, and becomes the caller's to
+ * kfree().  On error, watch->node will be NULL, so the caller has nothing to
+ * free, the device will switch to XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
+int xenbus_watch_path2(struct xenbus_device *dev, const char *path,
+                      const char *path2, struct xenbus_watch *watch,
+                      void (*callback)(struct xenbus_watch *,
+                                       const char **, unsigned int));
+#else
  __printf(4, 5)
  int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
                          void (*callback)(struct xenbus_watch *,
                                           const char **, unsigned int),
                          const char *pathfmt, ...);
+#endif
  
+/**
+ * Advertise in the store a change of the given driver to the given new_state.
+ * Return 0 on success, or -errno on error.  On error, the device will switch
+ * to XenbusStateClosing, and the error will be saved in the store.
+ */
  int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
+
+/**
+ * Grant access to the given ring_mfn to the peer of the given device.  Return
+ * 0 on success, or -errno on error.  On error, the device will switch to
+ * XenbusStateClosing, and the error will be saved in the store.
+ */
  int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
+
+/**
+ * Map a page of memory into this domain from another domain's grant table.
+ * xenbus_map_ring_valloc allocates a page of virtual address space, maps the
+ * page to that address, and sets *vaddr to that address.
+ * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
+ * or -ENOMEM on error. If an error is returned, device will switch to
+ * XenbusStateClosing and the error message will be saved in XenStore.
+ */
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev,
+                                        grant_ref_t ref);
+#else
  int xenbus_map_ring_valloc(struct xenbus_device *dev,
-                          int gnt_ref, void **vaddr);
-int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
+                          grant_ref_t gnt_ref, void **vaddr);
+#endif
+int xenbus_map_ring(struct xenbus_device *dev, grant_ref_t gnt_ref,
                            grant_handle_t *handle, void *vaddr);
  
+/**
+ * Unmap a page of memory in this domain that was imported from another domain
+ * and free the virtual address space.
+ * Returns 0 on success and returns GNTST_* on error
+ * (see xen/include/interface/grant_table.h).
+ */
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *);
+#else
  int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
+#endif
  int xenbus_unmap_ring(struct xenbus_device *dev,
                       grant_handle_t handle, void *vaddr);
  
+/**
+ * Allocate an event channel for the given xenbus_device, assigning the newly
+ * created local port to *port.  Return 0 on success, or -errno on error.  On
+ * error, the device will switch to XenbusStateClosing, and the error will be
+ * saved in the store.
+ */
  int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
-int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
+
+
+/**
+ * Free an existing event channel. Returns 0 on success or -errno on error.
+ */
  int xenbus_free_evtchn(struct xenbus_device *dev, int port);
  
+
+/**
+ * Return the state of the driver rooted at the given store path, or
+ * XenbusStateUnknown if no state can be read.
+ */
  enum xenbus_state xenbus_read_driver_state(const char *path);
  
-__printf(3, 4)
-void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...);
-__printf(3, 4)
-void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...);
+
+/***
+ * Report the given negative errno into the store, along with the given
+ * formatted message.
+ */
+void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt,
+                     ...) __printf(3, 4);
+
+/***
+ * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by
+ * xenbus_switch_state(dev, NULL, XenbusStateClosing) to schedule an orderly
+ * closedown of this driver and its peer.
+ */
+void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt,
+                     ...) __printf(3, 4);
+
+#if defined(CONFIG_XEN) || defined(HAVE_XEN_PLATFORM_COMPAT_H)
+int xenbus_dev_init(void);
+#endif
  
  const char *xenbus_strstate(enum xenbus_state state);
  int xenbus_dev_is_online(struct xenbus_device *dev);
  int xenbus_frontend_closed(struct xenbus_device *dev);
  
+int xenbus_for_each_backend(void *arg, int (*fn)(struct device *, void *));
+int xenbus_for_each_frontend(void *arg, int (*fn)(struct device *, void *));
+
  #endif /* _XEN_XENBUS_H */
diff --git a/include/xen/xencons.h b/include/xen/xencons.h

new file mode 100644 (file)

index 0000000..f021516
--- /dev/null
+++ b/include/xen/xencons.h
@@ -0,0 +1,12 @@
+#ifndef __ASM_XENCONS_H__
+#define __ASM_XENCONS_H__
+
+int xprintk(const char *, ...) __attribute__ ((__format__(__printf__, 1, 2)));
+
+struct dom0_vga_console_info;
+void dom0_init_screen_info(const struct dom0_vga_console_info *, size_t);
+
+void xencons_force_flush(void);
+void xencons_resume(void);
+
+#endif /* __ASM_XENCONS_H__ */
diff --git a/include/xen/xenoprof.h b/include/xen/xenoprof.h

new file mode 100644 (file)

index 0000000..4c3ab0f
--- /dev/null
+++ b/include/xen/xenoprof.h
@@ -0,0 +1,42 @@
+/******************************************************************************
+ * xen/xenoprof.h
+ *
+ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
+ *                    VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#ifndef __XEN_XENOPROF_H__
+#define __XEN_XENOPROF_H__
+#ifdef CONFIG_XEN
+
+#include <asm/xenoprof.h>
+
+struct oprofile_operations;
+int xenoprofile_init(struct oprofile_operations * ops);
+void xenoprofile_exit(void);
+
+struct xenoprof_shared_buffer {
+       char                                    *buffer;
+       struct xenoprof_arch_shared_buffer      arch;
+};
+#else
+#define xenoprofile_init(ops)  (-ENOSYS)
+#define xenoprofile_exit()     do { } while (0)
+
+#endif /* CONFIG_XEN */
+#endif /* __XEN_XENOPROF_H__ */
diff --git a/init/Kconfig b/init/Kconfig

index 6cfd71d..06860e8 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1,3 +1,41 @@
+config SUSE_KERNEL
+       def_bool y
+
+config ENTERPRISE_SUPPORT
+       bool "Enable enterprise support facility"
+       depends on SUSE_KERNEL
+       help
+         This feature enables the handling of the "supported" module flag.
+         This flag can be used to report unsupported module loads or even
+         refuse them entirely. It is useful when ensuring that the kernel
+         remains in a state that Novell Technical Services, or its
+         technical partners, is prepared to support.
+
+         Modules in the list of supported modules will be marked supported
+         on build. The default enforcement mode is to report, but not
+         deny, loading of unsupported modules.
+
+         If you aren't building a kernel for an enterprise distribution,
+         say n.
+
+config SPLIT_PACKAGE
+       bool "Split the kernel package into multiple RPMs"
+       depends on SUSE_KERNEL && MODULES
+       help
+         This is an option used by the kernel packaging infrastructure
+         to split kernel modules into different packages. It isn't used
+         by the kernel itself, but allows the the packager to make
+         decisions on a per-config basis.
+
+         If you aren't packaging a kernel for distribution, it's safe to
+         say n.
+
+config KERNEL_DESKTOP
+       bool "Kernel to suit desktop workloads"
+       help
+         This is an option used to tune kernel parameters to better suit
+         desktop workloads.
+
  config ARCH
         string
         option env="ARCH"
@@ -579,6 +617,7 @@ config HAVE_UNSTABLE_SCHED_CLOCK
  menuconfig CGROUPS
         boolean "Control Group support"
         depends on EVENTFD
+       default !KERNEL_DESKTOP
         help
           This option adds support for grouping sets of processes together, for
           use with process control subsystems such as Cpusets, CFS, memory
@@ -718,7 +757,7 @@ config CGROUP_PERF
  
  menuconfig CGROUP_SCHED
         bool "Group CPU scheduler"
-       default n
+       default !KERNEL_DESKTOP
         help
           This feature lets CPU scheduler recognize task groups and control CPU
           bandwidth allocation to such task groups. It uses cgroups to group
diff --git a/init/main.c b/init/main.c

index 44b2433..a45ad04 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -48,6 +48,7 @@
  #include <linux/rmap.h>
  #include <linux/mempolicy.h>
  #include <linux/key.h>
+#include <linux/unwind.h>
  #include <linux/buffer_head.h>
  #include <linux/page_cgroup.h>
  #include <linux/debug_locks.h>
@@ -471,6 +472,7 @@ asmlinkage void __init start_kernel(void)
          * Need to run as early as possible, to initialize the
          * lockdep hash:
          */
+       unwind_init();
         lockdep_init();
         smp_setup_processor_id();
         debug_objects_early_init();
@@ -497,6 +499,7 @@ asmlinkage void __init start_kernel(void)
         mm_init_owner(&init_mm, &init_task);
         mm_init_cpumask(&init_mm);
         setup_command_line(command_line);
+       unwind_setup();
         setup_nr_cpu_ids();
         setup_per_cpu_areas();
         smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz

index 94fabd5..98bf3bc 100644 (file)
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -4,6 +4,7 @@
  
  choice
         prompt "Timer frequency"
+       default HZ_1000 if KERNEL_DESKTOP
         default HZ_250
         help
          Allows the configuration of the timer frequency. It is customary
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt

index 3f9c974..2b92ad6 100644 (file)
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,6 +1,7 @@
  
  choice
         prompt "Preemption Model"
+       default PREEMPT if KERNEL_DESKTOP
         default PREEMPT_NONE
  
  config PREEMPT_NONE
@@ -35,6 +36,7 @@ config PREEMPT_VOLUNTARY
  
  config PREEMPT
         bool "Preemptible Kernel (Low-Latency Desktop)"
+       depends on !XEN
         select PREEMPT_COUNT
         select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
         help
diff --git a/kernel/Makefile b/kernel/Makefile

index cb41b95..589e844 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
  obj-$(CONFIG_UID16) += uid16.o
  obj-$(CONFIG_MODULES) += module.o
  obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_STACK_UNWIND) += unwind.o
  obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
  obj-$(CONFIG_KEXEC) += kexec.o
  obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c

index 3914c1e..7707fb1 100644 (file)
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -89,6 +89,7 @@ int irq_set_handler_data(unsigned int irq, void *data)
  }
  EXPORT_SYMBOL(irq_set_handler_data);
  
+#ifndef CONFIG_XEN
  /**
   *     irq_set_msi_desc - set MSI descriptor data for an irq
   *     @irq:   Interrupt number
@@ -109,6 +110,7 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
         irq_put_desc_unlock(desc, flags);
         return 0;
  }
+#endif
  
  /**
   *     irq_set_chip_data - set irq chip data for an irq
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c

index 611cd60..4c8ce39 100644 (file)
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -290,7 +290,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                  */
                 if (time_after(jiffies, desc->last_unhandled + HZ/10))
                         desc->irqs_unhandled = 1;
-               else
+               else if (!irq_ignore_unhandled(irq))
                         desc->irqs_unhandled++;
                 desc->last_unhandled = jiffies;
         }
diff --git a/kernel/kexec.c b/kernel/kexec.c

index 4e2e472..dc1c426 100644 (file)
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -39,12 +39,18 @@
  #include <asm/io.h>
  #include <asm/sections.h>
  
+#ifndef CONFIG_XEN
  /* Per cpu memory for storing cpu states in case of system crash. */
  note_buf_t __percpu *crash_notes;
+#endif
  
  /* vmcoreinfo stuff */
  static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+u32
+#if defined(CONFIG_XEN) && defined(CONFIG_X86)
+__page_aligned_bss
+#endif
+vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
  size_t vmcoreinfo_size;
  size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
  
@@ -355,13 +361,26 @@ static int kimage_is_destination_range(struct kimage *image,
         return 0;
  }
  
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order, unsigned long limit)
  {
         struct page *pages;
  
         pages = alloc_pages(gfp_mask, order);
         if (pages) {
                 unsigned int count, i;
+#ifdef CONFIG_XEN
+               int address_bits;
+
+               if (limit == ~0UL)
+                       address_bits = BITS_PER_LONG;
+               else
+                       address_bits = ilog2(limit);
+
+               if (xen_limit_pages_to_max_mfn(pages, order, address_bits) < 0) {
+                       __free_pages(pages, order);
+                       return NULL;
+               }
+#endif
                 pages->mapping = NULL;
                 set_page_private(pages, order);
                 count = 1 << order;
@@ -425,10 +444,10 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
         do {
                 unsigned long pfn, epfn, addr, eaddr;
  
-               pages = kimage_alloc_pages(GFP_KERNEL, order);
+               pages = kimage_alloc_pages(GFP_KERNEL, order, KEXEC_CONTROL_MEMORY_LIMIT);
                 if (!pages)
                         break;
-               pfn   = page_to_pfn(pages);
+               pfn   = kexec_page_to_pfn(pages);
                 epfn  = pfn + count;
                 addr  = pfn << PAGE_SHIFT;
                 eaddr = epfn << PAGE_SHIFT;
@@ -462,6 +481,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
         return pages;
  }
  
+#ifndef CONFIG_XEN
  static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
                                                       unsigned int order)
  {
@@ -515,7 +535,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
                 }
                 /* If I don't overlap any segments I have found my hole! */
                 if (i == image->nr_segments) {
-                       pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+                       pages = kexec_pfn_to_page(hole_start >> PAGE_SHIFT);
                         break;
                 }
         }
@@ -542,6 +562,13 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
  
         return pages;
  }
+#else /* !CONFIG_XEN */
+struct page *kimage_alloc_control_pages(struct kimage *image,
+                                        unsigned int order)
+{
+       return kimage_alloc_normal_control_pages(image, order);
+}
+#endif
  
  static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
  {
@@ -557,7 +584,7 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
                         return -ENOMEM;
  
                 ind_page = page_address(page);
-               *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+               *image->entry = kexec_virt_to_phys(ind_page) | IND_INDIRECTION;
                 image->entry = ind_page;
                 image->last_entry = ind_page +
                                       ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
@@ -616,13 +643,13 @@ static void kimage_terminate(struct kimage *image)
  #define for_each_kimage_entry(image, ptr, entry) \
         for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
                 ptr = (entry & IND_INDIRECTION)? \
-                       phys_to_virt((entry & PAGE_MASK)): ptr +1)
+                       kexec_phys_to_virt((entry & PAGE_MASK)): ptr +1)
  
  static void kimage_free_entry(kimage_entry_t entry)
  {
         struct page *page;
  
-       page = pfn_to_page(entry >> PAGE_SHIFT);
+       page = kexec_pfn_to_page(entry >> PAGE_SHIFT);
         kimage_free_pages(page);
  }
  
@@ -634,6 +661,10 @@ static void kimage_free(struct kimage *image)
         if (!image)
                 return;
  
+#ifdef CONFIG_XEN
+       xen_machine_kexec_unload(image);
+#endif
+
         kimage_free_extra_pages(image);
         for_each_kimage_entry(image, ptr, entry) {
                 if (entry & IND_INDIRECTION) {
@@ -709,7 +740,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
          * have a match.
          */
         list_for_each_entry(page, &image->dest_pages, lru) {
-               addr = page_to_pfn(page) << PAGE_SHIFT;
+               addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
                 if (addr == destination) {
                         list_del(&page->lru);
                         return page;
@@ -720,16 +751,16 @@ static struct page *kimage_alloc_page(struct kimage *image,
                 kimage_entry_t *old;
  
                 /* Allocate a page, if we run out of memory give up */
-               page = kimage_alloc_pages(gfp_mask, 0);
+               page = kimage_alloc_pages(gfp_mask, 0, KEXEC_SOURCE_MEMORY_LIMIT);
                 if (!page)
                         return NULL;
                 /* If the page cannot be used file it away */
-               if (page_to_pfn(page) >
+               if (kexec_page_to_pfn(page) >
                                 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
                         list_add(&page->lru, &image->unuseable_pages);
                         continue;
                 }
-               addr = page_to_pfn(page) << PAGE_SHIFT;
+               addr = kexec_page_to_pfn(page) << PAGE_SHIFT;
  
                 /* If it is the destination page we want use it */
                 if (addr == destination)
@@ -752,7 +783,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
                         struct page *old_page;
  
                         old_addr = *old & PAGE_MASK;
-                       old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+                       old_page = kexec_pfn_to_page(old_addr >> PAGE_SHIFT);
                         copy_highpage(page, old_page);
                         *old = addr | (*old & ~PAGE_MASK);
  
@@ -808,7 +839,7 @@ static int kimage_load_normal_segment(struct kimage *image,
                         result  = -ENOMEM;
                         goto out;
                 }
-               result = kimage_add_page(image, page_to_pfn(page)
+               result = kimage_add_page(image, kexec_page_to_pfn(page)
                                                                 << PAGE_SHIFT);
                 if (result < 0)
                         goto out;
@@ -840,6 +871,7 @@ out:
         return result;
  }
  
+#ifndef CONFIG_XEN
  static int kimage_load_crash_segment(struct kimage *image,
                                         struct kexec_segment *segment)
  {
@@ -862,7 +894,7 @@ static int kimage_load_crash_segment(struct kimage *image,
                 char *ptr;
                 size_t uchunk, mchunk;
  
-               page = pfn_to_page(maddr >> PAGE_SHIFT);
+               page = kexec_pfn_to_page(maddr >> PAGE_SHIFT);
                 if (!page) {
                         result  = -ENOMEM;
                         goto out;
@@ -911,6 +943,13 @@ static int kimage_load_segment(struct kimage *image,
  
         return result;
  }
+#else /* CONFIG_XEN */
+static int kimage_load_segment(struct kimage *image,
+                               struct kexec_segment *segment)
+{
+       return kimage_load_normal_segment(image, segment);
+}
+#endif
  
  /*
   * Exec Kernel system call: for obvious reasons only root may call it.
@@ -1017,6 +1056,13 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
                 if (flags & KEXEC_ON_CRASH)
                         crash_unmap_reserved_pages();
         }
+#ifdef CONFIG_XEN
+       if (image) {
+               result = xen_machine_kexec_load(image);
+               if (result)
+                       goto out;
+       }
+#endif
         /* Install the new kernel, and  Uninstall the old */
         image = xchg(dest_image, image);
  
@@ -1111,6 +1157,7 @@ size_t crash_get_memory_size(void)
         return size;
  }
  
+#ifndef CONFIG_XEN
  void __weak crash_free_reserved_phys_range(unsigned long begin,
                                            unsigned long end)
  {
@@ -1174,6 +1221,7 @@ unlock:
         mutex_unlock(&kexec_mutex);
         return ret;
  }
+#endif /* !CONFIG_XEN */
  
  static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
                             size_t data_len)
@@ -1203,6 +1251,7 @@ static void final_note(u32 *buf)
         memcpy(buf, &note, sizeof(note));
  }
  
+#ifndef CONFIG_XEN
  void crash_save_cpu(struct pt_regs *regs, int cpu)
  {
         struct elf_prstatus prstatus;
@@ -1228,9 +1277,11 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
                               &prstatus, sizeof(prstatus));
         final_note(buf);
  }
+#endif
  
  static int __init crash_notes_memory_init(void)
  {
+#ifndef CONFIG_XEN
         /* Allocate memory for saving cpu registers. */
         crash_notes = alloc_percpu(note_buf_t);
         if (!crash_notes) {
@@ -1238,11 +1289,13 @@ static int __init crash_notes_memory_init(void)
                 " states failed\n");
                 return -ENOMEM;
         }
+#endif
         return 0;
  }
  module_init(crash_notes_memory_init)
  
  
+#ifndef CONFIG_XEN
  /*
   * parsing the "crashkernel" commandline
   *
@@ -1409,7 +1462,7 @@ int __init parse_crashkernel(char                  *cmdline,
  
         return 0;
  }
-
+#endif
  
  static void update_vmcoreinfo_note(void)
  {
@@ -1466,7 +1519,18 @@ static int __init crash_save_vmcoreinfo_init(void)
         VMCOREINFO_SYMBOL(init_uts_ns);
         VMCOREINFO_SYMBOL(node_online_map);
  #ifdef CONFIG_MMU
+# ifndef CONFIG_X86_XEN
+       VMCOREINFO_SYMBOL(swapper_pg_dir);
+# else
+/*
+ * Since for x86-32 Xen swapper_pg_dir is a pointer rather than an array,
+ * make the value stored consistent with native (i.e. the base address of
+ * the page directory).
+ */
+#  define swapper_pg_dir *swapper_pg_dir
         VMCOREINFO_SYMBOL(swapper_pg_dir);
+#  undef swapper_pg_dir
+# endif
  #endif
         VMCOREINFO_SYMBOL(_stext);
         VMCOREINFO_SYMBOL(vmlist);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c

index 4e316e1..cace878 100644 (file)
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -107,6 +107,7 @@ static ssize_t kexec_crash_size_show(struct kobject *kobj,
  {
         return sprintf(buf, "%zu\n", crash_get_memory_size());
  }
+#ifndef CONFIG_XEN
  static ssize_t kexec_crash_size_store(struct kobject *kobj,
                                    struct kobj_attribute *attr,
                                    const char *buf, size_t count)
@@ -121,6 +122,9 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
         return ret < 0 ? ret : count;
  }
  KERNEL_ATTR_RW(kexec_crash_size);
+#else
+KERNEL_ATTR_RO(kexec_crash_size);
+#endif
  
  static ssize_t vmcoreinfo_show(struct kobject *kobj,
                                struct kobj_attribute *attr, char *buf)
@@ -167,6 +171,30 @@ static struct bin_attribute notes_attr = {
  struct kobject *kernel_kobj;
  EXPORT_SYMBOL_GPL(kernel_kobj);
  
+#ifdef CONFIG_ENTERPRISE_SUPPORT
+const char *supported_printable(int taint)
+{
+       int mask = TAINT_PROPRIETARY_MODULE|TAINT_NO_SUPPORT;
+       if ((taint & mask) == mask)
+               return "No, Proprietary and Unsupported modules are loaded";
+       else if (taint & TAINT_PROPRIETARY_MODULE)
+               return "No, Proprietary modules are loaded";
+       else if (taint & TAINT_NO_SUPPORT)
+               return "No, Unsupported modules are loaded";
+       else if (taint & TAINT_EXTERNAL_SUPPORT)
+               return "Yes, External";
+       else
+               return "Yes";
+}
+
+static ssize_t supported_show(struct kobject *kobj,
+                             struct kobj_attribute *attr, char *buf)
+{
+       return sprintf(buf, "%s\n", supported_printable(get_taint()));
+}
+KERNEL_ATTR_RO(supported);
+#endif
+
  static struct attribute * kernel_attrs[] = {
         &fscaps_attr.attr,
  #if defined(CONFIG_HOTPLUG)
@@ -182,6 +210,9 @@ static struct attribute * kernel_attrs[] = {
         &kexec_crash_size_attr.attr,
         &vmcoreinfo_attr.attr,
  #endif
+#ifdef CONFIG_ENTERPRISE_SUPPORT
+       &supported_attr.attr,
+#endif
         NULL
  };
  
diff --git a/kernel/module.c b/kernel/module.c

index 78ac6ec..cda8912 100644 (file)
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -44,6 +44,7 @@
  #include <linux/device.h>
  #include <linux/string.h>
  #include <linux/mutex.h>
+#include <linux/unwind.h>
  #include <linux/rculist.h>
  #include <asm/uaccess.h>
  #include <asm/cacheflush.h>
@@ -89,6 +90,22 @@
  /* If this is set, the section belongs in the init part of the module */
  #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
  
+#ifdef CONFIG_ENTERPRISE_SUPPORT
+/* Allow unsupported modules switch. */
+#ifdef UNSUPPORTED_MODULES
+int unsupported = UNSUPPORTED_MODULES;
+#else
+int unsupported = 2;  /* don't warn when loading unsupported modules. */
+#endif
+
+static int __init unsupported_setup(char *str)
+{
+       get_option(&str, &unsupported);
+       return 1;
+}
+__setup("unsupported=", unsupported_setup);
+#endif
+
  /*
   * Mutex protects:
   * 1) List of modules (also safely readable with preempt_disable),
@@ -137,7 +154,7 @@ struct load_info {
         struct _ddebug *debug;
         unsigned int num_debug;
         struct {
-               unsigned int sym, str, mod, vers, info, pcpu;
+               unsigned int sym, str, mod, vers, info, pcpu, unwind;
         } index;
  };
  
@@ -533,6 +550,27 @@ bool is_module_percpu_address(unsigned long addr)
  
  #endif /* CONFIG_SMP */
  
+static unsigned int find_unwind(struct load_info *info)
+{
+       int section = 0;
+#ifdef ARCH_UNWIND_SECTION_NAME
+       section = find_sec(info, ARCH_UNWIND_SECTION_NAME);
+       if (section)
+               info->sechdrs[section].sh_flags |= SHF_ALLOC;
+#endif
+       return section;
+}
+
+static void add_unwind_table(struct module *mod, struct load_info *info)
+{
+       int index = info->index.unwind;
+
+       /* Size of section 0 is 0, so this is ok if there is no unwind info. */
+       mod->unwind_info = unwind_add_table(mod,
+                                         (void *)info->sechdrs[index].sh_addr,
+                                         info->sechdrs[index].sh_size);
+}
+
  #define MODINFO_ATTR(field)    \
  static void setup_modinfo_##field(struct module *mod, const char *s)  \
  {                                                                     \
@@ -985,6 +1023,12 @@ static size_t module_flags_taint(struct module *mod, char *buf)
                 buf[l++] = 'F';
         if (mod->taints & (1 << TAINT_CRAP))
                 buf[l++] = 'C';
+#ifdef CONFIG_ENTERPRISE_SUPPORT
+       if (mod->taints & (1 << TAINT_NO_SUPPORT))
+               buf[l++] = 'N';
+       if (mod->taints & (1 << TAINT_EXTERNAL_SUPPORT))
+               buf[l++] = 'X';
+#endif
         /*
          * TAINT_FORCED_RMMOD: could be added.
          * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
@@ -1060,6 +1104,33 @@ static ssize_t show_taint(struct module_attribute *mattr,
  static struct module_attribute modinfo_taint =
         __ATTR(taint, 0444, show_taint, NULL);
  
+#ifdef CONFIG_ENTERPRISE_SUPPORT
+static void setup_modinfo_supported(struct module *mod, const char *s)
+{
+       if (!s) {
+               mod->taints |= (1 << TAINT_NO_SUPPORT);
+               return;
+       }
+
+       if (strcmp(s, "external") == 0)
+               mod->taints |= (1 << TAINT_EXTERNAL_SUPPORT);
+       else if (strcmp(s, "yes"))
+               mod->taints |= (1 << TAINT_NO_SUPPORT);
+}
+
+static ssize_t show_modinfo_supported(struct module_attribute *mattr,
+                                     struct module_kobject *mk, char *buffer)
+{
+       return sprintf(buffer, "%s\n", supported_printable(mk->mod->taints));
+}
+
+static struct module_attribute modinfo_supported = {
+       .attr = { .name = "supported", .mode = 0444 },
+       .show = show_modinfo_supported,
+       .setup = setup_modinfo_supported,
+};
+#endif
+
  static struct module_attribute *modinfo_attrs[] = {
         &module_uevent,
         &modinfo_version,
@@ -1068,6 +1139,9 @@ static struct module_attribute *modinfo_attrs[] = {
         &modinfo_coresize,
         &modinfo_initsize,
         &modinfo_taint,
+#ifdef CONFIG_ENTERPRISE_SUPPORT
+       &modinfo_supported,
+#endif
  #ifdef CONFIG_MODULE_UNLOAD
         &modinfo_refcnt,
  #endif
@@ -1609,9 +1683,36 @@ static int mod_sysfs_setup(struct module *mod,
         add_sect_attrs(mod, info);
         add_notes_attrs(mod, info);
  
+#ifdef CONFIG_ENTERPRISE_SUPPORT
+       /* We don't use add_taint() here because it also disables lockdep. */
+       if (mod->taints & (1 << TAINT_EXTERNAL_SUPPORT))
+               add_nonfatal_taint(TAINT_EXTERNAL_SUPPORT);
+       else if (mod->taints == (1 << TAINT_NO_SUPPORT)) {
+               if (unsupported == 0) {
+                       printk(KERN_WARNING "%s: module not supported by "
+                              "Novell, refusing to load. To override, echo "
+                              "1 > /proc/sys/kernel/unsupported\n", mod->name);
+                       err = -ENOEXEC;
+                       goto out_remove_attrs;
+               }
+               add_nonfatal_taint(TAINT_NO_SUPPORT);
+               if (unsupported == 1) {
+                       printk(KERN_WARNING "%s: module is not supported by "
+                              "Novell. Novell Technical Services may decline "
+                              "your support request if it involves a kernel "
+                              "fault.\n", mod->name);
+               }
+       }
+#endif
+
         kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
         return 0;
  
+out_remove_attrs:
+       remove_notes_attrs(mod);
+       remove_sect_attrs(mod);
+       del_usage_links(mod);
+       module_remove_modinfo_attrs(mod);
  out_unreg_param:
         module_param_sysfs_remove(mod);
  out_unreg_holders:
@@ -1809,6 +1910,8 @@ static void free_module(struct module *mod)
         /* Remove dynamic debug info */
         ddebug_remove_module(mod->name);
  
+       unwind_remove_table(mod->unwind_info, 0);
+
         /* Arch-specific cleanup. */
         module_arch_cleanup(mod);
  
@@ -2533,6 +2636,8 @@ static struct module *setup_load_info(struct load_info *info)
  
         info->index.pcpu = find_pcpusec(info);
  
+       info->index.unwind = find_unwind(info);
+
         /* Check module struct version now, before we try to use module. */
         if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
                 return ERR_PTR(-ENOEXEC);
@@ -2962,6 +3067,9 @@ static struct module *load_module(void __user *umod,
         if (err < 0)
                 goto unlink;
  
+       /* Initialize unwind table */
+       add_unwind_table(mod, &info);
+
         /* Get rid of temporary copy. */
         free_copy(&info);
  
@@ -3074,6 +3182,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
         /* Drop initial reference. */
         module_put(mod);
         trim_init_extable(mod);
+       unwind_remove_table(mod->unwind_info, 1);
  #ifdef CONFIG_KALLSYMS
         mod->num_symtab = mod->core_num_syms;
         mod->symtab = mod->core_symtab;
@@ -3522,6 +3631,9 @@ void print_modules(void)
         if (last_unloaded_module[0])
                 printk(" [last unloaded: %s]", last_unloaded_module);
         printk("\n");
+#ifdef CONFIG_ENTERPRISE_SUPPORT
+       printk("Supported: %s\n", supported_printable(get_taint()));
+#endif
  }
  
  #ifdef CONFIG_MODVERSIONS
diff --git a/kernel/panic.c b/kernel/panic.c

index 8ed89a1..ca9baa9 100644 (file)
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -198,6 +198,10 @@ static const struct tnt tnts[] = {
         { TAINT_CRAP,                   'C', ' ' },
         { TAINT_FIRMWARE_WORKAROUND,    'I', ' ' },
         { TAINT_OOT_MODULE,             'O', ' ' },
+#ifdef CONFIG_ENTERPRISE_SUPPORT
+       { TAINT_NO_SUPPORT,             'N', ' ' },
+       { TAINT_EXTERNAL_SUPPORT,       'X', ' ' },
+#endif
  };
  
  /**
@@ -216,6 +220,8 @@ static const struct tnt tnts[] = {
   *  'C' - modules from drivers/staging are loaded.
   *  'I' - Working around severe firmware bug.
   *  'O' - Out-of-tree module has been loaded.
+ *  'N' - Unsuported modules loaded.
+ *  'X' - Modules with external support loaded.
   *
   *     The string is overwritten by the next call to print_tainted().
   */
@@ -251,6 +257,11 @@ unsigned long get_taint(void)
         return tainted_mask;
  }
  
+void add_nonfatal_taint(unsigned flag)
+{
+       set_bit(flag, &tainted_mask);
+}
+
  void add_taint(unsigned flag)
  {
         /*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig

index deb5461..ca97aa1 100644 (file)
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -139,7 +139,7 @@ config PM_ADVANCED_DEBUG
  
  config PM_TEST_SUSPEND
         bool "Test suspend/resume and wakealarm during bootup"
-       depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
+       depends on SUSPEND && PM_DEBUG && RTC_CLASS=y && !XEN_UNPRIVILEGED_GUEST
         ---help---
         This option will let you suspend your machine during bootup, and
         make it wake up a few seconds later using an RTC wakeup alarm.
@@ -170,7 +170,7 @@ config PM_TRACE
  config PM_TRACE_RTC
         bool "Suspend/resume event tracing"
         depends on CAN_PM_TRACE
-       depends on X86
+       depends on X86 && !XEN_UNPRIVILEGED_GUEST
         select PM_TRACE
         ---help---
         This enables some cheesy code to save the last PM event point in the
diff --git a/kernel/printk.c b/kernel/printk.c

index b663c2c..c3a8b77 100644 (file)
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -41,6 +41,8 @@
  #include <linux/cpu.h>
  #include <linux/notifier.h>
  #include <linux/rculist.h>
+#include <linux/jhash.h>
+#include <linux/device.h>
  
  #include <asm/uaccess.h>
  
@@ -491,7 +493,7 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
         return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
  }
  
-#ifdef CONFIG_KGDB_KDB
+#if defined(CONFIG_KGDB_KDB) || defined(CONFIG_DEBUG_KERNEL)
  /* kdb dmesg command needs access to the syslog buffer.  do_syslog()
   * uses locks so it cannot be used during debugging.  Just tell kdb
   * where the start and end of the physical and logical logs are.  This
@@ -1805,3 +1807,46 @@ void kmsg_dump(enum kmsg_dump_reason reason)
         rcu_read_unlock();
  }
  #endif
+
+#if defined CONFIG_PRINTK && defined CONFIG_KMSG_IDS
+
+/**
+ * printk_hash - print a kernel message include a hash over the message
+ * @prefix: message prefix including the ".%06x" for the hash
+ * @fmt: format string
+ */
+asmlinkage int printk_hash(const char *prefix, const char *fmt, ...)
+{
+       va_list args;
+       int r;
+
+       r = printk(prefix, jhash(fmt, strlen(fmt), 0) & 0xffffff);
+       va_start(args, fmt);
+       r += vprintk(fmt, args);
+       va_end(args);
+
+       return r;
+}
+EXPORT_SYMBOL(printk_hash);
+
+/**
+ * printk_dev_hash - print a kernel message include a hash over the message
+ * @prefix: message prefix including the ".%06x" for the hash
+ * @dev: device this printk is all about
+ * @fmt: format string
+ */
+asmlinkage int printk_dev_hash(const char *prefix, const char *driver_name,
+                              const char *fmt, ...)
+{
+       va_list args;
+       int r;
+
+       r = printk(prefix, driver_name, jhash(fmt, strlen(fmt), 0) & 0xffffff);
+       va_start(args, fmt);
+       r += vprintk(fmt, args);
+       va_end(args);
+
+       return r;
+}
+EXPORT_SYMBOL(printk_dev_hash);
+#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index e5212ae..9a7fe31 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2656,6 +2656,48 @@ static inline void task_group_account_field(struct task_struct *p, int index,
  }
  
  
+#if !defined(CONFIG_XEN) || defined(CONFIG_VIRT_CPU_ACCOUNTING)
+# define cputime_to_u64(t) ((__force u64)(t))
+#else
+# include <linux/syscore_ops.h>
+# define NS_PER_TICK (1000000000 / HZ)
+
+static DEFINE_PER_CPU(u64, steal_snapshot);
+static DEFINE_PER_CPU(unsigned int, steal_residual);
+
+static u64 cputime_to_u64(cputime_t t)
+{
+       u64 s = this_vcpu_read(runstate.time[RUNSTATE_runnable]);
+       unsigned long adj = div_u64_rem(s - __this_cpu_read(steal_snapshot)
+                                         + __this_cpu_read(steal_residual),
+                                       NS_PER_TICK,
+                                       &__get_cpu_var(steal_residual));
+
+       __this_cpu_write(steal_snapshot, s);
+       if (t < jiffies_to_cputime(adj))
+               return 0;
+
+       return (__force u64)(t - jiffies_to_cputime(adj));
+}
+
+static void steal_resume(void)
+{
+       cputime_to_u64(((cputime_t)1 << (BITS_PER_LONG * sizeof(cputime_t)
+                                        / sizeof(long) - 1)) - 1);
+}
+
+static struct syscore_ops steal_syscore_ops = {
+       .resume = steal_resume,
+};
+
+static int __init steal_register(void)
+{
+       register_syscore_ops(&steal_syscore_ops);
+       return 0;
+}
+core_initcall(steal_register);
+#endif
+
  /*
   * Account user cpu time to a process.
   * @p: the process that the cpu time gets accounted to
@@ -2675,7 +2717,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
         index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
  
         /* Add user time to cpustat. */
-       task_group_account_field(p, index, (__force u64) cputime);
+       task_group_account_field(p, index, cputime_to_u64(cputime));
  
         /* Account for user time used */
         acct_update_integrals(p);
@@ -2725,7 +2767,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
         account_group_system_time(p, cputime);
  
         /* Add system time to cpustat. */
-       task_group_account_field(p, index, (__force u64) cputime);
+       task_group_account_field(p, index, cputime_to_u64(cputime));
  
         /* Account for system time used */
         acct_update_integrals(p);
@@ -2779,9 +2821,9 @@ void account_idle_time(cputime_t cputime)
         struct rq *rq = this_rq();
  
         if (atomic_read(&rq->nr_iowait) > 0)
-               cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+               cpustat[CPUTIME_IOWAIT] += cputime_to_u64(cputime);
         else
-               cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+               cpustat[CPUTIME_IDLE] += cputime_to_u64(cputime);
  }
  
  static __always_inline bool steal_account_process_tick(void)
@@ -2837,9 +2879,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                 return;
  
         if (irqtime_account_hi_update()) {
-               cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+               cpustat[CPUTIME_IRQ] += cputime_to_u64(cputime_one_jiffy);
         } else if (irqtime_account_si_update()) {
-               cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+               cpustat[CPUTIME_SOFTIRQ] += cputime_to_u64(cputime_one_jiffy);
         } else if (this_cpu_ksoftirqd() == p) {
                 /*
                  * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -3285,6 +3327,11 @@ void __sched schedule_preempt_disabled(void)
  }
  
  #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+#include <asm/mutex.h>
+
+#ifndef arch_cpu_is_running
+#define arch_cpu_is_running(cpu) true
+#endif
  
  static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
  {
@@ -3299,7 +3346,8 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
          */
         barrier();
  
-       return owner->on_cpu;
+       return owner->on_cpu
+              && arch_cpu_is_running(task_thread_info(owner)->cpu);
  }
  
  /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 4ab1187..f49a10f 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -660,6 +660,15 @@ static struct ctl_table kern_table[] = {
                 .extra1         = &pid_max_min,
                 .extra2         = &pid_max_max,
         },
+#if defined(CONFIG_MODULES) && defined(CONFIG_ENTERPRISE_SUPPORT)
+       {
+               .procname       = "unsupported",
+               .data           = &unsupported,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+#endif
         {
                 .procname       = "panic_on_oops",
                 .data           = &panic_on_oops,
@@ -840,6 +849,13 @@ static struct ctl_table kern_table[] = {
                 .proc_handler   = proc_dointvec,
         },
  #endif
+       {
+               .procname       = "suid_dumpable",
+               .data           = &suid_dumpable,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
  #if defined(CONFIG_S390) && defined(CONFIG_SMP)
         {
                 .procname       = "spin_retry",
@@ -849,7 +865,7 @@ static struct ctl_table kern_table[] = {
                 .proc_handler   = proc_dointvec,
         },
  #endif
-#if    defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
+#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) && !defined(CONFIG_ACPI_PV_SLEEP)
         {
                 .procname       = "acpi_video_flags",
                 .data           = &acpi_realmode_flags,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c

index a650694..06fe210 100644 (file)
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = {
         { CTL_INT,      KERN_COMPAT_LOG,                "compat-log" },
         { CTL_INT,      KERN_MAX_LOCK_DEPTH,            "max_lock_depth" },
         { CTL_INT,      KERN_PANIC_ON_NMI,              "panic_on_unrecovered_nmi" },
+       { CTL_INT,      KERN_SETUID_DUMPABLE,           "suid_dumpable" },
         {}
  };
  
@@ -872,6 +873,15 @@ static const struct bin_table bin_bus_table[] = {
  };
  
  
+#ifdef CONFIG_XEN
+#include <xen/sysctl.h>
+static const struct bin_table bin_xen_table[] = {
+       { CTL_INT,      CTL_XEN_INDEPENDENT_WALLCLOCK,  "independent_wallclock" },
+       { CTL_ULONG,    CTL_XEN_PERMITTED_CLOCK_JITTER, "permitted_clock_jitter" },
+       {}
+};
+#endif
+
  static const struct bin_table bin_s390dbf_table[] = {
         { CTL_INT,      5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
         { CTL_INT,      5679 /* CTL_S390DBF_ACTIVE */,    "debug_active" },
@@ -911,6 +921,9 @@ static const struct bin_table bin_root_table[] = {
         { CTL_DIR,      CTL_BUS,        "bus",          bin_bus_table },
         { CTL_DIR,      CTL_ABI,        "abi" },
         /* CTL_CPU not used */
+#ifdef CONFIG_XEN
+       { CTL_DIR,      CTL_XEN,        "xen",          bin_xen_table },
+#endif
         /* CTL_ARLAN "arlan" no longer used */
         { CTL_DIR,      CTL_S390DBF,    "s390dbf",      bin_s390dbf_table },
         { CTL_DIR,      CTL_SUNRPC,     "sunrpc",       bin_sunrpc_table },
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c

index f03fd83..53cb00a 100644 (file)
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -239,7 +239,10 @@ static inline void pps_fill_timex(struct timex *txc)
   * ntp_synced - Returns 1 if the NTP status is not UNSYNC
   *
   */
-static inline int ntp_synced(void)
+#ifndef CONFIG_XEN
+static
+#endif
+inline int ntp_synced(void)
  {
         return !(time_status & STA_UNSYNC);
  }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c

index d66b213..925943f 100644 (file)
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -20,6 +20,9 @@
  #include <linux/time.h>
  #include <linux/tick.h>
  #include <linux/stop_machine.h>
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#include <asm/time.h>
+#endif
  
  /* Structure holding internal timekeeping values. */
  struct timekeeper {
@@ -384,6 +387,9 @@ int do_settimeofday(const struct timespec *tv)
  
         timekeeper.xtime = *tv;
         timekeeping_update(true);
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       xen_update_wallclock(tv);
+#endif
  
         write_sequnlock_irqrestore(&timekeeper.lock, flags);
  
diff --git a/kernel/unwind.c b/kernel/unwind.c

new file mode 100644 (file)

index 0000000..9528a78
--- /dev/null
+++ b/kernel/unwind.c
@@ -0,0 +1,1641 @@
+/*
+ * Copyright (C) 2002-2006 Novell, Inc.
+ *     Jan Beulich <jbeulich@novell.com>
+ * This code is released under version 2 of the GNU GPL.
+ *
+ * A simple API for unwinding kernel stacks.  This is used for
+ * debugging and error reporting purposes.  The kernel doesn't need
+ * full-blown stack unwinding with all the bells and whistles, so there
+ * is not much point in implementing the full Dwarf2 unwind API.
+ */
+
+#include <linux/unwind.h>
+#include <linux/module.h>
+#include <linux/bootmem.h>
+#include <linux/sort.h>
+#include <linux/stop_machine.h>
+#include <linux/uaccess.h>
+#include <asm/sections.h>
+#include <asm/unaligned.h>
+#include <linux/slab.h>
+
+extern const char __start_unwind[], __end_unwind[];
+extern const u8 __start_unwind_hdr[], __end_unwind_hdr[];
+
+#define MAX_STACK_DEPTH 8
+
+#define EXTRA_INFO(f) { \
+               BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \
+                                 % FIELD_SIZEOF(struct unwind_frame_info, f)) \
+               + offsetof(struct unwind_frame_info, f) \
+                 / FIELD_SIZEOF(struct unwind_frame_info, f), \
+               FIELD_SIZEOF(struct unwind_frame_info, f) \
+       }
+#define PTREGS_INFO(f) EXTRA_INFO(regs.f)
+
+static const struct {
+       unsigned offs:BITS_PER_LONG / 2;
+       unsigned width:BITS_PER_LONG / 2;
+} reg_info[] = {
+       UNW_REGISTER_INFO
+};
+
+#undef PTREGS_INFO
+#undef EXTRA_INFO
+
+#ifndef REG_INVALID
+#define REG_INVALID(r) (reg_info[r].width == 0)
+#endif
+
+#define DW_CFA_nop                          0x00
+#define DW_CFA_set_loc                      0x01
+#define DW_CFA_advance_loc1                 0x02
+#define DW_CFA_advance_loc2                 0x03
+#define DW_CFA_advance_loc4                 0x04
+#define DW_CFA_offset_extended              0x05
+#define DW_CFA_restore_extended             0x06
+#define DW_CFA_undefined                    0x07
+#define DW_CFA_same_value                   0x08
+#define DW_CFA_register                     0x09
+#define DW_CFA_remember_state               0x0a
+#define DW_CFA_restore_state                0x0b
+#define DW_CFA_def_cfa                      0x0c
+#define DW_CFA_def_cfa_register             0x0d
+#define DW_CFA_def_cfa_offset               0x0e
+#define DW_CFA_def_cfa_expression           0x0f
+#define DW_CFA_expression                   0x10
+#define DW_CFA_offset_extended_sf           0x11
+#define DW_CFA_def_cfa_sf                   0x12
+#define DW_CFA_def_cfa_offset_sf            0x13
+#define DW_CFA_val_offset                   0x14
+#define DW_CFA_val_offset_sf                0x15
+#define DW_CFA_val_expression               0x16
+#define DW_CFA_lo_user                      0x1c
+#define DW_CFA_GNU_window_save              0x2d
+#define DW_CFA_GNU_args_size                0x2e
+#define DW_CFA_GNU_negative_offset_extended 0x2f
+#define DW_CFA_hi_user                      0x3f
+
+#define DW_EH_PE_FORM     0x07
+#define DW_EH_PE_native   0x00
+#define DW_EH_PE_leb128   0x01
+#define DW_EH_PE_data2    0x02
+#define DW_EH_PE_data4    0x03
+#define DW_EH_PE_data8    0x04
+#define DW_EH_PE_signed   0x08
+#define DW_EH_PE_ADJUST   0x70
+#define DW_EH_PE_abs      0x00
+#define DW_EH_PE_pcrel    0x10
+#define DW_EH_PE_textrel  0x20
+#define DW_EH_PE_datarel  0x30
+#define DW_EH_PE_funcrel  0x40
+#define DW_EH_PE_aligned  0x50
+#define DW_EH_PE_indirect 0x80
+#define DW_EH_PE_omit     0xff
+
+#define DW_OP_addr        0x03
+#define DW_OP_deref       0x06
+#define DW_OP_const1u     0x08
+#define DW_OP_const1s     0x09
+#define DW_OP_const2u     0x0a
+#define DW_OP_const2s     0x0b
+#define DW_OP_const4u     0x0c
+#define DW_OP_const4s     0x0d
+#define DW_OP_const8u     0x0e
+#define DW_OP_const8s     0x0f
+#define DW_OP_constu      0x10
+#define DW_OP_consts      0x11
+#define DW_OP_dup         0x12
+#define DW_OP_drop        0x13
+#define DW_OP_over        0x14
+#define DW_OP_pick        0x15
+#define DW_OP_swap        0x16
+#define DW_OP_rot         0x17
+#define DW_OP_xderef      0x18
+#define DW_OP_abs         0x19
+#define DW_OP_and         0x1a
+#define DW_OP_div         0x1b
+#define DW_OP_minus       0x1c
+#define DW_OP_mod         0x1d
+#define DW_OP_mul         0x1e
+#define DW_OP_neg         0x1f
+#define DW_OP_not         0x20
+#define DW_OP_or          0x21
+#define DW_OP_plus        0x22
+#define DW_OP_plus_uconst 0x23
+#define DW_OP_shl         0x24
+#define DW_OP_shr         0x25
+#define DW_OP_shra        0x26
+#define DW_OP_xor         0x27
+#define DW_OP_bra         0x28
+#define DW_OP_eq          0x29
+#define DW_OP_ge          0x2a
+#define DW_OP_gt          0x2b
+#define DW_OP_le          0x2c
+#define DW_OP_lt          0x2d
+#define DW_OP_ne          0x2e
+#define DW_OP_skip        0x2f
+#define DW_OP_lit0        0x30
+#define DW_OP_lit31       0x4f
+#define DW_OP_reg0        0x50
+#define DW_OP_reg31       0x6f
+#define DW_OP_breg0       0x70
+#define DW_OP_breg31      0x8f
+#define DW_OP_regx        0x90
+#define DW_OP_fbreg       0x91
+#define DW_OP_bregx       0x92
+#define DW_OP_piece       0x93
+#define DW_OP_deref_size  0x94
+#define DW_OP_xderef_size 0x95
+#define DW_OP_nop         0x96
+
+typedef unsigned long uleb128_t;
+typedef   signed long sleb128_t;
+#define sleb128abs __builtin_labs
+
+static struct unwind_table {
+       struct {
+               unsigned long pc;
+               unsigned long range;
+       } core, init;
+       const void *address;
+       unsigned long size;
+       const unsigned char *header;
+       unsigned long hdrsz;
+       struct unwind_table *link;
+       const char *name;
+} root_table;
+
+struct unwind_item {
+       enum item_location {
+               Nowhere,
+               Memory,
+               Register,
+               Value
+       } where;
+       uleb128_t value;
+};
+
+struct unwind_state {
+       uleb128_t loc, org;
+       const u8 *cieStart, *cieEnd;
+       uleb128_t codeAlign;
+       sleb128_t dataAlign;
+       struct cfa {
+               uleb128_t reg, offs, elen;
+               const u8 *expr;
+       } cfa;
+       struct unwind_item regs[ARRAY_SIZE(reg_info)];
+       unsigned stackDepth:8;
+       unsigned version:8;
+       const u8 *label;
+       const u8 *stack[MAX_STACK_DEPTH];
+};
+
+static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
+
+static unsigned unwind_debug;
+static int __init unwind_debug_setup(char *s)
+{
+       unwind_debug = simple_strtoul(s, NULL, 0);
+       return 1;
+}
+__setup("unwind_debug=", unwind_debug_setup);
+#define dprintk(lvl, fmt, args...) \
+       ((void)(lvl > unwind_debug \
+        || printk(KERN_DEBUG "unwind: " fmt "\n", ##args)))
+
+static struct unwind_table *find_table(unsigned long pc)
+{
+       struct unwind_table *table;
+
+       for (table = &root_table; table; table = table->link)
+               if ((pc >= table->core.pc
+                    && pc < table->core.pc + table->core.range)
+                   || (pc >= table->init.pc
+                       && pc < table->init.pc + table->init.range))
+                       break;
+
+       return table;
+}
+
+static unsigned long read_pointer(const u8 **pLoc,
+                                  const void *end,
+                                  signed ptrType,
+                                  unsigned long text_base,
+                                  unsigned long data_base);
+
+static void init_unwind_table(struct unwind_table *table,
+                              const char *name,
+                              const void *core_start,
+                              unsigned long core_size,
+                              const void *init_start,
+                              unsigned long init_size,
+                              const void *table_start,
+                              unsigned long table_size,
+                              const u8 *header_start,
+                              unsigned long header_size)
+{
+       const u8 *ptr = header_start + 4;
+       const u8 *end = header_start + header_size;
+
+       table->core.pc = (unsigned long)core_start;
+       table->core.range = core_size;
+       table->init.pc = (unsigned long)init_start;
+       table->init.range = init_size;
+       table->address = table_start;
+       table->size = table_size;
+       /* See if the linker provided table looks valid. */
+       if (header_size <= 4
+           || header_start[0] != 1
+           || (void *)read_pointer(&ptr, end, header_start[1], 0, 0)
+              != table_start
+           || !read_pointer(&ptr, end, header_start[2], 0, 0)
+           || !read_pointer(&ptr, end, header_start[3], 0,
+                            (unsigned long)header_start)
+           || !read_pointer(&ptr, end, header_start[3], 0,
+                            (unsigned long)header_start))
+               header_start = NULL;
+       table->hdrsz = header_size;
+       smp_wmb();
+       table->header = header_start;
+       table->link = NULL;
+       table->name = name;
+}
+
+void __init unwind_init(void)
+{
+       init_unwind_table(&root_table, "kernel",
+                         _text, _end - _text,
+                         NULL, 0,
+                         __start_unwind, __end_unwind - __start_unwind,
+                         __start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr);
+}
+
+static const u32 bad_cie, not_fde;
+static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *);
+static signed fde_pointer_type(const u32 *cie);
+
+struct eh_frame_hdr_table_entry {
+       unsigned long start, fde;
+};
+
+static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2)
+{
+       const struct eh_frame_hdr_table_entry *e1 = p1;
+       const struct eh_frame_hdr_table_entry *e2 = p2;
+
+       return (e1->start > e2->start) - (e1->start < e2->start);
+}
+
+static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size)
+{
+       struct eh_frame_hdr_table_entry *e1 = p1;
+       struct eh_frame_hdr_table_entry *e2 = p2;
+       unsigned long v;
+
+       v = e1->start;
+       e1->start = e2->start;
+       e2->start = v;
+       v = e1->fde;
+       e1->fde = e2->fde;
+       e2->fde = v;
+}
+
+static void __init setup_unwind_table(struct unwind_table *table,
+                                       void *(*alloc)(unsigned long))
+{
+       const u8 *ptr;
+       unsigned long tableSize = table->size, hdrSize;
+       unsigned n;
+       const u32 *fde;
+       struct {
+               u8 version;
+               u8 eh_frame_ptr_enc;
+               u8 fde_count_enc;
+               u8 table_enc;
+               unsigned long eh_frame_ptr;
+               unsigned int fde_count;
+               struct eh_frame_hdr_table_entry table[];
+       } __attribute__((__packed__)) *header;
+
+       if (table->header)
+               return;
+
+       if (table->hdrsz)
+               printk(KERN_WARNING ".eh_frame_hdr for '%s' present but unusable\n",
+                      table->name);
+
+       if (tableSize & (sizeof(*fde) - 1))
+               return;
+
+       for (fde = table->address, n = 0;
+            tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
+            tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) {
+               const u32 *cie = cie_for_fde(fde, table);
+               signed ptrType;
+
+               if (cie == &not_fde)
+                       continue;
+               if (cie == NULL
+                   || cie == &bad_cie
+                   || (ptrType = fde_pointer_type(cie)) < 0)
+                       return;
+               ptr = (const u8 *)(fde + 2);
+               if (!read_pointer(&ptr,
+                                 (const u8 *)(fde + 1) + *fde,
+                                 ptrType, 0, 0))
+                       return;
+               ++n;
+       }
+
+       if (tableSize || !n)
+               return;
+
+       hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int)
+               + 2 * n * sizeof(unsigned long);
+       dprintk(2, "Binary lookup table size for %s: %lu bytes", table->name, hdrSize);
+       header = alloc(hdrSize);
+       if (!header)
+               return;
+       header->version          = 1;
+       header->eh_frame_ptr_enc = DW_EH_PE_abs|DW_EH_PE_native;
+       header->fde_count_enc    = DW_EH_PE_abs|DW_EH_PE_data4;
+       header->table_enc        = DW_EH_PE_abs|DW_EH_PE_native;
+       put_unaligned((unsigned long)table->address, &header->eh_frame_ptr);
+       BUILD_BUG_ON(offsetof(typeof(*header), fde_count)
+                    % __alignof(typeof(header->fde_count)));
+       header->fde_count        = n;
+
+       BUILD_BUG_ON(offsetof(typeof(*header), table)
+                    % __alignof(typeof(*header->table)));
+       for (fde = table->address, tableSize = table->size, n = 0;
+            tableSize;
+            tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) {
+               const u32 *cie = fde + 1 - fde[1] / sizeof(*fde);
+
+               if (!fde[1])
+                       continue; /* this is a CIE */
+               ptr = (const u8 *)(fde + 2);
+               header->table[n].start = read_pointer(&ptr,
+                                                     (const u8 *)(fde + 1) + *fde,
+                                                     fde_pointer_type(cie), 0, 0);
+               header->table[n].fde = (unsigned long)fde;
+               ++n;
+       }
+       WARN_ON(n != header->fde_count);
+
+       sort(header->table,
+            n,
+            sizeof(*header->table),
+            cmp_eh_frame_hdr_table_entries,
+            swap_eh_frame_hdr_table_entries);
+
+       table->hdrsz = hdrSize;
+       smp_wmb();
+       table->header = (const void *)header;
+}
+
+static void *__init balloc(unsigned long sz)
+{
+       return __alloc_bootmem_nopanic(sz,
+                                      sizeof(unsigned int),
+                                      __pa(MAX_DMA_ADDRESS));
+}
+
+void __init unwind_setup(void)
+{
+       setup_unwind_table(&root_table, balloc);
+}
+
+#ifdef CONFIG_MODULES
+
+static struct unwind_table *last_table;
+
+/* Must be called with module_mutex held. */
+void *unwind_add_table(struct module *module,
+                       const void *table_start,
+                       unsigned long table_size)
+{
+       struct unwind_table *table;
+
+       if (table_size <= 0)
+               return NULL;
+
+       table = kmalloc(sizeof(*table), GFP_KERNEL);
+       if (!table)
+               return NULL;
+
+       init_unwind_table(table, module->name,
+                         module->module_core, module->core_size,
+                         module->module_init, module->init_size,
+                         table_start, table_size,
+                         NULL, 0);
+
+       if (last_table)
+               last_table->link = table;
+       else
+               root_table.link = table;
+       last_table = table;
+
+       return table;
+}
+
+struct unlink_table_info
+{
+       struct unwind_table *table;
+       int init_only;
+};
+
+static int unlink_table(void *arg)
+{
+       struct unlink_table_info *info = arg;
+       struct unwind_table *table = info->table, *prev;
+
+       for (prev = &root_table; prev->link && prev->link != table; prev = prev->link)
+               ;
+
+       if (prev->link) {
+               if (info->init_only) {
+                       table->init.pc = 0;
+                       table->init.range = 0;
+                       info->table = NULL;
+               } else {
+                       prev->link = table->link;
+                       if (!prev->link)
+                               last_table = prev;
+               }
+       } else
+               info->table = NULL;
+
+       return 0;
+}
+
+/* Must be called with module_mutex held. */
+void unwind_remove_table(void *handle, int init_only)
+{
+       struct unwind_table *table = handle;
+       struct unlink_table_info info;
+
+       if (!table || table == &root_table)
+               return;
+
+       if (init_only && table == last_table) {
+               table->init.pc = 0;
+               table->init.range = 0;
+               return;
+       }
+
+       info.table = table;
+       info.init_only = init_only;
+       stop_machine(unlink_table, &info, NULL);
+
+       if (info.table)
+               kfree(table);
+}
+
+#endif /* CONFIG_MODULES */
+
+static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
+{
+       const u8 *cur = *pcur;
+       uleb128_t value;
+       unsigned shift;
+
+       for (shift = 0, value = 0; cur < end; shift += 7) {
+               if (shift + 7 > 8 * sizeof(value)
+                   && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
+                       cur = end + 1;
+                       break;
+               }
+               value |= (uleb128_t)(*cur & 0x7f) << shift;
+               if (!(*cur++ & 0x80))
+                       break;
+       }
+       *pcur = cur;
+
+       return value;
+}
+
+static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
+{
+       const u8 *cur = *pcur;
+       sleb128_t value;
+       unsigned shift;
+
+       for (shift = 0, value = 0; cur < end; shift += 7) {
+               if (shift + 7 > 8 * sizeof(value)
+                   && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
+                       cur = end + 1;
+                       break;
+               }
+               value |= (sleb128_t)(*cur & 0x7f) << shift;
+               if (!(*cur & 0x80)) {
+                       value |= -(*cur++ & 0x40) << shift;
+                       break;
+               }
+       }
+       *pcur = cur;
+
+       return value;
+}
+
+static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table)
+{
+       const u32 *cie;
+
+       if (!*fde || (*fde & (sizeof(*fde) - 1)))
+               return &bad_cie;
+       if (!fde[1])
+               return &not_fde; /* this is a CIE */
+       if ((fde[1] & (sizeof(*fde) - 1))
+           || fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address)
+               return NULL; /* this is not a valid FDE */
+       cie = fde + 1 - fde[1] / sizeof(*fde);
+       if (*cie <= sizeof(*cie) + 4
+           || *cie >= fde[1] - sizeof(*fde)
+           || (*cie & (sizeof(*cie) - 1))
+           || cie[1])
+               return NULL; /* this is not a (valid) CIE */
+       return cie;
+}
+
+static unsigned long read_pointer(const u8 **pLoc,
+                                  const void *end,
+                                  signed ptrType,
+                                  unsigned long text_base,
+                                  unsigned long data_base)
+{
+       unsigned long value = 0;
+       union {
+               const u8 *p8;
+               const u16 *p16u;
+               const s16 *p16s;
+               const u32 *p32u;
+               const s32 *p32s;
+               const unsigned long *pul;
+       } ptr;
+
+       if (ptrType < 0 || ptrType == DW_EH_PE_omit) {
+               dprintk(1, "Invalid pointer encoding %02X (%p,%p).", ptrType, *pLoc, end);
+               return 0;
+       }
+       ptr.p8 = *pLoc;
+       switch (ptrType & DW_EH_PE_FORM) {
+       case DW_EH_PE_data2:
+               if (end < (const void *)(ptr.p16u + 1)) {
+                       dprintk(1, "Data16 overrun (%p,%p).", ptr.p8, end);
+                       return 0;
+               }
+               if (ptrType & DW_EH_PE_signed)
+                       value = get_unaligned(ptr.p16s++);
+               else
+                       value = get_unaligned(ptr.p16u++);
+               break;
+       case DW_EH_PE_data4:
+#ifdef CONFIG_64BIT
+               if (end < (const void *)(ptr.p32u + 1)) {
+                       dprintk(1, "Data32 overrun (%p,%p).", ptr.p8, end);
+                       return 0;
+               }
+               if (ptrType & DW_EH_PE_signed)
+                       value = get_unaligned(ptr.p32s++);
+               else
+                       value = get_unaligned(ptr.p32u++);
+               break;
+       case DW_EH_PE_data8:
+               BUILD_BUG_ON(sizeof(u64) != sizeof(value));
+#else
+               BUILD_BUG_ON(sizeof(u32) != sizeof(value));
+#endif
+       case DW_EH_PE_native:
+               if (end < (const void *)(ptr.pul + 1)) {
+                       dprintk(1, "DataUL overrun (%p,%p).", ptr.p8, end);
+                       return 0;
+               }
+               value = get_unaligned(ptr.pul++);
+               break;
+       case DW_EH_PE_leb128:
+               BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value));
+               value = ptrType & DW_EH_PE_signed
+                       ? get_sleb128(&ptr.p8, end)
+                       : get_uleb128(&ptr.p8, end);
+               if ((const void *)ptr.p8 > end) {
+                       dprintk(1, "DataLEB overrun (%p,%p).", ptr.p8, end);
+                       return 0;
+               }
+               break;
+       default:
+               dprintk(2, "Cannot decode pointer type %02X (%p,%p).",
+                       ptrType, ptr.p8, end);
+               return 0;
+       }
+       switch (ptrType & DW_EH_PE_ADJUST) {
+       case DW_EH_PE_abs:
+               break;
+       case DW_EH_PE_pcrel:
+               value += (unsigned long)*pLoc;
+               break;
+       case DW_EH_PE_textrel:
+               if (likely(text_base)) {
+                       value += text_base;
+                       break;
+               }
+               dprintk(2, "Text-relative encoding %02X (%p,%p), but zero text base.",
+                       ptrType, *pLoc, end);
+               return 0;
+       case DW_EH_PE_datarel:
+               if (likely(data_base)) {
+                       value += data_base;
+                       break;
+               }
+               dprintk(2, "Data-relative encoding %02X (%p,%p), but zero data base.",
+                       ptrType, *pLoc, end);
+               return 0;
+       default:
+               dprintk(2, "Cannot adjust pointer type %02X (%p,%p).",
+                       ptrType, *pLoc, end);
+               return 0;
+       }
+       if ((ptrType & DW_EH_PE_indirect)
+           && probe_kernel_address(value, value)) {
+               dprintk(1, "Cannot read indirect value %lx (%p,%p).",
+                       value, *pLoc, end);
+               return 0;
+       }
+       *pLoc = ptr.p8;
+
+       return value;
+}
+
+static signed fde_pointer_type(const u32 *cie)
+{
+       const u8 *ptr = (const u8 *)(cie + 2);
+       unsigned version = *ptr;
+
+       if (version != 1)
+               return -1; /* unsupported */
+       if (*++ptr) {
+               const char *aug;
+               const u8 *end = (const u8 *)(cie + 1) + *cie;
+               uleb128_t len;
+
+               /* check if augmentation size is first (and thus present) */
+               if (*ptr != 'z')
+                       return -1;
+               /* check if augmentation string is nul-terminated */
+               if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL)
+                       return -1;
+               ++ptr; /* skip terminator */
+               get_uleb128(&ptr, end); /* skip code alignment */
+               get_sleb128(&ptr, end); /* skip data alignment */
+               /* skip return address column */
+               version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end);
+               len = get_uleb128(&ptr, end); /* augmentation length */
+               if (ptr + len < ptr || ptr + len > end)
+                       return -1;
+               end = ptr + len;
+               while (*++aug) {
+                       if (ptr >= end)
+                               return -1;
+                       switch (*aug) {
+                       case 'L':
+                               ++ptr;
+                               break;
+                       case 'P': {
+                                       signed ptrType = *ptr++;
+
+                                       if (!read_pointer(&ptr, end, ptrType, 0, 0)
+                                           || ptr > end)
+                                               return -1;
+                               }
+                               break;
+                       case 'R':
+                               return *ptr;
+                       default:
+                               return -1;
+                       }
+               }
+       }
+       return DW_EH_PE_native|DW_EH_PE_abs;
+}
+
+static int advance_loc(unsigned long delta, struct unwind_state *state)
+{
+       state->loc += delta * state->codeAlign;
+
+       return delta > 0;
+}
+
+static void set_rule(uleb128_t reg,
+                     enum item_location where,
+                     uleb128_t value,
+                     struct unwind_state *state)
+{
+       if (reg < ARRAY_SIZE(state->regs)) {
+               state->regs[reg].where = where;
+               state->regs[reg].value = value;
+       }
+}
+
+static int processCFI(const u8 *start,
+                      const u8 *end,
+                      unsigned long targetLoc,
+                      signed ptrType,
+                      struct unwind_state *state)
+{
+       union {
+               const u8 *p8;
+               const u16 *p16;
+               const u32 *p32;
+       } ptr;
+       int result = 1;
+
+       if (start != state->cieStart) {
+               state->loc = state->org;
+               result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state);
+               if (targetLoc == 0 && state->label == NULL)
+                       return result;
+       }
+       for (ptr.p8 = start; result && ptr.p8 < end; ) {
+               switch (*ptr.p8 >> 6) {
+                       uleb128_t value;
+
+               case 0:
+                       switch (*ptr.p8++) {
+                       case DW_CFA_nop:
+                               break;
+                       case DW_CFA_set_loc:
+                               state->loc = read_pointer(&ptr.p8, end, ptrType, 0, 0);
+                               if (state->loc == 0)
+                                       result = 0;
+                               break;
+                       case DW_CFA_advance_loc1:
+                               result = ptr.p8 < end && advance_loc(*ptr.p8++, state);
+                               break;
+                       case DW_CFA_advance_loc2:
+                               result = ptr.p8 <= end + 2
+                                        && advance_loc(*ptr.p16++, state);
+                               break;
+                       case DW_CFA_advance_loc4:
+                               result = ptr.p8 <= end + 4
+                                        && advance_loc(*ptr.p32++, state);
+                               break;
+                       case DW_CFA_offset_extended:
+                               value = get_uleb128(&ptr.p8, end);
+                               set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
+                               break;
+                       case DW_CFA_val_offset:
+                               value = get_uleb128(&ptr.p8, end);
+                               set_rule(value, Value, get_uleb128(&ptr.p8, end), state);
+                               break;
+                       case DW_CFA_offset_extended_sf:
+                               value = get_uleb128(&ptr.p8, end);
+                               set_rule(value, Memory, get_sleb128(&ptr.p8, end), state);
+                               break;
+                       case DW_CFA_val_offset_sf:
+                               value = get_uleb128(&ptr.p8, end);
+                               set_rule(value, Value, get_sleb128(&ptr.p8, end), state);
+                               break;
+                       /*todo case DW_CFA_expression: */
+                       /*todo case DW_CFA_val_expression: */
+                       case DW_CFA_restore_extended:
+                       case DW_CFA_undefined:
+                       case DW_CFA_same_value:
+                               set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state);
+                               break;
+                       case DW_CFA_register:
+                               value = get_uleb128(&ptr.p8, end);
+                               set_rule(value,
+                                        Register,
+                                        get_uleb128(&ptr.p8, end), state);
+                               break;
+                       case DW_CFA_remember_state:
+                               if (ptr.p8 == state->label) {
+                                       state->label = NULL;
+                                       return 1;
+                               }
+                               if (state->stackDepth >= MAX_STACK_DEPTH) {
+                                       dprintk(1, "State stack overflow (%p,%p).", ptr.p8, end);
+                                       return 0;
+                               }
+                               state->stack[state->stackDepth++] = ptr.p8;
+                               break;
+                       case DW_CFA_restore_state:
+                               if (state->stackDepth) {
+                                       const uleb128_t loc = state->loc;
+                                       const u8 *label = state->label;
+
+                                       state->label = state->stack[state->stackDepth - 1];
+                                       memcpy(&state->cfa, &badCFA, sizeof(state->cfa));
+                                       memset(state->regs, 0, sizeof(state->regs));
+                                       state->stackDepth = 0;
+                                       result = processCFI(start, end, 0, ptrType, state);
+                                       state->loc = loc;
+                                       state->label = label;
+                               } else {
+                                       dprintk(1, "State stack underflow (%p,%p).", ptr.p8, end);
+                                       return 0;
+                               }
+                               break;
+                       case DW_CFA_def_cfa:
+                               state->cfa.reg = get_uleb128(&ptr.p8, end);
+                               state->cfa.elen = 0;
+                               /*nobreak*/
+                       case DW_CFA_def_cfa_offset:
+                               state->cfa.offs = get_uleb128(&ptr.p8, end);
+                               break;
+                       case DW_CFA_def_cfa_sf:
+                               state->cfa.reg = get_uleb128(&ptr.p8, end);
+                               state->cfa.elen = 0;
+                               /*nobreak*/
+                       case DW_CFA_def_cfa_offset_sf:
+                               state->cfa.offs = get_sleb128(&ptr.p8, end)
+                                                 * state->dataAlign;
+                               break;
+                       case DW_CFA_def_cfa_register:
+                               state->cfa.reg = get_uleb128(&ptr.p8, end);
+                               state->cfa.elen = 0;
+                               break;
+                       case DW_CFA_def_cfa_expression:
+                               state->cfa.elen = get_uleb128(&ptr.p8, end);
+                               if (!state->cfa.elen) {
+                                       dprintk(1, "Zero-length CFA expression.");
+                                       return 0;
+                               }
+                               state->cfa.expr = ptr.p8;
+                               ptr.p8 += state->cfa.elen;
+                               break;
+                       case DW_CFA_GNU_args_size:
+                               get_uleb128(&ptr.p8, end);
+                               break;
+                       case DW_CFA_GNU_negative_offset_extended:
+                               value = get_uleb128(&ptr.p8, end);
+                               set_rule(value,
+                                        Memory,
+                                        (uleb128_t)0 - get_uleb128(&ptr.p8, end), state);
+                               break;
+                       case DW_CFA_GNU_window_save:
+                       default:
+                               dprintk(1, "Unrecognized CFI op %02X (%p,%p).", ptr.p8[-1], ptr.p8 - 1, end);
+                               result = 0;
+                               break;
+                       }
+                       break;
+               case 1:
+                       result = advance_loc(*ptr.p8++ & 0x3f, state);
+                       break;
+               case 2:
+                       value = *ptr.p8++ & 0x3f;
+                       set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
+                       break;
+               case 3:
+                       set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
+                       break;
+               }
+               if (ptr.p8 > end) {
+                       dprintk(1, "Data overrun (%p,%p).", ptr.p8, end);
+                       result = 0;
+               }
+               if (result && targetLoc != 0 && targetLoc < state->loc)
+                       return 1;
+       }
+
+       if (result && ptr.p8 < end)
+               dprintk(1, "Data underrun (%p,%p).", ptr.p8, end);
+
+       return result
+              && ptr.p8 == end
+              && (targetLoc == 0
+                  || (/*todo While in theory this should apply, gcc in practice omits
+                        everything past the function prolog, and hence the location
+                        never reaches the end of the function.
+                      targetLoc < state->loc &&*/ state->label == NULL));
+}
+
+static unsigned long evaluate(const u8 *expr, const u8 *end,
+                             const struct unwind_frame_info *frame)
+{
+       union {
+               const u8 *pu8;
+               const s8 *ps8;
+               const u16 *pu16;
+               const s16 *ps16;
+               const u32 *pu32;
+               const s32 *ps32;
+               const u64 *pu64;
+               const s64 *ps64;
+       } ptr = { expr };
+       unsigned long stack[8], val1, val2;
+       unsigned int stidx = 0;
+#define PUSH(v) ({ unsigned long v__ = (v); if (stidx >= ARRAY_SIZE(stack)) return 0; stack[stidx++] = v__; })
+#define POP() ({ if (!stidx) return 0; stack[--stidx]; })
+
+       while (ptr.pu8 < end) {
+               switch (*ptr.pu8++) {
+               /*todo case DW_OP_addr: */
+               case DW_OP_deref:
+                       val1 = POP();
+                       if (probe_kernel_address(val1, val2)) {
+                               dprintk(1, "Cannot de-reference %lx (%p,%p).", val1, ptr.pu8 - 1, end);
+                               return 0;
+                       }
+                       PUSH(val2);
+                       break;
+               /*todo? case DW_OP_xderef: */
+               /*todo case DW_OP_deref_size: */
+               /*todo? case DW_OP_xderef_size: */
+               case DW_OP_const1u:
+                       if (ptr.pu8 < end)
+                               PUSH(*ptr.pu8);
+                       ++ptr.pu8;
+                       break;
+               case DW_OP_const1s:
+                       if (ptr.pu8 < end)
+                               PUSH(*ptr.ps8);
+                       ++ptr.ps8;
+                       break;
+               case DW_OP_const2u:
+                       if (ptr.pu8 + 1 < end)
+                               PUSH(*ptr.pu16);
+                       ++ptr.pu16;
+                       break;
+               case DW_OP_const2s:
+                       if (ptr.pu8 + 1 < end)
+                               PUSH(*ptr.ps16);
+                       ++ptr.ps16;
+                       break;
+               case DW_OP_const4u:
+                       if (ptr.pu8 + 3 < end)
+                               PUSH(*ptr.pu32);
+                       ++ptr.pu32;
+                       break;
+               case DW_OP_const4s:
+                       if (ptr.pu8 + 3 < end)
+                               PUSH(*ptr.ps32);
+                       ++ptr.ps32;
+                       break;
+               case DW_OP_const8u:
+                       if (ptr.pu8 + 7 < end)
+                               PUSH(*ptr.pu64);
+                       ++ptr.pu64;
+                       break;
+               case DW_OP_const8s:
+                       if (ptr.pu8 + 7 < end)
+                               PUSH(*ptr.ps64);
+                       ++ptr.ps64;
+                       break;
+               case DW_OP_constu:
+                       PUSH(get_uleb128(&ptr.pu8, end));
+                       break;
+               case DW_OP_consts:
+                       PUSH(get_sleb128(&ptr.pu8, end));
+                       break;
+               case DW_OP_dup:
+                       if (!stidx)
+                               return 0;
+                       PUSH(stack[stidx - 1]);
+                       break;
+               case DW_OP_drop:
+                       (void)POP();
+                       break;
+               case DW_OP_over:
+                       if (stidx <= 1)
+                               return 0;
+                       PUSH(stack[stidx - 2]);
+                       break;
+               case DW_OP_pick:
+                       if (ptr.pu8 < end) {
+                               if (stidx <= *ptr.pu8)
+                                       return 0;
+                               PUSH(stack[stidx - *ptr.pu8 - 1]);
+                       }
+                       ++ptr.pu8;
+                       break;
+               case DW_OP_swap:
+                       if (stidx <= 1)
+                               return 0;
+                       val1 = stack[stidx - 1];
+                       stack[stidx - 1] = stack[stidx - 2];
+                       stack[stidx - 2] = val1;
+                       break;
+               case DW_OP_rot:
+                       if (stidx <= 2)
+                               return 0;
+                       val1 = stack[stidx - 1];
+                       stack[stidx - 1] = stack[stidx - 2];
+                       stack[stidx - 2] = stack[stidx - 3];
+                       stack[stidx - 3] = val1;
+                       break;
+               case DW_OP_abs:
+                       PUSH(__builtin_labs(POP()));
+                       break;
+               case DW_OP_and:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val2 & val1);
+                       break;
+               case DW_OP_div:
+                       val1 = POP();
+                       if (!val1)
+                               return 0;
+                       val2 = POP();
+                       PUSH(val2 / val1);
+                       break;
+               case DW_OP_minus:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val2 - val1);
+                       break;
+               case DW_OP_mod:
+                       val1 = POP();
+                       if (!val1)
+                               return 0;
+                       val2 = POP();
+                       PUSH(val2 % val1);
+                       break;
+               case DW_OP_mul:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val2 * val1);
+                       break;
+               case DW_OP_neg:
+                       PUSH(-(long)POP());
+                       break;
+               case DW_OP_not:
+                       PUSH(~POP());
+                       break;
+               case DW_OP_or:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val2 | val1);
+                       break;
+               case DW_OP_plus:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val2 + val1);
+                       break;
+               case DW_OP_plus_uconst:
+                       PUSH(POP() + get_uleb128(&ptr.pu8, end));
+                       break;
+               case DW_OP_shl:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val1 < BITS_PER_LONG ? val2 << val1 : 0);
+                       break;
+               case DW_OP_shr:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val1 < BITS_PER_LONG ? val2 >> val1 : 0);
+                       break;
+               case DW_OP_shra:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val1 < BITS_PER_LONG ? (long)val2 >> val1 : (val2 < 0 ? -1 : 0));
+                       break;
+               case DW_OP_xor:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val2 ^ val1);
+                       break;
+               case DW_OP_bra:
+                       if (!POP()) {
+                               ++ptr.ps16;
+                               break;
+                       }
+                       /*nobreak*/
+               case DW_OP_skip:
+                       if (ptr.pu8 + 1 < end) {
+                               ptr.pu8 += *ptr.ps16;
+                               if (ptr.pu8 < expr)
+                                       return 0;
+                       } else
+                               ++ptr.ps16;
+                       break;
+               case DW_OP_eq:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val2 == val1);
+                       break;
+               case DW_OP_ne:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val2 != val1);
+                       break;
+               case DW_OP_lt:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val2 < val1);
+                       break;
+               case DW_OP_le:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val2 <= val1);
+               case DW_OP_ge:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val2 >= val1);
+                       break;
+               case DW_OP_gt:
+                       val1 = POP();
+                       val2 = POP();
+                       PUSH(val2 > val1);
+                       break;
+               case DW_OP_lit0 ... DW_OP_lit31:
+                       PUSH(ptr.pu8[-1] - DW_OP_lit0);
+                       break;
+               case DW_OP_breg0 ... DW_OP_breg31:
+                       val1 = ptr.pu8[-1] - DW_OP_breg0;
+                       if (0)
+               case DW_OP_bregx:
+                               val1 = get_uleb128(&ptr.pu8, end);
+                       if (val1 >= ARRAY_SIZE(reg_info)
+                           || reg_info[val1].width != sizeof(unsigned long))
+                               return 0;
+                       PUSH(((const unsigned long *)frame)[reg_info[val1].offs]
+                            + get_sleb128(&ptr.pu8, end));
+                       break;
+               /*todo? case DW_OP_fbreg: */
+               /*todo? case DW_OP_piece: */
+               case DW_OP_nop:
+                       break;
+               default:
+                       dprintk(1, "Unsupported expression op %02x (%p,%p).", ptr.pu8[-1], ptr.pu8 - 1, end);
+                       return 0;
+               }
+       }
+       if (ptr.pu8 > end)
+               return 0;
+       val1 = POP();
+#undef POP
+#undef PUSH
+       return val1;
+}
+
+/* Unwind to previous to frame.  Returns 0 if successful, negative
+ * number in case of an error. */
+int unwind(struct unwind_frame_info *frame)
+{
+#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
+       const u32 *fde = NULL, *cie = NULL;
+       const u8 *ptr = NULL, *end = NULL;
+       unsigned long pc = UNW_PC(frame) - frame->call_frame, sp;
+       unsigned long startLoc = 0, endLoc = 0, cfa;
+       unsigned i;
+       signed ptrType = -1;
+       uleb128_t retAddrReg = 0;
+       const struct unwind_table *table;
+       struct unwind_state state;
+
+       if (UNW_PC(frame) == 0)
+               return -EINVAL;
+       if ((table = find_table(pc)) != NULL
+           && !(table->size & (sizeof(*fde) - 1))) {
+               const u8 *hdr = table->header;
+               unsigned long tableSize;
+
+               smp_rmb();
+               if (hdr && hdr[0] == 1) {
+                       switch (hdr[3] & DW_EH_PE_FORM) {
+                       case DW_EH_PE_native: tableSize = sizeof(unsigned long); break;
+                       case DW_EH_PE_data2: tableSize = 2; break;
+                       case DW_EH_PE_data4: tableSize = 4; break;
+                       case DW_EH_PE_data8: tableSize = 8; break;
+                       default: tableSize = 0; break;
+                       }
+                       ptr = hdr + 4;
+                       end = hdr + table->hdrsz;
+                       if (tableSize
+                           && read_pointer(&ptr, end, hdr[1], 0, 0)
+                              == (unsigned long)table->address
+                           && (i = read_pointer(&ptr, end, hdr[2], 0, 0)) > 0
+                           && i == (end - ptr) / (2 * tableSize)
+                           && !((end - ptr) % (2 * tableSize))) {
+                               do {
+                                       const u8 *cur = ptr + (i / 2) * (2 * tableSize);
+
+                                       startLoc = read_pointer(&cur,
+                                                               cur + tableSize,
+                                                               hdr[3], 0,
+                                                               (unsigned long)hdr);
+                                       if (pc < startLoc)
+                                               i /= 2;
+                                       else {
+                                               ptr = cur - tableSize;
+                                               i = (i + 1) / 2;
+                                       }
+                               } while (startLoc && i > 1);
+                               if (i == 1
+                                   && (startLoc = read_pointer(&ptr,
+                                                               ptr + tableSize,
+                                                               hdr[3], 0,
+                                                               (unsigned long)hdr)) != 0
+                                   && pc >= startLoc)
+                                       fde = (void *)read_pointer(&ptr,
+                                                                  ptr + tableSize,
+                                                                  hdr[3], 0,
+                                                                  (unsigned long)hdr);
+                       }
+               }
+               if (hdr && !fde)
+                       dprintk(3, "Binary lookup for %lx failed.", pc);
+
+               if (fde != NULL) {
+                       cie = cie_for_fde(fde, table);
+                       ptr = (const u8 *)(fde + 2);
+                       if (cie != NULL
+                           && cie != &bad_cie
+                           && cie != &not_fde
+                           && (ptrType = fde_pointer_type(cie)) >= 0
+                           && read_pointer(&ptr,
+                                           (const u8 *)(fde + 1) + *fde,
+                                           ptrType, 0, 0) == startLoc) {
+                               if (!(ptrType & DW_EH_PE_indirect))
+                                       ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
+                               endLoc = startLoc
+                                        + read_pointer(&ptr,
+                                                       (const u8 *)(fde + 1) + *fde,
+                                                       ptrType, 0, 0);
+                               if (pc >= endLoc)
+                                       fde = NULL;
+                       } else
+                               fde = NULL;
+                       if (!fde)
+                               dprintk(1, "Binary lookup result for %lx discarded.", pc);
+               }
+               if (fde == NULL) {
+                       for (fde = table->address, tableSize = table->size;
+                            cie = NULL, tableSize > sizeof(*fde)
+                            && tableSize - sizeof(*fde) >= *fde;
+                            tableSize -= sizeof(*fde) + *fde,
+                            fde += 1 + *fde / sizeof(*fde)) {
+                               cie = cie_for_fde(fde, table);
+                               if (cie == &bad_cie) {
+                                       cie = NULL;
+                                       break;
+                               }
+                               if (cie == NULL
+                                   || cie == &not_fde
+                                   || (ptrType = fde_pointer_type(cie)) < 0)
+                                       continue;
+                               ptr = (const u8 *)(fde + 2);
+                               startLoc = read_pointer(&ptr,
+                                                       (const u8 *)(fde + 1) + *fde,
+                                                       ptrType, 0, 0);
+                               if (!startLoc)
+                                       continue;
+                               if (!(ptrType & DW_EH_PE_indirect))
+                                       ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
+                               endLoc = startLoc
+                                        + read_pointer(&ptr,
+                                                       (const u8 *)(fde + 1) + *fde,
+                                                       ptrType, 0, 0);
+                               if (pc >= startLoc && pc < endLoc)
+                                       break;
+                       }
+                       if (!fde)
+                               dprintk(3, "Linear lookup for %lx failed.", pc);
+               }
+       }
+       if (cie != NULL) {
+               memset(&state, 0, sizeof(state));
+               state.cieEnd = ptr; /* keep here temporarily */
+               ptr = (const u8 *)(cie + 2);
+               end = (const u8 *)(cie + 1) + *cie;
+               frame->call_frame = 1;
+               if ((state.version = *ptr) != 1)
+                       cie = NULL; /* unsupported version */
+               else if (*++ptr) {
+                       /* check if augmentation size is first (and thus present) */
+                       if (*ptr == 'z') {
+                               while (++ptr < end && *ptr) {
+                                       switch (*ptr) {
+                                       /* check for ignorable (or already handled)
+                                        * nul-terminated augmentation string */
+                                       case 'L':
+                                       case 'P':
+                                       case 'R':
+                                               continue;
+                                       case 'S':
+                                               frame->call_frame = 0;
+                                               continue;
+                                       default:
+                                               break;
+                                       }
+                                       break;
+                               }
+                       }
+                       if (ptr >= end || *ptr)
+                               cie = NULL;
+               }
+               if (!cie)
+                       dprintk(1, "CIE unusable (%p,%p).", ptr, end);
+               ++ptr;
+       }
+       if (cie != NULL) {
+               /* get code aligment factor */
+               state.codeAlign = get_uleb128(&ptr, end);
+               /* get data aligment factor */
+               state.dataAlign = get_sleb128(&ptr, end);
+               if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
+                       cie = NULL;
+               else if (UNW_PC(frame) % state.codeAlign
+                        || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
+                       dprintk(1, "Input pointer(s) misaligned (%lx,%lx).",
+                               UNW_PC(frame), UNW_SP(frame));
+                       return -EPERM;
+               } else {
+                       retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
+                       /* skip augmentation */
+                       if (((const char *)(cie + 2))[1] == 'z') {
+                               uleb128_t augSize = get_uleb128(&ptr, end);
+
+                               ptr += augSize;
+                       }
+                       if (ptr > end
+                           || retAddrReg >= ARRAY_SIZE(reg_info)
+                           || REG_INVALID(retAddrReg)
+                           || reg_info[retAddrReg].width != sizeof(unsigned long))
+                               cie = NULL;
+               }
+               if (!cie)
+                       dprintk(1, "CIE validation failed (%p,%p).", ptr, end);
+       }
+       if (cie != NULL) {
+               state.cieStart = ptr;
+               ptr = state.cieEnd;
+               state.cieEnd = end;
+               end = (const u8 *)(fde + 1) + *fde;
+               /* skip augmentation */
+               if (((const char *)(cie + 2))[1] == 'z') {
+                       uleb128_t augSize = get_uleb128(&ptr, end);
+
+                       if ((ptr += augSize) > end)
+                               fde = NULL;
+               }
+               if (!fde)
+                       dprintk(1, "FDE validation failed (%p,%p).", ptr, end);
+       }
+       if (cie == NULL || fde == NULL) {
+#ifdef CONFIG_FRAME_POINTER
+               unsigned long top = TSK_STACK_TOP(frame->task);
+               unsigned long bottom = STACK_BOTTOM(frame->task);
+               unsigned long fp = UNW_FP(frame);
+               unsigned long sp = UNW_SP(frame);
+               unsigned long link;
+
+               if ((sp | fp) & (sizeof(unsigned long) - 1))
+                       return -EPERM;
+
+# if FRAME_RETADDR_OFFSET < 0
+               if (!(sp < top && fp <= sp && bottom < fp))
+# else
+               if (!(sp > top && fp >= sp && bottom > fp))
+# endif
+                       return -ENXIO;
+
+               if (probe_kernel_address(fp + FRAME_LINK_OFFSET, link))
+                       return -ENXIO;
+
+# if FRAME_RETADDR_OFFSET < 0
+               if (!(link > bottom && link < fp))
+# else
+               if (!(link < bottom && link > fp))
+# endif
+                       return -ENXIO;
+
+               if (link & (sizeof(link) - 1))
+                       return -ENXIO;
+
+               fp += FRAME_RETADDR_OFFSET;
+               if (probe_kernel_address(fp, UNW_PC(frame)))
+                       return -ENXIO;
+
+               /* Ok, we can use it */
+# if FRAME_RETADDR_OFFSET < 0
+               UNW_SP(frame) = fp - sizeof(UNW_PC(frame));
+# else
+               UNW_SP(frame) = fp + sizeof(UNW_PC(frame));
+# endif
+               UNW_FP(frame) = link;
+               return 0;
+#else
+               return -ENXIO;
+#endif
+       }
+       state.org = startLoc;
+       memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
+       /* process instructions */
+       if (!processCFI(ptr, end, pc, ptrType, &state)
+           || state.loc > endLoc
+           || state.regs[retAddrReg].where == Nowhere) {
+               dprintk(1, "Unusable unwind info (%p,%p).", ptr, end);
+               return -EIO;
+       }
+       if (state.cfa.elen) {
+               cfa = evaluate(state.cfa.expr, state.cfa.expr + state.cfa.elen, frame);
+               if (!cfa) {
+                       dprintk(1, "Bad CFA expr (%p:%lu).", state.cfa.expr, state.cfa.elen);
+                       return -EIO;
+               }
+       } else if (state.cfa.reg >= ARRAY_SIZE(reg_info)
+                  || reg_info[state.cfa.reg].width != sizeof(unsigned long)
+                  || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long)
+                  || state.cfa.offs % sizeof(unsigned long)) {
+               dprintk(1, "Bad CFA (%lu,%lx).", state.cfa.reg, state.cfa.offs);
+               return -EIO;
+       } else
+               cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
+       /* update frame */
+#ifndef CONFIG_AS_CFI_SIGNAL_FRAME
+       if (frame->call_frame
+           && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign))
+               frame->call_frame = 0;
+#endif
+       startLoc = min((unsigned long)UNW_SP(frame), cfa);
+       endLoc = max((unsigned long)UNW_SP(frame), cfa);
+       if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) {
+               startLoc = min(STACK_LIMIT(cfa), cfa);
+               endLoc = max(STACK_LIMIT(cfa), cfa);
+       }
+#ifndef CONFIG_64BIT
+# define CASES CASE(8); CASE(16); CASE(32)
+#else
+# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
+#endif
+       pc = UNW_PC(frame);
+       sp = UNW_SP(frame);
+       for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
+               if (REG_INVALID(i)) {
+                       if (state.regs[i].where == Nowhere)
+                               continue;
+                       dprintk(1, "Cannot restore register %u (%d).",
+                               i, state.regs[i].where);
+                       return -EIO;
+               }
+               switch (state.regs[i].where) {
+               default:
+                       break;
+               case Register:
+                       if (state.regs[i].value >= ARRAY_SIZE(reg_info)
+                           || REG_INVALID(state.regs[i].value)
+                           || reg_info[i].width > reg_info[state.regs[i].value].width) {
+                               dprintk(1, "Cannot restore register %u from register %lu.",
+                                       i, state.regs[i].value);
+                               return -EIO;
+                       }
+                       switch (reg_info[state.regs[i].value].width) {
+#define CASE(n) \
+                       case sizeof(u##n): \
+                               state.regs[i].value = FRAME_REG(state.regs[i].value, \
+                                                               const u##n); \
+                               break
+                       CASES;
+#undef CASE
+                       default:
+                               dprintk(1, "Unsupported register size %u (%lu).",
+                                       reg_info[state.regs[i].value].width,
+                                       state.regs[i].value);
+                               return -EIO;
+                       }
+                       break;
+               }
+       }
+       for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
+               if (REG_INVALID(i))
+                       continue;
+               switch (state.regs[i].where) {
+               case Nowhere:
+                       if (reg_info[i].width != sizeof(UNW_SP(frame))
+                           || &FRAME_REG(i, __typeof__(UNW_SP(frame)))
+                              != &UNW_SP(frame))
+                               continue;
+                       UNW_SP(frame) = cfa;
+                       break;
+               case Register:
+                       switch (reg_info[i].width) {
+#define CASE(n) case sizeof(u##n): \
+                               FRAME_REG(i, u##n) = state.regs[i].value; \
+                               break
+                       CASES;
+#undef CASE
+                       default:
+                               dprintk(1, "Unsupported register size %u (%u).",
+                                       reg_info[i].width, i);
+                               return -EIO;
+                       }
+                       break;
+               case Value:
+                       if (reg_info[i].width != sizeof(unsigned long)) {
+                               dprintk(1, "Unsupported value size %u (%u).",
+                                       reg_info[i].width, i);
+                               return -EIO;
+                       }
+                       FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
+                                                           * state.dataAlign;
+                       break;
+               case Memory: {
+                               unsigned long addr = cfa + state.regs[i].value
+                                                          * state.dataAlign;
+
+                               if ((state.regs[i].value * state.dataAlign)
+                                   % sizeof(unsigned long)
+                                   || addr < startLoc
+                                   || addr + sizeof(unsigned long) < addr
+                                   || addr + sizeof(unsigned long) > endLoc) {
+                                       dprintk(1, "Bad memory location %lx (%lx).",
+                                               addr, state.regs[i].value);
+                                       return -EIO;
+                               }
+                               switch (reg_info[i].width) {
+#define CASE(n)                        case sizeof(u##n): \
+                                       if (probe_kernel_address(addr, \
+                                                                FRAME_REG(i, u##n))) \
+                                               return -EFAULT; \
+                                       break
+                               CASES;
+#undef CASE
+                               default:
+                                       dprintk(1, "Unsupported memory size %u (%u).",
+                                               reg_info[i].width, i);
+                                       return -EIO;
+                               }
+                       }
+                       break;
+               }
+       }
+
+       if (UNW_PC(frame) % state.codeAlign
+           || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
+               dprintk(1, "Output pointer(s) misaligned (%lx,%lx).",
+                       UNW_PC(frame), UNW_SP(frame));
+               return -EIO;
+       }
+       if (pc == UNW_PC(frame) && sp == UNW_SP(frame)) {
+               dprintk(1, "No progress (%lx,%lx).", pc, sp);
+               return -EIO;
+       }
+
+       return 0;
+#undef CASES
+#undef FRAME_REG
+}
+EXPORT_SYMBOL_GPL(unwind);
+
+int unwind_init_frame_info(struct unwind_frame_info *info,
+                           struct task_struct *tsk,
+                           /*const*/ struct pt_regs *regs)
+{
+       info->task = tsk;
+       info->call_frame = 0;
+       arch_unw_init_frame_info(info, regs);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(unwind_init_frame_info);
+
+/*
+ * Prepare to unwind a blocked task.
+ */
+int unwind_init_blocked(struct unwind_frame_info *info,
+                        struct task_struct *tsk)
+{
+       info->task = tsk;
+       info->call_frame = 0;
+       arch_unw_init_blocked(info);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(unwind_init_blocked);
+
+/*
+ * Prepare to unwind the currently running thread.
+ */
+int unwind_init_running(struct unwind_frame_info *info,
+                       asmlinkage unwind_callback_fn callback,
+                       const struct stacktrace_ops *ops, void *data)
+{
+       info->task = current;
+       info->call_frame = 0;
+
+       return arch_unwind_init_running(info, callback, ops, data);
+}
+EXPORT_SYMBOL_GPL(unwind_init_running);
+
+/*
+ * Unwind until the return pointer is in user-land (or until an error
+ * occurs).  Returns 0 if successful, negative number in case of
+ * error.
+ */
+int unwind_to_user(struct unwind_frame_info *info)
+{
+       while (!arch_unw_user_mode(info)) {
+               int err = unwind(info);
+
+               if (err < 0)
+                       return err;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(unwind_to_user);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug

index 6777153..edff85c 100644 (file)
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -860,6 +860,24 @@ config FRAME_POINTER
           larger and slower, but it gives very useful debugging information
           in case of kernel bugs. (precise oopses/stacktraces/warnings)
  
+config UNWIND_INFO
+       bool "Compile the kernel with frame unwind information"
+       depends on !IA64 && !PARISC && !ARM
+       depends on !MODULES || !(MIPS || PPC || SUPERH || V850)
+       help
+         If you say Y here the resulting kernel image will be slightly larger
+         but not slower, and it will give very useful debugging information.
+         If you don't debug the kernel, you can say N, but we may not be able
+         to solve problems without frame unwind information or frame pointers.
+
+config STACK_UNWIND
+       bool "Stack unwind support"
+       depends on UNWIND_INFO
+       depends on X86
+       help
+         This enables more precise stack traces, omitting all unrelated
+         occurrences of pointers into kernel code from the dump.
+
  config BOOT_PRINTK_DELAY
         bool "Delay each boot printk message by N milliseconds"
         depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY
@@ -1122,7 +1140,8 @@ config FAULT_INJECTION_STACKTRACE_FILTER
         depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
         depends on !X86_64
         select STACKTRACE
-       select FRAME_POINTER if !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND
+       select FRAME_POINTER if !PPC && !S390 && !MICROBLAZE && !X86 && !ARM_UNWIND
+       select UNWIND_INFO if X86 && !FRAME_POINTER
         help
           Provide stacktrace filter for fault-injection capabilities
  
@@ -1132,7 +1151,8 @@ config LATENCYTOP
         depends on DEBUG_KERNEL
         depends on STACKTRACE_SUPPORT
         depends on PROC_FS
-       select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND
+       select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !X86 && !ARM_UNWIND
+       select UNWIND_INFO if X86 && !FRAME_POINTER
         select KALLSYMS
         select KALLSYMS_ALL
         select STACKTRACE
diff --git a/lib/swiotlb-xen.c b/lib/swiotlb-xen.c

new file mode 100644 (file)

index 0000000..0150a40
--- /dev/null
+++ b/lib/swiotlb-xen.c
@@ -0,0 +1,807 @@
+/*
+ * Dynamic DMA mapping support.
+ *
+ * This implementation is a fallback for platforms that do not support
+ * I/O TLBs (aka DMA address translation hardware).
+ * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com>
+ * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com>
+ * Copyright (C) 2000, 2003 Hewlett-Packard Co
+ *     David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2005 Keir Fraser <keir@xensource.com>
+ * 08/12/11 beckyb     Add highmem support
+ */
+
+#include <linux/cache.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/swiotlb.h>
+#include <linux/pfn.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/iommu-helper.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+
+#include <asm/io.h>
+#include <asm/pci.h>
+#include <asm/dma.h>
+#include <asm/uaccess.h>
+#include <xen/gnttab.h>
+#include <xen/interface/memory.h>
+#include <asm/gnttab_dma.h>
+
+#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1)))
+
+int swiotlb;
+int swiotlb_force;
+
+/*
+ * Used to do a quick range check in swiotlb_tbl_unmap_single and
+ * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
+ * API.
+ */
+static char *io_tlb_start, *io_tlb_end;
+
+/*
+ * The number of IO TLB blocks (in groups of 64) between io_tlb_start and
+ * io_tlb_end.  This is command line adjustable via setup_io_tlb_npages.
+ */
+static unsigned long io_tlb_nslabs;
+
+/*
+ * When the IOMMU overflows we return a fallback buffer. This sets the size.
+ */
+static unsigned long io_tlb_overflow = 32*1024;
+
+static void *io_tlb_overflow_buffer;
+
+/*
+ * This is a free list describing the number of free entries available from
+ * each index
+ */
+static unsigned int *io_tlb_list;
+static unsigned int io_tlb_index;
+
+/*
+ * We need to save away the original address corresponding to a mapped entry
+ * for the sync operations.
+ */
+static phys_addr_t *io_tlb_orig_addr;
+
+/*
+ * Protect the above data structures in the map and unmap calls
+ */
+static DEFINE_SPINLOCK(io_tlb_lock);
+
+static unsigned int dma_bits;
+static unsigned int __initdata max_dma_bits = 32;
+static int __init
+setup_dma_bits(char *str)
+{
+       max_dma_bits = simple_strtoul(str, NULL, 0);
+       return 0;
+}
+__setup("dma_bits=", setup_dma_bits);
+
+static int __init
+setup_io_tlb_npages(char *str)
+{
+       /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */
+       if (isdigit(*str)) {
+               io_tlb_nslabs = simple_strtoul(str, &str, 0) <<
+                       (20 - IO_TLB_SHIFT);
+               io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+       }
+       if (*str == ',')
+               ++str;
+       /*
+         * NB. 'force' enables the swiotlb, but doesn't force its use for
+         * every DMA like it does on native Linux. 'off' forcibly disables
+         * use of the swiotlb.
+         */
+       if (!strcmp(str, "force"))
+               swiotlb_force = 1;
+       else if (!strcmp(str, "off"))
+               swiotlb_force = -1;
+
+       return 1;
+}
+__setup("swiotlb=", setup_io_tlb_npages);
+/* make io_tlb_overflow tunable too? */
+
+unsigned long swiotlb_nr_tbl(void)
+{
+       return io_tlb_nslabs;
+}
+EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
+/* Note that this doesn't work with highmem page */
+static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
+                                     volatile void *address)
+{
+       return phys_to_dma(hwdev, virt_to_phys(address));
+}
+
+void swiotlb_print_info(void)
+{
+       unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+
+       printk(KERN_INFO "Software IO TLB enabled: \n"
+              " Aperture:     %lu megabytes\n"
+              " Address size: %u bits\n"
+              " Kernel range: %p - %p\n",
+              bytes >> 20, dma_bits,
+              io_tlb_start, io_tlb_end);
+}
+
+void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+{
+       unsigned long i, bytes;
+       int rc;
+
+       bytes = nslabs << IO_TLB_SHIFT;
+
+       io_tlb_nslabs = nslabs;
+       io_tlb_start = tlb;
+       dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
+       for (nslabs = 0; nslabs < io_tlb_nslabs; nslabs += IO_TLB_SEGSIZE) {
+               do {
+                       rc = xen_create_contiguous_region(
+                               (unsigned long)io_tlb_start + (nslabs << IO_TLB_SHIFT),
+                               get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT),
+                               dma_bits);
+               } while (rc && dma_bits++ < max_dma_bits);
+               if (rc) {
+                       if (nslabs == 0)
+                               panic("No suitable physical memory available for SWIOTLB buffer!\n"
+                                     "Use dom0_mem Xen boot parameter to reserve\n"
+                                     "some DMA memory (e.g., dom0_mem=-128M).\n");
+                       io_tlb_nslabs = nslabs;
+                       i = nslabs << IO_TLB_SHIFT;
+                       free_bootmem(__pa(io_tlb_start + i), bytes - i);
+                       bytes = i;
+                       for (dma_bits = 0; i > 0; i -= IO_TLB_SEGSIZE << IO_TLB_SHIFT) {
+                               unsigned int bits = fls64(virt_to_bus(io_tlb_start + i - 1));
+
+                               if (bits > dma_bits)
+                                       dma_bits = bits;
+                       }
+                       break;
+               }
+       }
+       io_tlb_end = io_tlb_start + bytes;
+
+       /*
+        * Allocate and initialize the free list array.  This array is used
+        * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE.
+        */
+       io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
+       for (i = 0; i < io_tlb_nslabs; i++)
+               io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
+       io_tlb_index = 0;
+       io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
+
+       /*
+        * Get the overflow emergency buffer
+        */
+       io_tlb_overflow_buffer = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_overflow));
+       if (!io_tlb_overflow_buffer)
+               panic("Cannot allocate SWIOTLB overflow buffer!\n");
+
+       do {
+               rc = xen_create_contiguous_region(
+                       (unsigned long)io_tlb_overflow_buffer,
+                       get_order(io_tlb_overflow),
+                       dma_bits);
+       } while (rc && dma_bits++ < max_dma_bits);
+       if (rc)
+               panic("No suitable physical memory available for SWIOTLB overflow buffer!\n");
+       if (verbose)
+               swiotlb_print_info();
+}
+
+/*
+ * Statically reserve bounce buffer space and initialize bounce buffer data
+ * structures for the software IO TLB used to implement the DMA API.
+ */
+void __init
+swiotlb_init_with_default_size(size_t default_size, int verbose)
+{
+       unsigned long bytes;
+
+       if (!io_tlb_nslabs) {
+               io_tlb_nslabs = (default_size >> IO_TLB_SHIFT);
+               io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+       }
+
+       bytes = io_tlb_nslabs << IO_TLB_SHIFT;
+
+       /*
+        * Get IO TLB memory from the low pages
+        */
+       io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes));
+       if (!io_tlb_start)
+               panic("Cannot allocate SWIOTLB buffer");
+
+       swiotlb_init_with_tbl(io_tlb_start, io_tlb_nslabs, verbose);
+}
+
+void __init
+swiotlb_init(int verbose)
+{
+       unsigned long ram_end;
+       size_t defsz = 64 << 20; /* 64MB default size */
+
+       if (swiotlb_force == 1) {
+               swiotlb = 1;
+       } else if ((swiotlb_force != -1) &&
+                  is_running_on_xen() &&
+                  is_initial_xendomain()) {
+               /* Domain 0 always has a swiotlb. */
+               ram_end = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
+               if (ram_end <= 0x1ffff)
+                       defsz = 2 << 20; /* 2MB on <512MB systems. */
+               else if (ram_end <= 0x3ffff)
+                       defsz = 4 << 20; /* 4MB on <1GB systems. */
+               else if (ram_end <= 0x7ffff)
+                       defsz = 8 << 20; /* 8MB on <2GB systems. */
+               swiotlb = 1;
+       }
+
+       if (swiotlb)
+               swiotlb_init_with_default_size(defsz, verbose);
+       else
+               printk(KERN_INFO "Software IO TLB disabled\n");
+}
+
+static inline int range_needs_mapping(phys_addr_t pa, size_t size)
+{
+       return range_straddles_page_boundary(pa, size);
+}
+
+static int is_swiotlb_buffer(dma_addr_t addr)
+{
+       unsigned long pfn = mfn_to_local_pfn(PFN_DOWN(addr));
+       phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT;
+
+       return paddr >= virt_to_phys(io_tlb_start) &&
+               paddr < virt_to_phys(io_tlb_end);
+}
+
+/*
+ * Bounce: copy the swiotlb buffer back to the original dma location
+ *
+ * We use __copy_to_user_inatomic to transfer to the host buffer because the
+ * buffer may be mapped read-only (e.g, in blkback driver) but lower-level
+ * drivers map the buffer for DMA_BIDIRECTIONAL access. This causes an
+ * unnecessary copy from the aperture to the host buffer, and a page fault.
+ */
+void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
+                   enum dma_data_direction dir)
+{
+       unsigned long pfn = PFN_DOWN(phys);
+
+       if (PageHighMem(pfn_to_page(pfn))) {
+               /* The buffer does not have a mapping.  Map it in and copy */
+               unsigned int offset = phys & ~PAGE_MASK;
+               char *buffer;
+               unsigned int sz = 0;
+               unsigned long flags;
+
+               while (size) {
+                       sz = min_t(size_t, PAGE_SIZE - offset, size);
+
+                       local_irq_save(flags);
+                       buffer = kmap_atomic(pfn_to_page(pfn));
+                       if (dir == DMA_TO_DEVICE)
+                               memcpy(dma_addr, buffer + offset, sz);
+                       else if (__copy_to_user_inatomic(buffer + offset,
+                                                        dma_addr, sz))
+                               /* inaccessible */;
+                       kunmap_atomic(buffer);
+                       local_irq_restore(flags);
+
+                       size -= sz;
+                       pfn++;
+                       dma_addr += sz;
+                       offset = 0;
+               }
+       } else {
+               if (dir == DMA_TO_DEVICE)
+                       memcpy(dma_addr, phys_to_virt(phys), size);
+               else if (__copy_to_user_inatomic(phys_to_virt(phys),
+                                                dma_addr, size))
+                       /* inaccessible */;
+       }
+}
+EXPORT_SYMBOL_GPL(swiotlb_bounce);
+
+void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr,
+                            phys_addr_t phys, size_t size,
+                            enum dma_data_direction dir)
+{
+       unsigned long flags;
+       char *dma_addr;
+       unsigned int nslots, stride, index, wrap;
+       int i;
+       unsigned long mask;
+       unsigned long offset_slots;
+       unsigned long max_slots;
+
+       mask = dma_get_seg_boundary(hwdev);
+       offset_slots = -IO_TLB_SEGSIZE;
+
+       /*
+        * Carefully handle integer overflow which can occur when mask == ~0UL.
+        */
+       max_slots = mask + 1
+                   ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
+                   : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
+
+       /*
+        * For mappings greater than a page, we limit the stride (and
+        * hence alignment) to a page size.
+        */
+       nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+       if (size > PAGE_SIZE)
+               stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
+       else
+               stride = 1;
+
+       BUG_ON(!nslots);
+
+       /*
+        * Find suitable number of IO TLB entries size that will fit this
+        * request and allocate a buffer from that IO TLB pool.
+        */
+       spin_lock_irqsave(&io_tlb_lock, flags);
+       index = ALIGN(io_tlb_index, stride);
+       if (index >= io_tlb_nslabs)
+               index = 0;
+       wrap = index;
+
+       do {
+               while (iommu_is_span_boundary(index, nslots, offset_slots,
+                                             max_slots)) {
+                       index += stride;
+                       if (index >= io_tlb_nslabs)
+                               index = 0;
+                       if (index == wrap)
+                               goto not_found;
+               }
+
+               /*
+                * If we find a slot that indicates we have 'nslots' number of
+                * contiguous buffers, we allocate the buffers from that slot
+                * and mark the entries as '0' indicating unavailable.
+                */
+               if (io_tlb_list[index] >= nslots) {
+                       int count = 0;
+
+                       for (i = index; i < (int) (index + nslots); i++)
+                               io_tlb_list[i] = 0;
+                       for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
+                               io_tlb_list[i] = ++count;
+                       dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
+
+                       /*
+                        * Update the indices to avoid searching in the next
+                        * round.
+                        */
+                       io_tlb_index = ((index + nslots) < io_tlb_nslabs
+                                       ? (index + nslots) : 0);
+
+                       goto found;
+               }
+               index += stride;
+               if (index >= io_tlb_nslabs)
+                       index = 0;
+       } while (index != wrap);
+
+not_found:
+       spin_unlock_irqrestore(&io_tlb_lock, flags);
+       return NULL;
+found:
+       spin_unlock_irqrestore(&io_tlb_lock, flags);
+
+       /*
+        * Save away the mapping from the original address to the DMA address.
+        * This is needed when we sync the memory.  Then we sync the buffer if
+        * needed.
+        */
+       for (i = 0; i < nslots; i++)
+               io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
+       if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+               swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
+
+       return dma_addr;
+}
+EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single);
+
+/*
+ * Allocates bounce buffer and returns its kernel virtual address.
+ */
+
+static void *
+map_single(struct device *hwdev, phys_addr_t phys, size_t size,
+          enum dma_data_direction dir)
+{
+       dma_addr_t start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start);
+
+       return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir);
+}
+
+/*
+ * dma_addr is the kernel virtual address of the bounce buffer to unmap.
+ */
+void
+swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size,
+                       enum dma_data_direction dir)
+{
+       unsigned long flags;
+       int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+       int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+       phys_addr_t phys = io_tlb_orig_addr[index];
+
+       /*
+        * First, sync the memory before unmapping the entry
+        */
+       if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+               swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
+
+       /*
+        * Return the buffer to the free list by setting the corresponding
+        * entries to indicate the number of contiguous entries available.
+        * While returning the entries to the free list, we merge the entries
+        * with slots below and above the pool being returned.
+        */
+       spin_lock_irqsave(&io_tlb_lock, flags);
+       {
+               count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ?
+                        io_tlb_list[index + nslots] : 0);
+               /*
+                * Step 1: return the slots to the free list, merging the
+                * slots with superceeding slots
+                */
+               for (i = index + nslots - 1; i >= index; i--)
+                       io_tlb_list[i] = ++count;
+               /*
+                * Step 2: merge the returned slots with the preceding slots,
+                * if available (non zero)
+                */
+               for (i = index - 1;
+                    (OFFSET(i, IO_TLB_SEGSIZE) !=
+                     IO_TLB_SEGSIZE -1) && io_tlb_list[i];
+                    i--)
+                       io_tlb_list[i] = ++count;
+       }
+       spin_unlock_irqrestore(&io_tlb_lock, flags);
+}
+EXPORT_SYMBOL_GPL(swiotlb_tbl_unmap_single);
+
+void
+swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size,
+                       enum dma_data_direction dir,
+                       enum dma_sync_target target)
+{
+       int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
+       phys_addr_t phys = io_tlb_orig_addr[index];
+
+       phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
+
+       switch (target) {
+       case SYNC_FOR_CPU:
+               if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+                       swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
+               else
+                       BUG_ON(dir != DMA_TO_DEVICE);
+               break;
+       case SYNC_FOR_DEVICE:
+               if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
+                       swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
+               else
+                       BUG_ON(dir != DMA_FROM_DEVICE);
+               break;
+       default:
+               BUG();
+       }
+}
+EXPORT_SYMBOL_GPL(swiotlb_tbl_sync_single);
+
+static void
+swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
+            int do_panic)
+{
+       /*
+        * Ran out of IOMMU space for this operation. This is very bad.
+        * Unfortunately the drivers cannot handle this operation properly.
+        * unless they check for pci_dma_mapping_error (most don't)
+        * When the mapping is small enough return a static buffer to limit
+        * the damage, or panic when the transfer is too big.
+        */
+       printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %zu bytes at "
+              "device %s\n", size, dev ? dev_name(dev) : "?");
+
+       if (size <= io_tlb_overflow || !do_panic)
+               return;
+
+       if (dir == DMA_BIDIRECTIONAL)
+               panic("DMA: Random memory could be DMA accessed\n");
+       if (dir == DMA_FROM_DEVICE)
+               panic("DMA: Random memory could be DMA written\n");
+       if (dir == DMA_TO_DEVICE)
+               panic("DMA: Random memory could be DMA read\n");
+}
+
+/*
+ * Map a single buffer of the indicated size for DMA in streaming mode.  The
+ * PCI address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory until
+ * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed.
+ */
+dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
+                           unsigned long offset, size_t size,
+                           enum dma_data_direction dir,
+                           struct dma_attrs *attrs)
+{
+       phys_addr_t phys = page_to_pseudophys(page) + offset;
+       dma_addr_t dev_addr = gnttab_dma_map_page(page) + offset;
+       void *map;
+
+       BUG_ON(dir == DMA_NONE);
+
+       /*
+        * If the address happens to be in the device's DMA window,
+        * we can safely return the device addr and not worry about bounce
+        * buffering it.
+        */
+       if (dma_capable(dev, dev_addr, size) &&
+           !range_needs_mapping(phys, size))
+               return dev_addr;
+
+       /*
+        * Oh well, have to allocate and map a bounce buffer.
+        */
+       gnttab_dma_unmap_page(dev_addr);
+       map = map_single(dev, phys, size, dir);
+       if (!map) {
+               swiotlb_full(dev, size, dir, 1);
+               map = io_tlb_overflow_buffer;
+       }
+
+       dev_addr = swiotlb_virt_to_bus(dev, map);
+
+       /*
+        * Ensure that the address returned is DMA'ble
+        */
+       if (!dma_capable(dev, dev_addr, size)) {
+               swiotlb_tbl_unmap_single(dev, map, size, dir);
+               dev_addr = swiotlb_virt_to_bus(dev, io_tlb_overflow_buffer);
+       }
+
+       return dev_addr;
+}
+EXPORT_SYMBOL_GPL(swiotlb_map_page);
+
+/*
+ * Unmap a single streaming mode DMA translation.  The dma_addr and size must
+ * match what was provided for in a previous swiotlb_map_page call.  All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+                        size_t size, enum dma_data_direction dir)
+{
+       phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
+
+       BUG_ON(dir == DMA_NONE);
+
+       if (is_swiotlb_buffer(dev_addr)) {
+               swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
+               return;
+       }
+
+       gnttab_dma_unmap_page(dev_addr);
+}
+
+void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+                       size_t size, enum dma_data_direction dir,
+                       struct dma_attrs *attrs)
+{
+       unmap_single(hwdev, dev_addr, size, dir);
+}
+EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
+
+/*
+ * Make physical memory consistent for a single streaming mode DMA translation
+ * after a transfer.
+ *
+ * If you perform a swiotlb_map_page() but wish to interrogate the buffer
+ * using the cpu, yet do not wish to teardown the PCI dma mapping, you must
+ * call this function before doing so.  At the next point you give the PCI dma
+ * address back to the card, you must first perform a
+ * swiotlb_dma_sync_for_device, and then the device again owns the buffer
+ */
+static void
+swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
+                   size_t size, enum dma_data_direction dir,
+                   enum dma_sync_target target)
+{
+       phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
+
+       BUG_ON(dir == DMA_NONE);
+
+       if (is_swiotlb_buffer(dev_addr))
+               swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
+                                      target);
+}
+
+void
+swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+                           size_t size, enum dma_data_direction dir)
+{
+       swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
+}
+EXPORT_SYMBOL(swiotlb_sync_single_for_cpu);
+
+void
+swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+                              size_t size, enum dma_data_direction dir)
+{
+       swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
+}
+EXPORT_SYMBOL(swiotlb_sync_single_for_device);
+
+/*
+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
+ * This is the scatter-gather version of the above swiotlb_map_page
+ * interface.  Here the scatter gather list elements are each tagged with the
+ * appropriate dma address and length.  They are obtained via
+ * sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for swiotlb_map_page are the
+ * same here.
+ */
+int
+swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
+                    enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+       struct scatterlist *sg;
+       int i;
+
+       BUG_ON(dir == DMA_NONE);
+
+       for_each_sg(sgl, sg, nelems, i) {
+               dma_addr_t dev_addr = gnttab_dma_map_page(sg_page(sg))
+                                     + sg->offset;
+               phys_addr_t paddr = page_to_pseudophys(sg_page(sg))
+                                  + sg->offset;
+
+               if (range_needs_mapping(paddr, sg->length) ||
+                   !dma_capable(hwdev, dev_addr, sg->length)) {
+                       void *map;
+
+                       gnttab_dma_unmap_page(dev_addr);
+                       map = map_single(hwdev, paddr,
+                                        sg->length, dir);
+                       if (!map) {
+                               /* Don't panic here, we expect map_sg users
+                                  to do proper error handling. */
+                               swiotlb_full(hwdev, sg->length, dir, 0);
+                               swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
+                                                      attrs);
+                               sgl[0].dma_length = 0;
+                               return 0;
+                       }
+                       sg->dma_address = swiotlb_virt_to_bus(hwdev, map);
+               } else
+                       sg->dma_address = dev_addr;
+               sg->dma_length = sg->length;
+       }
+       return nelems;
+}
+EXPORT_SYMBOL(swiotlb_map_sg_attrs);
+
+int
+swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+              enum dma_data_direction dir)
+{
+       return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+EXPORT_SYMBOL(swiotlb_map_sg);
+
+/*
+ * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
+ * concerning calls here are the same as for swiotlb_unmap_page() above.
+ */
+void
+swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+                      int nelems, enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+       struct scatterlist *sg;
+       int i;
+
+       BUG_ON(dir == DMA_NONE);
+
+       for_each_sg(sgl, sg, nelems, i)
+               unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
+
+}
+EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
+
+void
+swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+                enum dma_data_direction dir)
+{
+       return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+EXPORT_SYMBOL(swiotlb_unmap_sg);
+
+/*
+ * Make physical memory consistent for a set of streaming mode DMA translations
+ * after a transfer.
+ *
+ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
+ * and usage.
+ */
+static void
+swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
+               int nelems, enum dma_data_direction dir,
+               enum dma_sync_target target)
+{
+       struct scatterlist *sg;
+       int i;
+
+       for_each_sg(sgl, sg, nelems, i)
+               swiotlb_sync_single(hwdev, sg->dma_address,
+                                   sg->dma_length, dir, target);
+}
+
+void
+swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+                       int nelems, enum dma_data_direction dir)
+{
+       swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
+}
+EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu);
+
+void
+swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+                          int nelems, enum dma_data_direction dir)
+{
+       swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
+}
+EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
+
+int
+swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
+{
+       return (dma_addr == swiotlb_virt_to_bus(hwdev, io_tlb_overflow_buffer));
+}
+EXPORT_SYMBOL(swiotlb_dma_mapping_error);
+
+/*
+ * Return whether the given PCI device DMA address mask can be supported
+ * properly.  For example, if your device can only drive the low 24-bits
+ * during PCI bus mastering, then you would pass 0x00ffffff as the mask to
+ * this function.
+ */
+int
+swiotlb_dma_supported (struct device *hwdev, u64 mask)
+{
+       return (mask >= ((1UL << dma_bits) - 1));
+}
+EXPORT_SYMBOL(swiotlb_dma_supported);
diff --git a/mm/Kconfig b/mm/Kconfig

index e338407..c882b8e 100644 (file)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -313,7 +313,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
  
  config TRANSPARENT_HUGEPAGE
         bool "Transparent Hugepage Support"
-       depends on X86 && MMU
+       depends on X86 && !XEN && MMU
         select COMPACTION
         help
           Transparent Hugepages allows the kernel to use huge pages and
@@ -379,3 +379,20 @@ config CLEANCACHE
           in a negligible performance hit.
  
           If unsure, say Y to enable cleancache
+
+config FRONTSWAP
+       bool "Enable frontswap to cache swap pages if tmem is present"
+       depends on SWAP
+       default n
+       help
+         Frontswap is so named because it can be thought of as the opposite
+         of a "backing" store for a swap device.  The data is stored into
+         "transcendent memory", memory that is not directly accessible or
+         addressable by the kernel and is of unknown and possibly
+         time-varying size.  When space in transcendent memory is available,
+         a significant swap I/O reduction may be achieved.  When none is
+         available, all frontswap calls are reduced to a single pointer-
+         compare-against-NULL resulting in a negligible performance hit
+         and swap data is stored as normal on the matching swap device.
+
+         If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile

index 50ec00e..306742a 100644 (file)
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
  
  obj-$(CONFIG_BOUNCE)   += bounce.o
  obj-$(CONFIG_SWAP)     += page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_FRONTSWAP)        += frontswap.o
  obj-$(CONFIG_HAS_DMA)  += dmapool.o
  obj-$(CONFIG_HUGETLBFS)        += hugetlb.o
  obj-$(CONFIG_NUMA)     += mempolicy.o
diff --git a/mm/frontswap.c b/mm/frontswap.c

new file mode 100644 (file)

index 0000000..d98c13e
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,272 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap.  See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp.  All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops __read_mostly;
+
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+int frontswap_enabled __read_mostly;
+EXPORT_SYMBOL(frontswap_enabled);
+
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured.  These are for information only so are not protected
+ * against increment races.
+ */
+static u64 frontswap_gets;
+static u64 frontswap_succ_puts;
+static u64 frontswap_failed_puts;
+static u64 frontswap_invalidates;
+
+/*
+ * Register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+       struct frontswap_ops old = frontswap_ops;
+
+       frontswap_ops = *ops;
+       frontswap_enabled = 1;
+       return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/* Called when a swap device is swapon'd */
+void __frontswap_init(unsigned type)
+{
+       struct swap_info_struct *sis = swap_info[type];
+
+       BUG_ON(sis == NULL);
+       if (sis->frontswap_map == NULL)
+               return;
+       if (frontswap_enabled)
+               (*frontswap_ops.init)(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+
+/*
+ * "Put" data from a page to frontswap and associate it with the page's
+ * swaptype and offset.  Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implmentation may either overwrite the data and
+ * return success or invalidate the page from frontswap and return failure
+ */
+int __frontswap_put_page(struct page *page)
+{
+       int ret = -1, dup = 0;
+       swp_entry_t entry = { .val = page_private(page), };
+       int type = swp_type(entry);
+       struct swap_info_struct *sis = swap_info[type];
+       pgoff_t offset = swp_offset(entry);
+
+       BUG_ON(!PageLocked(page));
+       BUG_ON(sis == NULL);
+       if (frontswap_test(sis, offset))
+               dup = 1;
+       ret = (*frontswap_ops.put_page)(type, offset, page);
+       if (ret == 0) {
+               frontswap_set(sis, offset);
+               frontswap_succ_puts++;
+               if (!dup)
+                       atomic_inc(&sis->frontswap_pages);
+       } else if (dup) {
+               /*
+                 failed dup always results in automatic invalidate of
+                 the (older) page from frontswap
+                */
+               frontswap_clear(sis, offset);
+               atomic_dec(&sis->frontswap_pages);
+               frontswap_failed_puts++;
+       } else
+               frontswap_failed_puts++;
+       return ret;
+}
+EXPORT_SYMBOL(__frontswap_put_page);
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache
+ */
+int __frontswap_get_page(struct page *page)
+{
+       int ret = -1;
+       swp_entry_t entry = { .val = page_private(page), };
+       int type = swp_type(entry);
+       struct swap_info_struct *sis = swap_info[type];
+       pgoff_t offset = swp_offset(entry);
+
+       BUG_ON(!PageLocked(page));
+       BUG_ON(sis == NULL);
+       if (frontswap_test(sis, offset))
+               ret = (*frontswap_ops.get_page)(type, offset, page);
+       if (ret == 0)
+               frontswap_gets++;
+       return ret;
+}
+EXPORT_SYMBOL(__frontswap_get_page);
+
+/*
+ * Invalidate any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+       struct swap_info_struct *sis = swap_info[type];
+
+       BUG_ON(sis == NULL);
+       if (frontswap_test(sis, offset)) {
+               (*frontswap_ops.invalidate_page)(type, offset);
+               atomic_dec(&sis->frontswap_pages);
+               frontswap_clear(sis, offset);
+               frontswap_invalidates++;
+       }
+}
+EXPORT_SYMBOL(__frontswap_invalidate_page);
+
+/*
+ * Invalidate all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_invalidate_area(unsigned type)
+{
+       struct swap_info_struct *sis = swap_info[type];
+
+       BUG_ON(sis == NULL);
+       if (sis->frontswap_map == NULL)
+               return;
+       (*frontswap_ops.invalidate_area)(type);
+       atomic_set(&sis->frontswap_pages, 0);
+       memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_invalidate_area);
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap to the
+ * number given in the parameter target_pages.
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+       struct swap_info_struct *si = NULL;
+       int si_frontswap_pages;
+       unsigned long total_pages = 0, total_pages_to_unuse;
+       unsigned long pages = 0, pages_to_unuse = 0;
+       int type;
+       bool locked = false;
+
+       /*
+        * we don't want to hold swap_lock while doing a very
+        * lengthy try_to_unuse, but swap_list may change
+        * so restart scan from swap_list.head each time
+        */
+       spin_lock(&swap_lock);
+       locked = true;
+       total_pages = 0;
+       for (type = swap_list.head; type >= 0; type = si->next) {
+               si = swap_info[type];
+               total_pages += atomic_read(&si->frontswap_pages);
+       }
+       if (total_pages <= target_pages)
+               goto out;
+       total_pages_to_unuse = total_pages - target_pages;
+       for (type = swap_list.head; type >= 0; type = si->next) {
+               si = swap_info[type];
+               si_frontswap_pages = atomic_read(&si->frontswap_pages);
+               if (total_pages_to_unuse < si_frontswap_pages)
+                       pages = pages_to_unuse = total_pages_to_unuse;
+               else {
+                       pages = si_frontswap_pages;
+                       pages_to_unuse = 0; /* unuse all */
+               }
+               /* ensure there is enough RAM to fetch pages from frontswap */
+               if (security_vm_enough_memory_mm(current->mm, pages))
+                       continue;
+               vm_unacct_memory(pages);
+               break;
+       }
+       if (type < 0)
+               goto out;
+       locked = false;
+       spin_unlock(&swap_lock);
+       try_to_unuse(type, true, pages_to_unuse);
+out:
+       if (locked)
+               spin_unlock(&swap_lock);
+       return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * Count and return the number of frontswap pages across all
+ * swap devices.  This is exported so that backend drivers can
+ * determine current usage without reading debugfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+       int type;
+       unsigned long totalpages = 0;
+       struct swap_info_struct *si = NULL;
+
+       spin_lock(&swap_lock);
+       for (type = swap_list.head; type >= 0; type = si->next) {
+               si = swap_info[type];
+               totalpages += atomic_read(&si->frontswap_pages);
+       }
+       spin_unlock(&swap_lock);
+       return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+static int __init init_frontswap(void)
+{
+       int err = 0;
+
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *root = debugfs_create_dir("frontswap", NULL);
+       if (root == NULL)
+               return -ENXIO;
+       debugfs_create_u64("gets", S_IRUGO, root, &frontswap_gets);
+       debugfs_create_u64("succ_puts", S_IRUGO, root, &frontswap_succ_puts);
+       debugfs_create_u64("puts", S_IRUGO, root, &frontswap_failed_puts);
+       debugfs_create_u64("invalidates", S_IRUGO,
+                               root, &frontswap_invalidates);
+#endif
+       return err;
+}
+
+module_init(init_frontswap);
diff --git a/mm/init-mm.c b/mm/init-mm.c

index a56a851..9517a72 100644 (file)
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -13,6 +13,10 @@
  #define INIT_MM_CONTEXT(name)
  #endif
  
+#ifdef CONFIG_X86_XEN
+#define swapper_pg_dir ((pgd_t *)NULL)
+#endif
+
  struct mm_struct init_mm = {
         .mm_rb          = RB_ROOT,
         .pgd            = swapper_pg_dir,
diff --git a/mm/memory.c b/mm/memory.c

index 6105f47..734d255 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -779,6 +779,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
  {
         unsigned long pfn = pte_pfn(pte);
  
+#if defined(CONFIG_XEN) && defined(CONFIG_X86)
+       /* XEN: Covers user-space grant mappings (even of local pages). */
+       if (unlikely(vma->vm_flags & VM_FOREIGN))
+               return NULL;
+#endif
+
         if (HAVE_PTE_SPECIAL) {
                 if (likely(!pte_special(pte)))
                         goto check_pfn;
@@ -810,6 +816,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                 return NULL;
  check_pfn:
         if (unlikely(pfn > highest_memmap_pfn)) {
+#ifdef CONFIG_XEN
+               if (!(vma->vm_flags & VM_RESERVED))
+#endif
                 print_bad_pte(vma, addr, pte, NULL);
                 return NULL;
         }
@@ -1135,8 +1144,14 @@ again:
                                      page->index > details->last_index))
                                         continue;
                         }
-                       ptent = ptep_get_and_clear_full(mm, addr, pte,
-                                                       tlb->fullmm);
+#ifdef CONFIG_XEN
+                       if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte))
+                               ptent = vma->vm_ops->zap_pte(vma, addr, pte,
+                                                            tlb->fullmm);
+                       else
+#endif
+                               ptent = ptep_get_and_clear_full(mm, addr, pte,
+                                                               tlb->fullmm);
                         tlb_remove_tlb_entry(tlb, pte, addr);
                         if (unlikely(!page))
                                 continue;
@@ -1390,6 +1405,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address,
         unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
         tlb_finish_mmu(&tlb, address, end);
  }
+EXPORT_SYMBOL(zap_page_range);
  
  /**
   * zap_page_range_single - remove user pages in a given range
@@ -1726,6 +1742,28 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                         goto next_page;
                 }
  
+#ifdef CONFIG_XEN
+               if (vma && (vma->vm_flags & VM_FOREIGN)) {
+                       struct vm_foreign_map *foreign_map =
+                               vma->vm_private_data;
+                       struct page **map = foreign_map->map;
+                       int offset = (start - vma->vm_start) >> PAGE_SHIFT;
+                       if (map[offset] != NULL) {
+                               if (pages) {
+                                       struct page *page = map[offset];
+
+                                       pages[i] = page;
+                                       get_page(page);
+                               }
+                               if (vmas)
+                                       vmas[i] = vma;
+                               i++;
+                               start += PAGE_SIZE;
+                               nr_pages--;
+                               continue;
+                       }
+               }
+#endif
                 if (!vma ||
                     (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
                     !(vm_flags & vma->vm_flags))
@@ -2412,6 +2450,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
         unsigned long end = addr + size;
         int err;
  
+#ifdef CONFIG_XEN
+       if (!mm)
+               mm = &init_mm;
+#endif
         BUG_ON(addr >= end);
         pgd = pgd_offset(mm, addr);
         do {
diff --git a/mm/mmap.c b/mm/mmap.c

index 848ef52..62c1157 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1924,6 +1924,14 @@ static void unmap_region(struct mm_struct *mm,
         tlb_finish_mmu(&tlb, start, end);
  }
  
+static inline void unmap_vma(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_XEN
+       if (unlikely(vma->vm_ops && vma->vm_ops->unmap))
+               vma->vm_ops->unmap(vma);
+#endif
+}
+
  /*
   * Create a list of vma's touched by the unmap, removing them from the mm's
   * vma list as we go..
@@ -1940,6 +1948,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
         vma->vm_prev = NULL;
         do {
                 rb_erase(&vma->vm_rb, &mm->mm_rb);
+               unmap_vma(vma);
                 mm->map_count--;
                 tail_vma = vma;
                 vma = vma->vm_next;
@@ -2296,6 +2305,11 @@ void exit_mmap(struct mm_struct *mm)
  
         arch_exit_mmap(mm);
  
+#ifdef CONFIG_XEN
+       for (vma = mm->mmap; vma; vma = vma->vm_next)
+               unmap_vma(vma);
+#endif
+
         vma = mm->mmap;
         if (!vma)       /* Can happen if dup_mmap() received an OOM */
                 return;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 918330f..e5a3966 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -692,6 +692,13 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
         int i;
         int bad = 0;
  
+#ifdef CONFIG_XEN
+       if (PageForeign(page)) {
+               PageForeignDestructor(page, order);
+               return false;
+       }
+#endif
+
         trace_mm_page_free(page, order);
         kmemcheck_free_shadow(page, order);
  
@@ -718,6 +725,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
         unsigned long flags;
         int wasMlocked = __TestClearPageMlocked(page);
  
+#ifdef CONFIG_XEN
+       WARN_ON(PageForeign(page) && wasMlocked);
+#endif
         if (!free_pages_prepare(page, order))
                 return;
  
@@ -1252,6 +1262,9 @@ void free_hot_cold_page(struct page *page, int cold)
         int migratetype;
         int wasMlocked = __TestClearPageMlocked(page);
  
+#ifdef CONFIG_XEN
+       WARN_ON(PageForeign(page) && wasMlocked);
+#endif
         if (!free_pages_prepare(page, 0))
                 return;
  
@@ -1910,7 +1923,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                 va_end(args);
         }
  
-       pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
+       if (!(gfp_mask & __GFP_WAIT)) {
+               pr_info("The following is only an harmless informational message.\n");
+               pr_info("Unless you get a _continuous_flood_ of these messages it means\n");
+               pr_info("everything is working fine. Allocations from irqs cannot be\n");
+               pr_info("perfectly reliable and the kernel is designed to handle that.\n");
+       }
+       pr_info("%s: page allocation failure. order:%d, mode:0x%x\n",
                 current->comm, order, gfp_mask);
  
         dump_stack();
@@ -5034,6 +5053,22 @@ void setup_per_zone_wmarks(void)
                 spin_unlock_irqrestore(&zone->lock, flags);
         }
  
+#ifdef CONFIG_XEN
+       for_each_populated_zone(zone) {
+               unsigned int cpu;
+
+               for_each_online_cpu(cpu) {
+                       unsigned long high;
+
+                       high = percpu_pagelist_fraction
+                              ? zone->present_pages / percpu_pagelist_fraction
+                              : 5 * zone_batchsize(zone);
+                       setup_pagelist_highmark(
+                               per_cpu_ptr(zone->pageset, cpu), high);
+               }
+       }
+#endif
+
         /* update totalreserve_pages */
         calculate_totalreserve_pages();
  }
diff --git a/mm/page_io.c b/mm/page_io.c

index dc76b4d..651a912 100644 (file)
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,6 +18,7 @@
  #include <linux/bio.h>
  #include <linux/swapops.h>
  #include <linux/writeback.h>
+#include <linux/frontswap.h>
  #include <asm/pgtable.h>
  
  static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
                 unlock_page(page);
                 goto out;
         }
+       if (frontswap_put_page(page) == 0) {
+               set_page_writeback(page);
+               unlock_page(page);
+               end_page_writeback(page);
+               goto out;
+       }
         bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
         if (bio == NULL) {
                 set_page_dirty(page);
@@ -122,6 +129,11 @@ int swap_readpage(struct page *page)
  
         VM_BUG_ON(!PageLocked(page));
         VM_BUG_ON(PageUptodate(page));
+       if (frontswap_get_page(page) == 0) {
+               SetPageUptodate(page);
+               unlock_page(page);
+               goto out;
+       }
         bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
         if (bio == NULL) {
                 unlock_page(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c

index fafc26d..9c7be87 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,8 @@
  #include <linux/memcontrol.h>
  #include <linux/poll.h>
  #include <linux/oom.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
  
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
@@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
  static void free_swap_count_continuations(struct swap_info_struct *);
  static sector_t map_swap_entry(swp_entry_t, struct block_device**);
  
-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
  static unsigned int nr_swapfiles;
  long nr_swap_pages;
  long total_swap_pages;
@@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry ";
  static const char Bad_offset[] = "Bad swap offset entry ";
  static const char Unused_offset[] = "Unused swap offset entry ";
  
-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
  
-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
  
  static DEFINE_MUTEX(swapon_mutex);
  
@@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
                         swap_list.next = p->type;
                 nr_swap_pages++;
                 p->inuse_pages--;
+               frontswap_invalidate_page(p->type, offset);
                 if ((p->flags & SWP_BLKDEV) &&
                                 disk->fops->swap_slot_free_notify)
                         disk->fops->swap_slot_free_notify(p->bdev, offset);
@@ -1016,11 +1019,12 @@ static int unuse_mm(struct mm_struct *mm,
  }
  
  /*
- * Scan swap_map from current position to next entry still in use.
+ * Scan swap_map (or frontswap_map if frontswap parameter is true)
+ * from current position to next entry still in use.
   * Recycle to start on reaching the end, returning 0 when empty.
   */
  static unsigned int find_next_to_unuse(struct swap_info_struct *si,
-                                       unsigned int prev)
+                                       unsigned int prev, bool frontswap)
  {
         unsigned int max = si->max;
         unsigned int i = prev;
@@ -1046,6 +1050,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                         prev = 0;
                         i = 1;
                 }
+               if (frontswap) {
+                       if (frontswap_test(si, i))
+                               break;
+                       else
+                               continue;
+               }
                 count = si->swap_map[i];
                 if (count && swap_count(count) != SWAP_MAP_BAD)
                         break;
@@ -1057,8 +1067,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
   * We completely avoid races by reading each swap page in advance,
   * and then search for the process using it.  All the necessary
   * page table adjustments can then be made atomically.
+ *
+ * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages; ignored if frontswap is false
   */
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, bool frontswap,
+                unsigned long pages_to_unuse)
  {
         struct swap_info_struct *si = swap_info[type];
         struct mm_struct *start_mm;
@@ -1091,7 +1105,7 @@ static int try_to_unuse(unsigned int type)
          * one pass through swap_map is enough, but not necessarily:
          * there are races when an instance of an entry might be missed.
          */
-       while ((i = find_next_to_unuse(si, i)) != 0) {
+       while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
                 if (signal_pending(current)) {
                         retval = -EINTR;
                         break;
@@ -1258,6 +1272,10 @@ static int try_to_unuse(unsigned int type)
                  * interactive performance.
                  */
                 cond_resched();
+               if (frontswap && pages_to_unuse > 0) {
+                       if (!--pages_to_unuse)
+                               break;
+               }
         }
  
         mmput(start_mm);
@@ -1517,7 +1535,8 @@ bad_bmap:
  }
  
  static void enable_swap_info(struct swap_info_struct *p, int prio,
-                               unsigned char *swap_map)
+                               unsigned char *swap_map,
+                               unsigned long *frontswap_map)
  {
         int i, prev;
  
@@ -1527,6 +1546,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
         else
                 p->prio = --least_priority;
         p->swap_map = swap_map;
+       frontswap_map_set(p, frontswap_map);
         p->flags |= SWP_WRITEOK;
         nr_swap_pages += p->pages;
         total_swap_pages += p->pages;
@@ -1543,6 +1563,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
                 swap_list.head = swap_list.next = p->type;
         else
                 swap_info[prev]->next = p->type;
+       frontswap_init(p->type);
         spin_unlock(&swap_lock);
  }
  
@@ -1616,7 +1637,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         spin_unlock(&swap_lock);
  
         oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
-       err = try_to_unuse(type);
+       err = try_to_unuse(type, false, 0); /* force all pages to be unused */
         compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
  
         if (err) {
@@ -1627,7 +1648,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                  * sys_swapoff for this swap_info_struct at this point.
                  */
                 /* re-insert swap space back into swap_list */
-               enable_swap_info(p, p->prio, p->swap_map);
+               enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
                 goto out_dput;
         }
  
@@ -1653,9 +1674,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         swap_map = p->swap_map;
         p->swap_map = NULL;
         p->flags = 0;
+       frontswap_invalidate_area(type);
         spin_unlock(&swap_lock);
         mutex_unlock(&swapon_mutex);
         vfree(swap_map);
+       vfree(frontswap_map_get(p));
         /* Destroy swap account informatin */
         swap_cgroup_swapoff(type);
  
@@ -2019,6 +2042,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
         sector_t span;
         unsigned long maxpages;
         unsigned char *swap_map = NULL;
+       unsigned long *frontswap_map = NULL;
         struct page *page = NULL;
         struct inode *inode = NULL;
  
@@ -2102,6 +2126,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                 error = nr_extents;
                 goto bad_swap;
         }
+       /* frontswap enabled? set up bit-per-page map for frontswap */
+       if (frontswap_enabled)
+               frontswap_map = vzalloc(maxpages / sizeof(long));
  
         if (p->bdev) {
                 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2117,14 +2144,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
         if (swap_flags & SWAP_FLAG_PREFER)
                 prio =
                   (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
-       enable_swap_info(p, prio, swap_map);
+       enable_swap_info(p, prio, swap_map, frontswap_map);
  
         printk(KERN_INFO "Adding %uk swap on %s.  "
-                       "Priority:%d extents:%d across:%lluk %s%s\n",
+                       "Priority:%d extents:%d across:%lluk %s%s%s\n",
                 p->pages<<(PAGE_SHIFT-10), name, p->prio,
                 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
                 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
-               (p->flags & SWP_DISCARDABLE) ? "D" : "");
+               (p->flags & SWP_DISCARDABLE) ? "D" : "",
+               (frontswap_map) ? "FS" : "");
  
         mutex_unlock(&swapon_mutex);
         atomic_inc(&proc_poll_event);
diff --git a/mm/thrash.c b/mm/thrash.c

index 57ad495..c64791a 100644 (file)
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -52,12 +52,15 @@ static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
  void grab_swap_token(struct mm_struct *mm)
  {
         int current_interval;
-       unsigned int old_prio = mm->token_priority;
+       unsigned int old_prio;
         static unsigned int global_faults;
         static unsigned int last_aging;
  
         global_faults++;
+       if (mm == NULL)
+               return;
  
+       old_prio = mm->token_priority;
         current_interval = global_faults - mm->faultstamp;
  
         if (!spin_trylock(&swap_token_lock))
diff --git a/mm/tmem-xen.c b/mm/tmem-xen.c

new file mode 100644 (file)

index 0000000..d79398a
--- /dev/null
+++ b/mm/tmem-xen.c
@@ -0,0 +1,56 @@
+/*
+ * Xen implementation for transcendent memory (tmem)
+ *
+ * Dan Magenheimer <dan.magenheimer@oracle.com> 2009
+ */
+
+#include <linux/types.h>
+#include <xen/interface/xen.h>
+#include <asm/hypervisor.h>
+#include "tmem.h"
+
+int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid, u32 index,
+       unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len)
+{
+       struct tmem_op op;
+       int rc = 0;
+
+       op.cmd = tmem_cmd;
+       op.pool_id = tmem_pool;
+       BUILD_BUG_ON(sizeof(op.u.gen.oid) != sizeof(oid.oid));
+       memcpy(op.u.gen.oid, oid.oid, sizeof(op.u.gen.oid));
+       op.u.gen.index = index;
+       op.u.gen.tmem_offset = tmem_offset;
+       op.u.gen.pfn_offset = pfn_offset;
+       op.u.gen.len = len;
+       op.u.gen.cmfn = gmfn;
+       rc = HYPERVISOR_tmem_op(&op);
+       return rc;
+}
+
+int xen_tmem_new_pool(uint32_t tmem_cmd, struct tmem_pool_uuid uuid,
+       uint32_t flags)
+{
+       struct tmem_op op;
+       int rc = 0;
+
+       op.cmd = tmem_cmd;
+       op.u.creat.uuid[0] = uuid.lo;
+       op.u.creat.uuid[1] = uuid.hi;
+#ifdef TMEM_SPEC_VERSION
+       switch (flags >> TMEM_POOL_VERSION_SHIFT) {
+       case 0:
+               flags |= TMEM_SPEC_VERSION << TMEM_POOL_VERSION_SHIFT;
+               break;
+       case TMEM_SPEC_VERSION:
+               break;
+       default:
+               WARN(1, "TMEM: Bogus version %u, expecting %u\n",
+                    flags >> TMEM_POOL_VERSION_SHIFT, TMEM_SPEC_VERSION);
+               return -ENOSYS;
+       }
+#endif
+       op.u.creat.flags = flags;
+       rc = HYPERVISOR_tmem_op(&op);
+       return rc;
+}
diff --git a/mm/truncate.c b/mm/truncate.c

index 61a183b..3e5a9b3 100644 (file)
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -293,6 +293,13 @@ void truncate_inode_pages_range(struct address_space *mapping,
                 index++;
         }
         cleancache_invalidate_inode(mapping);
+       /*
+        * Cycle the tree_lock to make sure all __delete_from_page_cache()
+        * calls run from page reclaim have finished as well (this handles the
+        * case when page reclaim took the last page from our range).
+        */
+       spin_lock_irq(&mapping->tree_lock);
+       spin_unlock_irq(&mapping->tree_lock);
  }
  EXPORT_SYMBOL(truncate_inode_pages_range);
  
diff --git a/mm/vmalloc.c b/mm/vmalloc.c

index 94dff88..d75f965 100644 (file)
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1575,6 +1575,13 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
         struct page **pages;
         unsigned int nr_pages, array_size, i;
         gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
+#ifdef CONFIG_XEN
+       gfp_t dma_mask = gfp_mask & (__GFP_DMA | __GFP_DMA32);
+
+       BUILD_BUG_ON((__GFP_DMA | __GFP_DMA32) != (__GFP_DMA + __GFP_DMA32));
+       if (dma_mask == (__GFP_DMA | __GFP_DMA32))
+               gfp_mask &= ~(__GFP_DMA | __GFP_DMA32);
+#endif
  
         nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
         array_size = (nr_pages * sizeof(struct page *));
@@ -1611,6 +1618,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                         goto fail;
                 }
                 area->pages[i] = page;
+#ifdef CONFIG_XEN
+               if (dma_mask) {
+                       if (xen_limit_pages_to_max_mfn(page, 0, 32)) {
+                               area->nr_pages = i + 1;
+                               goto fail;
+                       }
+                       if (gfp_mask & __GFP_ZERO)
+                               clear_highpage(page);
+               }
+#endif
         }
  
         if (map_vm_area(area, prot, &pages))
@@ -1836,6 +1853,8 @@ void *vmalloc_exec(unsigned long size)
  #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
  #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
  #define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
+#elif defined(CONFIG_XEN)
+#define GFP_VMALLOC32 GFP_DMA | GFP_DMA32 | GFP_KERNEL
  #else
  #define GFP_VMALLOC32 GFP_KERNEL
  #endif
@@ -2210,6 +2229,17 @@ struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
                 return NULL;
         }
  
+#ifdef CONFIG_XEN
+       /*
+        * If the allocated address space is passed to a hypercall before
+        * being used then we cannot rely on a page fault to trigger an update
+        * of the page tables.  So sync all the page tables here unless the
+        * caller is going to have the affected PTEs updated directly.
+        */
+       if (!ptes)
+               vmalloc_sync_all();
+#endif
+
         return area;
  }
  EXPORT_SYMBOL_GPL(alloc_vm_area);
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c

index 0a942fb..b4698d2 100644 (file)
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -239,11 +239,18 @@ int br_add_bridge(struct net *net, const char *name)
         if (!dev)
                 return -ENOMEM;
  
+       if (!try_module_get(THIS_MODULE)) {
+               free_netdev(dev);
+               return -ENOENT;
+       }
+
         dev_net_set(dev, net);
  
         res = register_netdev(dev);
-       if (res)
+       if (res) {
                 free_netdev(dev);
+               module_put(THIS_MODULE);
+       }
         return res;
  }
  
@@ -271,6 +278,8 @@ int br_del_bridge(struct net *net, const char *name)
                 br_dev_delete(dev, NULL);
  
         rtnl_unlock();
+       if (ret == 0)
+               module_put(THIS_MODULE);
         return ret;
  }
  
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c

index 7d5cb97..0c5f312 100644 (file)
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2933,6 +2933,7 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags)
                 goto out;
  
         if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
+           !(dev->flags&IFF_MULTICAST) ||
             idev->cnf.accept_dad < 1 ||
             !(ifp->flags&IFA_F_TENTATIVE) ||
             ifp->flags & IFA_F_NODAD) {
@@ -3036,6 +3037,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
              ifp->idev->cnf.accept_ra == 2) &&
             ifp->idev->cnf.rtr_solicits > 0 &&
             (dev->flags&IFF_LOOPBACK) == 0 &&
+           (dev->flags&IFF_MULTICAST) &&
             (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) {
                 /*
                  *      If a host as already performed a random delay
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig

index 0c6f67e..57080f2 100644 (file)
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -317,6 +317,21 @@ config NF_CONNTRACK_TFTP
  
           To compile it as a module, choose M here.  If unsure, say N.
  
+config NF_CONNTRACK_SLP
+       tristate "SLP protocol support"
+       depends on NF_CONNTRACK
+       depends on NETFILTER_ADVANCED
+       help
+         SLP queries are sometimes sent as broadcast messages from an
+         unprivileged port and responded to with unicast messages to the
+         same port. This make them hard to firewall properly because connection
+         tracking doesn't deal with broadcasts. This helper tracks locally
+         originating broadcast SLP queries and the corresponding
+         responses. It relies on correct IP address configuration, specifically
+         netmask and broadcast address.
+
+         To compile it as a module, choose M here.  If unsure, say N.
+
  config NF_CT_NETLINK
         tristate 'Connection tracking netlink interface'
         select NETFILTER_NETLINK
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile

index ca36765..546dbf1 100644 (file)
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o
  obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o
  obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o
  obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
+obj-$(CONFIG_NF_CONNTRACK_SLP) += nf_conntrack_slp.o
  
  # transparent proxy support
  obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c

index 8c5c95c..c4b89f7 100644 (file)
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -53,10 +53,14 @@ unsigned int (*nf_nat_ftp_hook)(struct sk_buff *skb,
                                 struct nf_conntrack_expect *exp);
  EXPORT_SYMBOL_GPL(nf_nat_ftp_hook);
  
-static int try_rfc959(const char *, size_t, struct nf_conntrack_man *, char);
-static int try_eprt(const char *, size_t, struct nf_conntrack_man *, char);
+static int try_rfc959(const char *, size_t, struct nf_conntrack_man *,
+                     char, unsigned int *);
+static int try_rfc1123(const char *, size_t, struct nf_conntrack_man *,
+                      char, unsigned int *);
+static int try_eprt(const char *, size_t, struct nf_conntrack_man *,
+                   char, unsigned int *);
  static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *,
-                            char);
+                            char, unsigned int *);
  
  static struct ftp_search {
         const char *pattern;
@@ -64,7 +68,7 @@ static struct ftp_search {
         char skip;
         char term;
         enum nf_ct_ftp_type ftptype;
-       int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char);
+       int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char, unsigned int *);
  } search[IP_CT_DIR_MAX][2] = {
         [IP_CT_DIR_ORIGINAL] = {
                 {
@@ -88,10 +92,8 @@ static struct ftp_search {
                 {
                         .pattern        = "227 ",
                         .plen           = sizeof("227 ") - 1,
-                       .skip           = '(',
-                       .term           = ')',
                         .ftptype        = NF_CT_FTP_PASV,
-                       .getnum         = try_rfc959,
+                       .getnum         = try_rfc1123,
                 },
                 {
                         .pattern        = "229 ",
@@ -130,8 +132,9 @@ static int try_number(const char *data, size_t dlen, u_int32_t array[],
                         i++;
                 else {
                         /* Unexpected character; true if it's the
-                          terminator and we're finished. */
-                       if (*data == term && i == array_size - 1)
+                          terminator (or we don't care about one)
+                          and we're finished. */
+                       if ((*data == term || !term) && i == array_size - 1)
                                 return len;
  
                         pr_debug("Char %u (got %u nums) `%u' unexpected\n",
@@ -146,7 +149,8 @@ static int try_number(const char *data, size_t dlen, u_int32_t array[],
  
  /* Returns 0, or length of numbers: 192,168,1,1,5,6 */
  static int try_rfc959(const char *data, size_t dlen,
-                     struct nf_conntrack_man *cmd, char term)
+                     struct nf_conntrack_man *cmd, char term,
+                     unsigned int *offset)
  {
         int length;
         u_int32_t array[6];
@@ -161,6 +165,33 @@ static int try_rfc959(const char *data, size_t dlen,
         return length;
  }
  
+/*
+ * From RFC 1123:
+ * The format of the 227 reply to a PASV command is not
+ * well standardized.  In particular, an FTP client cannot
+ * assume that the parentheses shown on page 40 of RFC-959
+ * will be present (and in fact, Figure 3 on page 43 omits
+ * them).  Therefore, a User-FTP program that interprets
+ * the PASV reply must scan the reply for the first digit
+ * of the host and port numbers.
+ */
+static int try_rfc1123(const char *data, size_t dlen,
+                      struct nf_conntrack_man *cmd, char term,
+                      unsigned int *offset)
+{
+       int i;
+       for (i = 0; i < dlen; i++)
+               if (isdigit(data[i]))
+                       break;
+
+       if (i == dlen)
+               return 0;
+
+       *offset += i;
+
+       return try_rfc959(data + i, dlen - i, cmd, 0, offset);
+}
+
  /* Grab port: number up to delimiter */
  static int get_port(const char *data, int start, size_t dlen, char delim,
                     __be16 *port)
@@ -189,7 +220,7 @@ static int get_port(const char *data, int start, size_t dlen, char delim,
  
  /* Returns 0, or length of numbers: |1|132.235.1.2|6275| or |2|3ffe::1|6275| */
  static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
-                   char term)
+                   char term, unsigned int *offset)
  {
         char delim;
         int length;
@@ -237,7 +268,8 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd,
  
  /* Returns 0, or length of numbers: |||6446| */
  static int try_epsv_response(const char *data, size_t dlen,
-                            struct nf_conntrack_man *cmd, char term)
+                            struct nf_conntrack_man *cmd, char term,
+                            unsigned int *offset)
  {
         char delim;
  
@@ -259,9 +291,10 @@ static int find_pattern(const char *data, size_t dlen,
                         unsigned int *numlen,
                         struct nf_conntrack_man *cmd,
                         int (*getnum)(const char *, size_t,
-                                     struct nf_conntrack_man *, char))
+                                     struct nf_conntrack_man *, char,
+                                     unsigned int *))
  {
-       size_t i;
+       size_t i = plen;
  
         pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen);
         if (dlen == 0)
@@ -291,16 +324,18 @@ static int find_pattern(const char *data, size_t dlen,
         pr_debug("Pattern matches!\n");
         /* Now we've found the constant string, try to skip
            to the 'skip' character */
-       for (i = plen; data[i] != skip; i++)
-               if (i == dlen - 1) return -1;
+       if (skip) {
+               for (i = plen; data[i] != skip; i++)
+                       if (i == dlen - 1) return -1;
  
-       /* Skip over the last character */
-       i++;
+               /* Skip over the last character */
+               i++;
+       }
  
         pr_debug("Skipped up to `%c'!\n", skip);
  
         *numoff = i;
-       *numlen = getnum(data + i, dlen - i, cmd, term);
+       *numlen = getnum(data + i, dlen - i, cmd, term, numoff);
         if (!*numlen)
                 return -1;
  
diff --git a/net/netfilter/nf_conntrack_slp.c b/net/netfilter/nf_conntrack_slp.c

new file mode 100644 (file)

index 0000000..0174dd0
--- /dev/null
+++ b/net/netfilter/nf_conntrack_slp.c
@@ -0,0 +1,131 @@
+/*
+ *      NetBIOS name service broadcast connection tracking helper
+ *
+ *      (c) 2007 Jiri Bohac <jbohac@suse.cz>
+ *      (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+/*
+ *      This helper tracks locally originating NetBIOS name service
+ *      requests by issuing permanent expectations (valid until
+ *      timing out) matching all reply connections from the
+ *      destination network. The only NetBIOS specific thing is
+ *      actually the port number.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/if_addr.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <net/route.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+#define SLP_PORT       427
+
+MODULE_AUTHOR("Jiri Bohac <jbohac@suse.cz>");
+MODULE_DESCRIPTION("SLP broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ip_conntrack_slp");
+
+static unsigned int timeout __read_mostly = 3;
+module_param(timeout, uint, 0400);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+static int help(struct sk_buff *skb, unsigned int protoff,
+               struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+       struct nf_conntrack_expect *exp;
+       struct rtable *rt = skb_rtable(skb);
+       struct in_device *in_dev;
+       __be32 mask = 0;
+       __be32 src = 0;
+
+       /* we're only interested in locally generated packets */
+       if (skb->sk == NULL)
+               goto out;
+       if (rt == NULL || !(rt->rt_flags & (RTCF_MULTICAST|RTCF_BROADCAST)))
+               goto out;
+       if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+               goto out;
+
+       rcu_read_lock();
+       in_dev = __in_dev_get_rcu(rt->dst.dev);
+       if (in_dev != NULL) {
+               for_primary_ifa(in_dev) {
+                       /* this is a hack as slp uses multicast we can't match
+                        * the destination address to some broadcast address. So
+                        * just take the first one. Better would be to install
+                        * expectations for all addresses */
+                       mask = ifa->ifa_mask;
+                       src = ifa->ifa_broadcast;
+                       break;
+               } endfor_ifa(in_dev);
+       }
+       rcu_read_unlock();
+
+       if (mask == 0 || src == 0)
+               goto out;
+
+       exp = nf_ct_expect_alloc(ct);
+       if (exp == NULL)
+               goto out;
+
+       exp->tuple                = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+       exp->tuple.src.u3.ip      = src;
+       exp->tuple.src.u.udp.port = htons(SLP_PORT);
+
+       exp->mask.src.u3.ip       = mask;
+       exp->mask.src.u.udp.port  = htons(0xFFFF);
+
+       exp->expectfn             = NULL;
+       exp->flags                = NF_CT_EXPECT_PERMANENT;
+       exp->class                = NF_CT_EXPECT_CLASS_DEFAULT;
+       exp->helper               = NULL;
+
+       nf_ct_expect_related(exp);
+       nf_ct_expect_put(exp);
+
+       nf_ct_refresh(ct, skb, timeout * HZ);
+out:
+       return NF_ACCEPT;
+}
+
+static struct nf_conntrack_expect_policy exp_policy = {
+       .max_expected   = 1,
+};
+
+static struct nf_conntrack_helper helper __read_mostly = {
+       .name                   = "slp",
+       .tuple.src.l3num        = AF_INET,
+       .tuple.src.u.udp.port   = __constant_htons(SLP_PORT),
+       .tuple.dst.protonum     = IPPROTO_UDP,
+       .me                     = THIS_MODULE,
+       .help                   = help,
+       .expect_policy          = &exp_policy,
+};
+
+static int __init nf_conntrack_slp_init(void)
+{
+       exp_policy.timeout = timeout;
+       return nf_conntrack_helper_register(&helper);
+}
+
+static void __exit nf_conntrack_slp_fini(void)
+{
+       nf_conntrack_helper_unregister(&helper);
+}
+
+module_init(nf_conntrack_slp_init);
+module_exit(nf_conntrack_slp_fini);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c

index 994cfea..d23b8aa 100644 (file)
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -584,6 +584,35 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
  }
  EXPORT_SYMBOL_GPL(rpc_wake_up_status);
  
+/**
+ * rpc_wake_up_softconn_status - wake up all SOFTCONN rpc_tasks and set their
+ * status value.
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ * @status: status value to set
+ *
+ * Grabs queue->lock
+ */
+void rpc_wake_up_softconn_status(struct rpc_wait_queue *queue, int status)
+{
+       struct rpc_task *task, *next;
+       struct list_head *head;
+
+       spin_lock_bh(&queue->lock);
+       head = &queue->tasks[queue->maxpriority];
+       for (;;) {
+               list_for_each_entry_safe(task, next, head, u.tk_wait.list)
+                       if (RPC_IS_SOFTCONN(task)) {
+                               task->tk_status = status;
+                               rpc_wake_up_task_queue_locked(queue, task);
+                       }
+               if (head == &queue->tasks[0])
+                       break;
+               head--;
+       }
+       spin_unlock_bh(&queue->lock);
+}
+EXPORT_SYMBOL_GPL(rpc_wake_up_softconn_status);
+
  static void __rpc_queue_timer_fn(unsigned long ptr)
  {
         struct rpc_wait_queue *queue = (struct rpc_wait_queue *)ptr;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c

index 890b03f..678456a 100644 (file)
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2157,7 +2157,11 @@ static void xs_tcp_setup_socket(struct work_struct *work)
         case -ECONNREFUSED:
         case -ECONNRESET:
         case -ENETUNREACH:
-               /* retry with existing socket, after a delay */
+               /* Retry with existing socket after a delay, except
+                * for SOFTCONN tasks which fail. */
+               xprt_clear_connecting(xprt);
+               rpc_wake_up_softconn_status(&xprt->pending, status);
+               return;
         case 0:
         case -EINPROGRESS:
         case -EALREADY:
diff --git a/scripts/Makefile.build b/scripts/Makefile.build

index ff1720d..d355919 100644 (file)
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -128,6 +128,21 @@ ifndef obj
  $(warning kbuild: Makefile.build is included improperly)
  endif
  
+ifeq ($(CONFIG_XEN),y)
+Makefile.xen := $(if $(KBUILD_EXTMOD),$(KBUILD_EXTMOD),$(objtree)/scripts)/Makefile.xen
+$(Makefile.xen): $(srctree)/scripts/Makefile.xen.awk $(srctree)/scripts/Makefile.build
+       @echo '  Updating $@'
+       $(if $(shell echo a | $(AWK) '{ print gensub(/a/, "AA", "g"); }'),\
+        ,$(error 'Your awk program does not define gensub.  Use gawk or another awk with gensub'))
+       @$(AWK) -f $< $(filter-out $<,$^) >$@
+
+xen-src-single-used-m  := $(patsubst $(srctree)/%,%,$(wildcard $(addprefix $(srctree)/,$(single-used-m:.o=-xen.c))))
+xen-single-used-m      := $(xen-src-single-used-m:-xen.c=.o)
+single-used-m          := $(filter-out $(xen-single-used-m),$(single-used-m))
+
+-include $(Makefile.xen)
+endif
+
  # ===========================================================================
  
  ifneq ($(strip $(lib-y) $(lib-m) $(lib-n) $(lib-)),)
@@ -213,6 +228,7 @@ cmd_gensymtypes =                                                           \
      $(CPP) -D__GENKSYMS__ $(c_flags) $< |                                   \
      $(GENKSYMS) $(if $(1), -T $(2)) -a $(ARCH)                              \
       $(if $(KBUILD_PRESERVE),-p)                                            \
+     $(if $(KBUILD_OVERRIDE),-o)                                            \
       -r $(firstword $(wildcard $(2:.symtypes=.symref) /dev/null))
  
  quiet_cmd_cc_symtypes_c = SYM $(quiet_modtag) $@
@@ -305,12 +321,14 @@ endef
  # Built-in and composite module parts
  $(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE
         $(call cmd,force_checksrc)
+       $(call cmd,force_check_kmsg)
         $(call if_changed_rule,cc_o_c)
  
  # Single-part modules are special since we need to mark them in $(MODVERDIR)
  
  $(single-used-m): $(obj)/%.o: $(src)/%.c $(recordmcount_source) FORCE
         $(call cmd,force_checksrc)
+       $(call cmd,force_check_kmsg)
         $(call if_changed_rule,cc_o_c)
         @{ echo $(@:.o=.ko); echo $@; } > $(MODVERDIR)/$(@F:.o=.mod)
  
@@ -434,6 +452,18 @@ $(multi-used-m) : %.o: $(multi-objs-m) FORCE
  
  targets += $(multi-used-y) $(multi-used-m)
  
+# kmsg check tool
+ifneq ($(KBUILD_KMSG_CHECK),0)
+  ifeq ($(KBUILD_KMSG_CHECK),2)
+    kmsg_cmd := print
+    quiet_cmd_force_check_kmsg = KMSG_PRINT $<
+    $(shell [ -d $(objtree)/man ] || mkdir -p $(objtree)/man)
+  else
+    kmsg_cmd := check
+    quiet_cmd_force_check_kmsg = KMSG_CHECK $<
+  endif
+  cmd_force_check_kmsg = $(KMSG_CHECK) $(kmsg_cmd) $(CC) $(c_flags) $< ;
+endif
  
  # Descending
  # ---------------------------------------------------------------------------
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib

index 0be6f11..e3dc19d 100644 (file)
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -22,6 +22,12 @@ obj-m := $(filter-out $(obj-y),$(obj-m))
  
  lib-y := $(filter-out $(obj-y), $(sort $(lib-y) $(lib-m)))
  
+# Remove objects forcibly disabled
+
+obj-y := $(filter-out $(disabled-obj-y),$(obj-y))
+obj-m := $(filter-out $(disabled-obj-y),$(obj-m))
+lib-y := $(filter-out $(disabled-obj-y),$(lib-y))
+
  
  # Handle objects in subdirs
  # ---------------------------------------------------------------------------
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost

index 08dce14..8f8f1d2 100644 (file)
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -81,7 +81,11 @@ modpost = scripts/mod/modpost                    \
   $(if $(KBUILD_EXTMOD),-o $(modulesymfile))      \
   $(if $(CONFIG_DEBUG_SECTION_MISMATCH),,-S)      \
   $(if $(KBUILD_EXTMOD)$(KBUILD_MODPOST_WARN),-w) \
- $(if $(cross_build),-c)
+ $(if $(cross_build),-c)                         \
+ $(if $(CONFIG_ENTERPRISE_SUPPORT),              \
+      -N $(firstword $(wildcard $(dir $(MODVERDIR))/Module.supported \
+                               $(objtree)/Module.supported \
+                               $(srctree)/Module.supported /dev/null)))
  
  quiet_cmd_modpost = MODPOST $(words $(filter-out vmlinux FORCE, $^)) modules
        cmd_modpost = $(modpost) -s
diff --git a/scripts/Makefile.xen.awk b/scripts/Makefile.xen.awk

new file mode 100644 (file)

index 0000000..1f7cf1e
--- /dev/null
+++ b/scripts/Makefile.xen.awk
@@ -0,0 +1,34 @@
+BEGIN {
+       is_rule = 0
+}
+
+/^[[:space:]]*#/ {
+       next
+}
+
+/^[[:space:]]*$/ {
+       if (is_rule)
+               print("")
+       is_rule = 0
+       next
+}
+
+/:[[:space:]]*\$\(src\)\/%\.[cS][[:space:]]/ {
+       line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0)
+       line = gensub(/(single-used-m)/, "xen-\\1", "g", line)
+       print line
+       is_rule = 1
+       next
+}
+
+/^[^\t]$/ {
+       if (is_rule)
+               print("")
+       is_rule = 0
+       next
+}
+
+is_rule {
+       print $0
+       next
+}
diff --git a/scripts/genksyms/genksyms.c b/scripts/genksyms/genksyms.c

index 8a10649..fcb87e3 100644 (file)
--- a/scripts/genksyms/genksyms.c
+++ b/scripts/genksyms/genksyms.c
@@ -44,7 +44,7 @@ char *cur_filename, *source_file;
  int in_source_file;
  
  static int flag_debug, flag_dump_defs, flag_reference, flag_dump_types,
-          flag_preserve, flag_warnings;
+          flag_override, flag_preserve, flag_warnings;
  static const char *arch = "";
  static const char *mod_prefix = "";
  
@@ -256,7 +256,7 @@ static struct symbol *__add_symbol(const char *name, enum symbol_type type,
                                 sym->is_declared = 1;
                                 return sym;
                         } else if (!sym->is_declared) {
-                               if (sym->is_override && flag_preserve) {
+                               if (sym->is_override && flag_override) {
                                         print_location();
                                         fprintf(stderr, "ignoring ");
                                         print_type_name(type, name);
@@ -667,11 +667,13 @@ void export_symbol(const char *name)
                         struct symbol *n = sym->expansion_trail;
  
                         if (sym->status != STATUS_UNCHANGED) {
+                               int fail = sym->is_override && flag_preserve;
+
                                 if (!has_changed) {
                                         print_location();
                                         fprintf(stderr, "%s: %s: modversion "
                                                 "changed because of changes "
-                                               "in ", flag_preserve ? "error" :
+                                               "in ", fail ? "error" :
                                                        "warning", name);
                                 } else
                                         fprintf(stderr, ", ");
@@ -679,7 +681,7 @@ void export_symbol(const char *name)
                                 if (sym->status == STATUS_DEFINED)
                                         fprintf(stderr, " (became defined)");
                                 has_changed = 1;
-                               if (flag_preserve)
+                               if (fail)
                                         errors++;
                         }
                         sym->expansion_trail = 0;
@@ -736,6 +738,7 @@ static void genksyms_usage(void)
               "  -D, --dump            Dump expanded symbol defs (for debugging only)\n"
               "  -r, --reference file  Read reference symbols from a file\n"
               "  -T, --dump-types file Dump expanded types into file\n"
+             "  -o, --override        Allow to override reference modversions\n"
               "  -p, --preserve        Preserve reference modversions or fail\n"
               "  -w, --warnings        Enable warnings\n"
               "  -q, --quiet           Disable warnings (default)\n"
@@ -747,6 +750,7 @@ static void genksyms_usage(void)
               "  -D                    Dump expanded symbol defs (for debugging only)\n"
               "  -r file               Read reference symbols from a file\n"
               "  -T file               Dump expanded types into file\n"
+             "  -o                    Allow to override reference modversions\n"
               "  -p                    Preserve reference modversions or fail\n"
               "  -w                    Enable warnings\n"
               "  -q                    Disable warnings (default)\n"
@@ -771,15 +775,16 @@ int main(int argc, char **argv)
                 {"reference", 1, 0, 'r'},
                 {"dump-types", 1, 0, 'T'},
                 {"preserve", 0, 0, 'p'},
+               {"override", 0, 0, 'o'},
                 {"version", 0, 0, 'V'},
                 {"help", 0, 0, 'h'},
                 {0, 0, 0, 0}
         };
  
-       while ((o = getopt_long(argc, argv, "a:dwqVDr:T:ph",
+       while ((o = getopt_long(argc, argv, "a:dwqVDr:T:oph",
                                 &long_opts[0], NULL)) != EOF)
  #else                          /* __GNU_LIBRARY__ */
-       while ((o = getopt(argc, argv, "a:dwqVDr:T:ph")) != EOF)
+       while ((o = getopt(argc, argv, "a:dwqVDr:T:oph")) != EOF)
  #endif                         /* __GNU_LIBRARY__ */
                 switch (o) {
                 case 'a':
@@ -816,7 +821,11 @@ int main(int argc, char **argv)
                                 return 1;
                         }
                         break;
+               case 'o':
+                       flag_override = 1;
+                       break;
                 case 'p':
+                       flag_override = 1;
                         flag_preserve = 1;
                         break;
                 case 'h':
diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile

index 7966265..9829863 100644 (file)
--- a/scripts/kconfig/Makefile
+++ b/scripts/kconfig/Makefile
@@ -75,6 +75,23 @@ PHONY += allnoconfig allyesconfig allmodconfig alldefconfig randconfig
  
  allnoconfig allyesconfig allmodconfig alldefconfig randconfig: $(obj)/conf
         $< --$@ $(Kconfig)
+ 
+UNAME_RELEASE := $(shell uname -r)
+CLONECONFIG := $(firstword $(wildcard /proc/config.gz \
+                                     /lib/modules/$(UNAME_RELEASE)/.config \
+                                     /etc/kernel-config \
+                                     /boot/config-$(UNAME_RELEASE)))
+cloneconfig: $(obj)/conf
+       $(Q)case "$(CLONECONFIG)" in                            \
+       '')     echo -e "The configuration of the running"      \
+                       "kernel could not be determined\n";     \
+               false ;;                                        \
+       *.gz)   gzip -cd $(CLONECONFIG) > .config.running ;;    \
+       *)      cat $(CLONECONFIG) > .config.running ;;         \
+       esac &&                                                 \
+       echo -e "Cloning configuration file $(CLONECONFIG)\n"
+       $(Q)$< --defconfig=.config.running arch/$(SRCARCH)/Kconfig
+
  
  PHONY += listnewconfig oldnoconfig savedefconfig defconfig
  
diff --git a/scripts/kmsg-doc b/scripts/kmsg-doc

new file mode 100644 (file)

index 0000000..e0f64ed
--- /dev/null
+++ b/scripts/kmsg-doc
@@ -0,0 +1,478 @@
+#!/usr/bin/perl -w
+#
+# kmsg kernel messages check and print tool.
+#
+# To check the source code for missing messages the script is called
+# with check, the name compiler and the compile parameters
+#      kmsg-doc check $(CC) $(c_flags) $<
+# To create man pages for the messages the script is called with
+#      kmsg-doc print $(CC) $(c_flags) $<
+#
+# Copyright IBM Corp. 2008
+# Author(s):  Martin Schwidefsky <schwidefsky@de.ibm.com>
+#            Michael Holzheu <holzheu@linux.vnet.ibm.com>
+#
+
+use Cwd;
+use bigint;
+
+my $errors = 0;
+my $warnings = 0;
+my $srctree = "";
+my $objtree = "";
+my $kmsg_count = 0;
+
+sub remove_quotes($)
+{
+    my ($string) = @_;
+    my $inside = 0;
+    my $slash = 0;
+    my $result = "";
+
+    foreach my $str (split(/([\\"])/, $string)) {
+        if ($inside && ($str ne "\"" || $slash)) {
+            $result .= $str;
+        }
+        # Check for backslash before quote
+        if ($str eq "\"") {
+            if (!$slash) {
+                $inside = !$inside;
+            }
+            $slash = 0;
+        } elsif ($str eq "\\") {
+            $slash = !$slash;
+        } elsif ($str ne "") {
+            $slash = 0;
+        }
+    }
+    return $result;
+}
+
+sub string_to_bytes($)
+{
+    my ($string) = @_;
+    my %is_escape = ('"', 0x22, '\'', 0x27, 'n', 0x0a, 'r', 0x0d, 'b', 0x08,
+                    't', 0x09, 'f', 0x0c, 'a', 0x07, 'v', 0x0b, '?', 0x3f);
+    my (@ar, $slash, $len);
+
+    # scan string, interpret backslash escapes and write bytes to @ar
+    $len = 0;
+    foreach my $ch (split(//, $string)) {
+       if ($ch eq '\\') {
+           $slash = !$slash;
+           if (!$slash) {
+               $ar[$len] = ord('\\');
+               $len++;
+           }
+       } elsif ($slash && defined $is_escape{$ch}) {
+           # C99 backslash escapes: \\ \" \' \n \r \b \t \f \a \v \?
+           $ar[$len] = $is_escape{$ch};
+           $len++;
+           $slash = 0;
+       } elsif ($slash) {
+           # FIXME: C99 backslash escapes \nnn \xhh
+           die("Unknown backslash escape in message $string.");
+       } else {
+           # normal character
+           $ar[$len] = ord($ch);
+           $len++;
+       }
+    }
+    return @ar;
+}
+
+sub calc_jhash($)
+{
+    my ($string) = @_;
+    my @ar;
+    my ($a, $b, $c, $i, $length, $len);
+
+    @ar = string_to_bytes($string);
+    $length = @ar;
+    # add dummy elements to @ar to avoid if then else hell
+    push @ar, (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    $a = 0x9e3779b9;
+    $b = 0x9e3779b9;
+    $c = 0;
+    $i = 0;
+    for ($len = $length + 12; $len >= 12; $len -= 12) {
+       if ($len < 24) {
+           # add length for last round
+           $c += $length;
+       }
+       $a += $ar[$i] + ($ar[$i+1]<<8) + ($ar[$i+2]<<16) + ($ar[$i+3]<<24);
+       $b += $ar[$i+4] + ($ar[$i+5]<<8) + ($ar[$i+6]<<16) + ($ar[$i+7]<<24);
+       if ($len >= 24) {
+           $c += $ar[$i+8] + ($ar[$i+9]<<8) + ($ar[$i+10]<<16) + ($ar[$i+11]<<24);
+       } else {
+           $c += ($ar[$i+8]<<8) + ($ar[$i+9]<<16) + ($ar[$i+10]<<24);
+       }
+       $a &= 0xffffffff; $b &= 0xffffffff; $c &= 0xffffffff;
+       $a -= $b; $a -= $c; $a ^= ($c >> 13); $a &= 0xffffffff;
+       $b -= $c; $b -= $a; $b ^= ($a << 8); $b &= 0xffffffff;
+       $c -= $a; $c -= $b; $c ^= ($b >> 13); $c &= 0xffffffff;
+       $a -= $b; $a -= $c; $a ^= ($c >> 12); $a &= 0xffffffff;
+       $b -= $c; $b -= $a; $b ^= ($a << 16); $b &= 0xffffffff;
+       $c -= $a; $c -= $b; $c ^= ($b >> 5); $c &= 0xffffffff;
+       $a -= $b; $a -= $c; $a ^= ($c >> 3); $a &= 0xffffffff;
+       $b -= $c; $b -= $a; $b ^= ($a << 10); $b &= 0xffffffff;
+       $c -= $a; $c -= $b; $c ^= ($b >> 15); $c &= 0xffffffff;
+       $i += 12;
+    }
+    return $c;
+}
+
+sub add_kmsg_desc($$$$$$)
+{
+    my ($component, $text, $sev, $argv, $desc, $user) = @_;
+    my ($hash, $tag);
+
+    $text = remove_quotes($text);
+    $hash = substr(sprintf("%08x", calc_jhash($text)), 2, 6);
+    $tag = $component . "." . $hash;
+
+    if ($kmsg_desc{$tag}) {
+       if ($text ne $kmsg_desc{$tag}->{'TEXT'}) {
+           warn "Duplicate message with tag $tag\n";
+           warn "  --- $kmsg_desc{$tag}->{'TEXT'}\n";
+           warn "  +++ $text\n";
+       } else {
+           warn "Duplicate message description for \"$text\"\n";
+       }
+       $errors++;
+       return;
+    }
+    $kmsg_desc{$tag}->{'TEXT'} = $text;
+    $kmsg_desc{$tag}->{'SEV'} = $sev;
+    $kmsg_desc{$tag}->{'ARGV'} = $argv;
+    $kmsg_desc{$tag}->{'DESC'} = $desc;
+    $kmsg_desc{$tag}->{'USER'} = $user;
+}
+
+sub add_kmsg_print($$$$)
+{
+    my ($component, $sev, $text, $argv) = @_;
+    my ($hash, $tag, $count, $parm);
+
+    $text = remove_quotes($text);
+    $hash = substr(sprintf("%08x", calc_jhash($text)), 2, 6);
+    $tag = $component . "." . $hash;
+
+    # Pretty print severity
+    $sev =~ s/"<0>"/Emerg/;
+    $sev =~ s/"<1>"/Alert/;
+    $sev =~ s/"<2>"/Critical/;
+    $sev =~ s/"<3>"/Error/;
+    $sev =~ s/"<4>"/Warning/;
+    $sev =~ s/"<5>"/Notice/;
+    $sev =~ s/"<6>"/Informational/;
+    $sev =~ s/"<7>"/Debug/;
+    $kmsg_print{$kmsg_count}->{'TAG'} = $tag;
+    $kmsg_print{$kmsg_count}->{'TEXT'} = $text;
+    $kmsg_print{$kmsg_count}->{'SEV'} = $sev;
+    $kmsg_print{$kmsg_count}->{'ARGV'} = $argv;
+    $kmsg_count += 1;
+}
+
+sub process_source_file($$)
+{
+    my ($component, $file) = @_;
+    my $state;
+    my ($text, $sev, $argv, $desc, $user);
+
+    if (!open(FD, "$file")) {
+       return "";
+    }
+
+    $state = 0;
+    while (<FD>) {
+       chomp;
+       # kmsg message component: #define KMSG_COMPONENT "<component>"
+       if (/^#define\s+KMSG_COMPONENT\s+\"(.*)\"[^\"]*$/o) {
+           $component = $1;
+       }
+       if ($state == 0) {
+           # single line kmsg for undocumented messages, format:
+           # /*? Text: "<message>" */
+           if (/^\s*\/\*\?\s*Text:\s*(\".*\")\s*\*\/\s*$/o) {
+               add_kmsg_desc($component, $1, "", "", "", "");
+           }
+           # kmsg message start: '/*?'
+           if (/^\s*\/\*\?\s*$/o) {
+               $state = 1;
+               ($text, $sev, $argv, $desc, $user) = ( "", "", "", "", "" );
+           }
+       } elsif ($state == 1) {
+           # kmsg message end: ' */'
+           if (/^\s*\*\/\s*/o) {
+               add_kmsg_desc($component, $text, $sev, $argv, $desc, $user);
+               $state = 0;
+           }
+           # kmsg message text: ' * Text: "<message>"'
+           elsif (/^\s*\*\s*Text:\s*(\".*\")\s*$/o) {
+               $text = $1;
+           }
+           # kmsg message severity: ' * Severity: <sev>'
+           elsif (/^\s*\*\s*Severity:\s*(\S*)\s*$/o) {
+               $sev = $1;
+           }
+           # kmsg message parameter: ' * Parameter: <argv>'
+           elsif (/^\s*\*\s*Parameter:\s*(\S*)\s*$/o) {
+               if (!defined($1)) {
+                   $argv = "";
+               } else {
+                   $argv = $1;
+               }
+               $state = 2;
+           }
+           # kmsg message description start: ' * Description:'
+           elsif (/^\s*\*\s*Description:\s*(\S*)\s*$/o) {
+               if (!defined($1)) {
+                   $desc = "";
+               } else {
+                   $desc = $1;
+               }
+               $state = 3;
+           }
+           # kmsg has unrecognizable lines
+           else {
+               warn "Warning(${file}:$.): Cannot understand $_";
+               $warnings++;
+               $state = 0;
+           }
+       } elsif ($state == 2) {
+           # kmsg message end: ' */'
+           if (/^\s*\*\//o) {
+               warn "Warning(${file}:$.): Missing description, skipping message";
+               $warnings++;
+               $state = 0;
+           }
+           # kmsg message description start: ' * Description:'
+           elsif (/^\s*\*\s*Description:\s*$/o) {
+               $desc = $1;
+               $state = 3;
+           }
+           # kmsg message parameter line: ' * <argv>'
+           elsif (/^\s*\*(.*)$/o) {
+               $argv .= "\n" . $1;
+           } else {
+               warn "Warning(${file}:$.): Cannot understand $_";
+               $warnings++;
+               $state = 0;
+           }
+       } elsif ($state == 3) {
+           # kmsg message end: ' */'
+           if (/^\s*\*\/\s*/o) {
+               add_kmsg_desc($component, $text, $sev, $argv, $desc, $user);
+               $state = 0;
+           }
+           # kmsg message description start: ' * User action:'
+           elsif (/^\s*\*\s*User action:\s*$/o) {
+               $user = $1;
+               $state = 4;
+           }
+           # kmsg message description line: ' * <text>'
+           elsif (/^\s*\*\s*(.*)$/o) {
+               $desc .= "\n" . $1;
+           } else {
+               warn "Warning(${file}:$.): Cannot understand $_";
+               $warnings++;
+               $state = 0;
+           }
+       } elsif ($state == 4) {
+           # kmsg message end: ' */'
+           if (/^\s*\*\/\s*/o) {
+               add_kmsg_desc($component, $text, $sev, $argv, $desc, $user);
+               $state = 0;
+           }
+           # kmsg message user action line: ' * <text>'
+           elsif (/^\s*\*\s*(.*)$/o) {
+               $user .= "\n" . $1;
+           } else {
+               warn "Warning(${file}:$.): Cannot understand $_";
+               $warnings++;
+               $state = 0;
+           }
+       }
+    }
+    return $component;
+}
+
+sub process_cpp_file($$$$)
+{
+    my ($cc, $options, $file, $component) = @_;
+
+    open(FD, "$cc $gcc_options|") or die ("Preprocessing failed.");
+
+    while (<FD>) {
+       chomp;
+       if (/.*__KMSG_PRINT\(\s*(\S*)\s*_FMT_(.*)_ARGS_\s*"(.*)"\s*_END_\s*\)/o) {
+           if ($component ne "") {
+               add_kmsg_print($component, $1, $2, $3);
+           } else {
+               warn "Error(${file}:$.): kmsg without component\n";
+               $errors++;
+           }
+       } elsif (/.*__KMSG_DEV\(\s*(\S*)\s*_FMT_(.*)_ARGS_\s*(.*)?_END_\s*\)/o) {
+           if ($component ne "") {
+               add_kmsg_print($component, $1, "\"%s: \"" . $2, $3);
+           } else {
+               warn "Error(${file}:$.): kmsg without component\n";
+               $errors++;
+           }
+       }
+    }
+}
+
+sub check_messages($)
+{
+    my $component = "@_";
+    my $failed = 0;
+
+    for ($i = 0; $i < $kmsg_count; $i++) {
+       $tag = $kmsg_print{$i}->{'TAG'};
+       if (!defined($kmsg_desc{$tag})) {
+           add_kmsg_desc($component,
+                         "\"" . $kmsg_print{$i}->{'TEXT'} . "\"",
+                         $kmsg_print{$i}->{'SEV'},
+                         $kmsg_print{$i}->{'ARGV'},
+                         "Please insert description here",
+                         "What is the user supposed to do");
+           $kmsg_desc{$tag}->{'CHECK'} = 1;
+           $failed = 1;
+           warn "$component: Missing description for: ".
+                $kmsg_print{$i}->{'TEXT'}."\n";
+           $errors++;
+           next;
+       }
+       if ($kmsg_desc{$tag}->{'SEV'} ne "" &&
+           $kmsg_desc{$tag}->{'SEV'} ne $kmsg_print{$i}->{'SEV'}) {
+           warn "Message severity mismatch for \"$kmsg_print{$i}->{'TEXT'}\"\n";
+           warn "  --- $kmsg_desc{$tag}->{'SEV'}\n";
+           warn "  +++ $kmsg_print{$i}->{'SEV'}\n";
+       }
+    }
+    return $failed;
+}
+
+sub print_templates()
+{
+    print "Templates for missing messages:\n";
+    foreach $tag ( sort { $kmsg_desc{$a} <=> $kmsg_desc{$b} } keys %kmsg_desc ) {
+       if (!defined($kmsg_desc{$tag}->{'CHECK'})) {
+           next;
+       }
+       print "/*?\n";
+       print " * Text: \"$kmsg_desc{$tag}->{'TEXT'}\"\n";
+       print " * Severity: $kmsg_desc{$tag}->{'SEV'}\n";
+       $argv = $kmsg_desc{$tag}->{'ARGV'};
+       if ($argv ne "") {
+           print " * Parameter:\n";
+           @parms = split(/\s*,\s*/,$kmsg_desc{$tag}->{'ARGV'});
+           $count = 0;
+           foreach $parm (@parms) {
+               $count += 1;
+               if (!($parm eq "")) {
+                   print " *   \@$count: $parm\n";
+               }
+           }
+       }
+       print " * Description:\n";
+       print " * $kmsg_desc{$tag}->{'DESC'}\n";
+       print " * User action:\n";
+       print " * $kmsg_desc{$tag}->{'USER'}\n";
+       print " */\n\n";
+    }
+}
+
+sub write_man_pages()
+{
+    my ($i, $file);
+
+    for ($i = 0; $i < $kmsg_count; $i++) {
+       $tag = $kmsg_print{$i}->{'TAG'};
+       if (!defined($kmsg_desc{$tag}) ||
+           defined($kmsg_desc{$tag}->{'CHECK'}) ||
+           $kmsg_desc{$tag}->{'DESC'} eq "") {
+           next;
+       }
+       $file = $objtree . "man/" . $tag . ".9";
+       if (!open(WR, ">$file")) {
+           warn "Error: Cannot open file $file\n";
+           $errors++;
+           return;
+       }
+       print WR ".TH \"$tag\" 9 \"Linux Messages\" LINUX\n";
+       print WR ".SH Message\n";
+       print WR $tag . ": " . $kmsg_desc{$tag}->{'TEXT'} . "\n";
+       print WR ".SH Severity\n";
+       print WR "$kmsg_desc{$tag}->{'SEV'}\n";
+       $argv = $kmsg_desc{$tag}->{'ARGV'};
+       if ($argv ne "") {
+           print WR ".SH Parameters\n";
+           @parms = split(/\s*\n\s*/,$kmsg_desc{$tag}->{'ARGV'});
+           foreach $parm (@parms) {
+               $parm =~ s/^\s*(.*)\s*$/$1/;
+               if (!($parm eq "")) {
+                   print WR "$parm\n\n";
+               }
+           }
+       }
+       print WR ".SH Description";
+       print WR "$kmsg_desc{$tag}->{'DESC'}\n";
+       $user = $kmsg_desc{$tag}->{'USER'};
+       if ($user ne "") {
+           print WR ".SH User action";
+           print WR "$user\n";
+       }
+    }
+}
+
+if (defined($ENV{'srctree'})) {
+    $srctree = "$ENV{'srctree'}" . "/";
+} else {
+    $srctree = getcwd;
+}
+
+if (defined($ENV{'objtree'})) {
+    $objtree = "$ENV{'objtree'}" . "/";
+} else {
+    $objtree = getcwd;
+}
+
+if (defined($ENV{'SRCARCH'})) {
+    $srcarch = "$ENV{'SRCARCH'}" . "/";
+} else {
+    print "kmsg-doc called without a valid \$SRCARCH\n";
+    exit 1;
+}
+
+$option = shift;
+
+$cc = shift;
+$gcc_options = "-E -D __KMSG_CHECKER ";
+foreach $tmp (@ARGV) {
+    $tmp =~ s/\(/\\\(/;
+    $tmp =~ s/\)/\\\)/;
+    $gcc_options .= " $tmp";
+    $filename = $tmp;
+}
+
+$component = process_source_file("", $filename);
+if ($component ne "") {
+    process_source_file($component, $srctree . "Documentation/kmsg/" .
+                       $srcarch . $component);
+    process_source_file($component, $srctree . "Documentation/kmsg/" .
+                       $component);
+}
+
+process_cpp_file($cc, $gcc_options, $filename, $component);
+if ($option eq "check") {
+    if (check_messages($component)) {
+       print_templates();
+    }
+} elsif ($option eq "print") {
+    write_man_pages();
+}
+
+exit($errors);
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c

index c4e7d15..566fbe4 100644 (file)
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1682,6 +1682,48 @@ static void check_sec_ref(struct module *mod, const char *modname,
         }
  }
  
+void *supported_file;
+unsigned long supported_size;
+
+static const char *supported(struct module *mod)
+{
+       unsigned long pos = 0;
+       char *line;
+
+       /* In a first shot, do a simple linear scan. */
+       while ((line = get_next_line(&pos, supported_file,
+                                    supported_size))) {
+               const char *basename, *how = "yes";
+               char *l = line;
+
+               /* optional type-of-support flag */
+               for (l = line; *l != '\0'; l++) {
+                       if (*l == ' ' || *l == '\t') {
+                               *l = '\0';
+                               how = l + 1;
+                               break;
+                       }
+               }
+
+               /* skip directory components */
+               if ((l = strrchr(line, '/')))
+                       line = l + 1;
+               /* strip .ko extension */
+               l = line + strlen(line);
+               if (l - line > 3 && !strcmp(l-3, ".ko"))
+                       *(l-3) = '\0';
+
+               /* skip directory components */
+               if ((basename = strrchr(mod->name, '/')))
+                       basename++;
+               else
+                       basename = mod->name;
+               if (!strcmp(basename, line))
+                       return how;
+       }
+       return NULL;
+}
+
  static void read_symbols(char *modname)
  {
         const char *symname;
@@ -1875,6 +1917,13 @@ static void add_staging_flag(struct buffer *b, const char *name)
                 buf_printf(b, "\nMODULE_INFO(staging, \"Y\");\n");
  }
  
+static void add_supported_flag(struct buffer *b, struct module *mod)
+{
+       const char *how = supported(mod);
+       if (how)
+               buf_printf(b, "\nMODULE_INFO(supported, \"%s\");\n", how);
+}
+
  /**
   * Record CRCs for unresolved symbols
   **/
@@ -2015,6 +2064,13 @@ static void write_if_changed(struct buffer *b, const char *fname)
         fclose(file);
  }
  
+static void read_supported(const char *fname)
+{
+       supported_file = grab_file(fname, &supported_size);
+       if (!supported_file)
+               ; /* ignore error */
+}
+
  /* parse Module.symvers file. line format:
   * 0x12345678<tab>symbol<tab>module[[<tab>export]<tab>something]
   **/
@@ -2108,12 +2164,13 @@ int main(int argc, char **argv)
         struct buffer buf = { };
         char *kernel_read = NULL, *module_read = NULL;
         char *dump_write = NULL;
+       const char *supported = NULL;
         int opt;
         int err;
         struct ext_sym_list *extsym_iter;
         struct ext_sym_list *extsym_start = NULL;
  
-       while ((opt = getopt(argc, argv, "i:I:e:cmsSo:awM:K:")) != -1) {
+       while ((opt = getopt(argc, argv, "i:I:e:cmsSo:awM:K:N:")) != -1) {
                 switch (opt) {
                 case 'i':
                         kernel_read = optarg;
@@ -2151,11 +2208,16 @@ int main(int argc, char **argv)
                 case 'w':
                         warn_unresolved = 1;
                         break;
+               case 'N':
+                       supported = optarg;
+                       break;
                 default:
                         exit(1);
                 }
         }
  
+       if (supported)
+               read_supported(supported);
         if (kernel_read)
                 read_dump(kernel_read, 1);
         if (module_read)
@@ -2189,6 +2251,7 @@ int main(int argc, char **argv)
                 add_header(&buf, mod);
                 add_intree_flag(&buf, !external_module);
                 add_staging_flag(&buf, mod->name);
+               add_supported_flag(&buf, mod);
                 err |= add_versions(&buf, mod);
                 add_depends(&buf, mod, modules);
                 add_moddevtable(&buf, mod);
diff --git a/security/apparmor/Makefile b/security/apparmor/Makefile

index 806bd19..d78730c 100644 (file)
--- a/security/apparmor/Makefile
+++ b/security/apparmor/Makefile
@@ -4,9 +4,9 @@ obj-$(CONFIG_SECURITY_APPARMOR) += apparmor.o
  
  apparmor-y := apparmorfs.o audit.o capability.o context.o ipc.o lib.o match.o \
                path.o domain.o policy.o policy_unpack.o procattr.o lsm.o \
-              resource.o sid.o file.o
+              resource.o sid.o file.o net.o
  
-clean-files := capability_names.h rlim_names.h
+clean-files := capability_names.h rlim_names.h af_names.h
  
  
  # Build a lower case string table of capability names
@@ -20,6 +20,9 @@ cmd_make-caps = echo "static const char *const capability_names[] = {" > $@ ;\
         -e 's/^\#define[ \t]+CAP_([A-Z0-9_]+)[ \t]+([0-9]+)/[\2] = "\L\1",/p';\
         echo "};" >> $@
  
+quiet_cmd_make-af = GEN     $@
+cmd_make-af = echo "static const char *address_family_names[] = {" > $@ ; sed -n -e "/AF_MAX/d" -e "/AF_LOCAL/d" -e "s/^\#define[ \\t]\\+AF_\\([A-Z0-9_]\\+\\)[ \\t]\\+\\([0-9]\\+\\)\\(.*\\)\$$/[\\2]  = \"\\1\",/p" $< | tr A-Z a-z >> $@ ; echo "};" >> $@
+
  
  # Build a lower case string table of rlimit names.
  # Transforms lines from
@@ -56,6 +59,7 @@ cmd_make-rlim = echo "static const char *const rlim_names[RLIM_NLIMITS] = {" \
             tr '\n' ' ' | sed -e 's/ $$/"\n/' >> $@
  
  $(obj)/capability.o : $(obj)/capability_names.h
+$(obj)/net.o : $(obj)/af_names.h
  $(obj)/resource.o : $(obj)/rlim_names.h
  $(obj)/capability_names.h : $(srctree)/include/linux/capability.h \
                             $(src)/Makefile
@@ -63,3 +67,5 @@ $(obj)/capability_names.h : $(srctree)/include/linux/capability.h \
  $(obj)/rlim_names.h : $(srctree)/include/asm-generic/resource.h \
                       $(src)/Makefile
         $(call cmd,make-rlim)
+$(obj)/af_names.h : $(srctree)/include/linux/socket.h
+       $(call cmd,make-af)
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c

index 16c15ec..08522a7 100644 (file)
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -144,6 +144,232 @@ static const struct file_operations aa_fs_profile_remove = {
         .llseek = default_llseek,
  };
  
+/**
+ * __next_namespace - find the next namespace to list
+ * @root: root namespace to stop search at (NOT NULL)
+ * @ns: current ns position (NOT NULL)
+ *
+ * Find the next namespace from @ns under @root and handle all locking needed
+ * while switching current namespace.
+ *
+ * Returns: next namespace or NULL if at last namespace under @root
+ * NOTE: will not unlock root->lock
+ */
+static struct aa_namespace *__next_namespace(struct aa_namespace *root,
+                                            struct aa_namespace *ns)
+{
+       struct aa_namespace *parent;
+
+       /* is next namespace a child */
+       if (!list_empty(&ns->sub_ns)) {
+               struct aa_namespace *next;
+               next = list_first_entry(&ns->sub_ns, typeof(*ns), base.list);
+               read_lock(&next->lock);
+               return next;
+       }
+
+       /* check if the next ns is a sibling, parent, gp, .. */
+       parent = ns->parent;
+       while (parent) {
+               read_unlock(&ns->lock);
+               list_for_each_entry_continue(ns, &parent->sub_ns, base.list) {
+                       read_lock(&ns->lock);
+                       return ns;
+               }
+               if (parent == root)
+                       return NULL;
+               ns = parent;
+               parent = parent->parent;
+       }
+
+       return NULL;
+}
+
+/**
+ * __first_profile - find the first profile in a namespace
+ * @root: namespace that is root of profiles being displayed (NOT NULL)
+ * @ns: namespace to start in   (NOT NULL)
+ *
+ * Returns: unrefcounted profile or NULL if no profile
+ */
+static struct aa_profile *__first_profile(struct aa_namespace *root,
+                                         struct aa_namespace *ns)
+{
+       for ( ; ns; ns = __next_namespace(root, ns)) {
+               if (!list_empty(&ns->base.profiles))
+                       return list_first_entry(&ns->base.profiles,
+                                               struct aa_profile, base.list);
+       }
+       return NULL;
+}
+
+/**
+ * __next_profile - step to the next profile in a profile tree
+ * @profile: current profile in tree (NOT NULL)
+ *
+ * Perform a depth first taversal on the profile tree in a namespace
+ *
+ * Returns: next profile or NULL if done
+ * Requires: profile->ns.lock to be held
+ */
+static struct aa_profile *__next_profile(struct aa_profile *p)
+{
+       struct aa_profile *parent;
+       struct aa_namespace *ns = p->ns;
+
+       /* is next profile a child */
+       if (!list_empty(&p->base.profiles))
+               return list_first_entry(&p->base.profiles, typeof(*p),
+                                       base.list);
+
+       /* is next profile a sibling, parent sibling, gp, subling, .. */
+       parent = p->parent;
+       while (parent) {
+               list_for_each_entry_continue(p, &parent->base.profiles,
+                                            base.list)
+                               return p;
+               p = parent;
+               parent = parent->parent;
+       }
+
+       /* is next another profile in the namespace */
+       list_for_each_entry_continue(p, &ns->base.profiles, base.list)
+               return p;
+
+       return NULL;
+}
+
+/**
+ * next_profile - step to the next profile in where ever it may be
+ * @root: root namespace  (NOT NULL)
+ * @profile: current profile  (NOT NULL)
+ *
+ * Returns: next profile or NULL if there isn't one
+ */
+static struct aa_profile *next_profile(struct aa_namespace *root,
+                                      struct aa_profile *profile)
+{
+       struct aa_profile *next = __next_profile(profile);
+       if (next)
+               return next;
+
+       /* finished all profiles in namespace move to next namespace */
+       return __first_profile(root, __next_namespace(root, profile->ns));
+}
+
+/**
+ * p_start - start a depth first traversal of profile tree
+ * @f: seq_file to fill
+ * @pos: current position
+ *
+ * Returns: first profile under current namespace or NULL if none found
+ *
+ * acquires first ns->lock
+ */
+static void *p_start(struct seq_file *f, loff_t *pos)
+       __acquires(root->lock)
+{
+       struct aa_profile *profile = NULL;
+       struct aa_namespace *root = aa_current_profile()->ns;
+       loff_t l = *pos;
+       f->private = aa_get_namespace(root);
+
+
+       /* find the first profile */
+       read_lock(&root->lock);
+       profile = __first_profile(root, root);
+
+       /* skip to position */
+       for (; profile && l > 0; l--)
+               profile = next_profile(root, profile);
+
+       return profile;
+}
+
+/**
+ * p_next - read the next profile entry
+ * @f: seq_file to fill
+ * @p: profile previously returned
+ * @pos: current position
+ *
+ * Returns: next profile after @p or NULL if none
+ *
+ * may acquire/release locks in namespace tree as necessary
+ */
+static void *p_next(struct seq_file *f, void *p, loff_t *pos)
+{
+       struct aa_profile *profile = p;
+       struct aa_namespace *root = f->private;
+       (*pos)++;
+
+       return next_profile(root, profile);
+}
+
+/**
+ * p_stop - stop depth first traversal
+ * @f: seq_file we are filling
+ * @p: the last profile writen
+ *
+ * Release all locking done by p_start/p_next on namespace tree
+ */
+static void p_stop(struct seq_file *f, void *p)
+       __releases(root->lock)
+{
+       struct aa_profile *profile = p;
+       struct aa_namespace *root = f->private, *ns;
+
+       if (profile) {
+               for (ns = profile->ns; ns && ns != root; ns = ns->parent)
+                       read_unlock(&ns->lock);
+       }
+       read_unlock(&root->lock);
+       aa_put_namespace(root);
+}
+
+/**
+ * seq_show_profile - show a profile entry
+ * @f: seq_file to file
+ * @p: current position (profile)    (NOT NULL)
+ *
+ * Returns: error on failure
+ */
+static int seq_show_profile(struct seq_file *f, void *p)
+{
+       struct aa_profile *profile = (struct aa_profile *)p;
+       struct aa_namespace *root = f->private;
+
+       if (profile->ns != root)
+               seq_printf(f, ":%s://", aa_ns_name(root, profile->ns));
+       seq_printf(f, "%s (%s)\n", profile->base.hname,
+                  COMPLAIN_MODE(profile) ? "complain" : "enforce");
+
+       return 0;
+}
+
+static const struct seq_operations aa_fs_profiles_op = {
+       .start = p_start,
+       .next = p_next,
+       .stop = p_stop,
+       .show = seq_show_profile,
+};
+
+static int profiles_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &aa_fs_profiles_op);
+}
+
+static int profiles_release(struct inode *inode, struct file *file)
+{
+       return seq_release(inode, file);
+}
+
+static const struct file_operations aa_fs_profiles_fops = {
+       .open = profiles_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = profiles_release,
+};
+
  static int aa_fs_seq_show(struct seq_file *seq, void *v)
  {
         struct aa_fs_entry *fs_file = seq->private;
@@ -203,6 +429,7 @@ static struct aa_fs_entry aa_fs_entry_features[] = {
         AA_FS_DIR("file",                       aa_fs_entry_file),
         AA_FS_FILE_U64("capability",            VFS_CAP_FLAGS_MASK),
         AA_FS_DIR("rlimit",                     aa_fs_entry_rlimit),
+       AA_FS_FILE_BOOLEAN("network",           1),
         { }
  };
  
@@ -211,6 +438,9 @@ static struct aa_fs_entry aa_fs_entry_apparmor[] = {
         AA_FS_FILE_FOPS(".replace", 0640, &aa_fs_profile_replace),
         AA_FS_FILE_FOPS(".remove", 0640, &aa_fs_profile_remove),
         AA_FS_DIR("features", aa_fs_entry_features),
+       AA_FS_FILE_STRING("matching", "pattern=aadfa audit perms=crwxamlk/ "
+                         "user::other"),
+       AA_FS_FILE_FOPS("profiles", 0440, &aa_fs_profiles_fops),
         { }
  };
  
diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h

index 3868b1e..c1ff09c 100644 (file)
--- a/security/apparmor/include/audit.h
+++ b/security/apparmor/include/audit.h
@@ -126,6 +126,10 @@ struct apparmor_audit_data {
                         u32 denied;
                         uid_t ouid;
                 } fs;
+               struct {
+                       int type, protocol;
+                       struct sock *sk;
+               } net;
         };
  };
  
diff --git a/security/apparmor/include/net.h b/security/apparmor/include/net.h

new file mode 100644 (file)

index 0000000..3c7d599
--- /dev/null
+++ b/security/apparmor/include/net.h
@@ -0,0 +1,40 @@
+/*
+ * AppArmor security module
+ *
+ * This file contains AppArmor network mediation definitions.
+ *
+ * Copyright (C) 1998-2008 Novell/SUSE
+ * Copyright 2009-2010 Canonical Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#ifndef __AA_NET_H
+#define __AA_NET_H
+
+#include <net/sock.h>
+
+/* struct aa_net - network confinement data
+ * @allowed: basic network families permissions
+ * @audit_network: which network permissions to force audit
+ * @quiet_network: which network permissions to quiet rejects
+ */
+struct aa_net {
+       u16 allow[AF_MAX];
+       u16 audit[AF_MAX];
+       u16 quiet[AF_MAX];
+};
+
+extern int aa_net_perm(int op, struct aa_profile *profile, u16 family,
+                      int type, int protocol, struct sock *sk);
+extern int aa_revalidate_sk(int op, struct sock *sk);
+
+static inline void aa_free_net_rules(struct aa_net *new)
+{
+       /* NOP */
+}
+
+#endif /* __AA_NET_H */
diff --git a/security/apparmor/include/policy.h b/security/apparmor/include/policy.h

index bda4569..eb13a73 100644 (file)
--- a/security/apparmor/include/policy.h
+++ b/security/apparmor/include/policy.h
@@ -27,6 +27,7 @@
  #include "capability.h"
  #include "domain.h"
  #include "file.h"
+#include "net.h"
  #include "resource.h"
  
  extern const char *const profile_mode_names[];
@@ -157,6 +158,7 @@ struct aa_policydb {
   * @policy: general match rules governing policy
   * @file: The set of rules governing basic file access and domain transitions
   * @caps: capabilities for the profile
+ * @net: network controls for the profile
   * @rlimits: rlimits for the profile
   *
   * The AppArmor profile contains the basic confinement data.  Each profile
@@ -194,6 +196,7 @@ struct aa_profile {
         struct aa_policydb policy;
         struct aa_file_rules file;
         struct aa_caps caps;
+       struct aa_net net;
         struct aa_rlimit rlimits;
  };
  
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c

index ad05d39..3cde194 100644 (file)
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -32,6 +32,7 @@
  #include "include/context.h"
  #include "include/file.h"
  #include "include/ipc.h"
+#include "include/net.h"
  #include "include/path.h"
  #include "include/policy.h"
  #include "include/procattr.h"
@@ -622,6 +623,104 @@ static int apparmor_task_setrlimit(struct task_struct *task,
         return error;
  }
  
+static int apparmor_socket_create(int family, int type, int protocol, int kern)
+{
+       struct aa_profile *profile;
+       int error = 0;
+
+       if (kern)
+               return 0;
+
+       profile = __aa_current_profile();
+       if (!unconfined(profile))
+               error = aa_net_perm(OP_CREATE, profile, family, type, protocol,
+                                   NULL);
+       return error;
+}
+
+static int apparmor_socket_bind(struct socket *sock,
+                               struct sockaddr *address, int addrlen)
+{
+       struct sock *sk = sock->sk;
+
+       return aa_revalidate_sk(OP_BIND, sk);
+}
+
+static int apparmor_socket_connect(struct socket *sock,
+                                  struct sockaddr *address, int addrlen)
+{
+       struct sock *sk = sock->sk;
+
+       return aa_revalidate_sk(OP_CONNECT, sk);
+}
+
+static int apparmor_socket_listen(struct socket *sock, int backlog)
+{
+       struct sock *sk = sock->sk;
+
+       return aa_revalidate_sk(OP_LISTEN, sk);
+}
+
+static int apparmor_socket_accept(struct socket *sock, struct socket *newsock)
+{
+       struct sock *sk = sock->sk;
+
+       return aa_revalidate_sk(OP_ACCEPT, sk);
+}
+
+static int apparmor_socket_sendmsg(struct socket *sock,
+                                  struct msghdr *msg, int size)
+{
+       struct sock *sk = sock->sk;
+
+       return aa_revalidate_sk(OP_SENDMSG, sk);
+}
+
+static int apparmor_socket_recvmsg(struct socket *sock,
+                                  struct msghdr *msg, int size, int flags)
+{
+       struct sock *sk = sock->sk;
+
+       return aa_revalidate_sk(OP_RECVMSG, sk);
+}
+
+static int apparmor_socket_getsockname(struct socket *sock)
+{
+       struct sock *sk = sock->sk;
+
+       return aa_revalidate_sk(OP_GETSOCKNAME, sk);
+}
+
+static int apparmor_socket_getpeername(struct socket *sock)
+{
+       struct sock *sk = sock->sk;
+
+       return aa_revalidate_sk(OP_GETPEERNAME, sk);
+}
+
+static int apparmor_socket_getsockopt(struct socket *sock, int level,
+                                     int optname)
+{
+       struct sock *sk = sock->sk;
+
+       return aa_revalidate_sk(OP_GETSOCKOPT, sk);
+}
+
+static int apparmor_socket_setsockopt(struct socket *sock, int level,
+                                     int optname)
+{
+       struct sock *sk = sock->sk;
+
+       return aa_revalidate_sk(OP_SETSOCKOPT, sk);
+}
+
+static int apparmor_socket_shutdown(struct socket *sock, int how)
+{
+       struct sock *sk = sock->sk;
+
+       return aa_revalidate_sk(OP_SOCK_SHUTDOWN, sk);
+}
+
  static struct security_operations apparmor_ops = {
         .name =                         "apparmor",
  
@@ -653,6 +752,19 @@ static struct security_operations apparmor_ops = {
         .getprocattr =                  apparmor_getprocattr,
         .setprocattr =                  apparmor_setprocattr,
  
+       .socket_create =                apparmor_socket_create,
+       .socket_bind =                  apparmor_socket_bind,
+       .socket_connect =               apparmor_socket_connect,
+       .socket_listen =                apparmor_socket_listen,
+       .socket_accept =                apparmor_socket_accept,
+       .socket_sendmsg =               apparmor_socket_sendmsg,
+       .socket_recvmsg =               apparmor_socket_recvmsg,
+       .socket_getsockname =           apparmor_socket_getsockname,
+       .socket_getpeername =           apparmor_socket_getpeername,
+       .socket_getsockopt =            apparmor_socket_getsockopt,
+       .socket_setsockopt =            apparmor_socket_setsockopt,
+       .socket_shutdown =              apparmor_socket_shutdown,
+
         .cred_alloc_blank =             apparmor_cred_alloc_blank,
         .cred_free =                    apparmor_cred_free,
         .cred_prepare =                 apparmor_cred_prepare,
diff --git a/security/apparmor/net.c b/security/apparmor/net.c

new file mode 100644 (file)

index 0000000..8de679a
--- /dev/null
+++ b/security/apparmor/net.c
@@ -0,0 +1,183 @@
+/*
+ * AppArmor security module
+ *
+ * This file contains AppArmor network mediation
+ *
+ * Copyright (C) 1998-2008 Novell/SUSE
+ * Copyright 2009-2010 Canonical Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include "include/apparmor.h"
+#include "include/audit.h"
+#include "include/context.h"
+#include "include/net.h"
+#include "include/policy.h"
+
+#include "af_names.h"
+
+static const char *sock_type_names[] = {
+       "unknown(0)",
+       "stream",
+       "dgram",
+       "raw",
+       "rdm",
+       "seqpacket",
+       "dccp",
+       "unknown(7)",
+       "unknown(8)",
+       "unknown(9)",
+       "packet",
+};
+
+/* audit callback for net specific fields */
+static void audit_cb(struct audit_buffer *ab, void *va)
+{
+       struct common_audit_data *sa = va;
+       struct apparmor_audit_data *aad = sa->apparmor_audit_data;
+
+       audit_log_format(ab, " family=");
+       if (address_family_names[sa->u.net->family]) {
+               audit_log_string(ab, address_family_names[sa->u.net->family]);
+       } else {
+               audit_log_format(ab, " \"unknown(%d)\"", sa->u.net->family);
+       }
+
+       audit_log_format(ab, " sock_type=");
+       if (sock_type_names[aad->net.type]) {
+               audit_log_string(ab, sock_type_names[aad->net.type]);
+       } else {
+               audit_log_format(ab, "\"unknown(%d)\"", aad->net.type);
+       }
+
+       audit_log_format(ab, " protocol=%d", aad->net.protocol);
+}
+
+/**
+ * audit_net - audit network access
+ * @profile: profile being enforced  (NOT NULL)
+ * @op: operation being checked
+ * @family: network family
+ * @type:   network type
+ * @protocol: network protocol
+ * @sk: socket auditing is being applied to
+ * @error: error code for failure else 0
+ *
+ * Returns: %0 or sa->error else other errorcode on failure
+ */
+static int audit_net(struct aa_profile *profile, int op, u16 family, int type,
+                    int protocol, struct sock *sk, int error)
+{
+       int audit_type = AUDIT_APPARMOR_AUTO;
+       struct common_audit_data sa;
+
+       struct apparmor_audit_data aad = {
+               .op = op,
+               .net = {
+                       .type = type,
+                       .protocol = protocol,
+               },
+               .error = error
+       };
+
+       struct lsm_network_audit net = {
+               .family = family,
+               .sk = sk,
+       };
+
+
+       if (sk) {
+               COMMON_AUDIT_DATA_INIT(&sa, NET);
+       } else {
+               COMMON_AUDIT_DATA_INIT(&sa, NONE);
+       }
+       /* todo fill in socket addr info */
+
+       sa.apparmor_audit_data = &aad;
+       sa.u.net = &net;
+
+       if (likely(!aad.error)) {
+               u16 audit_mask = profile->net.audit[net.family];
+               if (likely((AUDIT_MODE(profile) != AUDIT_ALL) &&
+                          !(1 << aad.net.type & audit_mask)))
+                       return 0;
+               audit_type = AUDIT_APPARMOR_AUDIT;
+       } else {
+               u16 quiet_mask = profile->net.quiet[net.family];
+               u16 kill_mask = 0;
+               u16 denied = (1 << aad.net.type) & ~quiet_mask;
+
+               if (denied & kill_mask)
+                       audit_type = AUDIT_APPARMOR_KILL;
+
+               if ((denied & quiet_mask) &&
+                   AUDIT_MODE(profile) != AUDIT_NOQUIET &&
+                   AUDIT_MODE(profile) != AUDIT_ALL)
+                       return COMPLAIN_MODE(profile) ? 0 : aad.error;
+       }
+
+       return aa_audit(audit_type, profile, GFP_KERNEL, &sa, audit_cb);
+}
+
+/**
+ * aa_net_perm - very course network access check
+ * @op: operation being checked
+ * @profile: profile being enforced  (NOT NULL)
+ * @family: network family
+ * @type:   network type
+ * @protocol: network protocol
+ *
+ * Returns: %0 else error if permission denied
+ */
+int aa_net_perm(int op, struct aa_profile *profile, u16 family, int type,
+               int protocol, struct sock *sk)
+{
+       u16 family_mask;
+       int error;
+
+       if ((family < 0) || (family >= AF_MAX))
+               return -EINVAL;
+
+       if ((type < 0) || (type >= SOCK_MAX))
+               return -EINVAL;
+
+       /* unix domain and netlink sockets are handled by ipc */
+       if (family == AF_UNIX || family == AF_NETLINK)
+               return 0;
+
+       family_mask = profile->net.allow[family];
+
+       error = (family_mask & (1 << type)) ? 0 : -EACCES;
+
+       return audit_net(profile, op, family, type, protocol, sk, error);
+}
+
+/**
+ * aa_revalidate_sk - Revalidate access to a sock
+ * @op: operation being checked
+ * @sk: sock being revalidated  (NOT NULL)
+ *
+ * Returns: %0 else error if permission denied
+ */
+int aa_revalidate_sk(int op, struct sock *sk)
+{
+       struct aa_profile *profile;
+       int error = 0;
+
+       /* aa_revalidate_sk should not be called from interrupt context
+        * don't mediate these calls as they are not task related
+        */
+       if (in_interrupt())
+               return 0;
+
+       profile = __aa_current_profile();
+       if (!unconfined(profile))
+               error = aa_net_perm(op, profile, sk->sk_family, sk->sk_type,
+                                   sk->sk_protocol, sk);
+
+       return error;
+}
diff --git a/security/apparmor/policy.c b/security/apparmor/policy.c

index f1f7506..b8100a7 100644 (file)
--- a/security/apparmor/policy.c
+++ b/security/apparmor/policy.c
@@ -745,6 +745,7 @@ static void free_profile(struct aa_profile *profile)
  
         aa_free_file_rules(&profile->file);
         aa_free_cap_rules(&profile->caps);
+       aa_free_net_rules(&profile->net);
         aa_free_rlimit_rules(&profile->rlimits);
  
         aa_free_sid(profile->sid);
diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c

index deab7c7..5444c61 100644 (file)
--- a/security/apparmor/policy_unpack.c
+++ b/security/apparmor/policy_unpack.c
@@ -193,6 +193,19 @@ fail:
         return 0;
  }
  
+static bool unpack_u16(struct aa_ext *e, u16 *data, const char *name)
+{
+       if (unpack_nameX(e, AA_U16, name)) {
+               if (!inbounds(e, sizeof(u16)))
+                       return 0;
+               if (data)
+                       *data = le16_to_cpu(get_unaligned((u16 *) e->pos));
+               e->pos += sizeof(u16);
+               return 1;
+       }
+       return 0;
+}
+
  static bool unpack_u32(struct aa_ext *e, u32 *data, const char *name)
  {
         if (unpack_nameX(e, AA_U32, name)) {
@@ -471,6 +484,7 @@ static struct aa_profile *unpack_profile(struct aa_ext *e)
  {
         struct aa_profile *profile = NULL;
         const char *name = NULL;
+       size_t size = 0;
         int i, error = -EPROTO;
         kernel_cap_t tmpcap;
         u32 tmp;
@@ -564,6 +578,38 @@ static struct aa_profile *unpack_profile(struct aa_ext *e)
         if (!unpack_rlimits(e, profile))
                 goto fail;
  
+       size = unpack_array(e, "net_allowed_af");
+       if (size) {
+
+               for (i = 0; i < size; i++) {
+                       /* discard extraneous rules that this kernel will
+                        * never request
+                        */
+                       if (i > AF_MAX) {
+                               u16 tmp;
+                               if (!unpack_u16(e, &tmp, NULL) ||
+                                   !unpack_u16(e, &tmp, NULL) ||
+                                   !unpack_u16(e, &tmp, NULL))
+                                       goto fail;
+                               continue;
+                       }
+                       if (!unpack_u16(e, &profile->net.allow[i], NULL))
+                               goto fail;
+                       if (!unpack_u16(e, &profile->net.audit[i], NULL))
+                               goto fail;
+                       if (!unpack_u16(e, &profile->net.quiet[i], NULL))
+                               goto fail;
+               }
+               if (!unpack_nameX(e, AA_ARRAYEND, NULL))
+                       goto fail;
+               /*
+                * allow unix domain and netlink sockets they are handled
+                * by IPC
+                */
+       }
+       profile->net.allow[AF_UNIX] = 0xffff;
+       profile->net.allow[AF_NETLINK] = 0xffff;
+
         if (unpack_nameX(e, AA_STRUCT, "policydb")) {
                 /* generic policy dfa - optional and may be NULL */
                 profile->policy.dfa = unpack_dfa(e);
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c

index 3b75b2e..b8efcea 100644 (file)
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -675,8 +675,15 @@ static void draw_c_p_states(void)
          * two pass drawing so that the P state bars are on top of the C state blocks
          */
         while (pwr) {
-               if (pwr->type == CSTATE)
+               if (pwr->type == CSTATE) {
+                       /* If the first event is an _end event, start timestamp is zero
+                          -> ignore these */
+                       if (pwr->start_time == 0 || pwr->end_time == 0) {
+                               pwr = pwr->next;
+                               continue;
+                       }
                         svg_cstate(pwr->cpu, pwr->start_time, pwr->end_time, pwr->state);
+               }
                 pwr = pwr->next;
         }
  
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c

index dcaf272..769dfb3 100644 (file)
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -202,7 +202,8 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
         old_irr = ioapic->irr;
         if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
                 entry = ioapic->redirtbl[irq];
-               level ^= entry.fields.polarity;
+// polarity is always active high in qemu
+//             level ^= entry.fields.polarity;
                 if (!level)
                         ioapic->irr &= ~mask;
                 else {
author	Jeff Mahoney <jeffm@suse.com>
	Mon, 21 May 2012 12:49:11 +0000 (08:49 -0400)
committer	Jeff Mahoney <jeffm@suse.com>
	Mon, 21 May 2012 12:49:11 +0000 (08:49 -0400)