Implement IBM features (not only) on powerpc: 304346: Relocatable kernel for ppc64...
authorTorsten Duwe <duwe@suse.de>
Thu, 19 Nov 2009 17:29:54 +0000 (18:29 +0100)
committerTorsten Duwe <duwe@suse.de>
Tue, 24 Nov 2009 12:53:02 +0000 (13:53 +0100)
suse-commit: 8d86162cb9c9ac76efbc3b5d3b56bff6e44d8298

39 files changed:
Documentation/cpuidle/core.txt
arch/powerpc/Kconfig
arch/powerpc/include/asm/lppaca.h
arch/powerpc/include/asm/pSeries_reconfig.h
arch/powerpc/include/asm/processor.h
arch/powerpc/include/asm/prom.h
arch/powerpc/include/asm/system.h
arch/powerpc/kernel/idle.c
arch/powerpc/kernel/sysfs.c
arch/powerpc/mm/mem.c
arch/powerpc/mm/numa.c
arch/powerpc/platforms/pseries/Makefile
arch/powerpc/platforms/pseries/dlpar.c [new file with mode: 0644]
arch/powerpc/platforms/pseries/hotplug-cpu.c
arch/powerpc/platforms/pseries/offline_states.h [new file with mode: 0644]
arch/powerpc/platforms/pseries/plpar_wrappers.h
arch/powerpc/platforms/pseries/processor_idle.c [new file with mode: 0644]
arch/powerpc/platforms/pseries/pseries.h
arch/powerpc/platforms/pseries/reconfig.c
arch/powerpc/platforms/pseries/setup.c
arch/powerpc/platforms/pseries/smp.c
arch/powerpc/xmon/xmon.c
arch/x86/kernel/apm_32.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/xen/setup.c
drivers/acpi/processor_core.c
drivers/acpi/processor_idle.c
drivers/base/cpu.c
drivers/base/memory.c
drivers/cpuidle/cpuidle.c
drivers/cpuidle/cpuidle.h
drivers/cpuidle/driver.c
drivers/cpuidle/governor.c
drivers/cpuidle/sysfs.c
include/linux/cpu.h
include/linux/cpuidle.h
include/linux/memory_hotplug.h

index 63ecc5d..483bc31 100644 (file)
@@ -21,3 +21,38 @@ which can be used to switch governors at run time. This boot option
 is meant for developer testing only. In normal usage, kernel picks the
 best governor based on governor ratings.
 SEE ALSO: sysfs.txt in this directory.
+
+Design:
+
+Cpuidle allows for registration of multiple sets of idle routines.
+The latest registered set is used by cpuidle governors as the current
+active set to choose the right idle state. This set is managed as a
+list and each time the newly registered set is added to the head of the
+list and made the current active set.
+
+An example of how this would work on x86 is shown below.
+
+-----------------                                      -----------------
+|              |                                       |               |
+| choose b/w   |       mwait is chosen                 |    mwait      |
+| mwait, poll, |-------------------------------------> |(current active|
+| default, c1e |       register to cpuidle             |    set)       |
+|              |       with mwait as the idle routine  |               |
+-----------------                                      -----------------
+
+
+-----------------                                      -----------------
+|              |                                       |  c1, c2, c3   |
+|     ACPI     |       register to cpuidle             |   (current)   |
+|   discovery  |-------------------------------------> |---------------|
+|              |       with c1, c2, c3                 |     mwait     |
+|              |       as set of idle routines         |               |
+-----------------                                      -----------------
+
+With this mechanism, a module can register and unregister its set of
+idle routines at run time in a clean manner.
+
+The main idle routine called inside cpu_idle() of every arch is defined in
+driver/cpuidle/cpuidle.c which would in turn call the idle routine selected
+by the governor. If the CONFIG_CPU_IDLE is disabled, the arch needs to
+provide an alternate definition for cpuidle_idle_call().
index 2ba14e7..08df576 100644 (file)
@@ -91,6 +91,12 @@ config ARCH_HAS_ILOG2_U64
        bool
        default y if 64BIT
 
+config ARCH_HAS_CPU_IDLE_WAIT
+       def_bool y
+
+config ARCH_HAS_DEFAULT_IDLE
+       def_bool y
+
 config GENERIC_HWEIGHT
        bool
        default y
@@ -247,6 +253,12 @@ source "kernel/Kconfig.freezer"
 source "arch/powerpc/sysdev/Kconfig"
 source "arch/powerpc/platforms/Kconfig"
 
+menu "Power management options"
+
+source "drivers/cpuidle/Kconfig"
+
+endmenu
+
 menu "Kernel options"
 
 config HIGHMEM
index f78f65c..14b592d 100644 (file)
@@ -100,7 +100,14 @@ struct lppaca {
        // Used to pass parms from the OS to PLIC for SetAsrAndRfid
        u64     saved_gpr3;             // Saved GPR3                   x20-x27
        u64     saved_gpr4;             // Saved GPR4                   x28-x2F
-       u64     saved_gpr5;             // Saved GPR5                   x30-x37
+       union {
+               u64     saved_gpr5;     /* Saved GPR5               x30-x37 */
+               struct {
+                       u8      cede_latency_hint;  /*                  x30 */
+                       u8      reserved[7];        /*              x31-x36 */
+               } fields;
+       } gpr5_dword;
+
 
        u8      dtl_enable_mask;        // Dispatch Trace Log mask      x38-x38
        u8      donate_dedicated_cpu;   // Donate dedicated CPU cycles  x39-x39
index e482e53..d4b4bfa 100644 (file)
@@ -17,6 +17,7 @@
 #ifdef CONFIG_PPC_PSERIES
 extern int pSeries_reconfig_notifier_register(struct notifier_block *);
 extern void pSeries_reconfig_notifier_unregister(struct notifier_block *);
+extern struct blocking_notifier_head pSeries_reconfig_chain;
 #else /* !CONFIG_PPC_PSERIES */
 static inline int pSeries_reconfig_notifier_register(struct notifier_block *nb)
 {
index 9eed29e..987bf8d 100644 (file)
@@ -332,6 +332,8 @@ static inline unsigned long get_clean_sp(struct pt_regs *regs, int is_32)
 }
 #endif
 
+extern int boot_option_idle_override;
+
 #endif /* __KERNEL__ */
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_PROCESSOR_H */
index 6ff0418..c1a582b 100644 (file)
@@ -349,6 +349,18 @@ extern int of_irq_to_resource(struct device_node *dev, int index,
  */
 extern void __iomem *of_iomap(struct device_node *device, int index);
 
+struct of_drconf_cell {
+       u64     base_addr;
+       u32     drc_index;
+       u32     reserved;
+       u32     aa_index;
+       u32     flags;
+};
+
+#define DRCONF_MEM_ASSIGNED    0x00000008
+#define DRCONF_MEM_AI_INVALID  0x00000040
+#define DRCONF_MEM_RESERVED    0x00000080
+
 /*
  * NB:  This is here while we transition from using asm/prom.h
  * to linux/of.h
index bb8e006..03fb0a7 100644 (file)
@@ -218,6 +218,7 @@ extern unsigned long klimit;
 extern void *alloc_maybe_bootmem(size_t size, gfp_t mask);
 extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask);
 
+extern void default_idle(void);
 extern int powersave_nap;      /* set if nap mode can be used in idle loop */
 
 /*
@@ -546,5 +547,13 @@ extern void account_system_vtime(struct task_struct *);
 
 extern struct dentry *powerpc_debugfs_root;
 
+void cpu_idle_wait(void);
+
+#ifdef CONFIG_CPU_IDLE
+extern void update_smt_snooze_delay(int snooze);
+#else
+static inline void update_smt_snooze_delay(int snooze) {}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_SYSTEM_H */
index 88d9c1d..a9aea17 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/cpu.h>
 #include <linux/sysctl.h>
 #include <linux/tick.h>
+#include <linux/cpuidle.h>
 
 #include <asm/system.h>
 #include <asm/processor.h>
 #define cpu_should_die()       0
 #endif
 
+int boot_option_idle_override = 0;
+
 static int __init powersave_off(char *arg)
 {
-       ppc_md.power_save = NULL;
+       boot_option_idle_override = 1;
        return 0;
 }
 __setup("powersave=off", powersave_off);
 
+#ifndef CONFIG_CPU_IDLE
+void cpuidle_idle_call(void)
+{
+       local_irq_enable();
+       cpu_relax();
+}
+#endif
+
 /*
  * The body of the idle task.
  */
@@ -60,35 +71,26 @@ void cpu_idle(void)
                while (!need_resched() && !cpu_should_die()) {
                        ppc64_runlatch_off();
 
-                       if (ppc_md.power_save) {
-                               clear_thread_flag(TIF_POLLING_NRFLAG);
-                               /*
-                                * smp_mb is so clearing of TIF_POLLING_NRFLAG
-                                * is ordered w.r.t. need_resched() test.
-                                */
-                               smp_mb();
-                               local_irq_disable();
-
-                               /* Don't trace irqs off for idle */
-                               stop_critical_timings();
-
-                               /* check again after disabling irqs */
-                               if (!need_resched() && !cpu_should_die())
-                                       ppc_md.power_save();
-
-                               start_critical_timings();
-
-                               local_irq_enable();
-                               set_thread_flag(TIF_POLLING_NRFLAG);
-
-                       } else {
-                               /*
-                                * Go into low thread priority and possibly
-                                * low power mode.
-                                */
-                               HMT_low();
-                               HMT_very_low();
-                       }
+                       clear_thread_flag(TIF_POLLING_NRFLAG);
+                       /*
+                        * smp_mb is so clearing of TIF_POLLING_NRFLAG
+                        * is ordered w.r.t. need_resched() test.
+                        */
+                       smp_mb();
+                       local_irq_disable();
+
+                       /* Don't trace irqs off for idle */
+                       stop_critical_timings();
+
+                       /* check again after disabling irqs */
+                       if (!need_resched() && !cpu_should_die())
+                               cpuidle_idle_call();
+
+                       start_critical_timings();
+
+                       local_irq_enable();
+                       set_thread_flag(TIF_POLLING_NRFLAG);
+
                }
 
                HMT_medium();
@@ -102,6 +104,31 @@ void cpu_idle(void)
        }
 }
 
+static void do_nothing(void *unused)
+{
+}
+
+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs come out of the old
+ * idle loop and start using the new idle loop.
+ * Required while changing idle handler on SMP systems.
+ * Caller must have changed idle handler to the new value before the call.
+ */
+void cpu_idle_wait(void)
+{
+       /* Ensure that new value of idle is set */
+       smp_mb();
+       /* kick all the CPUs so that they exit out of old idle routine */
+       smp_call_function(do_nothing, NULL, 1);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+void default_idle(void)
+{
+       HMT_low();
+       HMT_very_low();
+}
+
 int powersave_nap;
 
 #ifdef CONFIG_SYSCTL
index 956ab33..8aecf5e 100644 (file)
@@ -18,6 +18,7 @@
 #include <asm/machdep.h>
 #include <asm/smp.h>
 #include <asm/pmc.h>
+#include <asm/system.h>
 
 #include "cacheinfo.h"
 
@@ -51,6 +52,7 @@ static ssize_t store_smt_snooze_delay(struct sys_device *dev,
                return -EINVAL;
 
        per_cpu(smt_snooze_delay, cpu->sysdev.id) = snooze;
+       update_smt_snooze_delay(snooze);
 
        return count;
 }
index 5973631..639c936 100644 (file)
@@ -111,8 +111,19 @@ EXPORT_SYMBOL(phys_mem_access_prot);
 #ifdef CONFIG_MEMORY_HOTPLUG
 
 #ifdef CONFIG_NUMA
+int __attribute ((weak)) platform_probe_memory(u64 start)
+{
+       return 0;
+}
+
 int memory_add_physaddr_to_nid(u64 start)
 {
+       int rc;
+
+       rc = platform_probe_memory(start);
+       if (rc)
+               return rc;
+
        return hot_add_scn_to_nid(start);
 }
 #endif
index b037d95..c4de810 100644 (file)
@@ -296,18 +296,6 @@ static unsigned long __devinit read_n_cells(int n, const unsigned int **buf)
        return result;
 }
 
-struct of_drconf_cell {
-       u64     base_addr;
-       u32     drc_index;
-       u32     reserved;
-       u32     aa_index;
-       u32     flags;
-};
-
-#define DRCONF_MEM_ASSIGNED    0x00000008
-#define DRCONF_MEM_AI_INVALID  0x00000040
-#define DRCONF_MEM_RESERVED    0x00000080
-
 /*
  * Read the next lmb list entry from the ibm,dynamic-memory property
  * and return the information in the provided of_drconf_cell structure.
index 790c0b8..77ce538 100644 (file)
@@ -8,7 +8,7 @@ endif
 
 obj-y                  := lpar.o hvCall.o nvram.o reconfig.o \
                           setup.o iommu.o ras.o rtasd.o \
-                          firmware.o power.o
+                          firmware.o power.o dlpar.o
 obj-$(CONFIG_SMP)      += smp.o
 obj-$(CONFIG_XICS)     += xics.o
 obj-$(CONFIG_SCANLOG)  += scanlog.o
@@ -26,3 +26,4 @@ obj-$(CONFIG_HCALL_STATS)     += hvCall_inst.o
 obj-$(CONFIG_PHYP_DUMP)        += phyp_dump.o
 obj-$(CONFIG_CMM)              += cmm.o
 obj-$(CONFIG_DTL)              += dtl.o
+obj-$(CONFIG_CPU_IDLE)         += processor_idle.o
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
new file mode 100644 (file)
index 0000000..b6fc6ab
--- /dev/null
@@ -0,0 +1,806 @@
+/*
+ * Support for dynamic reconfiguration (including PCI, Memory, and CPU
+ * Hotplug and Dynamic Logical Partitioning on PAPR platforms).
+ *
+ * Copyright (C) 2009 Nathan Fontenot
+ * Copyright (C) 2009 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/memory_hotplug.h>
+#include <linux/sysdev.h>
+#include <linux/sysfs.h>
+#include <linux/cpu.h>
+#include "offline_states.h"
+
+#include <asm/prom.h>
+#include <asm/machdep.h>
+#include <asm/uaccess.h>
+#include <asm/rtas.h>
+#include <asm/pSeries_reconfig.h>
+
+#define CFG_CONN_WORK_SIZE     4096
+static char workarea[CFG_CONN_WORK_SIZE];
+static DEFINE_SPINLOCK(workarea_lock);
+
+struct cc_workarea {
+       u32     drc_index;
+       u32     zero;
+       u32     name_offset;
+       u32     prop_length;
+       u32     prop_offset;
+};
+
+static struct property *parse_cc_property(char *workarea)
+{
+       struct property *prop;
+       struct cc_workarea *ccwa;
+       char *name;
+       char *value;
+
+       prop = kzalloc(sizeof(*prop), GFP_KERNEL);
+       if (!prop)
+               return NULL;
+
+       ccwa = (struct cc_workarea *)workarea;
+       name = workarea + ccwa->name_offset;
+       prop->name = kzalloc(strlen(name) + 1, GFP_KERNEL);
+       if (!prop->name) {
+               kfree(prop);
+               return NULL;
+       }
+
+       strcpy(prop->name, name);
+
+       prop->length = ccwa->prop_length;
+       value = workarea + ccwa->prop_offset;
+       prop->value = kzalloc(prop->length, GFP_KERNEL);
+       if (!prop->value) {
+               kfree(prop->name);
+               kfree(prop);
+               return NULL;
+       }
+
+       memcpy(prop->value, value, prop->length);
+       return prop;
+}
+
+static void free_property(struct property *prop)
+{
+       kfree(prop->name);
+       kfree(prop->value);
+       kfree(prop);
+}
+
+static struct device_node *parse_cc_node(char *work_area)
+{
+       struct device_node *dn;
+       struct cc_workarea *ccwa;
+       char *name;
+
+       dn = kzalloc(sizeof(*dn), GFP_KERNEL);
+       if (!dn)
+               return NULL;
+
+       ccwa = (struct cc_workarea *)work_area;
+       name = work_area + ccwa->name_offset;
+       dn->full_name = kzalloc(strlen(name) + 1, GFP_KERNEL);
+       if (!dn->full_name) {
+               kfree(dn);
+               return NULL;
+       }
+
+       strcpy(dn->full_name, name);
+       return dn;
+}
+
+static void free_one_cc_node(struct device_node *dn)
+{
+       struct property *prop;
+
+       while (dn->properties) {
+               prop = dn->properties;
+               dn->properties = prop->next;
+               free_property(prop);
+       }
+
+       kfree(dn->full_name);
+       kfree(dn);
+}
+
+static void free_cc_nodes(struct device_node *dn)
+{
+       if (dn->child)
+               free_cc_nodes(dn->child);
+
+       if (dn->sibling)
+               free_cc_nodes(dn->sibling);
+
+       free_one_cc_node(dn);
+}
+
+#define NEXT_SIBLING    1
+#define NEXT_CHILD      2
+#define NEXT_PROPERTY   3
+#define PREV_PARENT     4
+#define MORE_MEMORY     5
+#define CALL_AGAIN     -2
+#define ERR_CFG_USE     -9003
+
+struct device_node *configure_connector(u32 drc_index)
+{
+       struct device_node *dn;
+       struct device_node *first_dn = NULL;
+       struct device_node *last_dn = NULL;
+       struct property *property;
+       struct property *last_property = NULL;
+       struct cc_workarea *ccwa;
+       int cc_token;
+       int rc;
+
+       cc_token = rtas_token("ibm,configure-connector");
+       if (cc_token == RTAS_UNKNOWN_SERVICE)
+               return NULL;
+
+       spin_lock(&workarea_lock);
+
+       ccwa = (struct cc_workarea *)&workarea[0];
+       ccwa->drc_index = drc_index;
+       ccwa->zero = 0;
+
+       rc = rtas_call(cc_token, 2, 1, NULL, workarea, NULL);
+       while (rc) {
+               switch (rc) {
+               case NEXT_SIBLING:
+                       dn = parse_cc_node(workarea);
+                       if (!dn)
+                               goto cc_error;
+
+                       dn->parent = last_dn->parent;
+                       last_dn->sibling = dn;
+                       last_dn = dn;
+                       break;
+
+               case NEXT_CHILD:
+                       dn = parse_cc_node(workarea);
+                       if (!dn)
+                               goto cc_error;
+
+                       if (!first_dn)
+                               first_dn = dn;
+                       else {
+                               dn->parent = last_dn;
+                               if (last_dn)
+                                       last_dn->child = dn;
+                       }
+
+                       last_dn = dn;
+                       break;
+
+               case NEXT_PROPERTY:
+                       property = parse_cc_property(workarea);
+                       if (!property)
+                               goto cc_error;
+
+                       if (!last_dn->properties)
+                               last_dn->properties = property;
+                       else
+                               last_property->next = property;
+
+                       last_property = property;
+                       break;
+
+               case PREV_PARENT:
+                       last_dn = last_dn->parent;
+                       break;
+
+               case CALL_AGAIN:
+                       break;
+
+               case MORE_MEMORY:
+               case ERR_CFG_USE:
+               default:
+                       printk(KERN_ERR "Unexpected Error (%d) "
+                              "returned from configure-connector\n", rc);
+                       goto cc_error;
+               }
+
+               rc = rtas_call(cc_token, 2, 1, NULL, workarea, NULL);
+       }
+
+       spin_unlock(&workarea_lock);
+       return first_dn;
+
+cc_error:
+       spin_unlock(&workarea_lock);
+
+       if (first_dn)
+               free_cc_nodes(first_dn);
+
+       return NULL;
+}
+
+static struct device_node *derive_parent(const char *path)
+{
+       struct device_node *parent;
+       char parent_path[128];
+       int parent_path_len;
+
+       parent_path_len = strrchr(path, '/') - path + 1;
+       strlcpy(parent_path, path, parent_path_len);
+
+       parent = of_find_node_by_path(parent_path);
+
+       return parent;
+}
+
+static int add_one_node(struct device_node *dn)
+{
+       struct proc_dir_entry *ent;
+       int rc;
+
+       of_node_set_flag(dn, OF_DYNAMIC);
+       kref_init(&dn->kref);
+       dn->parent = derive_parent(dn->full_name);
+
+       rc = blocking_notifier_call_chain(&pSeries_reconfig_chain,
+                                         PSERIES_RECONFIG_ADD, dn);
+       if (rc == NOTIFY_BAD) {
+               printk(KERN_ERR "Failed to add device node %s\n",
+                      dn->full_name);
+               return -ENOMEM; /* For now, safe to assume kmalloc failure */
+       }
+
+       of_attach_node(dn);
+
+#ifdef CONFIG_PROC_DEVICETREE
+       ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde);
+       if (ent)
+               proc_device_tree_add_node(dn, ent);
+#endif
+
+       of_node_put(dn->parent);
+       return 0;
+}
+
+int add_device_tree_nodes(struct device_node *dn)
+{
+       struct device_node *child = dn->child;
+       struct device_node *sibling = dn->sibling;
+       int rc;
+
+       dn->child = NULL;
+       dn->sibling = NULL;
+       dn->parent = NULL;
+
+       rc = add_one_node(dn);
+       if (rc)
+               return rc;
+
+       if (child) {
+               rc = add_device_tree_nodes(child);
+               if (rc)
+                       return rc;
+       }
+
+       if (sibling)
+               rc = add_device_tree_nodes(sibling);
+
+       return rc;
+}
+
+static int remove_one_node(struct device_node *dn)
+{
+       struct device_node *parent = dn->parent;
+       struct property *prop = dn->properties;
+
+#ifdef CONFIG_PROC_DEVICETREE
+       while (prop) {
+               remove_proc_entry(prop->name, dn->pde);
+               prop = prop->next;
+       }
+
+       if (dn->pde)
+               remove_proc_entry(dn->pde->name, parent->pde);
+#endif
+
+       blocking_notifier_call_chain(&pSeries_reconfig_chain,
+                           PSERIES_RECONFIG_REMOVE, dn);
+       of_detach_node(dn);
+       of_node_put(dn); /* Must decrement the refcount */
+
+       return 0;
+}
+
+static int _remove_device_tree_nodes(struct device_node *dn)
+{
+       int rc;
+
+       if (dn->child) {
+               rc = _remove_device_tree_nodes(dn->child);
+               if (rc)
+                       return rc;
+       }
+
+       if (dn->sibling) {
+               rc = _remove_device_tree_nodes(dn->sibling);
+               if (rc)
+                       return rc;
+       }
+
+       rc = remove_one_node(dn);
+       return rc;
+}
+
+int remove_device_tree_nodes(struct device_node *dn)
+{
+       int rc;
+
+       if (dn->child) {
+               rc = _remove_device_tree_nodes(dn->child);
+               if (rc)
+                       return rc;
+       }
+
+       rc = remove_one_node(dn);
+       return rc;
+}
+
+int online_node_cpus(struct device_node *dn)
+{
+       int rc = 0;
+       unsigned int cpu;
+       int len, nthreads, i;
+       const u32 *intserv;
+
+       intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+       if (!intserv)
+               return -EINVAL;
+
+       nthreads = len / sizeof(u32);
+
+       cpu_maps_update_begin();
+       for (i = 0; i < nthreads; i++) {
+               for_each_present_cpu(cpu) {
+                       if (get_hard_smp_processor_id(cpu) != intserv[i])
+                               continue;
+                       BUG_ON(get_cpu_current_state(cpu)
+                                       != CPU_STATE_OFFLINE);
+                       cpu_maps_update_done();
+                       rc = cpu_up(cpu);
+                       if (rc)
+                               goto out;
+                       cpu_maps_update_begin();
+
+                       break;
+               }
+               if (cpu == num_possible_cpus())
+                       printk(KERN_WARNING "Could not find cpu to online "
+                              "with physical id 0x%x\n", intserv[i]);
+       }
+       cpu_maps_update_done();
+
+out:
+       return rc;
+
+}
+
+int offline_node_cpus(struct device_node *dn)
+{
+       int rc = 0;
+       unsigned int cpu;
+       int len, nthreads, i;
+       const u32 *intserv;
+
+       intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len);
+       if (!intserv)
+               return -EINVAL;
+
+       nthreads = len / sizeof(u32);
+
+       cpu_maps_update_begin();
+       for (i = 0; i < nthreads; i++) {
+               for_each_present_cpu(cpu) {
+                       if (get_hard_smp_processor_id(cpu) != intserv[i])
+                               continue;
+
+                       if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE)
+                               break;
+
+                       if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) {
+                               cpu_maps_update_done();
+                               rc = cpu_down(cpu);
+                               if (rc)
+                                       goto out;
+                               cpu_maps_update_begin();
+                               break;
+
+                       }
+
+                       /*
+                        * The cpu is in CPU_STATE_INACTIVE.
+                        * Upgrade it's state to CPU_STATE_OFFLINE.
+                        */
+                       set_preferred_offline_state(cpu, CPU_STATE_OFFLINE);
+                       BUG_ON(plpar_hcall_norets(H_PROD, intserv[i])
+                                                               != H_SUCCESS);
+                       __cpu_die(cpu);
+                       break;
+               }
+               if (cpu == num_possible_cpus())
+                       printk(KERN_WARNING "Could not find cpu to offline "
+                              "with physical id 0x%x\n", intserv[i]);
+       }
+       cpu_maps_update_done();
+
+out:
+       return rc;
+
+}
+
+#define DR_ENTITY_SENSE                9003
+#define DR_ENTITY_PRESENT      1
+#define DR_ENTITY_UNUSABLE     2
+#define ALLOCATION_STATE       9003
+#define ALLOC_UNUSABLE         0
+#define ALLOC_USABLE           1
+#define ISOLATION_STATE                9001
+#define ISOLATE                        0
+#define UNISOLATE              1
+
+int acquire_drc(u32 drc_index)
+{
+       int dr_status, rc;
+
+       rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+                      DR_ENTITY_SENSE, drc_index);
+       if (rc || dr_status != DR_ENTITY_UNUSABLE)
+               return -1;
+
+       rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE);
+       if (rc)
+               return rc;
+
+       rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+       if (rc) {
+               rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+               return rc;
+       }
+
+       return 0;
+}
+
+int release_drc(u32 drc_index)
+{
+       int dr_status, rc;
+
+       rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status,
+                      DR_ENTITY_SENSE, drc_index);
+       if (rc || dr_status != DR_ENTITY_PRESENT)
+               return -1;
+
+       rc = rtas_set_indicator(ISOLATION_STATE, drc_index, ISOLATE);
+       if (rc)
+               return rc;
+
+       rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE);
+       if (rc) {
+               rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE);
+               return rc;
+       }
+
+       return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static DEFINE_MUTEX(pseries_cpu_hotplug_mutex);
+
+void cpu_hotplug_driver_lock()
+{
+       mutex_lock(&pseries_cpu_hotplug_mutex);
+}
+
+void cpu_hotplug_driver_unlock()
+{
+       mutex_unlock(&pseries_cpu_hotplug_mutex);
+}
+
+static ssize_t cpu_probe_store(struct class *class, const char *buf,
+                              size_t count)
+{
+       struct device_node *dn;
+       unsigned long drc_index;
+       char *cpu_name;
+       int rc;
+
+       cpu_hotplug_driver_lock();
+       rc = strict_strtoul(buf, 0, &drc_index);
+       if (rc)
+               goto out;
+
+       rc = acquire_drc(drc_index);
+       if (rc)
+               goto out;
+
+       dn = configure_connector(drc_index);
+       if (!dn) {
+               release_drc(drc_index);
+               goto out;
+       }
+
+       /* fixup dn name */
+       cpu_name = kzalloc(strlen(dn->full_name) + strlen("/cpus/") + 1,
+                          GFP_KERNEL);
+       if (!cpu_name) {
+               free_cc_nodes(dn);
+               release_drc(drc_index);
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       sprintf(cpu_name, "/cpus/%s", dn->full_name);
+       kfree(dn->full_name);
+       dn->full_name = cpu_name;
+
+       rc = add_device_tree_nodes(dn);
+       if (rc)
+               release_drc(drc_index);
+
+       rc = online_node_cpus(dn);
+out:
+       cpu_hotplug_driver_unlock();
+
+       return rc ? -EINVAL : count;
+}
+
+static ssize_t cpu_release_store(struct class *class, const char *buf,
+                                size_t count)
+{
+       struct device_node *dn;
+       const u32 *drc_index;
+       int rc;
+
+       dn = of_find_node_by_path(buf);
+       if (!dn)
+               return -EINVAL;
+
+       drc_index = of_get_property(dn, "ibm,my-drc-index", NULL);
+       if (!drc_index) {
+               of_node_put(dn);
+               return -EINVAL;
+       }
+
+       cpu_hotplug_driver_lock();
+       rc = offline_node_cpus(dn);
+
+       if (rc)
+               goto out;
+
+       rc = release_drc(*drc_index);
+       if (rc) {
+               of_node_put(dn);
+               goto out;
+       }
+
+       rc = remove_device_tree_nodes(dn);
+       if (rc)
+               acquire_drc(*drc_index);
+
+       of_node_put(dn);
+out:
+       cpu_hotplug_driver_unlock();
+       return rc ? -EINVAL : count;
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+static struct property *clone_property(struct property *old_prop)
+{
+       struct property *new_prop;
+
+       new_prop = kzalloc((sizeof *new_prop), GFP_KERNEL);
+       if (!new_prop)
+               return NULL;
+
+       new_prop->name = kstrdup(old_prop->name, GFP_KERNEL);
+       new_prop->value = kzalloc(old_prop->length + 1, GFP_KERNEL);
+       if (!new_prop->name || !new_prop->value) {
+               free_property(new_prop);
+               return NULL;
+       }
+
+       memcpy(new_prop->value, old_prop->value, old_prop->length);
+       new_prop->length = old_prop->length;
+
+       return new_prop;
+}
+
+int platform_probe_memory(u64 phys_addr)
+{
+       struct device_node *dn = NULL;
+       struct property *new_prop;
+       struct property *old_prop;
+       struct of_drconf_cell *drmem;
+       const u64 *lmb_size;
+       int num_entries, i;
+       int rc = -EINVAL;
+
+       if (!phys_addr)
+               goto memory_probe_exit;
+
+       dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+       if (!dn)
+               goto memory_probe_exit;
+
+       lmb_size = of_get_property(dn, "ibm,lmb-size", NULL);
+       if (!lmb_size)
+               goto memory_probe_exit;
+
+       old_prop = of_find_property(dn, "ibm,dynamic-memory", NULL);
+       if (!old_prop)
+               goto memory_probe_exit;
+
+       num_entries = *(u32 *)old_prop->value;
+       drmem = (struct of_drconf_cell *)
+                               ((char *)old_prop->value + sizeof(u32));
+
+       for (i = 0; i < num_entries; i++) {
+               u64 lmb_end_addr = drmem[i].base_addr + *lmb_size;
+               if (phys_addr >= drmem[i].base_addr
+                   && phys_addr < lmb_end_addr)
+                       break;
+       }
+
+       if (i >= num_entries)
+               goto memory_probe_exit;
+
+       if (drmem[i].flags & DRCONF_MEM_ASSIGNED) {
+               /* This lmb is already adssigned to the system, nothing to do */
+               rc = 0;
+               goto memory_probe_exit;
+       }
+
+       rc = acquire_drc(drmem[i].drc_index);
+       if (rc) {
+               rc = -EINVAL;
+               goto memory_probe_exit;
+       }
+
+       new_prop = clone_property(old_prop);
+       drmem = (struct of_drconf_cell *)
+                               ((char *)new_prop->value + sizeof(u32));
+
+       drmem[i].flags |= DRCONF_MEM_ASSIGNED;
+       rc = prom_update_property(dn, new_prop, old_prop);
+       if (rc) {
+               free_property(new_prop);
+               rc = -EINVAL;
+               goto memory_probe_exit;
+       }
+
+       rc = blocking_notifier_call_chain(&pSeries_reconfig_chain,
+                                         PSERIES_DRCONF_MEM_ADD,
+                                         &drmem[i].base_addr);
+       if (rc == NOTIFY_BAD) {
+               prom_update_property(dn, old_prop, new_prop);
+               release_drc(drmem[i].drc_index);
+               rc = -EINVAL;
+       } else
+               rc = 0;
+
+memory_probe_exit:
+       of_node_put(dn);
+       return rc;
+}
+
+static ssize_t memory_release_store(struct class *class, const char *buf,
+                                   size_t count)
+{
+       unsigned long drc_index;
+       struct device_node *dn;
+       struct property *new_prop, *old_prop;
+       struct of_drconf_cell *drmem;
+       int num_entries;
+       int i;
+       int rc = -EINVAL;
+
+       rc = strict_strtoul(buf, 0, &drc_index);
+       if (rc)
+               return rc;
+
+       dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+       if (!dn)
+               return rc;
+
+       old_prop = of_find_property(dn, "ibm,dynamic-memory", NULL);
+       if (!old_prop)
+               goto memory_release_exit;
+
+       num_entries = *(u32 *)old_prop->value;
+       drmem = (struct of_drconf_cell *)
+                               ((char *)old_prop->value + sizeof(u32));
+
+       for (i = 0; i < num_entries; i++) {
+               if (drmem[i].drc_index == drc_index)
+                       break;
+       }
+
+       if (i >= num_entries)
+               goto memory_release_exit;
+
+       new_prop = clone_property(old_prop);
+       drmem = (struct of_drconf_cell *)
+                               ((char *)new_prop->value + sizeof(u32));
+
+       drmem[i].flags &= ~DRCONF_MEM_ASSIGNED;
+       rc = prom_update_property(dn, new_prop, old_prop);
+       if (rc) {
+               free_property(new_prop);
+               rc = -EINVAL;
+               goto memory_release_exit;
+       }
+
+       rc = blocking_notifier_call_chain(&pSeries_reconfig_chain,
+                                         PSERIES_DRCONF_MEM_REMOVE,
+                                         &drmem[i].base_addr);
+       if (rc != NOTIFY_BAD)
+               rc = release_drc(drc_index);
+
+       if (rc) {
+               prom_update_property(dn, old_prop, new_prop);
+               rc = -EINVAL;
+       }
+
+memory_release_exit:
+       of_node_put(dn);
+       return rc ? rc : count;
+}
+
+static struct class_attribute class_attr_mem_release =
+                       __ATTR(release, S_IWUSR, NULL, memory_release_store);
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+#ifdef CONFIG_HOTPLUG_CPU
+static struct class_attribute class_attr_cpu_probe =
+                       __ATTR(probe, S_IWUSR, NULL, cpu_probe_store);
+static struct class_attribute class_attr_cpu_release =
+                       __ATTR(release, S_IWUSR, NULL, cpu_release_store);
+#endif
+
+static int pseries_dlpar_init(void)
+{
+       if (!machine_is(pseries))
+               return 0;
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+       if (sysfs_create_file(&memory_sysdev_class.kset.kobj,
+                             &class_attr_mem_release.attr))
+               printk(KERN_INFO "DLPAR: Could not create sysfs memory "
+                      "release file\n");
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+       if (sysfs_create_file(&cpu_sysdev_class.kset.kobj,
+                             &class_attr_cpu_probe.attr))
+               printk(KERN_INFO "DLPAR: Could not create sysfs cpu "
+                      "probe file\n");
+
+       if (sysfs_create_file(&cpu_sysdev_class.kset.kobj,
+                             &class_attr_cpu_release.attr))
+               printk(KERN_INFO "DLPAR: Could not create sysfs cpu "
+                      "release file\n");
+#endif
+
+       return 0;
+}
+device_initcall(pseries_dlpar_init);
index ebff6d9..8555ce3 100644 (file)
@@ -30,6 +30,7 @@
 #include <asm/pSeries_reconfig.h>
 #include "xics.h"
 #include "plpar_wrappers.h"
+#include "offline_states.h"
 
 /* This version can't take the spinlock, because it never returns */
 static struct rtas_args rtas_stop_self_args = {
@@ -39,6 +40,55 @@ static struct rtas_args rtas_stop_self_args = {
        .rets = &rtas_stop_self_args.args[0],
 };
 
+static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) =
+                                                       CPU_STATE_OFFLINE;
+static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE;
+
+static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE;
+
+static int cede_offline_enabled __read_mostly = 1;
+
+/*
+ * Enable/disable cede_offline when available.
+ */
+static int __init setup_cede_offline(char *str)
+{
+       if (!strcmp(str, "off"))
+               cede_offline_enabled = 0;
+       else if (!strcmp(str, "on"))
+               cede_offline_enabled = 1;
+       else
+               return 0;
+       return 1;
+}
+
+__setup("cede_offline=", setup_cede_offline);
+
+enum cpu_state_vals get_cpu_current_state(int cpu)
+{
+       return per_cpu(current_state, cpu);
+}
+
+void set_cpu_current_state(int cpu, enum cpu_state_vals state)
+{
+       per_cpu(current_state, cpu) = state;
+}
+
+enum cpu_state_vals get_preferred_offline_state(int cpu)
+{
+       return per_cpu(preferred_offline_state, cpu);
+}
+
+void set_preferred_offline_state(int cpu, enum cpu_state_vals state)
+{
+       per_cpu(preferred_offline_state, cpu) = state;
+}
+
+void set_default_offline_state(int cpu)
+{
+       per_cpu(preferred_offline_state, cpu) = default_offline_state;
+}
+
 static void rtas_stop_self(void)
 {
        struct rtas_args *args = &rtas_stop_self_args;
@@ -56,11 +106,64 @@ static void rtas_stop_self(void)
 
 static void pseries_mach_cpu_die(void)
 {
+       unsigned int cpu = smp_processor_id();
+       unsigned int hwcpu = hard_smp_processor_id();
+       u8 cede_latency_hint = 0;
+
        local_irq_disable();
        idle_task_exit();
        xics_teardown_cpu();
-       unregister_slb_shadow(hard_smp_processor_id(), __pa(get_slb_shadow()));
-       rtas_stop_self();
+
+       if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+               set_cpu_current_state(cpu, CPU_STATE_INACTIVE);
+               cede_latency_hint = 2;
+
+               get_lppaca()->idle = 1;
+               if (!get_lppaca()->shared_proc)
+                       get_lppaca()->donate_dedicated_cpu = 1;
+
+               printk(KERN_INFO
+                       "cpu %u (hwid %u) ceding for offline with hint %d\n",
+                       cpu, hwcpu, cede_latency_hint);
+               while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+                       extended_cede_processor(cede_latency_hint);
+                       printk(KERN_INFO "cpu %u (hwid %u) returned from cede.\n",
+                               cpu, hwcpu);
+                       printk(KERN_INFO
+                       "Decrementer value = %x Timebase value = %llx\n",
+                       get_dec(), get_tb());
+               }
+
+               printk(KERN_INFO "cpu %u (hwid %u) got prodded to go online\n",
+                       cpu, hwcpu);
+
+               if (!get_lppaca()->shared_proc)
+                       get_lppaca()->donate_dedicated_cpu = 0;
+               get_lppaca()->idle = 0;
+       }
+
+       if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) {
+               unregister_slb_shadow(hwcpu, __pa(get_slb_shadow()));
+
+               /*
+                * NOTE: Calling start_secondary() here for now to
+                * start new context.
+                * However, need to do it cleanly by resetting the
+                * stack pointer.
+                */
+               start_secondary();
+               goto out_bug;
+
+       } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) {
+
+               set_cpu_current_state(cpu, CPU_STATE_OFFLINE);
+               unregister_slb_shadow(hard_smp_processor_id(),
+                                       __pa(get_slb_shadow()));
+               rtas_stop_self();
+               goto out_bug;
+       }
+
+out_bug:
        /* Should never get here... */
        BUG();
        for(;;);
@@ -109,15 +212,28 @@ static int pseries_cpu_disable(void)
 static void pseries_cpu_die(unsigned int cpu)
 {
        int tries;
-       int cpu_status;
+       int cpu_status = 1;
        unsigned int pcpu = get_hard_smp_processor_id(cpu);
 
-       for (tries = 0; tries < 25; tries++) {
-               cpu_status = query_cpu_stopped(pcpu);
-               if (cpu_status == 0 || cpu_status == -1)
-                       break;
-               cpu_relax();
+       if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) {
+               cpu_status = 1;
+               for (tries = 0; tries < 1000; tries++) {
+                       if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) {
+                               cpu_status = 0;
+                               break;
+                       }
+                       cpu_relax();
+               }
+       } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) {
+
+               for (tries = 0; tries < 25; tries++) {
+                       cpu_status = query_cpu_stopped(pcpu);
+                       if (cpu_status == 0 || cpu_status == -1)
+                               break;
+                       cpu_relax();
+               }
        }
+
        if (cpu_status != 0) {
                printk("Querying DEAD? cpu %i (%i) shows %i\n",
                       cpu, pcpu, cpu_status);
@@ -252,10 +368,41 @@ static struct notifier_block pseries_smp_nb = {
        .notifier_call = pseries_smp_notifier,
 };
 
+#define MAX_CEDE_LATENCY_LEVELS                4
+#define        CEDE_LATENCY_PARAM_LENGTH       10
+#define CEDE_LATENCY_PARAM_MAX_LENGTH  \
+       (MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char))
+#define CEDE_LATENCY_TOKEN             45
+
+static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH];
+
+static int parse_cede_parameters(void)
+{
+       int call_status;
+
+       memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH);
+       call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+                               NULL,
+                               CEDE_LATENCY_TOKEN,
+                               __pa(cede_parameters),
+                               CEDE_LATENCY_PARAM_MAX_LENGTH);
+
+       if (call_status != 0)
+               printk(KERN_INFO "CEDE_LATENCY: \
+                       %s %s Error calling get-system-parameter(0x%x)\n",
+                       __FILE__, __func__, call_status);
+       else
+               printk(KERN_INFO "CEDE_LATENCY: \
+                       get-system-parameter successful.\n");
+
+       return call_status;
+}
+
 static int __init pseries_cpu_hotplug_init(void)
 {
        struct device_node *np;
        const char *typep;
+       int cpu;
 
        for_each_node_by_name(np, "interrupt-controller") {
                typep = of_get_property(np, "compatible", NULL);
@@ -283,8 +430,16 @@ static int __init pseries_cpu_hotplug_init(void)
        smp_ops->cpu_die = pseries_cpu_die;
 
        /* Processors can be added/removed only on LPAR */
-       if (firmware_has_feature(FW_FEATURE_LPAR))
+       if (firmware_has_feature(FW_FEATURE_LPAR)) {
                pSeries_reconfig_notifier_register(&pseries_smp_nb);
+               cpu_maps_update_begin();
+               if (cede_offline_enabled && parse_cede_parameters() == 0) {
+                       default_offline_state = CPU_STATE_INACTIVE;
+                       for_each_online_cpu(cpu)
+                               set_default_offline_state(cpu);
+               }
+               cpu_maps_update_done();
+       }
 
        return 0;
 }
diff --git a/arch/powerpc/platforms/pseries/offline_states.h b/arch/powerpc/platforms/pseries/offline_states.h
new file mode 100644 (file)
index 0000000..22574e0
--- /dev/null
@@ -0,0 +1,18 @@
+#ifndef _OFFLINE_STATES_H_
+#define _OFFLINE_STATES_H_
+
+/* Cpu offline states go here */
+enum cpu_state_vals {
+       CPU_STATE_OFFLINE,
+       CPU_STATE_INACTIVE,
+       CPU_STATE_ONLINE,
+       CPU_MAX_OFFLINE_STATES
+};
+
+extern enum cpu_state_vals get_cpu_current_state(int cpu);
+extern void set_cpu_current_state(int cpu, enum cpu_state_vals state);
+extern enum cpu_state_vals get_preferred_offline_state(int cpu);
+extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state);
+extern void set_default_offline_state(int cpu);
+extern int start_secondary(void);
+#endif
index a24a6b2..0603c91 100644 (file)
@@ -9,11 +9,33 @@ static inline long poll_pending(void)
        return plpar_hcall_norets(H_POLL_PENDING);
 }
 
+static inline u8 get_cede_latency_hint(void)
+{
+       return get_lppaca()->gpr5_dword.fields.cede_latency_hint;
+}
+
+static inline void set_cede_latency_hint(u8 latency_hint)
+{
+       get_lppaca()->gpr5_dword.fields.cede_latency_hint = latency_hint;
+}
+
 static inline long cede_processor(void)
 {
        return plpar_hcall_norets(H_CEDE);
 }
 
+static inline long extended_cede_processor(unsigned long latency_hint)
+{
+       long rc;
+       u8 old_latency_hint = get_cede_latency_hint();
+
+       set_cede_latency_hint(latency_hint);
+       rc = cede_processor();
+       set_cede_latency_hint(old_latency_hint);
+
+       return rc;
+}
+
 static inline long vpa_call(unsigned long flags, unsigned long cpu,
                unsigned long vpa)
 {
diff --git a/arch/powerpc/platforms/pseries/processor_idle.c b/arch/powerpc/platforms/pseries/processor_idle.c
new file mode 100644 (file)
index 0000000..be07c55
--- /dev/null
@@ -0,0 +1,201 @@
+/*
+ *  processor_idle - idle state cpuidle driver.
+ *  Adapted from drivers/acpi/processor_idle.c
+ *
+ *  Arun R Bharadwaj <arun@linux.vnet.ibm.com>
+ *
+ *  Copyright (C) 2009 IBM Corporation.
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/cpuidle.h>
+
+#include <asm/paca.h>
+#include <asm/reg.h>
+#include <asm/system.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+
+#include "plpar_wrappers.h"
+#include "pseries.h"
+
+MODULE_AUTHOR("Arun R Bharadwaj");
+MODULE_DESCRIPTION("pSeries Idle State Driver");
+MODULE_LICENSE("GPL");
+
+struct cpuidle_driver pseries_idle_driver = {
+       .name =         "pseries_idle",
+       .owner =        THIS_MODULE,
+};
+
+DEFINE_PER_CPU(struct cpuidle_device, pseries_dev);
+
+#define IDLE_STATE_COUNT       2
+
+/* pSeries Idle state Flags */
+#define        PSERIES_DEDICATED_SNOOZE        (0x01)
+#define        PSERIES_DEDICATED_CEDE          (0x02)
+#define        PSERIES_SHARED_CEDE             (0x03)
+
+static int pseries_idle_init(struct cpuidle_device *dev)
+{
+       return cpuidle_register_device(dev);
+}
+
+static void shared_cede_loop(void)
+{
+       get_lppaca()->idle = 1;
+       cede_processor();
+       get_lppaca()->idle = 0;
+}
+
+static void dedicated_snooze_loop(void)
+{
+       local_irq_enable();
+       set_thread_flag(TIF_POLLING_NRFLAG);
+       while (!need_resched()) {
+               ppc64_runlatch_off();
+               HMT_low();
+               HMT_very_low();
+       }
+       HMT_medium();
+       clear_thread_flag(TIF_POLLING_NRFLAG);
+       smp_mb();
+       local_irq_disable();
+}
+
+static void dedicated_cede_loop(void)
+{
+       ppc64_runlatch_off();
+       HMT_medium();
+       cede_processor();
+}
+
+static void pseries_cpuidle_loop(struct cpuidle_device *dev,
+                               struct cpuidle_state *st)
+{
+       unsigned long in_purr, out_purr;
+
+       get_lppaca()->idle = 1;
+       get_lppaca()->donate_dedicated_cpu = 1;
+       in_purr = mfspr(SPRN_PURR);
+
+       if (st->flags & PSERIES_SHARED_CEDE)
+               shared_cede_loop();
+       else if (st->flags & PSERIES_DEDICATED_SNOOZE)
+               dedicated_snooze_loop();
+       else
+               dedicated_cede_loop();
+
+       out_purr = mfspr(SPRN_PURR);
+       get_lppaca()->wait_state_cycles += out_purr - in_purr;
+       get_lppaca()->donate_dedicated_cpu = 0;
+       get_lppaca()->idle = 0;
+}
+
+static int pseries_setup_cpuidle(struct cpuidle_device *dev, int cpu)
+{
+       int i;
+       struct cpuidle_state *state;
+
+       dev->cpu = cpu;
+
+       if (get_lppaca()->shared_proc) {
+               state = &dev->states[0];
+               snprintf(state->name, CPUIDLE_NAME_LEN, "IDLE");
+               state->enter = pseries_cpuidle_loop;
+               strncpy(state->desc, "shared_cede", CPUIDLE_DESC_LEN);
+               state->flags = PSERIES_SHARED_CEDE;
+               state->exit_latency = 0;
+               state->target_residency = 0;
+               return 0;
+       }
+
+       for (i = 0; i < IDLE_STATE_COUNT; i++) {
+               state = &dev->states[i];
+
+               snprintf(state->name, CPUIDLE_NAME_LEN, "CEDE%d", i);
+               state->enter = pseries_cpuidle_loop;
+
+               switch (i) {
+               case 0:
+                       strncpy(state->desc, "snooze", CPUIDLE_DESC_LEN);
+                       state->flags = PSERIES_DEDICATED_SNOOZE;
+                       state->exit_latency = 0;
+                       state->target_residency = 0;
+                       break;
+
+               case 1:
+                       strncpy(state->desc, "cede", CPUIDLE_DESC_LEN);
+                       state->flags = PSERIES_DEDICATED_CEDE;
+                       state->exit_latency = 1;
+                       state->target_residency =
+                                       __get_cpu_var(smt_snooze_delay);
+                       break;
+               }
+       }
+       dev->state_count = IDLE_STATE_COUNT;
+
+       return 0;
+}
+
+void update_smt_snooze_delay(int snooze)
+{
+       int cpu;
+       for_each_online_cpu(cpu)
+               per_cpu(pseries_dev, cpu).states[0].target_residency = snooze;
+}
+
+static int __init pseries_processor_idle_init(void)
+{
+       int cpu;
+       int result;
+
+       if (boot_option_idle_override) {
+               printk(KERN_DEBUG "Using default idle\n");
+               return 0;
+       }
+
+       result = cpuidle_register_driver(&pseries_idle_driver);
+
+       if (result < 0)
+               return result;
+
+       printk(KERN_DEBUG "pSeries idle driver registered\n");
+
+       if (!firmware_has_feature(FW_FEATURE_SPLPAR)) {
+               printk(KERN_DEBUG "Using default idle\n");
+               return 0;
+       }
+
+       for_each_online_cpu(cpu) {
+               pseries_setup_cpuidle(&per_cpu(pseries_dev, cpu), cpu);
+               pseries_idle_init(&per_cpu(pseries_dev, cpu));
+       }
+
+       printk(KERN_DEBUG "Using cpuidle idle loop\n");
+
+       return 0;
+}
+
+device_initcall(pseries_processor_idle_init);
index 9e17c0d..92648fd 100644 (file)
@@ -10,6 +10,8 @@
 #ifndef _PSERIES_PSERIES_H
 #define _PSERIES_PSERIES_H
 
+#include <linux/cpuidle.h>
+
 extern void __init fw_feature_init(const char *hypertas, unsigned long len);
 
 struct pt_regs;
@@ -40,4 +42,8 @@ extern unsigned long rtas_poweron_auto;
 
 extern void find_udbg_vterm(void);
 
+DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
+
+extern struct cpuidle_driver pseries_idle_driver;
+
 #endif /* _PSERIES_PSERIES_H */
index 2e2bbe1..fbb31b4 100644 (file)
@@ -96,7 +96,7 @@ static struct device_node *derive_parent(const char *path)
        return parent;
 }
 
-static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain);
+BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain);
 
 int pSeries_reconfig_notifier_register(struct notifier_block *nb)
 {
index 1565b86..533af7d 100644 (file)
@@ -75,9 +75,6 @@ EXPORT_SYMBOL(CMO_PageSize);
 
 int fwnmi_active;  /* TRUE if an FWNMI handler is present */
 
-static void pseries_shared_idle_sleep(void);
-static void pseries_dedicated_idle_sleep(void);
-
 static struct device_node *pSeries_mpic_node;
 
 static void pSeries_show_cpuinfo(struct seq_file *m)
@@ -297,18 +294,8 @@ static void __init pSeries_setup_arch(void)
        pSeries_nvram_init();
 
        /* Choose an idle loop */
-       if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+       if (firmware_has_feature(FW_FEATURE_SPLPAR))
                vpa_init(boot_cpuid);
-               if (get_lppaca()->shared_proc) {
-                       printk(KERN_DEBUG "Using shared processor idle loop\n");
-                       ppc_md.power_save = pseries_shared_idle_sleep;
-               } else {
-                       printk(KERN_DEBUG "Using dedicated idle loop\n");
-                       ppc_md.power_save = pseries_dedicated_idle_sleep;
-               }
-       } else {
-               printk(KERN_DEBUG "Using default idle loop\n");
-       }
 
        if (firmware_has_feature(FW_FEATURE_LPAR))
                ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
@@ -500,80 +487,6 @@ static int __init pSeries_probe(void)
        return 1;
 }
 
-
-DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
-
-static void pseries_dedicated_idle_sleep(void)
-{ 
-       unsigned int cpu = smp_processor_id();
-       unsigned long start_snooze;
-       unsigned long in_purr, out_purr;
-
-       /*
-        * Indicate to the HV that we are idle. Now would be
-        * a good time to find other work to dispatch.
-        */
-       get_lppaca()->idle = 1;
-       get_lppaca()->donate_dedicated_cpu = 1;
-       in_purr = mfspr(SPRN_PURR);
-
-       /*
-        * We come in with interrupts disabled, and need_resched()
-        * has been checked recently.  If we should poll for a little
-        * while, do so.
-        */
-       if (__get_cpu_var(smt_snooze_delay)) {
-               start_snooze = get_tb() +
-                       __get_cpu_var(smt_snooze_delay) * tb_ticks_per_usec;
-               local_irq_enable();
-               set_thread_flag(TIF_POLLING_NRFLAG);
-
-               while (get_tb() < start_snooze) {
-                       if (need_resched() || cpu_is_offline(cpu))
-                               goto out;
-                       ppc64_runlatch_off();
-                       HMT_low();
-                       HMT_very_low();
-               }
-
-               HMT_medium();
-               clear_thread_flag(TIF_POLLING_NRFLAG);
-               smp_mb();
-               local_irq_disable();
-               if (need_resched() || cpu_is_offline(cpu))
-                       goto out;
-       }
-
-       cede_processor();
-
-out:
-       HMT_medium();
-       out_purr = mfspr(SPRN_PURR);
-       get_lppaca()->wait_state_cycles += out_purr - in_purr;
-       get_lppaca()->donate_dedicated_cpu = 0;
-       get_lppaca()->idle = 0;
-}
-
-static void pseries_shared_idle_sleep(void)
-{
-       /*
-        * Indicate to the HV that we are idle. Now would be
-        * a good time to find other work to dispatch.
-        */
-       get_lppaca()->idle = 1;
-
-       /*
-        * Yield the processor to the hypervisor.  We return if
-        * an external interrupt occurs (which are driven prior
-        * to returning here) or if a prod occurs from another
-        * processor. When returning here, external interrupts
-        * are enabled.
-        */
-       cede_processor();
-
-       get_lppaca()->idle = 0;
-}
-
 static int pSeries_pci_probe_mode(struct pci_bus *bus)
 {
        if (firmware_has_feature(FW_FEATURE_LPAR))
index 440000c..8868c01 100644 (file)
@@ -48,6 +48,7 @@
 #include "plpar_wrappers.h"
 #include "pseries.h"
 #include "xics.h"
+#include "offline_states.h"
 
 
 /*
@@ -84,6 +85,9 @@ static inline int __devinit smp_startup_cpu(unsigned int lcpu)
        /* Fixup atomic count: it exited inside IRQ handler. */
        task_thread_info(paca[lcpu].__current)->preempt_count   = 0;
 
+       if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE)
+               goto out;
+
        /* 
         * If the RTAS start-cpu token does not exist then presume the
         * cpu is already spinning.
@@ -98,6 +102,7 @@ static inline int __devinit smp_startup_cpu(unsigned int lcpu)
                return 0;
        }
 
+out:
        return 1;
 }
 
@@ -111,12 +116,16 @@ static void __devinit smp_xics_setup_cpu(int cpu)
                vpa_init(cpu);
 
        cpu_clear(cpu, of_spin_map);
+       set_cpu_current_state(cpu, CPU_STATE_ONLINE);
+       set_default_offline_state(cpu);
 
 }
 #endif /* CONFIG_XICS */
 
 static void __devinit smp_pSeries_kick_cpu(int nr)
 {
+       long rc;
+       unsigned long hcpuid;
        BUG_ON(nr < 0 || nr >= NR_CPUS);
 
        if (!smp_startup_cpu(nr))
@@ -128,6 +137,16 @@ static void __devinit smp_pSeries_kick_cpu(int nr)
         * the processor will continue on to secondary_start
         */
        paca[nr].cpu_start = 1;
+
+       set_preferred_offline_state(nr, CPU_STATE_ONLINE);
+
+       if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) {
+               hcpuid = get_hard_smp_processor_id(nr);
+               rc = plpar_hcall_norets(H_PROD, hcpuid);
+               if (rc != H_SUCCESS)
+                       panic("Error: Prod to wake up processor %d Ret= %ld\n",
+                               nr, rc);
+       }
 }
 
 static int smp_pSeries_cpu_bootable(unsigned int nr)
index 14cdd4a..c9cec37 100644 (file)
@@ -1646,7 +1646,8 @@ static void super_regs(void)
                               ptrLpPaca->saved_srr0, ptrLpPaca->saved_srr1);
                        printf("    Saved Gpr3=%.16lx  Saved Gpr4=%.16lx \n",
                               ptrLpPaca->saved_gpr3, ptrLpPaca->saved_gpr4);
-                       printf("    Saved Gpr5=%.16lx \n", ptrLpPaca->saved_gpr5);
+                       printf("    Saved Gpr5=%.16lx \n",
+                               ptrLpPaca->gpr5_dword.saved_gpr5);
                }
 #endif
 
index 59b0e01..b86b96d 100644 (file)
@@ -2270,6 +2270,46 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
        { }
 };
 
+DEFINE_PER_CPU(struct cpuidle_device, apm_idle_devices);
+
+struct cpuidle_driver cpuidle_apm_driver = {
+       .name =         "cpuidle_apm",
+};
+
+static void apm_idle_loop(struct cpuidle_device *dev, struct cpuidle_state *st)
+{
+       apm_cpu_idle();
+}
+
+static void setup_cpuidle_apm(void)
+{
+       struct cpuidle_device *dev;
+       int cpu;
+
+       if (!cpuidle_curr_driver)
+               cpuidle_register_driver(&cpuidle_apm_driver);
+
+       for_each_online_cpu(cpu) {
+               dev = &per_cpu(apm_idle_devices, cpu);
+               dev->cpu = cpu;
+               dev->states[0].enter = apm_idle_loop;
+               dev->state_count = 1;
+               cpuidle_register_device(dev);
+       }
+}
+
+void exit_cpuidle_apm(void)
+{
+       struct cpuidle_device *dev;
+       int cpu;
+
+       for_each_online_cpu(cpu) {
+               dev = &per_cpu(apm_idle_devices, cpu);
+               cpuidle_unregister_device(dev);
+       }
+}
+
+
 /*
  * Just start the APM thread. We do NOT want to do APM BIOS
  * calls from anything but the APM thread, if for no other reason
@@ -2407,8 +2447,7 @@ static int __init apm_init(void)
        if (HZ != 100)
                idle_period = (idle_period * HZ) / 100;
        if (idle_threshold < 100) {
-               original_pm_idle = pm_idle;
-               pm_idle  = apm_cpu_idle;
+               setup_cpuidle_apm();
                set_pm_idle = 1;
        }
 
@@ -2420,7 +2459,7 @@ static void __exit apm_exit(void)
        int error;
 
        if (set_pm_idle) {
-               pm_idle = original_pm_idle;
+               exit_cpuidle_apm();
                /*
                 * We are about to unload the current idle thread pm callback
                 * (pm_idle), Wait for all processors to update cached/local
index 3ec3668..7b46ade 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/pm.h>
 #include <linux/clockchips.h>
 #include <linux/random.h>
+#include <linux/cpuidle.h>
 #include <trace/events/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
@@ -256,12 +257,6 @@ int sys_vfork(struct pt_regs *regs)
 unsigned long boot_option_idle_override = 0;
 EXPORT_SYMBOL(boot_option_idle_override);
 
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
-
 #ifdef CONFIG_X86_32
 /*
  * This halt magic was a workaround for ancient floppy DMA
@@ -341,17 +336,15 @@ static void do_nothing(void *unused)
 }
 
 /*
- * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
- * pm_idle and update to new pm_idle value. Required while changing pm_idle
- * handler on SMP systems.
+ * cpu_idle_wait - Required while changing idle routine handler on SMP systems.
  *
- * Caller must have changed pm_idle to the new value before the call. Old
- * pm_idle value will not be used by any CPU after the return of this function.
+ * Caller must have changed idle routine to the new value before the call. Old
+ * value will not be used by any CPU after the return of this function.
  */
 void cpu_idle_wait(void)
 {
        smp_mb();
-       /* kick all the CPUs so that they exit out of pm_idle */
+       /* kick all the CPUs so that they exit out of idle loop */
        smp_call_function(do_nothing, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
@@ -530,15 +523,58 @@ static void c1e_idle(void)
                default_idle();
 }
 
+static void (*local_idle)(void);
+
+#ifndef CONFIG_CPU_IDLE
+void cpuidle_idle_call(void)
+{
+       if (local_idle)
+               local_idle();
+       else
+               default_idle();
+}
+#endif
+
+DEFINE_PER_CPU(struct cpuidle_device, idle_devices);
+
+struct cpuidle_driver cpuidle_default_driver = {
+       .name =         "cpuidle_default",
+};
+
+static void local_idle_loop(struct cpuidle_device *dev,
+                               struct cpuidle_state *st)
+{
+       local_idle();
+}
+
+static int setup_cpuidle_simple(void)
+{
+       struct cpuidle_device *dev;
+       int cpu;
+
+       if (!cpuidle_curr_driver)
+               cpuidle_register_driver(&cpuidle_default_driver);
+
+       for_each_online_cpu(cpu) {
+               dev = &per_cpu(idle_devices, cpu);
+               dev->cpu = cpu;
+               dev->states[0].enter = local_idle_loop;
+               dev->state_count = 1;
+               cpuidle_register_device(dev);
+       }
+       return 0;
+}
+device_initcall(setup_cpuidle_simple);
+
 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
-       if (pm_idle == poll_idle && smp_num_siblings > 1) {
+       if (local_idle == poll_idle && smp_num_siblings > 1) {
                printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
                        " performance may degrade.\n");
        }
 #endif
-       if (pm_idle)
+       if (local_idle)
                return;
 
        if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
@@ -546,18 +582,20 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
                 * One CPU supports mwait => All CPUs supports mwait
                 */
                printk(KERN_INFO "using mwait in idle threads.\n");
-               pm_idle = mwait_idle;
+               local_idle = mwait_idle;
        } else if (check_c1e_idle(c)) {
                printk(KERN_INFO "using C1E aware idle routine\n");
-               pm_idle = c1e_idle;
+               local_idle = c1e_idle;
        } else
-               pm_idle = default_idle;
+               local_idle = default_idle;
+
+       return;
 }
 
 void __init init_c1e_mask(void)
 {
        /* If we're using c1e_idle, we need to allocate c1e_mask. */
-       if (pm_idle == c1e_idle)
+       if (local_idle == c1e_idle)
                zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
 }
 
@@ -568,7 +606,7 @@ static int __init idle_setup(char *str)
 
        if (!strcmp(str, "poll")) {
                printk("using polling idle threads.\n");
-               pm_idle = poll_idle;
+               local_idle = poll_idle;
        } else if (!strcmp(str, "mwait"))
                force_mwait = 1;
        else if (!strcmp(str, "halt")) {
@@ -579,7 +617,7 @@ static int __init idle_setup(char *str)
                 * To continue to load the CPU idle driver, don't touch
                 * the boot_option_idle_override.
                 */
-               pm_idle = default_idle;
+               local_idle = default_idle;
                idle_halt = 1;
                return 0;
        } else if (!strcmp(str, "nomwait")) {
index 4cf7956..cc750f1 100644 (file)
@@ -40,6 +40,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
+#include <linux/cpuidle.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -111,7 +112,7 @@ void cpu_idle(void)
                        local_irq_disable();
                        /* Don't trace irqs off for idle */
                        stop_critical_timings();
-                       pm_idle();
+                       cpuidle_idle_call();
                        start_critical_timings();
                }
                tick_nohz_restart_sched_tick();
index eb62cbc..29c0705 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/io.h>
 #include <linux/ftrace.h>
 #include <linux/dmi.h>
+#include <linux/cpuidle.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -140,7 +141,7 @@ void cpu_idle(void)
                        enter_idle();
                        /* Don't trace irqs off for idle */
                        stop_critical_timings();
-                       pm_idle();
+                       cpuidle_idle_call();
                        start_critical_timings();
                        /* In many cases the interrupt that ended idle
                           has already called exit_idle. But some idle
index ad0047f..1484f1f 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/pm.h>
+#include <linux/cpuidle.h>
 
 #include <asm/elf.h>
 #include <asm/vdso.h>
@@ -151,6 +152,33 @@ void __cpuinit xen_enable_syscall(void)
 #endif /* CONFIG_X86_64 */
 }
 
+DEFINE_PER_CPU(struct cpuidle_device, xen_idle_devices);
+struct cpuidle_driver cpuidle_xen_driver = {
+       .name =         "cpuidle_xen",
+};
+
+static void xen_idle_loop(struct cpuidle_device *dev, struct cpuidle_state *st)
+{
+       xen_idle();
+}
+
+static void setup_cpuidle_xen(void)
+{
+       struct cpuidle_device *dev;
+       int cpu;
+
+       if (!cpuidle_curr_driver)
+               cpuidle_register_driver(&cpuidle_xen_driver);
+
+       for_each_online_cpu(cpu) {
+               dev = &per_cpu(xen_idle_devices, cpu);
+               dev->cpu = cpu;
+               dev->states[0].enter = xen_idle_loop;
+               dev->state_count = 1;
+               cpuidle_register_device(dev);
+       }
+}
+
 void __init xen_arch_setup(void)
 {
        struct physdev_set_iopl set_iopl;
@@ -186,7 +214,7 @@ void __init xen_arch_setup(void)
               MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
               COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
 
-       pm_idle = xen_idle;
+       setup_cpuidle_xen();
 
        paravirt_disable_iospace();
 
index 0e8b75d..e343079 100644 (file)
@@ -1215,9 +1215,12 @@ static int __init acpi_processor_init(void)
         * should not use mwait for CPU-states.
         */
        dmi_check_system(processor_idle_dmi_table);
-       result = cpuidle_register_driver(&acpi_idle_driver);
-       if (result < 0)
-               goto out_proc;
+
+       if (!boot_option_idle_override) {
+               result = cpuidle_register_driver(&acpi_idle_driver);
+               if (result < 0)
+                       goto out_proc;
+       }
 
        result = acpi_bus_register_driver(&acpi_processor_driver);
        if (result < 0)
index 52bc748..b86c7a1 100644 (file)
@@ -854,18 +854,16 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
  *
  * This is equivalent to the HALT instruction.
  */
-static int acpi_idle_enter_c1(struct cpuidle_device *dev,
+static void acpi_idle_enter_c1(struct cpuidle_device *dev,
                              struct cpuidle_state *state)
 {
-       ktime_t  kt1, kt2;
-       s64 idle_time;
        struct acpi_processor *pr;
        struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
 
        pr = __get_cpu_var(processors);
 
        if (unlikely(!pr))
-               return 0;
+               return;
 
        local_irq_disable();
 
@@ -873,20 +871,15 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
        if (acpi_idle_suspend) {
                local_irq_enable();
                cpu_relax();
-               return 0;
+               return;
        }
 
        lapic_timer_state_broadcast(pr, cx, 1);
-       kt1 = ktime_get_real();
        acpi_idle_do_entry(cx);
-       kt2 = ktime_get_real();
-       idle_time =  ktime_to_us(ktime_sub(kt2, kt1));
 
        local_irq_enable();
        cx->usage++;
        lapic_timer_state_broadcast(pr, cx, 0);
-
-       return idle_time;
 }
 
 /**
@@ -894,7 +887,7 @@ static int acpi_idle_enter_c1(struct cpuidle_device *dev,
  * @dev: the target CPU
  * @state: the state data
  */
-static int acpi_idle_enter_simple(struct cpuidle_device *dev,
+static void acpi_idle_enter_simple(struct cpuidle_device *dev,
                                  struct cpuidle_state *state)
 {
        struct acpi_processor *pr;
@@ -906,10 +899,12 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
        pr = __get_cpu_var(processors);
 
        if (unlikely(!pr))
-               return 0;
+               return;
 
-       if (acpi_idle_suspend)
-               return(acpi_idle_enter_c1(dev, state));
+       if (acpi_idle_suspend) {
+               acpi_idle_enter_c1(dev, state);
+               return;
+       }
 
        local_irq_disable();
        current_thread_info()->status &= ~TS_POLLING;
@@ -922,7 +917,7 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
        if (unlikely(need_resched())) {
                current_thread_info()->status |= TS_POLLING;
                local_irq_enable();
-               return 0;
+               return;
        }
 
        /*
@@ -953,7 +948,6 @@ static int acpi_idle_enter_simple(struct cpuidle_device *dev,
 
        lapic_timer_state_broadcast(pr, cx, 0);
        cx->time += sleep_ticks;
-       return idle_time;
 }
 
 static int c3_cpu_count;
@@ -966,7 +960,7 @@ static DEFINE_SPINLOCK(c3_lock);
  *
  * If BM is detected, the deepest non-C3 idle state is entered instead.
  */
-static int acpi_idle_enter_bm(struct cpuidle_device *dev,
+static void acpi_idle_enter_bm(struct cpuidle_device *dev,
                              struct cpuidle_state *state)
 {
        struct acpi_processor *pr;
@@ -979,20 +973,23 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
        pr = __get_cpu_var(processors);
 
        if (unlikely(!pr))
-               return 0;
+               return;
 
-       if (acpi_idle_suspend)
-               return(acpi_idle_enter_c1(dev, state));
+       if (acpi_idle_suspend) {
+               acpi_idle_enter_c1(dev, state);
+               return;
+       }
 
        if (acpi_idle_bm_check()) {
                if (dev->safe_state) {
                        dev->last_state = dev->safe_state;
-                       return dev->safe_state->enter(dev, dev->safe_state);
+                       dev->safe_state->enter(dev, dev->safe_state);
+                       return;
                } else {
                        local_irq_disable();
                        acpi_safe_halt();
                        local_irq_enable();
-                       return 0;
+                       return;
                }
        }
 
@@ -1007,7 +1004,7 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
        if (unlikely(need_resched())) {
                current_thread_info()->status |= TS_POLLING;
                local_irq_enable();
-               return 0;
+               return;
        }
 
        acpi_unlazy_tlb(smp_processor_id());
@@ -1065,7 +1062,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
 
        lapic_timer_state_broadcast(pr, cx, 0);
        cx->time += sleep_ticks;
-       return idle_time;
 }
 
 struct cpuidle_driver acpi_idle_driver = {
index f7d730f..b593d5a 100644 (file)
@@ -35,6 +35,7 @@ static ssize_t __ref store_online(struct sys_device *dev, struct sysdev_attribut
        struct cpu *cpu = container_of(dev, struct cpu, sysdev);
        ssize_t ret;
 
+       cpu_hotplug_driver_lock();
        switch (buf[0]) {
        case '0':
                ret = cpu_down(cpu->sysdev.id);
@@ -49,6 +50,7 @@ static ssize_t __ref store_online(struct sys_device *dev, struct sysdev_attribut
        default:
                ret = -EINVAL;
        }
+       cpu_hotplug_driver_unlock();
 
        if (ret >= 0)
                ret = count;
index 989429c..facefe4 100644 (file)
 
 #define MEMORY_CLASS_NAME      "memory"
 
-static struct sysdev_class memory_sysdev_class = {
+struct sysdev_class memory_sysdev_class = {
        .name = MEMORY_CLASS_NAME,
 };
+EXPORT_SYMBOL(memory_sysdev_class);
 
 static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj)
 {
index 12fdd39..57a5895 100644 (file)
 #include "cpuidle.h"
 
 DEFINE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
+DEFINE_PER_CPU(struct list_head, cpuidle_devices_list);
 
 DEFINE_MUTEX(cpuidle_lock);
-LIST_HEAD(cpuidle_detected_devices);
-static void (*pm_idle_old)(void);
-
-static int enabled_devices;
 
 #if defined(CONFIG_ARCH_HAS_CPU_IDLE_WAIT)
 static void cpuidle_kick_cpus(void)
@@ -47,21 +44,20 @@ static int __cpuidle_register_device(struct cpuidle_device *dev);
  *
  * NOTE: no locks or semaphores should be used here
  */
-static void cpuidle_idle_call(void)
+void cpuidle_idle_call(void)
 {
        struct cpuidle_device *dev = __get_cpu_var(cpuidle_devices);
        struct cpuidle_state *target_state;
        int next_state;
+       ktime_t t1, t2;
+       s64 diff;
 
        /* check if the device is ready */
        if (!dev || !dev->enabled) {
-               if (pm_idle_old)
-                       pm_idle_old();
-               else
 #if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE)
-                       default_idle();
+               default_idle();
 #else
-                       local_irq_enable();
+               local_irq_enable();
 #endif
                return;
        }
@@ -75,7 +71,11 @@ static void cpuidle_idle_call(void)
        hrtimer_peek_ahead_timers();
 #endif
        /* ask the governor for the next state */
-       next_state = cpuidle_curr_governor->select(dev);
+       if (dev->state_count > 1)
+               next_state = cpuidle_curr_governor->select(dev);
+       else
+               next_state = 0;
+
        if (need_resched()) {
                local_irq_enable();
                return;
@@ -85,7 +85,18 @@ static void cpuidle_idle_call(void)
 
        /* enter the state and update stats */
        dev->last_state = target_state;
-       dev->last_residency = target_state->enter(dev, target_state);
+
+       t1 = ktime_get();
+
+       target_state->enter(dev, target_state);
+
+       t2 = ktime_get();
+       diff = ktime_to_us(ktime_sub(t2, t1));
+       if (diff > INT_MAX)
+               diff = INT_MAX;
+
+       dev->last_residency = (int) diff;
+
        if (dev->last_state)
                target_state = dev->last_state;
 
@@ -99,35 +110,12 @@ static void cpuidle_idle_call(void)
 }
 
 /**
- * cpuidle_install_idle_handler - installs the cpuidle idle loop handler
- */
-void cpuidle_install_idle_handler(void)
-{
-       if (enabled_devices && (pm_idle != cpuidle_idle_call)) {
-               /* Make sure all changes finished before we switch to new idle */
-               smp_wmb();
-               pm_idle = cpuidle_idle_call;
-       }
-}
-
-/**
- * cpuidle_uninstall_idle_handler - uninstalls the cpuidle idle loop handler
- */
-void cpuidle_uninstall_idle_handler(void)
-{
-       if (enabled_devices && pm_idle_old && (pm_idle != pm_idle_old)) {
-               pm_idle = pm_idle_old;
-               cpuidle_kick_cpus();
-       }
-}
-
-/**
  * cpuidle_pause_and_lock - temporarily disables CPUIDLE
  */
 void cpuidle_pause_and_lock(void)
 {
        mutex_lock(&cpuidle_lock);
-       cpuidle_uninstall_idle_handler();
+       cpuidle_kick_cpus();
 }
 
 EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock);
@@ -137,12 +125,50 @@ EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock);
  */
 void cpuidle_resume_and_unlock(void)
 {
-       cpuidle_install_idle_handler();
        mutex_unlock(&cpuidle_lock);
 }
 
 EXPORT_SYMBOL_GPL(cpuidle_resume_and_unlock);
 
+int cpuidle_add_to_list(struct cpuidle_device *dev)
+{
+       int ret, cpu = dev->cpu;
+       struct cpuidle_device *old_dev;
+
+       if (!list_empty(&per_cpu(cpuidle_devices_list, cpu))) {
+               old_dev = list_first_entry(&per_cpu(cpuidle_devices_list, cpu),
+                               struct cpuidle_device, idle_list);
+               cpuidle_remove_state_sysfs(old_dev);
+       }
+
+       list_add(&dev->idle_list, &per_cpu(cpuidle_devices_list, cpu));
+       ret = cpuidle_add_state_sysfs(dev);
+       return ret;
+}
+
+void cpuidle_remove_from_list(struct cpuidle_device *dev)
+{
+       struct cpuidle_device *temp_dev;
+       struct list_head *pos;
+       int ret, cpu = dev->cpu;
+
+       list_for_each(pos, &per_cpu(cpuidle_devices_list, cpu)) {
+               temp_dev = container_of(pos, struct cpuidle_device, idle_list);
+               if (dev == temp_dev) {
+                       list_del(&temp_dev->idle_list);
+                       cpuidle_remove_state_sysfs(temp_dev);
+                       break;
+               }
+       }
+
+       if (!list_empty(&per_cpu(cpuidle_devices_list, cpu))) {
+               temp_dev = list_first_entry(&per_cpu(cpuidle_devices_list, cpu),
+                                       struct cpuidle_device, idle_list);
+               ret = cpuidle_add_state_sysfs(temp_dev);
+       }
+       cpuidle_kick_cpus();
+}
+
 /**
  * cpuidle_enable_device - enables idle PM for a CPU
  * @dev: the CPU
@@ -167,9 +193,6 @@ int cpuidle_enable_device(struct cpuidle_device *dev)
                        return ret;
        }
 
-       if ((ret = cpuidle_add_state_sysfs(dev)))
-               return ret;
-
        if (cpuidle_curr_governor->enable &&
            (ret = cpuidle_curr_governor->enable(dev)))
                goto fail_sysfs;
@@ -185,11 +208,10 @@ int cpuidle_enable_device(struct cpuidle_device *dev)
 
        dev->enabled = 1;
 
-       enabled_devices++;
        return 0;
 
 fail_sysfs:
-       cpuidle_remove_state_sysfs(dev);
+       cpuidle_remove_from_list(dev);
 
        return ret;
 }
@@ -214,32 +236,16 @@ void cpuidle_disable_device(struct cpuidle_device *dev)
 
        if (cpuidle_curr_governor->disable)
                cpuidle_curr_governor->disable(dev);
-
-       cpuidle_remove_state_sysfs(dev);
-       enabled_devices--;
 }
 
 EXPORT_SYMBOL_GPL(cpuidle_disable_device);
 
 #ifdef CONFIG_ARCH_HAS_CPU_RELAX
-static int poll_idle(struct cpuidle_device *dev, struct cpuidle_state *st)
+static void poll_idle(struct cpuidle_device *dev, struct cpuidle_state *st)
 {
-       ktime_t t1, t2;
-       s64 diff;
-       int ret;
-
-       t1 = ktime_get();
        local_irq_enable();
        while (!need_resched())
                cpu_relax();
-
-       t2 = ktime_get();
-       diff = ktime_to_us(ktime_sub(t2, t1));
-       if (diff > INT_MAX)
-               diff = INT_MAX;
-
-       ret = (int) diff;
-       return ret;
 }
 
 static void poll_idle_init(struct cpuidle_device *dev)
@@ -269,7 +275,6 @@ static void poll_idle_init(struct cpuidle_device *dev) {}
  */
 static int __cpuidle_register_device(struct cpuidle_device *dev)
 {
-       int ret;
        struct sys_device *sys_dev = get_cpu_sysdev((unsigned long)dev->cpu);
 
        if (!sys_dev)
@@ -277,16 +282,9 @@ static int __cpuidle_register_device(struct cpuidle_device *dev)
        if (!try_module_get(cpuidle_curr_driver->owner))
                return -EINVAL;
 
-       init_completion(&dev->kobj_unregister);
-
        poll_idle_init(dev);
 
        per_cpu(cpuidle_devices, dev->cpu) = dev;
-       list_add(&dev->device_list, &cpuidle_detected_devices);
-       if ((ret = cpuidle_add_sysfs(sys_dev))) {
-               module_put(cpuidle_curr_driver->owner);
-               return ret;
-       }
 
        dev->registered = 1;
        return 0;
@@ -308,7 +306,7 @@ int cpuidle_register_device(struct cpuidle_device *dev)
        }
 
        cpuidle_enable_device(dev);
-       cpuidle_install_idle_handler();
+       cpuidle_add_to_list(dev);
 
        mutex_unlock(&cpuidle_lock);
 
@@ -324,18 +322,14 @@ EXPORT_SYMBOL_GPL(cpuidle_register_device);
  */
 void cpuidle_unregister_device(struct cpuidle_device *dev)
 {
-       struct sys_device *sys_dev = get_cpu_sysdev((unsigned long)dev->cpu);
-
        if (dev->registered == 0)
                return;
 
        cpuidle_pause_and_lock();
 
        cpuidle_disable_device(dev);
+       cpuidle_remove_from_list(dev);
 
-       cpuidle_remove_sysfs(sys_dev);
-       list_del(&dev->device_list);
-       wait_for_completion(&dev->kobj_unregister);
        per_cpu(cpuidle_devices, dev->cpu) = NULL;
 
        cpuidle_resume_and_unlock();
@@ -385,14 +379,15 @@ static inline void latency_notifier_init(struct notifier_block *n)
  */
 static int __init cpuidle_init(void)
 {
-       int ret;
-
-       pm_idle_old = pm_idle;
+       int ret, cpu;
 
        ret = cpuidle_add_class_sysfs(&cpu_sysdev_class);
        if (ret)
                return ret;
 
+       for_each_possible_cpu(cpu)
+               INIT_LIST_HEAD(&per_cpu(cpuidle_devices_list, cpu));
+
        latency_notifier_init(&cpuidle_latency_notifier);
 
        return 0;
index 9476ba3..d163df3 100644 (file)
@@ -9,9 +9,7 @@
 
 /* For internal use only */
 extern struct cpuidle_governor *cpuidle_curr_governor;
-extern struct cpuidle_driver *cpuidle_curr_driver;
 extern struct list_head cpuidle_governors;
-extern struct list_head cpuidle_detected_devices;
 extern struct mutex cpuidle_lock;
 extern spinlock_t cpuidle_driver_lock;
 
@@ -27,7 +25,7 @@ extern int cpuidle_add_class_sysfs(struct sysdev_class *cls);
 extern void cpuidle_remove_class_sysfs(struct sysdev_class *cls);
 extern int cpuidle_add_state_sysfs(struct cpuidle_device *device);
 extern void cpuidle_remove_state_sysfs(struct cpuidle_device *device);
-extern int cpuidle_add_sysfs(struct sys_device *sysdev);
-extern void cpuidle_remove_sysfs(struct sys_device *sysdev);
+extern int cpuidle_add_sysfs(struct cpuidle_device *device);
+extern void cpuidle_remove_sysfs(struct cpuidle_device *device);
 
 #endif /* __DRIVER_CPUIDLE_H */
index 2257004..8daa6d8 100644 (file)
@@ -27,10 +27,6 @@ int cpuidle_register_driver(struct cpuidle_driver *drv)
                return -EINVAL;
 
        spin_lock(&cpuidle_driver_lock);
-       if (cpuidle_curr_driver) {
-               spin_unlock(&cpuidle_driver_lock);
-               return -EBUSY;
-       }
        cpuidle_curr_driver = drv;
        spin_unlock(&cpuidle_driver_lock);
 
index 70b5964..2cadc57 100644 (file)
@@ -43,16 +43,14 @@ static struct cpuidle_governor * __cpuidle_find_governor(const char *str)
  */
 int cpuidle_switch_governor(struct cpuidle_governor *gov)
 {
-       struct cpuidle_device *dev;
+       int cpu;
 
        if (gov == cpuidle_curr_governor)
                return 0;
 
-       cpuidle_uninstall_idle_handler();
-
        if (cpuidle_curr_governor) {
-               list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
-                       cpuidle_disable_device(dev);
+               for_each_online_cpu(cpu)
+                       cpuidle_disable_device(per_cpu(cpuidle_devices, cpu));
                module_put(cpuidle_curr_governor->owner);
        }
 
@@ -61,9 +59,8 @@ int cpuidle_switch_governor(struct cpuidle_governor *gov)
        if (gov) {
                if (!try_module_get(cpuidle_curr_governor->owner))
                        return -EINVAL;
-               list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
-                       cpuidle_enable_device(dev);
-               cpuidle_install_idle_handler();
+               for_each_online_cpu(cpu)
+                       cpuidle_enable_device(per_cpu(cpuidle_devices, cpu));
                printk(KERN_INFO "cpuidle: using governor %s\n", gov->name);
        }
 
index 97b0038..07383d4 100644 (file)
@@ -311,6 +311,13 @@ int cpuidle_add_state_sysfs(struct cpuidle_device *device)
        int i, ret = -ENOMEM;
        struct cpuidle_state_kobj *kobj;
 
+       init_completion(&device->kobj_unregister);
+
+       ret = cpuidle_add_sysfs(device);
+       if (ret) {
+               module_put(cpuidle_curr_driver->owner);
+               return ret;
+       }
        /* state statistics */
        for (i = 0; i < device->state_count; i++) {
                kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL);
@@ -347,35 +354,32 @@ void cpuidle_remove_state_sysfs(struct cpuidle_device *device)
 
        for (i = 0; i < device->state_count; i++)
                cpuidle_free_state_kobj(device, i);
+
+       cpuidle_remove_sysfs(device);
 }
 
 /**
  * cpuidle_add_sysfs - creates a sysfs instance for the target device
- * @sysdev: the target device
+ * @device: the target device
  */
-int cpuidle_add_sysfs(struct sys_device *sysdev)
+int cpuidle_add_sysfs(struct cpuidle_device *device)
 {
-       int cpu = sysdev->id;
-       struct cpuidle_device *dev;
        int error;
+       struct sys_device *sysdev = get_cpu_sysdev((unsigned long)device->cpu);
 
-       dev = per_cpu(cpuidle_devices, cpu);
-       error = kobject_init_and_add(&dev->kobj, &ktype_cpuidle, &sysdev->kobj,
-                                    "cpuidle");
+       error = kobject_init_and_add(&device->kobj, &ktype_cpuidle,
+                               &sysdev->kobj, "cpuidle");
        if (!error)
-               kobject_uevent(&dev->kobj, KOBJ_ADD);
+               kobject_uevent(&device->kobj, KOBJ_ADD);
        return error;
 }
 
 /**
  * cpuidle_remove_sysfs - deletes a sysfs instance on the target device
- * @sysdev: the target device
+ * @device: the target device
  */
-void cpuidle_remove_sysfs(struct sys_device *sysdev)
+void cpuidle_remove_sysfs(struct cpuidle_device *device)
 {
-       int cpu = sysdev->id;
-       struct cpuidle_device *dev;
-
-       dev = per_cpu(cpuidle_devices, cpu);
-       kobject_put(&dev->kobj);
+       kobject_put(&device->kobj);
+       wait_for_completion(&device->kobj_unregister);
 }
index 4753619..b0ad4e1 100644 (file)
@@ -115,6 +115,19 @@ extern void put_online_cpus(void);
 #define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb)
 int cpu_down(unsigned int cpu);
 
+#ifdef CONFIG_PPC_PSERIES
+extern void cpu_hotplug_driver_lock(void);
+extern void cpu_hotplug_driver_unlock(void);
+#else
+static inline void cpu_hotplug_driver_lock(void)
+{
+}
+
+static inline void cpu_hotplug_driver_unlock(void)
+{
+}
+#endif
+
 #else          /* CONFIG_HOTPLUG_CPU */
 
 #define get_online_cpus()      do { } while (0)
index dcf77fa..d57c22b 100644 (file)
@@ -41,7 +41,7 @@ struct cpuidle_state {
        unsigned long long      usage;
        unsigned long long      time; /* in US */
 
-       int (*enter)    (struct cpuidle_device *dev,
+       void (*enter)   (struct cpuidle_device *dev,
                         struct cpuidle_state *state);
 };
 
@@ -92,7 +92,7 @@ struct cpuidle_device {
        struct cpuidle_state_kobj *kobjs[CPUIDLE_STATE_MAX];
        struct cpuidle_state    *last_state;
 
-       struct list_head        device_list;
+       struct list_head        idle_list;
        struct kobject          kobj;
        struct completion       kobj_unregister;
        void                    *governor_data;
@@ -112,6 +112,9 @@ static inline int cpuidle_get_last_residency(struct cpuidle_device *dev)
        return dev->last_residency;
 }
 
+extern struct cpuidle_driver *cpuidle_curr_driver;
+extern void cpuidle_idle_call(void);
+
 
 /****************************
  * CPUIDLE DRIVER INTERFACE *
@@ -133,6 +136,8 @@ extern void cpuidle_pause_and_lock(void);
 extern void cpuidle_resume_and_unlock(void);
 extern int cpuidle_enable_device(struct cpuidle_device *dev);
 extern void cpuidle_disable_device(struct cpuidle_device *dev);
+extern int common_idle_loop(struct cpuidle_device *dev,
+                       struct cpuidle_state *st, void (*idle)(void));
 
 #else
 
@@ -148,6 +153,8 @@ static inline void cpuidle_resume_and_unlock(void) { }
 static inline int cpuidle_enable_device(struct cpuidle_device *dev)
 {return 0;}
 static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
+static inline int common_idle_loop(struct cpuidle_device *dev,
+                       struct cpuidle_state *st, void (*idle)(void)) { }
 
 #endif
 
index fed9692..80c6700 100644 (file)
@@ -12,6 +12,8 @@ struct mem_section;
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 
+extern struct sysdev_class memory_sysdev_class;
+
 /*
  * Types for free bootmem.
  * The normal smallest mapcount is -1. Here is smaller value than it.