- Update to 3.2-rc1.
authorJeff Mahoney <jeffm@suse.com>
Wed, 16 Nov 2011 05:32:19 +0000 (00:32 -0500)
committerJeff Mahoney <jeffm@suse.com>
Wed, 16 Nov 2011 05:32:19 +0000 (00:32 -0500)
  - Eliminate 212 patches.

suse-commit: 6318d5bd78708a9eae86c2c29377ad4062f66121

112 files changed:
1  2 
Documentation/kernel-parameters.txt
Documentation/sysctl/kernel.txt
Makefile
arch/ia64/Kconfig
arch/ia64/kernel/acpi.c
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/legacy_serial.c
arch/powerpc/kernel/prom_init.c
arch/powerpc/platforms/pseries/setup.c
arch/powerpc/xmon/xmon.c
arch/s390/Kconfig
arch/x86/Kconfig
arch/x86/Makefile
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/apic/bigsmp_32.c
arch/x86/kernel/apic/probe_32.c
arch/x86/kernel/apm_32.c
arch/x86/kernel/cpu/perf_event.c
arch/x86/kernel/dumpstack_64.c
arch/x86/kernel/e820.c
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/hpet.c
arch/x86/kernel/reboot.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/kvm/svm.c
arch/x86/kvm/x86.c
drivers/acpi/Kconfig
drivers/acpi/ec_sys.c
drivers/acpi/osl.c
drivers/char/Kconfig
drivers/connector/cn_proc.c
drivers/cpufreq/cpufreq_ondemand.c
drivers/hid/hid-apple.c
drivers/hid/hid-core.c
drivers/hid/hid-ids.h
drivers/idle/intel_idle.c
drivers/input/mouse/synaptics.c
drivers/input/mouse/synaptics.h
drivers/input/touchscreen/Kconfig
drivers/input/touchscreen/Makefile
drivers/isdn/mISDN/socket.c
drivers/md/Kconfig
drivers/md/Makefile
drivers/md/dm-least-pending.c
drivers/md/dm-memcache.c
drivers/md/dm-raid45.c
drivers/md/dm-table.c
drivers/md/dm.c
drivers/misc/Kconfig
drivers/net/ethernet/dec/tulip/tulip_core.c
drivers/net/ethernet/ibm/ehea/ehea_main.c
drivers/net/wireless/b43/main.c
drivers/scsi/device_handler/scsi_dh.c
drivers/scsi/megaraid/megaraid_mbox.c
drivers/scsi/scsi_error.c
drivers/scsi/scsi_lib.c
drivers/scsi/scsi_netlink.c
drivers/scsi/scsi_scan.c
drivers/scsi/sd.c
drivers/tty/serial/8250.c
drivers/tty/vt/keyboard.c
drivers/tty/vt/vt.c
drivers/video/Kconfig
drivers/video/Makefile
drivers/video/console/vgacon.c
fs/Kconfig
fs/Makefile
fs/ext4/ext4.h
fs/ext4/file.c
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/namei.c
fs/ext4/super.c
fs/ext4/xattr.c
fs/namei.c
fs/nfs/inode.c
fs/super.c
include/acpi/acpiosxf.h
include/asm-generic/vmlinux.lds.h
include/linux/acpi.h
include/linux/blkdev.h
include/linux/device.h
include/linux/fs.h
include/linux/kernel.h
include/linux/mm.h
include/linux/module.h
include/linux/nfs_fs.h
include/linux/printk.h
init/Kconfig
init/main.c
kernel/Kconfig.preempt
kernel/Makefile
kernel/ksysfs.c
kernel/module.c
kernel/panic.c
kernel/printk.c
kernel/sysctl.c
kernel/sysctl_binary.c
lib/Kconfig.debug
mm/page_alloc.c
mm/thrash.c
mm/truncate.c
net/bridge/br_if.c
net/netfilter/Kconfig
scripts/Makefile.build
scripts/genksyms/genksyms.c
scripts/kconfig/Makefile
scripts/mod/modpost.c
security/apparmor/apparmorfs.c
security/apparmor/policy_unpack.c
virt/kvm/ioapic.c

Simple merge
Simple merge
diff --cc Makefile
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -1361,23 -1343,7 +1343,22 @@@ static int __init dmi_ignore_irq0_timer
        }
        return 0;
  }
- #endif
  
 +static int __init force_acpi_rsdt(const struct dmi_system_id *d)
 +{
 +      if (!acpi_force) {
 +              printk(KERN_NOTICE "%s detected: force use of acpi=rsdt\n",
 +                     d->ident);
 +              acpi_rsdt_forced = 1;
 +      } else {
 +              printk(KERN_NOTICE
 +                     "Warning: acpi=force overrules DMI blacklist: "
 +                     "acpi=rsdt\n");
 +      }
 +      return 0;
 +
 +}
 +
  /*
   * If your system is blacklisted here, but you find that acpi=force
   * works for you, please contact linux-acpi@vger.kernel.org
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -1217,41 -1212,7 +1217,41 @@@ ENTRY(call_softirq
        CFI_ENDPROC
  END(call_softirq)
  
 +#ifdef CONFIG_STACK_UNWIND
 +ENTRY(arch_unwind_init_running)
 +      CFI_STARTPROC
 +      movq    %r15, R15(%rdi)
 +      movq    %r14, R14(%rdi)
 +      xchgq   %rsi, %rdx
 +      movq    %r13, R13(%rdi)
 +      movq    %r12, R12(%rdi)
 +      xorl    %eax, %eax
 +      movq    %rbp, RBP(%rdi)
 +      movq    %rbx, RBX(%rdi)
 +      movq    (%rsp), %r9
 +      xchgq   %rdx, %rcx
 +      movq    %rax, R11(%rdi)
 +      movq    %rax, R10(%rdi)
 +      movq    %rax, R9(%rdi)
 +      movq    %rax, R8(%rdi)
 +      movq    %rax, RAX(%rdi)
 +      movq    %rax, RCX(%rdi)
 +      movq    %rax, RDX(%rdi)
 +      movq    %rax, RSI(%rdi)
 +      movq    %rax, RDI(%rdi)
 +      movq    %rax, ORIG_RAX(%rdi)
 +      movq    %r9, RIP(%rdi)
 +      leaq    8(%rsp), %r9
 +      movq    $__KERNEL_CS, CS(%rdi)
 +      movq    %rax, EFLAGS(%rdi)
 +      movq    %r9, RSP(%rdi)
 +      movq    $__KERNEL_DS, SS(%rdi)
 +      jmpq    *%rcx
 +      CFI_ENDPROC
 +END(arch_unwind_init_running)
 +#endif
 +
- #ifdef CONFIG_PARAVIRT_XEN
+ #ifdef CONFIG_XEN
  zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
  
  /*
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -1557,7 -1572,6 +1572,7 @@@ int kvm_set_msr_common(struct kvm_vcpu 
        case MSR_VM_HSAVE_PA:
        case MSR_AMD64_PATCH_LOADER:
                break;
-       case 0xe2:
++      case MSR_NHM_SNB_PKG_CST_CFG_CTL: /* 0xe2 */
        case 0x200 ... 0x2ff:
                return set_msr_mtrr(vcpu, msr, data);
        case MSR_IA32_APICBASE:
@@@ -1859,9 -1878,11 +1879,12 @@@ int kvm_get_msr_common(struct kvm_vcpu 
        case MSR_K8_INT_PENDING_MSG:
        case MSR_AMD64_NB_CFG:
        case MSR_FAM10H_MMIO_CONF_BASE:
-       case 0xe2:
++      case MSR_NHM_SNB_PKG_CST_CFG_CTL: /* 0xe2 */
                data = 0;
                break;
+       case MSR_IA32_UCODE_REV:
+               data = 0x100000000ULL;
+               break;
        case MSR_MTRRcap:
                data = 0x500 | KVM_NR_VAR_MTRR;
                break;
Simple merge
@@@ -11,7 -11,7 +11,8 @@@
  #include <linux/kernel.h>
  #include <linux/acpi.h>
  #include <linux/debugfs.h>
+ #include <linux/module.h>
 +#include <linux/uaccess.h>
  #include "internal.h"
  
  MODULE_AUTHOR("Thomas Renninger <trenn@suse.de>");
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -1036,14 -1363,13 +1471,16 @@@ int synaptics_init(struct psmouse *psmo
  
        priv->pkt_type = SYN_MODEL_NEWABS(priv->model_id) ? SYN_NEWABS : SYN_OLDABS;
  
-       printk(KERN_INFO "Synaptics Touchpad, model: %ld, fw: %ld.%ld, id: %#lx, caps: %#lx/%#lx/%#lx\n",
-               SYN_ID_MODEL(priv->identity),
-               SYN_ID_MAJOR(priv->identity), SYN_ID_MINOR(priv->identity),
-               priv->model_id, priv->capabilities, priv->ext_cap, priv->ext_cap_0c);
+       psmouse_info(psmouse,
+                    "Touchpad model: %ld, fw: %ld.%ld, id: %#lx, caps: %#lx/%#lx/%#lx\n",
+                    SYN_ID_MODEL(priv->identity),
+                    SYN_ID_MAJOR(priv->identity), SYN_ID_MINOR(priv->identity),
+                    priv->model_id,
+                    priv->capabilities, priv->ext_cap, priv->ext_cap_0c);
  
 +      if (synaptics_init_led(psmouse) < 0)
 +              goto init_fail;
 +
        set_input_params(psmouse->dev, priv);
  
        /*
@@@ -127,10 -139,11 +139,13 @@@ struct synaptics_hw_state 
        unsigned int down:1;
        unsigned char ext_buttons;
        signed char scroll;
+       /* As reported in last AGM-CONTACT packets */
+       struct synaptics_mt_state mt_state;
  };
  
 +struct synaptics_led;
 +
  struct synaptics_data {
        /* Data read from the touchpad */
        unsigned long int model_id;             /* Model-ID */
  
        struct serio *pt_port;                  /* Pass-through serio port */
  
-       struct synaptics_hw_state mt;           /* current gesture packet */
+       struct synaptics_mt_state mt_state;     /* Current mt finger state */
+       bool mt_state_lost;                     /* mt_state may be incorrect */
+       /*
+        * Last received Advanced Gesture Mode (AGM) packet. An AGM packet
+        * contains position data for a second contact, at half resolution.
+        */
+       struct synaptics_hw_state agm;
+       bool agm_pending;                       /* new AGM packet received */
 +      struct synaptics_led *led;
  };
  
  void synaptics_module_init(void);
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -38,7 -41,7 +41,8 @@@ obj-$(CONFIG_DM_MIRROR)               += dm-mirror.
  obj-$(CONFIG_DM_LOG_USERSPACE)        += dm-log-userspace.o
  obj-$(CONFIG_DM_ZERO)         += dm-zero.o
  obj-$(CONFIG_DM_RAID) += dm-raid.o
+ obj-$(CONFIG_DM_THIN_PROVISIONING)    += dm-thin-pool.o
 +obj-$(CONFIG_DM_RAID45)               += dm-raid45.o dm-log.o dm-memcache.o
  
  ifeq ($(CONFIG_DM_UEVENT),y)
  dm-mod-objs                   += dm-uevent.o
index 4bce57c,0000000..f4e98b7
mode 100644,000000..100644
--- /dev/null
@@@ -1,258 -1,0 +1,259 @@@
 +/*
 + * (C) Copyright 2008 Hewlett-Packard Development Company, L.P
 + *
 + * This file is released under the GPL.
 + */
 +
 +#include "dm-path-selector.h"
 +
 +#include <linux/slab.h>
++#include <linux/module.h>
 +
 +#define DM_MSG_PREFIX "multipath least-pending"
 +
 +/*-----------------------------------------------------------------
 +* Path-handling code, paths are held in lists
 +*---------------------------------------------------------------*/
 +struct path_info {
 +       struct list_head list;
 +       struct dm_path *path;
 +       unsigned repeat_count;
 +       atomic_t io_count;
 +};
 +
 +static void free_paths(struct list_head *paths)
 +{
 +       struct path_info *pi, *next;
 +
 +       list_for_each_entry_safe(pi, next, paths, list) {
 +              list_del(&pi->list);
 +              kfree(pi);
 +       }
 +}
 +
 +/*-----------------------------------------------------------------
 + * Least-pending selector
 + *---------------------------------------------------------------*/
 +
 +#define LPP_MIN_IO     1
 +
 +struct selector {
 +       struct list_head valid_paths;
 +       struct list_head invalid_paths;
 +};
 +
 +static struct selector *alloc_selector(void)
 +{
 +       struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
 +
 +       if (s) {
 +              INIT_LIST_HEAD(&s->valid_paths);
 +              INIT_LIST_HEAD(&s->invalid_paths);
 +       }
 +
 +       return s;
 +}
 +
 +static int lpp_create(struct path_selector *ps, unsigned argc, char **argv)
 +{
 +       struct selector *s;
 +
 +       s = alloc_selector();
 +       if (!s)
 +              return -ENOMEM;
 +
 +       ps->context = s;
 +       return 0;
 +}
 +
 +static void lpp_destroy(struct path_selector *ps)
 +{
 +       struct selector *s = ps->context;
 +
 +       free_paths(&s->valid_paths);
 +       free_paths(&s->invalid_paths);
 +       kfree(s);
 +       ps->context = NULL;
 +}
 +
 +static int lpp_status(struct path_selector *ps, struct dm_path *path,
 +                      status_type_t type, char *result, unsigned int maxlen)
 +{
 +       struct path_info *pi;
 +       int sz = 0;
 +
 +       if (!path)
 +              switch (type) {
 +              case STATUSTYPE_INFO:
 +                      DMEMIT("1 ");
 +              break;
 +              case STATUSTYPE_TABLE:
 +                      DMEMIT("0 ");
 +              break;
 +              }
 +       else {
 +              pi = path->pscontext;
 +              switch (type) {
 +              case STATUSTYPE_INFO:
 +                      DMEMIT("%u:%u ", pi->repeat_count,
 +                                       atomic_read(&pi->io_count));
 +              break;
 +              case STATUSTYPE_TABLE:
 +              break;
 +              }
 +      }
 +
 +       return sz;
 +}
 +
 +/*
 + * Called during initialisation to register each path with an
 + * optional repeat_count.
 + */
 +static int lpp_add_path(struct path_selector *ps, struct dm_path *path,
 +                      int argc, char **argv, char **error)
 +{
 +       struct selector *s = ps->context;
 +       struct path_info *pi;
 +       unsigned repeat_count = LPP_MIN_IO;
 +
 +      if (argc > 1) {
 +              *error = "least-pending ps: incorrect number of arguments";
 +              return -EINVAL;
 +      }
 +
 +       /* First path argument is number of I/Os before switching path */
 +       if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
 +              *error = "least-pending ps: invalid repeat count";
 +              return -EINVAL;
 +       }
 +
 +       /* allocate the path */
 +       pi = kmalloc(sizeof(*pi), GFP_KERNEL);
 +       if (!pi) {
 +              *error = "least-pending ps: Error allocating path context";
 +              return -ENOMEM;
 +       }
 +
 +       pi->path = path;
 +       pi->repeat_count = repeat_count;
 +       atomic_set(&pi->io_count, 0);
 +
 +       path->pscontext = pi;
 +
 +       list_add(&pi->list, &s->valid_paths);
 +
 +       return 0;
 +}
 +
 +static void lpp_fail_path(struct path_selector *ps, struct dm_path *p)
 +{
 +       struct selector *s = ps->context;
 +       struct path_info *pi = p->pscontext;
 +
 +       if (!pi)
 +      return;
 +
 +       atomic_set(&pi->io_count, 0);
 +
 +       list_move(&pi->list, &s->invalid_paths);
 +}
 +
 +static int lpp_reinstate_path(struct path_selector *ps, struct dm_path *p)
 +{
 +       struct selector *s = ps->context;
 +       struct path_info *pi = p->pscontext;
 +
 +       if (!pi)
 +      return 1;
 +
 +       list_move(&pi->list, &s->valid_paths);
 +
 +       return 0;
 +}
 +
 +static struct dm_path *lpp_select_path(struct path_selector *ps,
 +                                      unsigned *repeat_count,
 +                                      size_t nr_bytes)
 +{
 +       struct selector *s = ps->context;
 +       struct path_info *pi, *next, *least_io_path = NULL;
 +       struct list_head *paths;
 +
 +       if (list_empty(&s->valid_paths))
 +              return NULL;
 +
 +       paths = &s->valid_paths;
 +
 +       list_for_each_entry_safe(pi, next, paths, list) {
 +              if (!least_io_path || atomic_read(&least_io_path->io_count) < atomic_read(&pi->io_count))
 +                      least_io_path = pi;
 +              if (!atomic_read(&least_io_path->io_count))
 +                      break;
 +       }
 +
 +       if (!least_io_path)
 +              return NULL;
 +
 +       atomic_inc(&least_io_path->io_count);
 +       *repeat_count = least_io_path->repeat_count;
 +
 +       return least_io_path->path;
 +}
 +
 +static int lpp_end_io(struct path_selector *ps, struct dm_path *path,
 +                    size_t nr_bytes)
 +{
 +       struct path_info *pi = NULL;
 +
 +       pi = path->pscontext;
 +       if (!pi)
 +      return 1;
 +
 +       atomic_dec(&pi->io_count);
 +
 +       return 0;
 +}
 +
 +static struct path_selector_type lpp_ps = {
 +       .name = "least-pending",
 +       .module = THIS_MODULE,
 +       .table_args = 1,
 +       .info_args = 0,
 +       .create = lpp_create,
 +       .destroy = lpp_destroy,
 +       .status = lpp_status,
 +       .add_path = lpp_add_path,
 +       .fail_path = lpp_fail_path,
 +       .reinstate_path = lpp_reinstate_path,
 +       .select_path = lpp_select_path,
 +       .end_io = lpp_end_io,
 +};
 +
 +static int __init dm_lpp_init(void)
 +{
 +       int r = dm_register_path_selector(&lpp_ps);
 +
 +       if (r < 0)
 +              DMERR("register failed %d", r);
 +
 +       DMINFO("version 1.0.0 loaded");
 +
 +       return r;
 +}
 +
 +static void __exit dm_lpp_exit(void)
 +{
 +       int r = dm_unregister_path_selector(&lpp_ps);
 +
 +       if (r < 0)
 +              DMERR("unregister failed %d", r);
 +}
 +
 +module_init(dm_lpp_init);
 +module_exit(dm_lpp_exit);
 +
 +MODULE_DESCRIPTION(DM_NAME " least-pending multipath path selector");
 +MODULE_AUTHOR("Sakshi Chaitanya Veni <vsakshi@hp.com>");
 +MODULE_LICENSE("GPL");
 +
index abfcd5f,0000000..2d7d914
mode 100644,000000..100644
--- /dev/null
@@@ -1,302 -1,0 +1,303 @@@
 +/*
 + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
 + *
 + * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
 + *
 + * Device-mapper memory object handling:
 + *
 + * o allocate/free total_pages in a per client page pool.
 + *
 + * o allocate/free memory objects with chunks (1..n) of
 + *   pages_per_chunk pages hanging off.
 + *
 + * This file is released under the GPL.
 + */
 +
 +#define       DM_MEM_CACHE_VERSION    "0.2"
 +
 +#include "dm.h"
 +#include "dm-memcache.h"
 +#include <linux/dm-io.h>
 +#include <linux/slab.h>
++#include <linux/module.h>
 +
 +struct dm_mem_cache_client {
 +      spinlock_t lock;
 +      mempool_t *objs_pool;
 +      struct page_list *free_list;
 +      unsigned objects;
 +      unsigned chunks;
 +      unsigned pages_per_chunk;
 +      unsigned free_pages;
 +      unsigned total_pages;
 +};
 +
 +/*
 + * Free pages and page_list elements of client.
 + */
 +static void free_cache_pages(struct page_list *list)
 +{
 +      while (list) {
 +              struct page_list *pl = list;
 +
 +              list = pl->next;
 +              BUG_ON(!pl->page);
 +              __free_page(pl->page);
 +              kfree(pl);
 +      }
 +}
 +
 +/*
 + * Alloc number of pages and page_list elements as required by client.
 + */
 +static struct page_list *alloc_cache_pages(unsigned pages)
 +{
 +      struct page_list *pl, *ret = NULL;
 +      struct page *page;
 +
 +      while (pages--) {
 +              page = alloc_page(GFP_NOIO);
 +              if (!page)
 +                      goto err;
 +
 +              pl = kmalloc(sizeof(*pl), GFP_NOIO);
 +              if (!pl) {
 +                      __free_page(page);
 +                      goto err;
 +              }
 +
 +              pl->page = page;
 +              pl->next = ret;
 +              ret = pl;
 +      }
 +
 +      return ret;
 +
 +err:
 +      free_cache_pages(ret);
 +      return NULL;
 +}
 +
 +/*
 + * Allocate page_list elements from the pool to chunks of the memory object.
 + */
 +static void alloc_chunks(struct dm_mem_cache_client *cl,
 +                       struct dm_mem_cache_object *obj)
 +{
 +      unsigned chunks = cl->chunks;
 +      unsigned long flags;
 +
 +      local_irq_save(flags);
 +      local_irq_disable();
 +      while (chunks--) {
 +              unsigned p = cl->pages_per_chunk;
 +
 +              obj[chunks].pl = NULL;
 +
 +              while (p--) {
 +                      struct page_list *pl;
 +
 +                      /* Take next element from free list */
 +                      spin_lock(&cl->lock);
 +                      pl = cl->free_list;
 +                      BUG_ON(!pl);
 +                      cl->free_list = pl->next;
 +                      spin_unlock(&cl->lock);
 +
 +                      pl->next = obj[chunks].pl;
 +                      obj[chunks].pl = pl;
 +              }
 +      }
 +
 +      local_irq_restore(flags);
 +}
 +
 +/*
 + * Free page_list elements putting them back onto free list
 + */
 +static void free_chunks(struct dm_mem_cache_client *cl,
 +                      struct dm_mem_cache_object *obj)
 +{
 +      unsigned chunks = cl->chunks;
 +      unsigned long flags;
 +      struct page_list *next, *pl;
 +
 +      local_irq_save(flags);
 +      local_irq_disable();
 +      while (chunks--) {
 +              for (pl = obj[chunks].pl; pl; pl = next) {
 +                      next = pl->next;
 +
 +                      spin_lock(&cl->lock);
 +                      pl->next = cl->free_list;
 +                      cl->free_list = pl;
 +                      cl->free_pages++;
 +                      spin_unlock(&cl->lock);
 +              }
 +      }
 +
 +      local_irq_restore(flags);
 +}
 +
 +/*
 + * Create/destroy dm memory cache client resources.
 + */
 +struct dm_mem_cache_client *
 +dm_mem_cache_client_create(unsigned objects, unsigned chunks,
 +                         unsigned pages_per_chunk)
 +{
 +      unsigned total_pages = objects * chunks * pages_per_chunk;
 +      struct dm_mem_cache_client *client;
 +
 +      BUG_ON(!total_pages);
 +      client = kzalloc(sizeof(*client), GFP_KERNEL);
 +      if (!client)
 +              return ERR_PTR(-ENOMEM);
 +
 +      client->objs_pool = mempool_create_kmalloc_pool(objects,
 +                              chunks * sizeof(struct dm_mem_cache_object));
 +      if (!client->objs_pool)
 +              goto err;
 +
 +      client->free_list = alloc_cache_pages(total_pages);
 +      if (!client->free_list)
 +              goto err1;
 +
 +      spin_lock_init(&client->lock);
 +      client->objects = objects;
 +      client->chunks = chunks;
 +      client->pages_per_chunk = pages_per_chunk;
 +      client->free_pages = client->total_pages = total_pages;
 +      return client;
 +
 +err1:
 +      mempool_destroy(client->objs_pool);
 +err:
 +      kfree(client);
 +      return ERR_PTR(-ENOMEM);
 +}
 +EXPORT_SYMBOL(dm_mem_cache_client_create);
 +
 +void dm_mem_cache_client_destroy(struct dm_mem_cache_client *cl)
 +{
 +      BUG_ON(cl->free_pages != cl->total_pages);
 +      free_cache_pages(cl->free_list);
 +      mempool_destroy(cl->objs_pool);
 +      kfree(cl);
 +}
 +EXPORT_SYMBOL(dm_mem_cache_client_destroy);
 +
 +/*
 + * Grow a clients cache by an amount of pages.
 + *
 + * Don't call from interrupt context!
 + */
 +int dm_mem_cache_grow(struct dm_mem_cache_client *cl, unsigned objects)
 +{
 +      unsigned pages = objects * cl->chunks * cl->pages_per_chunk;
 +      struct page_list *pl, *last;
 +
 +      BUG_ON(!pages);
 +      pl = alloc_cache_pages(pages);
 +      if (!pl)
 +              return -ENOMEM;
 +
 +      last = pl;
 +      while (last->next)
 +              last = last->next;
 +
 +      spin_lock_irq(&cl->lock);
 +      last->next = cl->free_list;
 +      cl->free_list = pl;
 +      cl->free_pages += pages;
 +      cl->total_pages += pages;
 +      cl->objects += objects;
 +      spin_unlock_irq(&cl->lock);
 +
 +      mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
 +      return 0;
 +}
 +EXPORT_SYMBOL(dm_mem_cache_grow);
 +
 +/* Shrink a clients cache by an amount of pages */
 +int dm_mem_cache_shrink(struct dm_mem_cache_client *cl, unsigned objects)
 +{
 +      int r;
 +      unsigned pages = objects * cl->chunks * cl->pages_per_chunk, p = pages;
 +      unsigned long flags;
 +      struct page_list *last = NULL, *pl, *pos;
 +
 +      BUG_ON(!pages);
 +
 +      spin_lock_irqsave(&cl->lock, flags);
 +      pl = pos = cl->free_list;
 +      while (p-- && pos->next) {
 +              last = pos;
 +              pos = pos->next;
 +      }
 +
 +      if (++p)
 +              r = -ENOMEM;
 +      else {
 +              r = 0;
 +              cl->free_list = pos;
 +              cl->free_pages -= pages;
 +              cl->total_pages -= pages;
 +              cl->objects -= objects;
 +              last->next = NULL;
 +      }
 +      spin_unlock_irqrestore(&cl->lock, flags);
 +
 +      if (!r) {
 +              free_cache_pages(pl);
 +              mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
 +      }
 +
 +      return r;
 +}
 +EXPORT_SYMBOL(dm_mem_cache_shrink);
 +
 +/*
 + * Allocate/free a memory object
 + *
 + * Can be called from interrupt context
 + */
 +struct dm_mem_cache_object *dm_mem_cache_alloc(struct dm_mem_cache_client *cl)
 +{
 +      int r = 0;
 +      unsigned pages = cl->chunks * cl->pages_per_chunk;
 +      unsigned long flags;
 +      struct dm_mem_cache_object *obj;
 +
 +      obj = mempool_alloc(cl->objs_pool, GFP_NOIO);
 +      if (!obj)
 +              return ERR_PTR(-ENOMEM);
 +
 +      spin_lock_irqsave(&cl->lock, flags);
 +      if (pages > cl->free_pages)
 +              r = -ENOMEM;
 +      else
 +              cl->free_pages -= pages;
 +      spin_unlock_irqrestore(&cl->lock, flags);
 +
 +      if (r) {
 +              mempool_free(obj, cl->objs_pool);
 +              return ERR_PTR(r);
 +      }
 +
 +      alloc_chunks(cl, obj);
 +      return obj;
 +}
 +EXPORT_SYMBOL(dm_mem_cache_alloc);
 +
 +void dm_mem_cache_free(struct dm_mem_cache_client *cl,
 +                     struct dm_mem_cache_object *obj)
 +{
 +      free_chunks(cl, obj);
 +      mempool_free(obj, cl->objs_pool);
 +}
 +EXPORT_SYMBOL(dm_mem_cache_free);
 +
 +MODULE_DESCRIPTION(DM_NAME " dm memory cache");
 +MODULE_AUTHOR("Heinz Mauelshagen <heinzm@redhat.com>");
 +MODULE_LICENSE("GPL");
index 5312a16,0000000..fecc9b7
mode 100644,000000..100644
--- /dev/null
@@@ -1,4691 -1,0 +1,4692 @@@
 +/*
 + * Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
 + *
 + * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
 + *
 + * This file is released under the GPL.
 + *
 + *
 + * Linux 2.6 Device Mapper RAID4 and RAID5 target.
 + *
 + * Tested-by: Intel; Marcin.Labun@intel.com, krzysztof.wojcik@intel.com
 + *
 + *
 + * Supports the following ATARAID vendor solutions (and SNIA DDF):
 + *
 + *    Adaptec HostRAID ASR
 + *    SNIA DDF1
 + *    Hiphpoint 37x
 + *    Hiphpoint 45x
 + *    Intel IMSM
 + *    Jmicron ATARAID
 + *    LSI Logic MegaRAID
 + *    NVidia RAID
 + *    Promise FastTrack
 + *    Silicon Image Medley
 + *    VIA Software RAID
 + *
 + * via the dmraid application.
 + *
 + *
 + * Features:
 + *
 + *    o RAID4 with dedicated and selectable parity device
 + *    o RAID5 with rotating parity (left+right, symmetric+asymmetric)
 + *    o recovery of out of sync device for initial
 + *      RAID set creation or after dead drive replacement
 + *    o run time optimization of xor algorithm used to calculate parity
 + *
 + *
 + * Thanks to MD for:
 + *    o the raid address calculation algorithm
 + *    o the base of the biovec <-> page list copier.
 + *
 + *
 + * Uses region hash to keep track of how many writes are in flight to
 + * regions in order to use dirty log to keep state of regions to recover:
 + *
 + *    o clean regions (those which are synchronized
 + *    and don't have write io in flight)
 + *    o dirty regions (those with write io in flight)
 + *
 + *
 + * On startup, any dirty regions are migrated to the
 + * 'nosync' state and are subject to recovery by the daemon.
 + *
 + * See raid_ctr() for table definition.
 + *
 + * ANALYZEME: recovery bandwidth
 + */
 +
 +static const char *version = "v0.2597k";
 +
 +#include "dm.h"
 +#include "dm-memcache.h"
 +#include "dm-raid45.h"
 +
 +#include <linux/kernel.h>
 +#include <linux/vmalloc.h>
 +#include <linux/raid/xor.h>
 +#include <linux/slab.h>
++#include <linux/module.h>
 +
 +#include <linux/bio.h>
 +#include <linux/dm-io.h>
 +#include <linux/dm-dirty-log.h>
 +#include <linux/dm-region-hash.h>
 +
 +
 +/*
 + * Configurable parameters
 + */
 +
 +/* Minimum/maximum and default # of selectable stripes. */
 +#define       STRIPES_MIN             8
 +#define       STRIPES_MAX             16384
 +#define       STRIPES_DEFAULT         80
 +
 +/* Maximum and default chunk size in sectors if not set in constructor. */
 +#define       CHUNK_SIZE_MIN          8
 +#define       CHUNK_SIZE_MAX          16384
 +#define       CHUNK_SIZE_DEFAULT      64
 +
 +/* Default io size in sectors if not set in constructor. */
 +#define       IO_SIZE_MIN             CHUNK_SIZE_MIN
 +#define       IO_SIZE_DEFAULT         IO_SIZE_MIN
 +
 +/* Recover io size default in sectors. */
 +#define       RECOVER_IO_SIZE_MIN             64
 +#define       RECOVER_IO_SIZE_DEFAULT         256
 +
 +/* Default, minimum and maximum percentage of recover io bandwidth. */
 +#define       BANDWIDTH_DEFAULT       10
 +#define       BANDWIDTH_MIN           1
 +#define       BANDWIDTH_MAX           100
 +
 +/* # of parallel recovered regions */
 +#define RECOVERY_STRIPES_MIN  1
 +#define RECOVERY_STRIPES_MAX  64
 +#define RECOVERY_STRIPES_DEFAULT      RECOVERY_STRIPES_MIN
 +/*
 + * END Configurable parameters
 + */
 +
 +#define       TARGET  "dm-raid45"
 +#define       DAEMON  "kraid45d"
 +#define       DM_MSG_PREFIX   TARGET
 +
 +#define       SECTORS_PER_PAGE        (PAGE_SIZE >> SECTOR_SHIFT)
 +
 +/* Amount/size for __xor(). */
 +#define       XOR_SIZE        PAGE_SIZE
 +
 +/* Ticks to run xor_speed() test for. */
 +#define       XOR_SPEED_TICKS 5
 +
 +/* Check value in range. */
 +#define       range_ok(i, min, max)   (i >= min && i <= max)
 +
 +/* Structure access macros. */
 +/* Derive raid_set from stripe_cache pointer. */
 +#define       RS(x)   container_of(x, struct raid_set, sc)
 +
 +/* Page reference. */
 +#define PAGE(stripe, p)  ((stripe)->obj[p].pl->page)
 +
 +/* Stripe chunk reference. */
 +#define CHUNK(stripe, p) ((stripe)->chunk + p)
 +
 +/* Bio list reference. */
 +#define       BL(stripe, p, rw)       (stripe->chunk[p].bl + rw)
 +#define       BL_CHUNK(chunk, rw)     (chunk->bl + rw)
 +
 +/* Page list reference. */
 +#define       PL(stripe, p)           (stripe->obj[p].pl)
 +/* END: structure access macros. */
 +
 +/* Factor out to dm-bio-list.h */
 +static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
 +{
 +      bio->bi_next = bl->head;
 +      bl->head = bio;
 +
 +      if (!bl->tail)
 +              bl->tail = bio;
 +}
 +
 +/* Factor out to dm.h */
 +#define TI_ERR_RET(str, ret) \
 +      do { ti->error = str; return ret; } while (0);
 +#define TI_ERR(str)     TI_ERR_RET(str, -EINVAL)
 +
 +/* Macro to define access IO flags access inline functions. */
 +#define       BITOPS(name, what, var, flag) \
 +static inline int TestClear ## name ## what(struct var *v) \
 +{ return test_and_clear_bit(flag, &v->io.flags); } \
 +static inline int TestSet ## name ## what(struct var *v) \
 +{ return test_and_set_bit(flag, &v->io.flags); } \
 +static inline void Clear ## name ## what(struct var *v) \
 +{ clear_bit(flag, &v->io.flags); } \
 +static inline void Set ## name ## what(struct var *v) \
 +{ set_bit(flag, &v->io.flags); } \
 +static inline int name ## what(struct var *v) \
 +{ return test_bit(flag, &v->io.flags); }
 +
 +/*-----------------------------------------------------------------
 + * Stripe cache
 + *
 + * Cache for all reads and writes to raid sets (operational or degraded)
 + *
 + * We need to run all data to and from a RAID set through this cache,
 + * because parity chunks need to get calculated from data chunks
 + * or, in the degraded/resynchronization case, missing chunks need
 + * to be reconstructed using the other chunks of the stripe.
 + *---------------------------------------------------------------*/
 +/* Unique kmem cache name suffix # counter. */
 +static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
 +
 +/* A chunk within a stripe (holds bios hanging off). */
 +/* IO status flags for chunks of a stripe. */
 +enum chunk_flags {
 +      CHUNK_DIRTY,            /* Pages of chunk dirty; need writing. */
 +      CHUNK_ERROR,            /* IO error on any chunk page. */
 +      CHUNK_IO,               /* Allow/prohibit IO on chunk pages. */
 +      CHUNK_LOCKED,           /* Chunk pages locked during IO. */
 +      CHUNK_MUST_IO,          /* Chunk must io. */
 +      CHUNK_UNLOCK,           /* Enforce chunk unlock. */
 +      CHUNK_UPTODATE,         /* Chunk pages are uptodate. */
 +};
 +
 +enum bl_type {
 +      WRITE_QUEUED = WRITE + 1,
 +      WRITE_MERGED,
 +      NR_BL_TYPES,    /* Must be last one! */
 +};
 +struct stripe_chunk {
 +      atomic_t cnt;           /* Reference count. */
 +      struct stripe *stripe;  /* Backpointer to stripe for endio(). */
 +      /* Bio lists for reads, writes, and writes merged. */
 +      struct bio_list bl[NR_BL_TYPES];
 +      struct {
 +              unsigned long flags; /* IO status flags. */
 +      } io;
 +};
 +
 +/* Define chunk bit operations. */
 +BITOPS(Chunk, Dirty,   stripe_chunk, CHUNK_DIRTY)
 +BITOPS(Chunk, Error,   stripe_chunk, CHUNK_ERROR)
 +BITOPS(Chunk, Io,      stripe_chunk, CHUNK_IO)
 +BITOPS(Chunk, Locked,  stripe_chunk, CHUNK_LOCKED)
 +BITOPS(Chunk, MustIo,  stripe_chunk, CHUNK_MUST_IO)
 +BITOPS(Chunk, Unlock,  stripe_chunk, CHUNK_UNLOCK)
 +BITOPS(Chunk, Uptodate,        stripe_chunk, CHUNK_UPTODATE)
 +
 +/*
 + * Stripe linked list indexes. Keep order, because the stripe
 + * and the stripe cache rely on the first 3!
 + */
 +enum list_types {
 +      LIST_FLUSH,     /* Stripes to flush for io. */
 +      LIST_ENDIO,     /* Stripes to endio. */
 +      LIST_LRU,       /* Least recently used stripes. */
 +      SC_NR_LISTS,    /* # of lists in stripe cache. */
 +      LIST_HASH = SC_NR_LISTS,        /* Hashed stripes. */
 +      LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
 +      STRIPE_NR_LISTS,/* To size array in struct stripe. */
 +};
 +
 +/* Adressing region recovery. */
 +struct recover_addr {
 +      struct dm_region *reg;  /* Actual region to recover. */
 +      sector_t pos;   /* Position within region to recover. */
 +      sector_t end;   /* End of region to recover. */
 +};
 +
 +/* A stripe: the io object to handle all reads and writes to a RAID set. */
 +struct stripe {
 +      atomic_t cnt;                   /* Reference count. */
 +      struct stripe_cache *sc;        /* Backpointer to stripe cache. */
 +
 +      /*
 +       * 4 linked lists:
 +       *   o io list to flush io
 +       *   o endio list
 +       *   o LRU list to put stripes w/o reference count on
 +       *   o stripe cache hash
 +       */
 +      struct list_head lists[STRIPE_NR_LISTS];
 +
 +      sector_t key;    /* Hash key. */
 +      region_t region; /* Region stripe is mapped to. */
 +
 +      struct {
 +              unsigned long flags;    /* Stripe state flags (see below). */
 +
 +              /*
 +               * Pending ios in flight:
 +               *
 +               * used to control move of stripe to endio list
 +               */
 +              atomic_t pending;
 +
 +              /* Sectors to read and write for multi page stripe sets. */
 +              unsigned size;
 +      } io;
 +
 +      /* Address region recovery. */
 +      struct recover_addr *recover;
 +
 +      /* Lock on stripe (Future: for clustering). */
 +      void *lock;
 +
 +      struct {
 +              unsigned short parity;  /* Parity chunk index. */
 +              short recover;          /* Recovery chunk index. */
 +      } idx;
 +
 +      /*
 +       * This stripe's memory cache object (dm-mem-cache);
 +       * i.e. the io chunk pages.
 +       */
 +      struct dm_mem_cache_object *obj;
 +
 +      /* Array of stripe sets (dynamically allocated). */
 +      struct stripe_chunk chunk[0];
 +};
 +
 +/* States stripes can be in (flags field). */
 +enum stripe_states {
 +      STRIPE_ERROR,           /* io error on stripe. */
 +      STRIPE_MERGED,          /* Writes got merged to be written. */
 +      STRIPE_RBW,             /* Read-before-write stripe. */
 +      STRIPE_RECONSTRUCT,     /* Reconstruct of a missing chunk required. */
 +      STRIPE_RECONSTRUCTED,   /* Reconstructed of a missing chunk. */
 +      STRIPE_RECOVER,         /* Stripe used for RAID set recovery. */
 +};
 +
 +/* Define stripe bit operations. */
 +BITOPS(Stripe, Error,       stripe, STRIPE_ERROR)
 +BITOPS(Stripe, Merged,        stripe, STRIPE_MERGED)
 +BITOPS(Stripe, RBW,         stripe, STRIPE_RBW)
 +BITOPS(Stripe, Reconstruct,   stripe, STRIPE_RECONSTRUCT)
 +BITOPS(Stripe, Reconstructed, stripe, STRIPE_RECONSTRUCTED)
 +BITOPS(Stripe, Recover,             stripe, STRIPE_RECOVER)
 +
 +/* A stripe hash. */
 +struct stripe_hash {
 +      struct list_head *hash;
 +      unsigned buckets;
 +      unsigned mask;
 +      unsigned prime;
 +      unsigned shift;
 +};
 +
 +enum sc_lock_types {
 +      LOCK_ENDIO,     /* Protect endio list. */
 +      NR_LOCKS,       /* To size array in struct stripe_cache. */
 +};
 +
 +/* A stripe cache. */
 +struct stripe_cache {
 +      /* Stripe hash. */
 +      struct stripe_hash hash;
 +
 +      spinlock_t locks[NR_LOCKS];     /* Locks to protect lists. */
 +
 +      /* Stripes with io to flush, stripes to endio and LRU lists. */
 +      struct list_head lists[SC_NR_LISTS];
 +
 +      /* Slab cache to allocate stripes from. */
 +      struct {
 +              struct kmem_cache *cache;       /* Cache itself. */
 +              char name[32];  /* Unique name. */
 +      } kc;
 +
 +      struct dm_io_client *dm_io_client; /* dm-io client resource context. */
 +
 +      /* dm-mem-cache client resource context. */
 +      struct dm_mem_cache_client *mem_cache_client;
 +
 +      int stripes_parm;           /* # stripes parameter from constructor. */
 +      atomic_t stripes;           /* actual # of stripes in cache. */
 +      atomic_t stripes_to_set;    /* # of stripes to resize cache to. */
 +      atomic_t stripes_last;      /* last # of stripes in cache. */
 +      atomic_t active_stripes;    /* actual # of active stripes in cache. */
 +
 +      /* REMOVEME: */
 +      atomic_t active_stripes_max; /* actual # of active stripes in cache. */
 +};
 +
 +/* Flag specs for raid_dev */ ;
 +enum raid_dev_flags {
 +      DEV_FAILED,     /* Device failed. */
 +      DEV_IO_QUEUED,  /* Io got queued to device. */
 +};
 +
 +/* The raid device in a set. */
 +struct raid_dev {
 +      struct dm_dev *dev;
 +      sector_t start;         /* Offset to map to. */
 +      struct {        /* Using struct to be able to BITOPS(). */
 +              unsigned long flags;    /* raid_dev_flags. */
 +      } io;
 +};
 +
 +BITOPS(Dev, Failed,   raid_dev, DEV_FAILED)
 +BITOPS(Dev, IoQueued, raid_dev, DEV_IO_QUEUED)
 +
 +/* Flags spec for raid_set. */
 +enum raid_set_flags {
 +      RS_CHECK_OVERWRITE,     /* Check for chunk overwrites. */
 +      RS_DEAD,                /* RAID set inoperational. */
 +      RS_DEAD_ENDIO_MESSAGE,  /* RAID set dead endio one-off message. */
 +      RS_DEGRADED,            /* Io errors on RAID device. */
 +      RS_DEVEL_STATS,         /* REMOVEME: display status information. */
 +      RS_ENFORCE_PARITY_CREATION,/* Enforce parity creation. */
 +      RS_PROHIBIT_WRITES,     /* Prohibit writes on device failure. */
 +      RS_RECOVER,             /* Do recovery. */
 +      RS_RECOVERY_BANDWIDTH,  /* Allow recovery bandwidth (delayed bios). */
 +      RS_SC_BUSY,             /* Stripe cache busy -> send an event. */
 +      RS_SUSPEND,             /* Suspend RAID set. */
 +};
 +
 +/* REMOVEME: devel stats counters. */
 +enum stats_types {
 +      S_BIOS_READ,
 +      S_BIOS_ADDED_READ,
 +      S_BIOS_ENDIO_READ,
 +      S_BIOS_WRITE,
 +      S_BIOS_ADDED_WRITE,
 +      S_BIOS_ENDIO_WRITE,
 +      S_CAN_MERGE,
 +      S_CANT_MERGE,
 +      S_CONGESTED,
 +      S_DM_IO_READ,
 +      S_DM_IO_WRITE,
 +      S_BANDWIDTH,
 +      S_BARRIER,
 +      S_BIO_COPY_PL_NEXT,
 +      S_DEGRADED,
 +      S_DELAYED_BIOS,
 +      S_FLUSHS,
 +      S_HITS_1ST,
 +      S_IOS_POST,
 +      S_INSCACHE,
 +      S_MAX_LOOKUP,
 +      S_CHUNK_LOCKED,
 +      S_NO_BANDWIDTH,
 +      S_NOT_CONGESTED,
 +      S_NO_RW,
 +      S_NOSYNC,
 +      S_OVERWRITE,
 +      S_PROHIBITCHUNKIO,
 +      S_RECONSTRUCT_EI,
 +      S_RECONSTRUCT_DEV,
 +      S_RECONSTRUCT_SET,
 +      S_RECONSTRUCTED,
 +      S_REQUEUE,
 +      S_STRIPE_ERROR,
 +      S_SUM_DELAYED_BIOS,
 +      S_XORS,
 +      S_NR_STATS,     /* # of stats counters. Must be last! */
 +};
 +
 +/* Status type -> string mappings. */
 +struct stats_map {
 +      const enum stats_types type;
 +      const char *str;
 +};
 +
 +static struct stats_map stats_map[] = {
 +      { S_BIOS_READ, "r=" },
 +      { S_BIOS_ADDED_READ, "/" },
 +      { S_BIOS_ENDIO_READ, "/" },
 +      { S_BIOS_WRITE, " w=" },
 +      { S_BIOS_ADDED_WRITE, "/" },
 +      { S_BIOS_ENDIO_WRITE, "/" },
 +      { S_DM_IO_READ, " rc=" },
 +      { S_DM_IO_WRITE, " wc=" },
 +      { S_BANDWIDTH, "\nbw=" },
 +      { S_NO_BANDWIDTH, " no_bw=" },
 +      { S_BARRIER, "\nbarrier=" },
 +      { S_BIO_COPY_PL_NEXT, "\nbio_cp_next=" },
 +      { S_CAN_MERGE, "\nmerge=" },
 +      { S_CANT_MERGE, "/no_merge=" },
 +      { S_CHUNK_LOCKED, "\nchunk_locked=" },
 +      { S_CONGESTED, "\ncgst=" },
 +      { S_NOT_CONGESTED, "/not_cgst=" },
 +      { S_DEGRADED, "\ndegraded=" },
 +      { S_DELAYED_BIOS, "\ndel_bios=" },
 +      { S_SUM_DELAYED_BIOS, "/sum_del_bios=" },
 +      { S_FLUSHS, "\nflushs=" },
 +      { S_HITS_1ST, "\nhits_1st=" },
 +      { S_IOS_POST, " ios_post=" },
 +      { S_INSCACHE, " inscache=" },
 +      { S_MAX_LOOKUP, " maxlookup=" },
 +      { S_NO_RW, "\nno_rw=" },
 +      { S_NOSYNC, " nosync=" },
 +      { S_OVERWRITE, " ovr=" },
 +      { S_PROHIBITCHUNKIO, " prhbt_io=" },
 +      { S_RECONSTRUCT_EI, "\nrec_ei=" },
 +      { S_RECONSTRUCT_DEV, " rec_dev=" },
 +      { S_RECONSTRUCT_SET, " rec_set=" },
 +      { S_RECONSTRUCTED, " rec=" },
 +      { S_REQUEUE, " requeue=" },
 +      { S_STRIPE_ERROR, " stripe_err=" },
 +      { S_XORS, " xors=" },
 +};
 +
 +/*
 + * A RAID set.
 + */
 +#define       dm_rh_client    dm_region_hash
 +enum count_type { IO_WORK = 0, IO_RECOVER, IO_NR_COUNT };
 +typedef void (*xor_function_t)(unsigned count, unsigned long **data);
 +struct raid_set {
 +      struct dm_target *ti;   /* Target pointer. */
 +
 +      struct {
 +              unsigned long flags;    /* State flags. */
 +              struct mutex in_lock;   /* Protects central input list below. */
 +              struct mutex xor_lock;  /* Protects xor algorithm set. */
 +              struct bio_list in;     /* Pending ios (central input list). */
 +              struct bio_list work;   /* ios work set. */
 +              wait_queue_head_t suspendq;     /* suspend synchronization. */
 +              atomic_t in_process;    /* counter of queued bios (suspendq). */
 +              atomic_t in_process_max;/* counter of queued bios max. */
 +
 +              /* io work. */
 +              struct workqueue_struct *wq;
 +              struct delayed_work dws_do_raid;        /* For main worker. */
 +              struct work_struct ws_do_table_event;   /* For event worker. */
 +      } io;
 +
 +      /* Stripe locking abstraction. */
 +      struct dm_raid45_locking_type *locking;
 +
 +      struct stripe_cache sc; /* Stripe cache for this set. */
 +
 +      /* Xor optimization. */
 +      struct {
 +              struct xor_func *f;
 +              unsigned chunks;
 +              unsigned speed;
 +      } xor;
 +
 +      /* Recovery parameters. */
 +      struct recover {
 +              struct dm_dirty_log *dl;        /* Dirty log. */
 +              struct dm_rh_client *rh;        /* Region hash. */
 +
 +              struct dm_io_client *dm_io_client; /* recovery dm-io client. */
 +              /* dm-mem-cache client resource context for recovery stripes. */
 +              struct dm_mem_cache_client *mem_cache_client;
 +
 +              struct list_head stripes;       /* List of recovery stripes. */
 +
 +              region_t nr_regions;
 +              region_t nr_regions_to_recover;
 +              region_t nr_regions_recovered;
 +              unsigned long start_jiffies;
 +              unsigned long end_jiffies;
 +
 +              unsigned bandwidth;      /* Recovery bandwidth [%]. */
 +              unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
 +              unsigned bandwidth_parm; /*  " constructor parm. */
 +              unsigned io_size;        /* recovery io size <= region size. */
 +              unsigned io_size_parm;   /* recovery io size ctr parameter. */
 +              unsigned recovery;       /* Recovery allowed/prohibited. */
 +              unsigned recovery_stripes; /* # of parallel recovery stripes. */
 +
 +              /* recovery io throttling. */
 +              atomic_t io_count[IO_NR_COUNT]; /* counter recover/regular io.*/
 +              unsigned long last_jiffies;
 +      } recover;
 +
 +      /* RAID set parameters. */
 +      struct {
 +              struct raid_type *raid_type;    /* RAID type (eg, RAID4). */
 +              unsigned raid_parms;    /* # variable raid parameters. */
 +
 +              unsigned chunk_size;    /* Sectors per chunk. */
 +              unsigned chunk_size_parm;
 +              unsigned chunk_shift;   /* rsector chunk size shift. */
 +
 +              unsigned io_size;       /* Sectors per io. */
 +              unsigned io_size_parm;
 +              unsigned io_mask;       /* Mask for bio_copy_page_list(). */
 +              unsigned io_inv_mask;   /* Mask for raid_address(). */
 +
 +              sector_t sectors_per_dev;       /* Sectors per device. */
 +
 +              atomic_t failed_devs;           /* Amount of devices failed. */
 +
 +              /* Index of device to initialize. */
 +              int dev_to_init;
 +              int dev_to_init_parm;
 +
 +              /* Raid devices dynamically allocated. */
 +              unsigned raid_devs;     /* # of RAID devices below. */
 +              unsigned data_devs;     /* # of RAID data devices. */
 +
 +              int ei;         /* index of failed RAID device. */
 +
 +              /* Index of dedicated parity device (i.e. RAID4). */
 +              int pi;
 +              int pi_parm;    /* constructor parm for status output. */
 +      } set;
 +
 +      /* REMOVEME: devel stats counters. */
 +      atomic_t stats[S_NR_STATS];
 +
 +      /* Dynamically allocated temporary pointers for xor(). */
 +      unsigned long **data;
 +
 +      /* Dynamically allocated RAID devices. Alignment? */
 +      struct raid_dev dev[0];
 +};
 +
 +/* Define RAID set bit operations. */
 +BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
 +BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
 +BITOPS(RS, Dead, raid_set, RS_DEAD)
 +BITOPS(RS, DeadEndioMessage, raid_set, RS_DEAD_ENDIO_MESSAGE)
 +BITOPS(RS, Degraded, raid_set, RS_DEGRADED)
 +BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
 +BITOPS(RS, EnforceParityCreation, raid_set, RS_ENFORCE_PARITY_CREATION)
 +BITOPS(RS, ProhibitWrites, raid_set, RS_PROHIBIT_WRITES)
 +BITOPS(RS, Recover, raid_set, RS_RECOVER)
 +BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
 +BITOPS(RS, Suspend, raid_set, RS_SUSPEND)
 +#undef BITOPS
 +
 +/*-----------------------------------------------------------------
 + * Raid-4/5 set structures.
 + *---------------------------------------------------------------*/
 +/* RAID level definitions. */
 +enum raid_level {
 +      raid4,
 +      raid5,
 +};
 +
 +/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
 +enum raid_algorithm {
 +      none,
 +      left_asym,
 +      right_asym,
 +      left_sym,
 +      right_sym,
 +};
 +
 +struct raid_type {
 +      const char *name;               /* RAID algorithm. */
 +      const char *descr;              /* Descriptor text for logging. */
 +      const unsigned parity_devs;     /* # of parity devices. */
 +      const unsigned minimal_devs;    /* minimal # of devices in set. */
 +      const enum raid_level level;            /* RAID level. */
 +      const enum raid_algorithm algorithm;    /* RAID algorithm. */
 +};
 +
 +/* Supported raid types and properties. */
 +static struct raid_type raid_types[] = {
 +      {"raid4",    "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
 +      {"raid5_la", "RAID5 (left asymmetric)",       1, 3, raid5, left_asym},
 +      {"raid5_ra", "RAID5 (right asymmetric)",      1, 3, raid5, right_asym},
 +      {"raid5_ls", "RAID5 (left symmetric)",        1, 3, raid5, left_sym},
 +      {"raid5_rs", "RAID5 (right symmetric)",       1, 3, raid5, right_sym},
 +};
 +
 +/* Address as calculated by raid_address(). */
 +struct raid_address {
 +      sector_t key;           /* Hash key (address of stripe % chunk_size). */
 +      unsigned di, pi;        /* Data and parity disks index. */
 +};
 +
 +/* REMOVEME: reset statistics counters. */
 +static void stats_reset(struct raid_set *rs)
 +{
 +      unsigned s = S_NR_STATS;
 +
 +      while (s--)
 +              atomic_set(rs->stats + s, 0);
 +}
 +
 +/*----------------------------------------------------------------
 + * RAID set management routines.
 + *--------------------------------------------------------------*/
 +/*
 + * Begin small helper functions.
 + */
 +/* No need to be called from region hash indirectly at dm_rh_dec(). */
 +static void wake_dummy(void *context) {}
 +
 +/* Return # of io reference. */
 +static int io_ref(struct raid_set *rs)
 +{
 +      return atomic_read(&rs->io.in_process);
 +}
 +
 +/* Get an io reference. */
 +static void io_get(struct raid_set *rs)
 +{
 +      int p = atomic_inc_return(&rs->io.in_process);
 +
 +      if (p > atomic_read(&rs->io.in_process_max))
 +              atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
 +}
 +
 +/* Put the io reference and conditionally wake io waiters. */
 +static void io_put(struct raid_set *rs)
 +{
 +      /* Intel: rebuild data corrupter? */
 +      if (atomic_dec_and_test(&rs->io.in_process))
 +              wake_up(&rs->io.suspendq);
 +      else
 +              BUG_ON(io_ref(rs) < 0);
 +}
 +
 +/* Wait until all io has been processed. */
 +static void wait_ios(struct raid_set *rs)
 +{
 +      wait_event(rs->io.suspendq, !io_ref(rs));
 +}
 +
 +/* Queue (optionally delayed) io work. */
 +static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
 +{
 +      queue_delayed_work(rs->io.wq, &rs->io.dws_do_raid, delay);
 +}
 +
 +/* Queue io work immediately (called from region hash too). */
 +static void wake_do_raid(void *context)
 +{
 +      struct raid_set *rs = context;
 +
 +      queue_work(rs->io.wq, &rs->io.dws_do_raid.work);
 +}
 +
 +/* Calculate device sector offset. */
 +static sector_t _sector(struct raid_set *rs, struct bio *bio)
 +{
 +      sector_t sector = bio->bi_sector;
 +
 +      sector_div(sector, rs->set.data_devs);
 +      return sector;
 +}
 +
 +/* Return # of active stripes in stripe cache. */
 +static int sc_active(struct stripe_cache *sc)
 +{
 +      return atomic_read(&sc->active_stripes);
 +}
 +
 +/* Stripe cache busy indicator. */
 +static int sc_busy(struct raid_set *rs)
 +{
 +      return sc_active(&rs->sc) >
 +             atomic_read(&rs->sc.stripes) - (STRIPES_MIN / 2);
 +}
 +
 +/* Set chunks states. */
 +enum chunk_dirty_type { CLEAN, DIRTY, ERROR };
 +static void chunk_set(struct stripe_chunk *chunk, enum chunk_dirty_type type)
 +{
 +      switch (type) {
 +      case CLEAN:
 +              ClearChunkDirty(chunk);
 +              break;
 +      case DIRTY:
 +              SetChunkDirty(chunk);
 +              break;
 +      case ERROR:
 +              SetChunkError(chunk);
 +              SetStripeError(chunk->stripe);
 +              return;
 +      default:
 +              BUG();
 +      }
 +
 +      SetChunkUptodate(chunk);
 +      SetChunkIo(chunk);
 +      ClearChunkError(chunk);
 +}
 +
 +/* Return region state for a sector. */
 +static int region_state(struct raid_set *rs, sector_t sector,
 +                      enum dm_rh_region_states state)
 +{
 +      struct dm_rh_client *rh = rs->recover.rh;
 +      region_t region = dm_rh_sector_to_region(rh, sector);
 +
 +      return !!(dm_rh_get_state(rh, region, 1) & state);
 +}
 +
 +/*
 + * Return true in case a chunk should be read/written
 + *
 + * Conditions to read/write:
 + *    o chunk not uptodate
 + *    o chunk dirty
 + *
 + * Conditios to avoid io:
 + *    o io already ongoing on chunk
 + *    o io explitely prohibited
 + */
 +static int chunk_io(struct stripe_chunk *chunk)
 +{
 +      /* 2nd run optimization (flag set below on first run). */
 +      if (TestClearChunkMustIo(chunk))
 +              return 1;
 +
 +      /* Avoid io if prohibited or a locked chunk. */
 +      if (!ChunkIo(chunk) || ChunkLocked(chunk))
 +              return 0;
 +
 +      if (!ChunkUptodate(chunk) || ChunkDirty(chunk)) {
 +              SetChunkMustIo(chunk); /* 2nd run optimization. */
 +              return 1;
 +      }
 +
 +      return 0;
 +}
 +
 +/* Call a function on each chunk needing io unless device failed. */
 +static unsigned for_each_io_dev(struct stripe *stripe,
 +                              void (*f_io)(struct stripe *stripe, unsigned p))
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      unsigned p, r = 0;
 +
 +      for (p = 0; p < rs->set.raid_devs; p++) {
 +              if (chunk_io(CHUNK(stripe, p)) && !DevFailed(rs->dev + p)) {
 +                      f_io(stripe, p);
 +                      r++;
 +              }
 +      }
 +
 +      return r;
 +}
 +
 +/*
 + * Index of device to calculate parity on.
 + *
 + * Either the parity device index *or* the selected
 + * device to init after a spare replacement.
 + */
 +static int dev_for_parity(struct stripe *stripe, int *sync)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      int r = region_state(rs, stripe->key, DM_RH_NOSYNC | DM_RH_RECOVERING);
 +
 +      *sync = !r;
 +
 +      /* Reconstruct a particular device ?. */
 +      if (r && rs->set.dev_to_init > -1)
 +              return rs->set.dev_to_init;
 +      else if (rs->set.raid_type->level == raid4)
 +              return rs->set.pi;
 +      else if (!StripeRecover(stripe))
 +              return stripe->idx.parity;
 +      else
 +              return -1;
 +}
 +
 +/* RAID set congested function. */
 +static int rs_congested(void *congested_data, int bdi_bits)
 +{
 +      int r;
 +      unsigned p;
 +      struct raid_set *rs = congested_data;
 +
 +      if (sc_busy(rs) || RSSuspend(rs) || RSProhibitWrites(rs))
 +              r = 1;
 +      else for (r = 0, p = rs->set.raid_devs; !r && p--; ) {
 +              /* If any of our component devices are overloaded. */
 +              struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
 +
 +              r |= bdi_congested(&q->backing_dev_info, bdi_bits);
 +      }
 +
 +      /* REMOVEME: statistics. */
 +      atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
 +      return r;
 +}
 +
 +/* RAID device degrade check. */
 +static void rs_check_degrade_dev(struct raid_set *rs,
 +                               struct stripe *stripe, unsigned p)
 +{
 +      if (TestSetDevFailed(rs->dev + p))
 +              return;
 +
 +      /* Through an event in case of member device errors. */
 +      if ((atomic_inc_return(&rs->set.failed_devs) >
 +           rs->set.raid_type->parity_devs) &&
 +           !TestSetRSDead(rs)) {
 +              /* Display RAID set dead message once. */
 +              unsigned p;
 +              char buf[BDEVNAME_SIZE];
 +
 +              DMERR("FATAL: too many devices failed -> RAID set broken");
 +              for (p = 0; p < rs->set.raid_devs; p++) {
 +                      if (DevFailed(rs->dev + p))
 +                              DMERR("device /dev/%s failed",
 +                                    bdevname(rs->dev[p].dev->bdev, buf));
 +              }
 +      }
 +
 +      /* Only log the first member error. */
 +      if (!TestSetRSDegraded(rs)) {
 +              char buf[BDEVNAME_SIZE];
 +
 +              /* Store index for recovery. */
 +              rs->set.ei = p;
 +              DMERR("CRITICAL: %sio error on device /dev/%s "
 +                    "in region=%llu; DEGRADING RAID set\n",
 +                    stripe ? "" : "FAKED ",
 +                    bdevname(rs->dev[p].dev->bdev, buf),
 +                    (unsigned long long) (stripe ? stripe->key : 0));
 +              DMERR("further device error messages suppressed");
 +      }
 +
 +      /* Prohibit further writes to allow for userpace to update metadata. */
 +      SetRSProhibitWrites(rs);
 +      schedule_work(&rs->io.ws_do_table_event);
 +}
 +
 +/* RAID set degrade check. */
 +static void rs_check_degrade(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      unsigned p = rs->set.raid_devs;
 +
 +      while (p--) {
 +              if (ChunkError(CHUNK(stripe, p)))
 +                      rs_check_degrade_dev(rs, stripe, p);
 +      }
 +}
 +
 +/* Lookup a RAID device by name or by major:minor number. */
 +static int raid_dev_lookup(struct raid_set *rs, struct raid_dev *dev_lookup)
 +{
 +      unsigned p;
 +      struct raid_dev *dev;
 +
 +      /*
 +       * Must be an incremental loop, because the device array
 +       * can have empty slots still on calls from raid_ctr()
 +       */
 +      for (dev = rs->dev, p = 0;
 +           dev->dev && p < rs->set.raid_devs;
 +           dev++, p++) {
 +              if (dev_lookup->dev->bdev->bd_dev == dev->dev->bdev->bd_dev)
 +                      return p;
 +      }
 +
 +      return -ENODEV;
 +}
 +/*
 + * End small helper functions.
 + */
 +
 +/*
 + * Stripe hash functions
 + */
 +/* Initialize/destroy stripe hash. */
 +static int hash_init(struct stripe_hash *hash, unsigned stripes)
 +{
 +      unsigned buckets = roundup_pow_of_two(stripes >> 1);
 +      static unsigned hash_primes[] = {
 +              /* Table of primes for hash_fn/table size optimization. */
 +              1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769,
 +              1543, 3079, 6151, 12289, 24593, 49157, 98317,
 +      };
 +
 +      /* Allocate stripe hash buckets. */
 +      hash->hash = vmalloc(buckets * sizeof(*hash->hash));
 +      if (!hash->hash)
 +              return -ENOMEM;
 +
 +      hash->buckets = buckets;
 +      hash->mask = buckets - 1;
 +      hash->shift = ffs(buckets);
 +      if (hash->shift > ARRAY_SIZE(hash_primes))
 +              hash->shift = ARRAY_SIZE(hash_primes) - 1;
 +
 +      BUG_ON(hash->shift < 2);
 +      hash->prime = hash_primes[hash->shift];
 +
 +      /* Initialize buckets. */
 +      while (buckets--)
 +              INIT_LIST_HEAD(hash->hash + buckets);
 +      return 0;
 +}
 +
 +static void hash_exit(struct stripe_hash *hash)
 +{
 +      if (hash->hash) {
 +              vfree(hash->hash);
 +              hash->hash = NULL;
 +      }
 +}
 +
 +static unsigned hash_fn(struct stripe_hash *hash, sector_t key)
 +{
 +      return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
 +}
 +
 +static struct list_head *hash_bucket(struct stripe_hash *hash, sector_t key)
 +{
 +      return hash->hash + hash_fn(hash, key);
 +}
 +
 +/* Insert an entry into a hash. */
 +static void stripe_insert(struct stripe_hash *hash, struct stripe *stripe)
 +{
 +      list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
 +}
 +
 +/* Lookup an entry in the stripe hash. */
 +static struct stripe *stripe_lookup(struct stripe_cache *sc, sector_t key)
 +{
 +      unsigned look = 0;
 +      struct stripe *stripe;
 +      struct list_head *bucket = hash_bucket(&sc->hash, key);
 +
 +      list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
 +              look++;
 +
 +              if (stripe->key == key) {
 +                      /* REMOVEME: statisics. */
 +                      if (look > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
 +                              atomic_set(RS(sc)->stats + S_MAX_LOOKUP, look);
 +                      return stripe;
 +              }
 +      }
 +
 +      return NULL;
 +}
 +
 +/* Resize the stripe cache hash on size changes. */
 +static int sc_hash_resize(struct stripe_cache *sc)
 +{
 +      /* Resize indicated ? */
 +      if (atomic_read(&sc->stripes) != atomic_read(&sc->stripes_last)) {
 +              int r;
 +              struct stripe_hash hash;
 +
 +              r = hash_init(&hash, atomic_read(&sc->stripes));
 +              if (r)
 +                      return r;
 +
 +              if (sc->hash.hash) {
 +                      unsigned b = sc->hash.buckets;
 +                      struct list_head *pos, *tmp;
 +
 +                      /* Walk old buckets and insert into new. */
 +                      while (b--) {
 +                              list_for_each_safe(pos, tmp, sc->hash.hash + b)
 +                                  stripe_insert(&hash,
 +                                                list_entry(pos, struct stripe,
 +                                                           lists[LIST_HASH]));
 +                      }
 +
 +              }
 +
 +              hash_exit(&sc->hash);
 +              memcpy(&sc->hash, &hash, sizeof(sc->hash));
 +              atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
 +      }
 +
 +      return 0;
 +}
 +/* End hash stripe hash function. */
 +
 +/* List add, delete, push and pop functions. */
 +/* Add stripe to flush list. */
 +#define       DEL_LIST(lh) \
 +      if (!list_empty(lh)) \
 +              list_del_init(lh);
 +
 +/* Delete stripe from hash. */
 +static void stripe_hash_del(struct stripe *stripe)
 +{
 +      DEL_LIST(stripe->lists + LIST_HASH);
 +}
 +
 +/* Return stripe reference count. */
 +static inline int stripe_ref(struct stripe *stripe)
 +{
 +      return atomic_read(&stripe->cnt);
 +}
 +
 +static void stripe_flush_add(struct stripe *stripe)
 +{
 +      struct stripe_cache *sc = stripe->sc;
 +      struct list_head *lh = stripe->lists + LIST_FLUSH;
 +
 +      if (!StripeReconstruct(stripe) && list_empty(lh))
 +              list_add_tail(lh, sc->lists + LIST_FLUSH);
 +}
 +
 +/*
 + * Add stripe to LRU (inactive) list.
 + *
 + * Need lock, because of concurrent access from message interface.
 + */
 +static void stripe_lru_add(struct stripe *stripe)
 +{
 +      if (!StripeRecover(stripe)) {
 +              struct list_head *lh = stripe->lists + LIST_LRU;
 +
 +              if (list_empty(lh))
 +                      list_add_tail(lh, stripe->sc->lists + LIST_LRU);
 +      }
 +}
 +
 +#define POP_LIST(list) \
 +      do { \
 +              if (list_empty(sc->lists + (list))) \
 +                      stripe = NULL; \
 +              else { \
 +                      stripe = list_first_entry(sc->lists + (list), \
 +                                                struct stripe, \
 +                                                lists[(list)]); \
 +                      list_del_init(stripe->lists + (list)); \
 +              } \
 +      } while (0);
 +
 +/* Pop an available stripe off the LRU list. */
 +static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
 +{
 +      struct stripe *stripe;
 +
 +      POP_LIST(LIST_LRU);
 +      return stripe;
 +}
 +
 +/* Pop an available stripe off the io list. */
 +static struct stripe *stripe_io_pop(struct stripe_cache *sc)
 +{
 +      struct stripe *stripe;
 +
 +      POP_LIST(LIST_FLUSH);
 +      return stripe;
 +}
 +
 +/* Push a stripe safely onto the endio list to be handled by do_endios(). */
 +static void stripe_endio_push(struct stripe *stripe)
 +{
 +      unsigned long flags;
 +      struct stripe_cache *sc = stripe->sc;
 +      struct list_head *stripe_list = stripe->lists + LIST_ENDIO,
 +                       *sc_list = sc->lists + LIST_ENDIO;
 +      spinlock_t *lock = sc->locks + LOCK_ENDIO;
 +
 +      /* This runs in parallel with do_endios(). */
 +      spin_lock_irqsave(lock, flags);
 +      if (list_empty(stripe_list))
 +              list_add_tail(stripe_list, sc_list);
 +      spin_unlock_irqrestore(lock, flags);
 +
 +      wake_do_raid(RS(sc)); /* Wake myself. */
 +}
 +
 +/* Pop a stripe off safely off the endio list. */
 +static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
 +{
 +      struct stripe *stripe;
 +      spinlock_t *lock = sc->locks + LOCK_ENDIO;
 +
 +      /* This runs in parallel with endio(). */
 +      spin_lock_irq(lock);
 +      POP_LIST(LIST_ENDIO)
 +      spin_unlock_irq(lock);
 +      return stripe;
 +}
 +#undef POP_LIST
 +
 +/*
 + * Stripe cache locking functions
 + */
 +/* Dummy lock function for single host RAID4+5. */
 +static void *no_lock(sector_t key, enum dm_lock_type type)
 +{
 +      return &no_lock;
 +}
 +
 +/* Dummy unlock function for single host RAID4+5. */
 +static void no_unlock(void *lock_handle)
 +{
 +}
 +
 +/* No locking (for single host RAID 4+5). */
 +static struct dm_raid45_locking_type locking_none = {
 +      .lock = no_lock,
 +      .unlock = no_unlock,
 +};
 +
 +/* Lock a stripe (for clustering). */
 +static int
 +stripe_lock(struct stripe *stripe, int rw, sector_t key)
 +{
 +      stripe->lock = RS(stripe->sc)->locking->lock(key, rw == READ ? DM_RAID45_SHARED : DM_RAID45_EX);
 +      return stripe->lock ? 0 : -EPERM;
 +}
 +
 +/* Unlock a stripe (for clustering). */
 +static void stripe_unlock(struct stripe *stripe)
 +{
 +      RS(stripe->sc)->locking->unlock(stripe->lock);
 +      stripe->lock = NULL;
 +}
 +
 +/* Test io pending on stripe. */
 +static int stripe_io_ref(struct stripe *stripe)
 +{
 +      return atomic_read(&stripe->io.pending);
 +}
 +
 +static void stripe_io_get(struct stripe *stripe)
 +{
 +      if (atomic_inc_return(&stripe->io.pending) == 1)
 +              /* REMOVEME: statistics */
 +              atomic_inc(&stripe->sc->active_stripes);
 +      else
 +              BUG_ON(stripe_io_ref(stripe) < 0);
 +}
 +
 +static void stripe_io_put(struct stripe *stripe)
 +{
 +      if (atomic_dec_and_test(&stripe->io.pending)) {
 +              if (unlikely(StripeRecover(stripe)))
 +                      /* Don't put recovery stripe on endio list. */
 +                      wake_do_raid(RS(stripe->sc));
 +              else
 +                      /* Add regular stripe to endio list and wake daemon. */
 +                      stripe_endio_push(stripe);
 +
 +              /* REMOVEME: statistics */
 +              atomic_dec(&stripe->sc->active_stripes);
 +      } else
 +              BUG_ON(stripe_io_ref(stripe) < 0);
 +}
 +
 +/* Take stripe reference out. */
 +static int stripe_get(struct stripe *stripe)
 +{
 +      int r;
 +      struct list_head *lh = stripe->lists + LIST_LRU;
 +
 +      /* Delete stripe from LRU (inactive) list if on. */
 +      DEL_LIST(lh);
 +      BUG_ON(stripe_ref(stripe) < 0);
 +
 +      /* Lock stripe on first reference */
 +      r = (atomic_inc_return(&stripe->cnt) == 1) ?
 +          stripe_lock(stripe, WRITE, stripe->key) : 0;
 +
 +      return r;
 +}
 +#undef DEL_LIST
 +
 +/* Return references on a chunk. */
 +static int chunk_ref(struct stripe_chunk *chunk)
 +{
 +      return atomic_read(&chunk->cnt);
 +}
 +
 +/* Take out reference on a chunk. */
 +static int chunk_get(struct stripe_chunk *chunk)
 +{
 +      return atomic_inc_return(&chunk->cnt);
 +}
 +
 +/* Drop reference on a chunk. */
 +static void chunk_put(struct stripe_chunk *chunk)
 +{
 +      BUG_ON(atomic_dec_return(&chunk->cnt) < 0);
 +}
 +
 +/*
 + * Drop reference on a stripe.
 + *
 + * Move it to list of LRU stripes if zero.
 + */
 +static void stripe_put(struct stripe *stripe)
 +{
 +      if (atomic_dec_and_test(&stripe->cnt)) {
 +              BUG_ON(stripe_io_ref(stripe));
 +              stripe_unlock(stripe);
 +      } else
 +              BUG_ON(stripe_ref(stripe) < 0);
 +}
 +
 +/* Helper needed by for_each_io_dev(). */
 +static void stripe_get_references(struct stripe *stripe, unsigned p)
 +{
 +
 +      /*
 +       * Another one to reference the stripe in
 +       * order to protect vs. LRU list moves.
 +       */
 +      io_get(RS(stripe->sc)); /* Global io references. */
 +      stripe_get(stripe);
 +      stripe_io_get(stripe);  /* One for each chunk io. */
 +}
 +
 +/* Helper for endio() to put all take references. */
 +static void stripe_put_references(struct stripe *stripe)
 +{
 +      stripe_io_put(stripe);  /* One for each chunk io. */
 +      stripe_put(stripe);
 +      io_put(RS(stripe->sc));
 +}
 +
 +/*
 + * Stripe cache functions.
 + */
 +/*
 + * Invalidate all chunks (i.e. their pages)  of a stripe.
 + *
 + * I only keep state for the whole chunk.
 + */
 +static inline void stripe_chunk_invalidate(struct stripe_chunk *chunk)
 +{
 +      chunk->io.flags = 0;
 +}
 +
 +static void
 +stripe_chunks_invalidate(struct stripe *stripe)
 +{
 +      unsigned p = RS(stripe->sc)->set.raid_devs;
 +
 +      while (p--)
 +              stripe_chunk_invalidate(CHUNK(stripe, p));
 +}
 +
 +/* Prepare stripe for (re)use. */
 +static void stripe_invalidate(struct stripe *stripe)
 +{
 +      stripe->io.flags = 0;
 +      stripe->idx.parity = stripe->idx.recover = -1;
 +      stripe_chunks_invalidate(stripe);
 +}
 +
 +/*
 + * Allow io on all chunks of a stripe.
 + * If not set, IO will not occur; i.e. it's prohibited.
 + *
 + * Actual IO submission for allowed chunks depends
 + * on their !uptodate or dirty state.
 + */
 +static void stripe_allow_io(struct stripe *stripe)
 +{
 +      unsigned p = RS(stripe->sc)->set.raid_devs;
 +
 +      while (p--)
 +              SetChunkIo(CHUNK(stripe, p));
 +}
 +
 +/* Initialize a stripe. */
 +static void stripe_init(struct stripe_cache *sc, struct stripe *stripe)
 +{
 +      unsigned i, p = RS(sc)->set.raid_devs;
 +
 +      /* Work all io chunks. */
 +      while (p--) {
 +              struct stripe_chunk *chunk = CHUNK(stripe, p);
 +
 +              atomic_set(&chunk->cnt, 0);
 +              chunk->stripe = stripe;
 +              i = ARRAY_SIZE(chunk->bl);
 +              while (i--)
 +                      bio_list_init(chunk->bl + i);
 +      }
 +
 +      stripe->sc = sc;
 +
 +      i = ARRAY_SIZE(stripe->lists);
 +      while (i--)
 +              INIT_LIST_HEAD(stripe->lists + i);
 +
 +      stripe->io.size = RS(sc)->set.io_size;
 +      atomic_set(&stripe->cnt, 0);
 +      atomic_set(&stripe->io.pending, 0);
 +      stripe_invalidate(stripe);
 +}
 +
 +/* Number of pages per chunk. */
 +static inline unsigned chunk_pages(unsigned sectors)
 +{
 +      return dm_div_up(sectors, SECTORS_PER_PAGE);
 +}
 +
 +/* Number of pages per stripe. */
 +static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
 +{
 +      return chunk_pages(io_size) * rs->set.raid_devs;
 +}
 +
 +/* Initialize part of page_list (recovery). */
 +static void stripe_zero_pl_part(struct stripe *stripe, int p,
 +                              unsigned start, unsigned count)
 +{
 +      unsigned o = start / SECTORS_PER_PAGE, pages = chunk_pages(count);
 +      /* Get offset into the page_list. */
 +      struct page_list *pl = pl_elem(PL(stripe, p), o);
 +
 +      BUG_ON(!pl);
 +      while (pl && pages--) {
 +              BUG_ON(!pl->page);
 +              memset(page_address(pl->page), 0, PAGE_SIZE);
 +              pl = pl->next;
 +      }
 +}
 +
 +/* Initialize parity chunk of stripe. */
 +static void stripe_zero_chunk(struct stripe *stripe, int p)
 +{
 +      if (p > -1)
 +              stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
 +}
 +
 +/* Return dynamic stripe structure size. */
 +static size_t stripe_size(struct raid_set *rs)
 +{
 +      return sizeof(struct stripe) +
 +                    rs->set.raid_devs * sizeof(struct stripe_chunk);
 +}
 +
 +/* Allocate a stripe and its memory object. */
 +/* XXX adjust to cope with stripe cache and recovery stripe caches. */
 +enum grow { SC_GROW, SC_KEEP };
 +static struct stripe *stripe_alloc(struct stripe_cache *sc,
 +                                 struct dm_mem_cache_client *mc,
 +                                 enum grow grow)
 +{
 +      int r;
 +      struct stripe *stripe;
 +
 +      stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
 +      if (stripe) {
 +              /* Grow the dm-mem-cache by one object. */
 +              if (grow == SC_GROW) {
 +                      r = dm_mem_cache_grow(mc, 1);
 +                      if (r)
 +                              goto err_free;
 +              }
 +
 +              stripe->obj = dm_mem_cache_alloc(mc);
 +              if (IS_ERR(stripe->obj))
 +                      goto err_shrink;
 +
 +              stripe_init(sc, stripe);
 +      }
 +
 +      return stripe;
 +
 +err_shrink:
 +      if (grow == SC_GROW)
 +              dm_mem_cache_shrink(mc, 1);
 +err_free:
 +      kmem_cache_free(sc->kc.cache, stripe);
 +      return NULL;
 +}
 +
 +/*
 + * Free a stripes memory object, shrink the
 + * memory cache and free the stripe itself.
 + */
 +static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
 +{
 +      dm_mem_cache_free(mc, stripe->obj);
 +      dm_mem_cache_shrink(mc, 1);
 +      kmem_cache_free(stripe->sc->kc.cache, stripe);
 +}
 +
 +/* Free the recovery stripe. */
 +static void stripe_recover_free(struct raid_set *rs)
 +{
 +      struct recover *rec = &rs->recover;
 +      struct dm_mem_cache_client *mc;
 +
 +      mc = rec->mem_cache_client;
 +      rec->mem_cache_client = NULL;
 +      if (mc) {
 +              struct stripe *stripe;
 +
 +              while (!list_empty(&rec->stripes)) {
 +                      stripe = list_first_entry(&rec->stripes, struct stripe,
 +                                                lists[LIST_RECOVER]);
 +                      list_del(stripe->lists + LIST_RECOVER);
 +                      kfree(stripe->recover);
 +                      stripe_free(stripe, mc);
 +              }
 +
 +              dm_mem_cache_client_destroy(mc);
 +              dm_io_client_destroy(rec->dm_io_client);
 +              rec->dm_io_client = NULL;
 +      }
 +}
 +
 +/* Grow stripe cache. */
 +static int sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
 +{
 +      int r = 0;
 +
 +      /* Try to allocate this many (additional) stripes. */
 +      while (stripes--) {
 +              struct stripe *stripe =
 +                      stripe_alloc(sc, sc->mem_cache_client, grow);
 +
 +              if (likely(stripe)) {
 +                      stripe_lru_add(stripe);
 +                      atomic_inc(&sc->stripes);
 +              } else {
 +                      r = -ENOMEM;
 +                      break;
 +              }
 +      }
 +
 +      return r ? r : sc_hash_resize(sc);
 +}
 +
 +/* Shrink stripe cache. */
 +static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
 +{
 +      int r = 0;
 +
 +      /* Try to get unused stripe from LRU list. */
 +      while (stripes--) {
 +              struct stripe *stripe;
 +
 +              stripe = stripe_lru_pop(sc);
 +              if (stripe) {
 +                      /* An LRU stripe may never have ios pending! */
 +                      BUG_ON(stripe_io_ref(stripe));
 +                      BUG_ON(stripe_ref(stripe));
 +                      atomic_dec(&sc->stripes);
 +                      /* Remove from hash if on before deletion. */
 +                      stripe_hash_del(stripe);
 +                      stripe_free(stripe, sc->mem_cache_client);
 +              } else {
 +                      r = -ENOENT;
 +                      break;
 +              }
 +      }
 +
 +      /* Check if stats are still sane. */
 +      if (atomic_read(&sc->active_stripes_max) >
 +          atomic_read(&sc->stripes))
 +              atomic_set(&sc->active_stripes_max, 0);
 +
 +      if (r)
 +              return r;
 +
 +      return atomic_read(&sc->stripes) ? sc_hash_resize(sc) : 0;
 +}
 +
 +/* Create stripe cache and recovery. */
 +static int sc_init(struct raid_set *rs, unsigned stripes)
 +{
 +      unsigned i, r, rstripes;
 +      struct stripe_cache *sc = &rs->sc;
 +      struct stripe *stripe;
 +      struct recover *rec = &rs->recover;
 +      struct mapped_device *md;
 +      struct gendisk *disk;
 +
 +
 +      /* Initialize lists and locks. */
 +      i = ARRAY_SIZE(sc->lists);
 +      while (i--)
 +              INIT_LIST_HEAD(sc->lists + i);
 +
 +      INIT_LIST_HEAD(&rec->stripes);
 +
 +      /* Initialize endio and LRU list locks. */
 +      i = NR_LOCKS;
 +      while (i--)
 +              spin_lock_init(sc->locks + i);
 +
 +      /* Initialize atomic variables. */
 +      atomic_set(&sc->stripes, 0);
 +      atomic_set(&sc->stripes_to_set, 0);
 +      atomic_set(&sc->active_stripes, 0);
 +      atomic_set(&sc->active_stripes_max, 0); /* REMOVEME: statistics. */
 +
 +      /*
 +       * We need a runtime unique # to suffix the kmem cache name
 +       * because we'll have one for each active RAID set.
 +       */
 +      md = dm_table_get_md(rs->ti->table);
 +      disk = dm_disk(md);
 +      snprintf(sc->kc.name, sizeof(sc->kc.name), "%s-%d.%d", TARGET,
 +               disk->first_minor, atomic_inc_return(&_stripe_sc_nr));
 +      sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
 +                                       0, 0, NULL);
 +      if (!sc->kc.cache)
 +              return -ENOMEM;
 +
 +      /* Create memory cache client context for RAID stripe cache. */
 +      sc->mem_cache_client =
 +              dm_mem_cache_client_create(stripes, rs->set.raid_devs,
 +                                         chunk_pages(rs->set.io_size));
 +      if (IS_ERR(sc->mem_cache_client))
 +              return PTR_ERR(sc->mem_cache_client);
 +
 +      /* Create memory cache client context for RAID recovery stripe(s). */
 +      rstripes = rec->recovery_stripes;
 +      rec->mem_cache_client =
 +              dm_mem_cache_client_create(rstripes, rs->set.raid_devs,
 +                                         chunk_pages(rec->io_size));
 +      if (IS_ERR(rec->mem_cache_client))
 +              return PTR_ERR(rec->mem_cache_client);
 +
 +      /* Create dm-io client context for IO stripes. */
 +      sc->dm_io_client = dm_io_client_create();
 +      if (IS_ERR(sc->dm_io_client))
 +              return PTR_ERR(sc->dm_io_client);
 +
 +      /* FIXME: intermingeled with stripe cache initialization. */
 +      /* Create dm-io client context for recovery stripes. */
 +      rec->dm_io_client = dm_io_client_create();
 +      if (IS_ERR(rec->dm_io_client))
 +              return PTR_ERR(rec->dm_io_client);
 +
 +      /* Allocate stripes for set recovery. */
 +      while (rstripes--) {
 +              stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
 +              if (!stripe)
 +                      return -ENOMEM;
 +
 +              stripe->recover = kzalloc(sizeof(*stripe->recover), GFP_KERNEL);
 +              if (!stripe->recover) {
 +                      stripe_free(stripe, rec->mem_cache_client);
 +                      return -ENOMEM;
 +              }
 +
 +              SetStripeRecover(stripe);
 +              stripe->io.size = rec->io_size;
 +              list_add_tail(stripe->lists + LIST_RECOVER, &rec->stripes);
 +              /* Don't add recovery stripes to LRU list! */
 +      }
 +
 +      /*
 +       * Allocate the stripe objetcs from the
 +       * cache and add them to the LRU list.
 +       */
 +      r = sc_grow(sc, stripes, SC_KEEP);
 +      if (!r)
 +              atomic_set(&sc->stripes_last, stripes);
 +
 +      return r;
 +}
 +
 +/* Destroy the stripe cache. */
 +static void sc_exit(struct stripe_cache *sc)
 +{
 +      struct raid_set *rs = RS(sc);
 +
 +      if (sc->kc.cache) {
 +              stripe_recover_free(rs);
 +              BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
 +              kmem_cache_destroy(sc->kc.cache);
 +              sc->kc.cache = NULL;
 +
 +              if (sc->mem_cache_client && !IS_ERR(sc->mem_cache_client))
 +                      dm_mem_cache_client_destroy(sc->mem_cache_client);
 +
 +              if (sc->dm_io_client && !IS_ERR(sc->dm_io_client))
 +                      dm_io_client_destroy(sc->dm_io_client);
 +
 +              hash_exit(&sc->hash);
 +      }
 +}
 +
 +/*
 + * Calculate RAID address
 + *
 + * Delivers tuple with the index of the data disk holding the chunk
 + * in the set, the parity disks index and the start of the stripe
 + * within the address space of the set (used as the stripe cache hash key).
 + */
 +/* thx MD. */
 +static struct raid_address *raid_address(struct raid_set *rs, sector_t sector,
 +                                       struct raid_address *addr)
 +{
 +      sector_t stripe, tmp;
 +
 +      /*
 +       * chunk_number = sector / chunk_size
 +       * stripe_number = chunk_number / data_devs
 +       * di = stripe % data_devs;
 +       */
 +      stripe = sector >> rs->set.chunk_shift;
 +      addr->di = sector_div(stripe, rs->set.data_devs);
 +
 +      switch (rs->set.raid_type->level) {
 +      case raid4:
 +              addr->pi = rs->set.pi;
 +              goto check_shift_di;
 +      case raid5:
 +              tmp = stripe;
 +              addr->pi = sector_div(tmp, rs->set.raid_devs);
 +
 +              switch (rs->set.raid_type->algorithm) {
 +              case left_asym:         /* Left asymmetric. */
 +                      addr->pi = rs->set.data_devs - addr->pi;
 +              case right_asym:        /* Right asymmetric. */
 +check_shift_di:
 +                      if (addr->di >= addr->pi)
 +                              addr->di++;
 +                      break;
 +              case left_sym:          /* Left symmetric. */
 +                      addr->pi = rs->set.data_devs - addr->pi;
 +              case right_sym:         /* Right symmetric. */
 +                      addr->di = (addr->pi + addr->di + 1) %
 +                                 rs->set.raid_devs;
 +                      break;
 +              case none: /* Ain't happen: RAID4 algorithm placeholder. */
 +                      BUG();
 +              }
 +      }
 +
 +      /*
 +       * Start offset of the stripes chunk on any single device of the RAID
 +       * set, adjusted in case io size differs from chunk size.
 +       */
 +      addr->key = (stripe << rs->set.chunk_shift) +
 +                  (sector & rs->set.io_inv_mask);
 +      return addr;
 +}
 +
 +/*
 + * Copy data across between stripe pages and bio vectors.
 + *
 + * Pay attention to data alignment in stripe and bio pages.
 + */
 +static void bio_copy_page_list(int rw, struct stripe *stripe,
 +                             struct page_list *pl, struct bio *bio)
 +{
 +      unsigned i, page_offset;
 +      void *page_addr;
 +      struct raid_set *rs = RS(stripe->sc);
 +      struct bio_vec *bv;
 +
 +      /* Get start page in page list for this sector. */
 +      i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
 +      pl = pl_elem(pl, i);
 +      BUG_ON(!pl);
 +      BUG_ON(!pl->page);
 +
 +      page_addr = page_address(pl->page);
 +      page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
 +
 +      /* Walk all segments and copy data across between bio_vecs and pages. */
 +      bio_for_each_segment(bv, bio, i) {
 +              int len = bv->bv_len, size;
 +              unsigned bio_offset = 0;
 +              void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
 +redo:
 +              size = (page_offset + len > PAGE_SIZE) ?
 +                     PAGE_SIZE - page_offset : len;
 +
 +              if (rw == READ)
 +                      memcpy(bio_addr + bio_offset,
 +                             page_addr + page_offset, size);
 +              else
 +                      memcpy(page_addr + page_offset,
 +                             bio_addr + bio_offset, size);
 +
 +              page_offset += size;
 +              if (page_offset == PAGE_SIZE) {
 +                      /*
 +                       * We reached the end of the chunk page ->
 +                       * need to refer to the next one to copy more data.
 +                       */
 +                      len -= size;
 +                      if (len) {
 +                              /* Get next page. */
 +                              pl = pl->next;
 +                              BUG_ON(!pl);
 +                              BUG_ON(!pl->page);
 +                              page_addr = page_address(pl->page);
 +                              page_offset = 0;
 +                              bio_offset += size;
 +                              /* REMOVEME: statistics. */
 +                              atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
 +                              goto redo;
 +                      }
 +              }
 +
 +              __bio_kunmap_atomic(bio_addr, KM_USER0);
 +      }
 +}
 +
 +/*
 + * Xor optimization macros.
 + */
 +/* Xor data pointer declaration and initialization macros. */
 +#define DECLARE_2     unsigned long *d0 = data[0], *d1 = data[1]
 +#define DECLARE_3     DECLARE_2, *d2 = data[2]
 +#define DECLARE_4     DECLARE_3, *d3 = data[3]
 +#define DECLARE_5     DECLARE_4, *d4 = data[4]
 +#define DECLARE_6     DECLARE_5, *d5 = data[5]
 +#define DECLARE_7     DECLARE_6, *d6 = data[6]
 +#define DECLARE_8     DECLARE_7, *d7 = data[7]
 +
 +/* Xor unrole macros. */
 +#define D2(n) d0[n] = d0[n] ^ d1[n]
 +#define D3(n) D2(n) ^ d2[n]
 +#define D4(n) D3(n) ^ d3[n]
 +#define D5(n) D4(n) ^ d4[n]
 +#define D6(n) D5(n) ^ d5[n]
 +#define D7(n) D6(n) ^ d6[n]
 +#define D8(n) D7(n) ^ d7[n]
 +
 +#define       X_2(macro, offset)      macro(offset); macro(offset + 1);
 +#define       X_4(macro, offset)      X_2(macro, offset); X_2(macro, offset + 2);
 +#define       X_8(macro, offset)      X_4(macro, offset); X_4(macro, offset + 4);
 +#define       X_16(macro, offset)     X_8(macro, offset); X_8(macro, offset + 8);
 +#define       X_32(macro, offset)     X_16(macro, offset); X_16(macro, offset + 16);
 +#define       X_64(macro, offset)     X_32(macro, offset); X_32(macro, offset + 32);
 +
 +/* Define a _xor_#chunks_#xors_per_run() function. */
 +#define       _XOR(chunks, xors_per_run) \
 +static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
 +{ \
 +      unsigned end = XOR_SIZE / sizeof(data[0]), i; \
 +      DECLARE_ ## chunks; \
 +\
 +      for (i = 0; i < end; i += xors_per_run) { \
 +              X_ ## xors_per_run(D ## chunks, i); \
 +      } \
 +}
 +
 +/* Define xor functions for 2 - 8 chunks and xors per run. */
 +#define       MAKE_XOR_PER_RUN(xors_per_run) \
 +      _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
 +      _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
 +      _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
 +      _XOR(8, xors_per_run);
 +
 +MAKE_XOR_PER_RUN(8)   /* Define _xor_*_8() functions. */
 +MAKE_XOR_PER_RUN(16)  /* Define _xor_*_16() functions. */
 +MAKE_XOR_PER_RUN(32)  /* Define _xor_*_32() functions. */
 +MAKE_XOR_PER_RUN(64)  /* Define _xor_*_64() functions. */
 +
 +#define MAKE_XOR(xors_per_run) \
 +struct { \
 +      void (*f)(unsigned long **); \
 +} static xor_funcs ## xors_per_run[] = { \
 +      { NULL }, /* NULL pointers to optimize indexing in xor(). */ \
 +      { NULL }, \
 +      { _xor2_ ## xors_per_run }, \
 +      { _xor3_ ## xors_per_run }, \
 +      { _xor4_ ## xors_per_run }, \
 +      { _xor5_ ## xors_per_run }, \
 +      { _xor6_ ## xors_per_run }, \
 +      { _xor7_ ## xors_per_run }, \
 +      { _xor8_ ## xors_per_run }, \
 +}; \
 +\
 +static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
 +{ \
 +      /* Call respective function for amount of chunks. */ \
 +      xor_funcs ## xors_per_run[n].f(data); \
 +}
 +
 +/* Define xor_8() - xor_64 functions. */
 +MAKE_XOR(8)
 +MAKE_XOR(16)
 +MAKE_XOR(32)
 +MAKE_XOR(64)
 +/*
 + * END xor optimization macros.
 + */
 +
 +/* Maximum number of chunks, which can be xor'ed in one go. */
 +#define       XOR_CHUNKS_MAX  (ARRAY_SIZE(xor_funcs8) - 1)
 +
 +/* xor_blocks wrapper to allow for using that crypto library function. */
 +static void xor_blocks_wrapper(unsigned n, unsigned long **data)
 +{
 +      BUG_ON(n < 2 || n > MAX_XOR_BLOCKS + 1);
 +      xor_blocks(n - 1, XOR_SIZE, (void *) data[0], (void **) data + 1);
 +}
 +
 +struct xor_func {
 +      xor_function_t f;
 +      const char *name;
 +} static xor_funcs[] = {
 +      { xor_64,  "xor_64" },
 +      { xor_32,  "xor_32" },
 +      { xor_16,  "xor_16" },
 +      { xor_8,   "xor_8"  },
 +      { xor_blocks_wrapper, "xor_blocks" },
 +};
 +
 +/*
 + * Check, if chunk has to be xored in/out:
 + *
 + * o if writes are queued
 + * o if writes are merged
 + * o if stripe is to be reconstructed
 + * o if recovery stripe
 + */
 +static inline int chunk_must_xor(struct stripe_chunk *chunk)
 +{
 +      if (ChunkUptodate(chunk)) {
 +              BUG_ON(!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) &&
 +                     !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)));
 +
 +              if (!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) ||
 +                  !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)))
 +                      return 1;
 +
 +              if (StripeReconstruct(chunk->stripe) ||
 +                  StripeRecover(chunk->stripe))
 +                      return 1;
 +      }
 +
 +      return 0;
 +}
 +
 +/*
 + * Calculate crc.
 + *
 + * This indexes into the chunks of a stripe and their pages.
 + *
 + * All chunks will be xored into the indexed (@pi)
 + * chunk in maximum groups of xor.chunks.
 + *
 + */
 +static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      unsigned max_chunks = rs->xor.chunks, n = 1,
 +               o = sector / SECTORS_PER_PAGE, /* Offset into the page_list. */
 +               p = rs->set.raid_devs;
 +      unsigned long **d = rs->data;
 +      xor_function_t xor_f = rs->xor.f->f;
 +
 +      BUG_ON(sector > stripe->io.size);
 +
 +      /* Address of parity page to xor into. */
 +      d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
 +
 +      while (p--) {
 +              /* Preset pointers to data pages. */
 +              if (p != pi && chunk_must_xor(CHUNK(stripe, p)))
 +                      d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
 +
 +              /* If max chunks -> xor. */
 +              if (n == max_chunks) {
 +                      mutex_lock(&rs->io.xor_lock);
 +                      xor_f(n, d);
 +                      mutex_unlock(&rs->io.xor_lock);
 +                      n = 1;
 +              }
 +      }
 +
 +      /* If chunks -> xor. */
 +      if (n > 1) {
 +              mutex_lock(&rs->io.xor_lock);
 +              xor_f(n, d);
 +              mutex_unlock(&rs->io.xor_lock);
 +      }
 +}
 +
 +/* Common xor loop through all stripe page lists. */
 +static void common_xor(struct stripe *stripe, sector_t count,
 +                     unsigned off, unsigned pi)
 +{
 +      unsigned sector;
 +
 +      BUG_ON(!count);
 +      for (sector = off; sector < count; sector += SECTORS_PER_PAGE)
 +              xor(stripe, pi, sector);
 +
 +      /* Set parity page uptodate and clean. */
 +      chunk_set(CHUNK(stripe, pi), CLEAN);
 +      atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
 +}
 +
 +/*
 + * Calculate parity sectors on intact stripes.
 + *
 + * Need to calculate raid address for recover stripe, because its
 + * chunk sizes differs and is typically larger than io chunk size.
 + */
 +static void parity_xor(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      int size_differs = stripe->io.size != rs->set.io_size;
 +      unsigned chunk_size = rs->set.chunk_size, io_size = stripe->io.size,
 +               xor_size = chunk_size > io_size ? io_size : chunk_size;
 +      sector_t off;
 +
 +      /* This can be the recover stripe with a larger io size. */
 +      for (off = 0; off < io_size; off += xor_size) {
 +              /*
 +               * Recover stripe is likely bigger than regular io
 +               * ones and has no precalculated parity disk index ->
 +               * need to calculate RAID address.
 +               */
 +              if (unlikely(size_differs)) {
 +                      struct raid_address addr;
 +
 +                      raid_address(rs, (stripe->key + off) *
 +                                       rs->set.data_devs, &addr);
 +                      stripe->idx.parity = addr.pi;
 +                      stripe_zero_pl_part(stripe, addr.pi, off, xor_size);
 +              }
 +
 +              common_xor(stripe, xor_size, off, stripe->idx.parity);
 +              chunk_set(CHUNK(stripe, stripe->idx.parity), DIRTY);
 +      }
 +}
 +
 +/* Reconstruct missing chunk. */
 +static void stripe_reconstruct(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      int p = rs->set.raid_devs, pr = stripe->idx.recover;
 +
 +      BUG_ON(pr < 0);
 +
 +      /* Check if all but the chunk to be reconstructed are uptodate. */
 +      while (p--)
 +              BUG_ON(p != pr && !ChunkUptodate(CHUNK(stripe, p)));
 +
 +      /* REMOVEME: statistics. */
 +      atomic_inc(rs->stats + (RSDegraded(rs) ? S_RECONSTRUCT_EI :
 +                                               S_RECONSTRUCT_DEV));
 +      /* Zero chunk to be reconstructed. */
 +      stripe_zero_chunk(stripe, pr);
 +      common_xor(stripe, stripe->io.size, 0, pr);
 +}
 +
 +/*
 + * Recovery io throttling
 + */
 +/* Conditionally reset io counters. */
 +static int recover_io_reset(struct raid_set *rs)
 +{
 +      unsigned long j = jiffies;
 +
 +      /* Pay attention to jiffies overflows. */
 +      if (j > rs->recover.last_jiffies + HZ ||
 +          j < rs->recover.last_jiffies) {
 +              atomic_set(rs->recover.io_count + IO_WORK, 0);
 +              atomic_set(rs->recover.io_count + IO_RECOVER, 0);
 +              rs->recover.last_jiffies = j;
 +              return 1;
 +      }
 +
 +      return 0;
 +}
 +
 +/* Count ios. */
 +static void recover_io_count(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +
 +      atomic_inc(rs->recover.io_count +
 +                 (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
 +}
 +
 +/* Try getting a stripe either from the hash or from the LRU list. */
 +static struct stripe *stripe_find(struct raid_set *rs,
 +                                struct raid_address *addr)
 +{
 +      int r;
 +      struct stripe_cache *sc = &rs->sc;
 +      struct stripe *stripe;
 +
 +      /* Try stripe from hash. */
 +      stripe = stripe_lookup(sc, addr->key);
 +      if (stripe) {
 +              r = stripe_get(stripe);
 +              if (r)
 +                      goto get_lock_failed;
 +
 +              atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
 +      } else {
 +              /* Not in hash -> try to get an LRU stripe. */
 +              stripe = stripe_lru_pop(sc);
 +              if (stripe) {
 +                      /*
 +                       * An LRU stripe may not be referenced
 +                       * and may never have ios pending!
 +                       */
 +                      BUG_ON(stripe_ref(stripe));
 +                      BUG_ON(stripe_io_ref(stripe));
 +
 +                      /* Remove from hash if on before reuse. */
 +                      stripe_hash_del(stripe);
 +
 +                      /* Invalidate before reinserting with changed key. */
 +                      stripe_invalidate(stripe);
 +
 +                      stripe->key = addr->key;
 +                      stripe->region = dm_rh_sector_to_region(rs->recover.rh,
 +                                                              addr->key);
 +                      stripe->idx.parity = addr->pi;
 +                      r = stripe_get(stripe);
 +                      if (r)
 +                              goto get_lock_failed;
 +
 +                      /* Insert stripe into the stripe hash. */
 +                      stripe_insert(&sc->hash, stripe);
 +                      /* REMOVEME: statistics. */
 +                      atomic_inc(rs->stats + S_INSCACHE);
 +              }
 +      }
 +
 +      return stripe;
 +
 +get_lock_failed:
 +      stripe_put(stripe);
 +      return NULL;
 +}
 +
 +/*
 + * Process end io
 + *
 + * I need to do it here because I can't in interrupt
 + */
 +/* End io all bios on a bio list. */
 +static void bio_list_endio(struct stripe *stripe, struct bio_list *bl,
 +                         int p, int error)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      struct bio *bio;
 +      struct page_list *pl = PL(stripe, p);
 +      struct stripe_chunk *chunk = CHUNK(stripe, p);
 +
 +      /* Update region counters. */
 +      while ((bio = bio_list_pop(bl))) {
 +              if (bio_data_dir(bio) == WRITE)
 +                      /* Drop io pending count for any writes. */
 +                      dm_rh_dec(rs->recover.rh, stripe->region);
 +              else if (!error)
 +                      /* Copy data accross. */
 +                      bio_copy_page_list(READ, stripe, pl, bio);
 +
 +              bio_endio(bio, error);
 +
 +              /* REMOVEME: statistics. */
 +              atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
 +                         S_BIOS_ENDIO_READ : S_BIOS_ENDIO_WRITE));
 +
 +              chunk_put(chunk);
 +              stripe_put(stripe);
 +              io_put(rs);     /* Wake any suspend waiters on last bio. */
 +      }
 +}
 +
 +/*
 + * End io all reads/writes on a stripe copying
 + * read data accross from stripe to bios and
 + * decrementing region counters for writes.
 + *
 + * Processing of ios depeding on state:
 + * o no chunk error -> endio ok
 + * o degraded:
 + *   - chunk error and read -> ignore to be requeued
 + *   - chunk error and write -> endio ok
 + * o dead (more than parity_devs failed) and chunk_error-> endio failed
 + */
 +static void stripe_endio(int rw, struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      unsigned p = rs->set.raid_devs;
 +      int write = (rw != READ);
 +
 +      while (p--) {
 +              struct stripe_chunk *chunk = CHUNK(stripe, p);
 +              struct bio_list *bl;
 +
 +              BUG_ON(ChunkLocked(chunk));
 +
 +              bl = BL_CHUNK(chunk, rw);
 +              if (bio_list_empty(bl))
 +                      continue;
 +
 +              if (unlikely(ChunkError(chunk) || !ChunkUptodate(chunk))) {
 +                      /* RAID set dead. */
 +                      if (unlikely(RSDead(rs)))
 +                              bio_list_endio(stripe, bl, p, -EIO);
 +                      /* RAID set degraded. */
 +                      else if (write)
 +                              bio_list_endio(stripe, bl, p, 0);
 +              } else {
 +                      BUG_ON(!RSDegraded(rs) && ChunkDirty(chunk));
 +                      bio_list_endio(stripe, bl, p, 0);
 +              }
 +      }
 +}
 +
 +/* Fail all ios hanging off all bio lists of a stripe. */
 +static void stripe_fail_io(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      unsigned p = rs->set.raid_devs;
 +
 +      while (p--) {
 +              struct stripe_chunk *chunk = CHUNK(stripe, p);
 +              int i = ARRAY_SIZE(chunk->bl);
 +
 +              /* Fail all bios on all bio lists of the stripe. */
 +              while (i--) {
 +                      struct bio_list *bl = chunk->bl + i;
 +
 +                      if (!bio_list_empty(bl))
 +                              bio_list_endio(stripe, bl, p, -EIO);
 +              }
 +      }
 +
 +      /* Put stripe on LRU list. */
 +      BUG_ON(stripe_io_ref(stripe));
 +      BUG_ON(stripe_ref(stripe));
 +}
 +
 +/* Unlock all required chunks. */
 +static void stripe_chunks_unlock(struct stripe *stripe)
 +{
 +      unsigned p = RS(stripe->sc)->set.raid_devs;
 +      struct stripe_chunk *chunk;
 +
 +      while (p--) {
 +              chunk = CHUNK(stripe, p);
 +
 +              if (TestClearChunkUnlock(chunk))
 +                      ClearChunkLocked(chunk);
 +      }
 +}
 +
 +/*
 + * Queue reads and writes to a stripe by hanging
 + * their bios off the stripesets read/write lists.
 + */
 +static int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
 +                          struct bio_list *reject)
 +{
 +      struct raid_address addr;
 +      struct stripe *stripe;
 +
 +      stripe = stripe_find(rs, raid_address(rs, bio->bi_sector, &addr));
 +      if (stripe) {
 +              int r = 0, rw = bio_data_dir(bio);
 +
 +              /* Distinguish reads and writes. */
 +              bio_list_add(BL(stripe, addr.di, rw), bio);
 +
 +              if (rw == READ)
 +                      /* REMOVEME: statistics. */
 +                      atomic_inc(rs->stats + S_BIOS_ADDED_READ);
 +              else {
 +                      /* Inrement pending write count on region. */
 +                      dm_rh_inc(rs->recover.rh, stripe->region);
 +                      r = 1;
 +
 +                      /* REMOVEME: statistics. */
 +                      atomic_inc(rs->stats + S_BIOS_ADDED_WRITE);
 +              }
 +
 +              /*
 +               * Put on io (flush) list in case of
 +               * initial bio queued to chunk.
 +               */
 +              if (chunk_get(CHUNK(stripe, addr.di)) == 1)
 +                      stripe_flush_add(stripe);
 +
 +              return r;
 +      }
 +
 +      /* Got no stripe from cache or failed to lock it -> reject bio. */
 +      bio_list_add(reject, bio);
 +      atomic_inc(rs->stats + S_IOS_POST); /* REMOVEME: statistics. */
 +      return 0;
 +}
 +
 +/*
 + * Handle all stripes by handing them to the daemon, because we can't
 + * map their chunk pages to copy the data in interrupt context.
 + *
 + * We don't want to handle them here either, while interrupts are disabled.
 + */
 +
 +/* Read/write endio function for dm-io (interrupt context). */
 +static void endio(unsigned long error, void *context)
 +{
 +      struct stripe_chunk *chunk = context;
 +
 +      if (unlikely(error)) {
 +              chunk_set(chunk, ERROR);
 +              /* REMOVEME: statistics. */
 +              atomic_inc(RS(chunk->stripe->sc)->stats + S_STRIPE_ERROR);
 +      } else
 +              chunk_set(chunk, CLEAN);
 +
 +      /*
 +       * For recovery stripes, I need to reset locked locked
 +       * here, because those aren't processed in do_endios().
 +       */
 +      if (unlikely(StripeRecover(chunk->stripe)))
 +              ClearChunkLocked(chunk);
 +      else
 +              SetChunkUnlock(chunk);
 +
 +      /* Indirectly puts stripe on cache's endio list via stripe_io_put(). */
 +      stripe_put_references(chunk->stripe);
 +}
 +
 +/* Read/Write a chunk asynchronously. */
 +static void stripe_chunk_rw(struct stripe *stripe, unsigned p)
 +{
 +      struct stripe_cache *sc = stripe->sc;
 +      struct raid_set *rs = RS(sc);
 +      struct dm_mem_cache_object *obj = stripe->obj + p;
 +      struct page_list *pl = obj->pl;
 +      struct stripe_chunk *chunk = CHUNK(stripe, p);
 +      struct raid_dev *dev = rs->dev + p;
 +      struct dm_io_region io = {
 +              .bdev = dev->dev->bdev,
 +              .sector = stripe->key,
 +              .count = stripe->io.size,
 +      };
 +      struct dm_io_request control = {
 +              .bi_rw = ChunkDirty(chunk) ? WRITE : READ,
 +              .mem = {
 +                      .type = DM_IO_PAGE_LIST,
 +                      .ptr.pl = pl,
 +                      .offset = 0,
 +              },
 +              .notify = {
 +                      .fn = endio,
 +                      .context = chunk,
 +              },
 +              .client = StripeRecover(stripe) ? rs->recover.dm_io_client :
 +                                                sc->dm_io_client,
 +      };
 +
 +      BUG_ON(ChunkLocked(chunk));
 +      BUG_ON(!ChunkUptodate(chunk) && ChunkDirty(chunk));
 +      BUG_ON(ChunkUptodate(chunk) && !ChunkDirty(chunk));
 +
 +      /*
 +       * Don't rw past end of device, which can happen, because
 +       * typically sectors_per_dev isn't divisible by io_size.
 +       */
 +      if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
 +              io.count = rs->set.sectors_per_dev - io.sector;
 +
 +      BUG_ON(!io.count);
 +      io.sector += dev->start;        /* Add <offset>. */
 +      if (RSRecover(rs))
 +              recover_io_count(stripe);       /* Recovery io accounting. */
 +
 +      /* REMOVEME: statistics. */
 +      atomic_inc(rs->stats + (ChunkDirty(chunk) ? S_DM_IO_WRITE :
 +                                                  S_DM_IO_READ));
 +      SetChunkLocked(chunk);
 +      SetDevIoQueued(dev);
 +      BUG_ON(dm_io(&control, 1, &io, NULL));
 +}
 +
 +/*
 + * Write dirty or read not uptodate page lists of a stripe.
 + */
 +static int stripe_chunks_rw(struct stripe *stripe)
 +{
 +      int r;
 +      struct raid_set *rs = RS(stripe->sc);
 +
 +      /*
 +       * Increment the pending count on the stripe
 +       * first, so that we don't race in endio().
 +       *
 +       * An inc (IO) is needed for any chunk unless !ChunkIo(chunk):
 +       *
 +       * o not uptodate
 +       * o dirtied by writes merged
 +       * o dirtied by parity calculations
 +       */
 +      r = for_each_io_dev(stripe, stripe_get_references);
 +      if (r) {
 +              /* Io needed: chunks are either not uptodate or dirty. */
 +              int max;        /* REMOVEME: */
 +              struct stripe_cache *sc = &rs->sc;
 +
 +              /* Submit actual io. */
 +              for_each_io_dev(stripe, stripe_chunk_rw);
 +
 +              /* REMOVEME: statistics */
 +              max = sc_active(sc);
 +              if (atomic_read(&sc->active_stripes_max) < max)
 +                      atomic_set(&sc->active_stripes_max, max);
 +
 +              atomic_inc(rs->stats + S_FLUSHS);
 +              /* END REMOVEME: statistics */
 +      }
 +
 +      return r;
 +}
 +
 +/* Merge in all writes hence dirtying respective chunks. */
 +static void stripe_merge_writes(struct stripe *stripe)
 +{
 +      unsigned p = RS(stripe->sc)->set.raid_devs;
 +
 +      while (p--) {
 +              struct stripe_chunk *chunk = CHUNK(stripe, p);
 +              struct bio_list *write = BL_CHUNK(chunk, WRITE_QUEUED);
 +
 +              if (!bio_list_empty(write)) {
 +                      struct bio *bio;
 +                      struct page_list *pl = stripe->obj[p].pl;
 +
 +                      /*
 +                       * We can play with the lists without holding a lock,
 +                       * because it is just us accessing them anyway.
 +                       */
 +                      bio_list_for_each(bio, write)
 +                              bio_copy_page_list(WRITE, stripe, pl, bio);
 +
 +                      bio_list_merge(BL_CHUNK(chunk, WRITE_MERGED), write);
 +                      bio_list_init(write);
 +                      chunk_set(chunk, DIRTY);
 +              }
 +      }
 +}
 +
 +/* Queue all writes to get merged. */
 +static int stripe_queue_writes(struct stripe *stripe)
 +{
 +      int r = 0;
 +      unsigned p = RS(stripe->sc)->set.raid_devs;
 +
 +      while (p--) {
 +              struct stripe_chunk *chunk = CHUNK(stripe, p);
 +              struct bio_list *write = BL_CHUNK(chunk, WRITE);
 +
 +              if (!bio_list_empty(write)) {
 +                      bio_list_merge(BL_CHUNK(chunk, WRITE_QUEUED), write);
 +                      bio_list_init(write);
 +SetChunkIo(chunk);
 +                      r = 1;
 +              }
 +      }
 +
 +      return r;
 +}
 +
 +
 +/* Check, if a chunk gets completely overwritten. */
 +static int stripe_check_chunk_overwrite(struct stripe *stripe, unsigned p)
 +{
 +      unsigned sectors = 0;
 +      struct bio *bio;
 +      struct bio_list *bl = BL(stripe, p, WRITE_QUEUED);
 +
 +      bio_list_for_each(bio, bl)
 +              sectors += bio_sectors(bio);
 +
 +      BUG_ON(sectors > RS(stripe->sc)->set.io_size);
 +      return sectors == RS(stripe->sc)->set.io_size;
 +}
 +
 +/*
 + * Avoid io on broken/reconstructed drive in order to
 + * reconstruct date on endio.
 + *
 + * (*1*) We set StripeReconstruct() in here, so that _do_endios()
 + *     will trigger a reconstruct call before resetting it.
 + */
 +static int stripe_chunk_set_io_flags(struct stripe *stripe, int pr)
 +{
 +      struct stripe_chunk *chunk = CHUNK(stripe, pr);
 +
 +      /*
 +       * Allow io on all chunks but the indexed one,
 +       * because we're either degraded or prohibit it
 +       * on the one for later reconstruction.
 +       */
 +      /* Includes ClearChunkIo(), ClearChunkUptodate(). */
 +      stripe_chunk_invalidate(chunk);
 +      stripe->idx.recover = pr;
 +      SetStripeReconstruct(stripe);
 +
 +      /* REMOVEME: statistics. */
 +      atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
 +      return -EPERM;
 +}
 +
 +/* Chunk locked/uptodate and device failed tests. */
 +static struct stripe_chunk *
 +stripe_chunk_check(struct stripe *stripe, unsigned p, unsigned *chunks_uptodate)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      struct stripe_chunk *chunk = CHUNK(stripe, p);
 +
 +      /* Can't access active chunks. */
 +      if (ChunkLocked(chunk)) {
 +              /* REMOVEME: statistics. */
 +              atomic_inc(rs->stats + S_CHUNK_LOCKED);
 +              return NULL;
 +      }
 +
 +      /* Can't access broken devive. */
 +      if (ChunkError(chunk) || DevFailed(rs->dev + p))
 +              return NULL;
 +
 +      /* Can access uptodate chunks. */
 +      if (ChunkUptodate(chunk)) {
 +              (*chunks_uptodate)++;
 +              return NULL;
 +      }
 +
 +      return chunk;
 +}
 +
 +/*
 + * Degraded/reconstruction mode.
 + *
 + * Check stripe state to figure which chunks don't need IO.
 + *
 + * Returns 0 for fully operational, -EPERM for degraded/resynchronizing.
 + */
 +static int stripe_check_reconstruct(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +
 +      if (RSDead(rs)) {
 +              ClearStripeReconstruct(stripe);
 +              ClearStripeReconstructed(stripe);
 +              stripe_allow_io(stripe);
 +              return 0;
 +      }
 +
 +      /* Avoid further reconstruction setting, when already set. */
 +      if (StripeReconstruct(stripe)) {
 +              /* REMOVEME: statistics. */
 +              atomic_inc(rs->stats + S_RECONSTRUCT_SET);
 +              return -EBUSY;
 +      }
 +
 +      /* Initially allow io on all chunks. */
 +      stripe_allow_io(stripe);
 +
 +      /* Return if stripe is already reconstructed. */
 +      if (StripeReconstructed(stripe)) {
 +              atomic_inc(rs->stats + S_RECONSTRUCTED);
 +              return 0;
 +      }
 +
 +      /*
 +       * Degraded/reconstruction mode (device failed) ->
 +       * avoid io on the failed device.
 +       */
 +      if (unlikely(RSDegraded(rs))) {
 +              /* REMOVEME: statistics. */
 +              atomic_inc(rs->stats + S_DEGRADED);
 +              /* Allow IO on all devices but the dead one. */
 +              BUG_ON(rs->set.ei < 0);
 +              return stripe_chunk_set_io_flags(stripe, rs->set.ei);
 +      } else {
 +              int sync, pi = dev_for_parity(stripe, &sync);
 +
 +              /*
 +               * Reconstruction mode (ie. a particular (replaced) device or
 +               * some (rotating) parity chunk is being resynchronized) ->
 +               *   o make sure all needed chunks are read in
 +               *   o cope with 3/4 disk array special case where it
 +               *     doesn't make a difference to read in parity
 +               *     to xor data in/out
 +               */
 +              if (RSEnforceParityCreation(rs) || !sync) {
 +                      /* REMOVEME: statistics. */
 +                      atomic_inc(rs->stats + S_NOSYNC);
 +                      /* Allow IO on all devs but the one to reconstruct. */
 +                      return stripe_chunk_set_io_flags(stripe, pi);
 +              }
 +      }
 +
 +      return 0;
 +}
 +
 +/*
 + * Check, if stripe is ready to merge writes.
 + * I.e. if all chunks present to allow to merge bios.
 + *
 + * We prohibit io on:
 + *
 + * o chunks without bios
 + * o chunks which get completely written over
 + */
 +static int stripe_merge_possible(struct stripe *stripe, int nosync)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      unsigned chunks_overwrite = 0, chunks_prohibited = 0,
 +               chunks_uptodate = 0, p = rs->set.raid_devs;
 +
 +      /* Walk all chunks. */
 +      while (p--) {
 +              struct stripe_chunk *chunk;
 +
 +              /* Prohibit io on broken devices. */
 +              if (DevFailed(rs->dev + p)) {
 +                      chunk = CHUNK(stripe, p);
 +                      goto prohibit_io;
 +              }
 +
 +              /* We can't optimize any further if no chunk. */
 +              chunk = stripe_chunk_check(stripe, p, &chunks_uptodate);
 +              if (!chunk || nosync)
 +                      continue;
 +
 +              /*
 +               * We have a chunk, which is not uptodate.
 +               *
 +               * If this is not parity and we don't have
 +               * reads queued, we can optimize further.
 +               */
 +              if (p != stripe->idx.parity &&
 +                  bio_list_empty(BL_CHUNK(chunk, READ)) &&
 +                  bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) {
 +                      if (bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)))
 +                              goto prohibit_io;
 +                      else if (RSCheckOverwrite(rs) &&
 +                               stripe_check_chunk_overwrite(stripe, p))
 +                              /* Completely overwritten chunk. */
 +                              chunks_overwrite++;
 +              }
 +
 +              /* Allow io for chunks with bios and overwritten ones. */
 +              SetChunkIo(chunk);
 +              continue;
 +
 +prohibit_io:
 +              /* No io for broken devices or for chunks w/o bios. */
 +              ClearChunkIo(chunk);
 +              chunks_prohibited++;
 +              /* REMOVEME: statistics. */
 +              atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
 +      }
 +
 +      /* All data chunks will get written over. */
 +      if (chunks_overwrite == rs->set.data_devs)
 +              atomic_inc(rs->stats + S_OVERWRITE); /* REMOVEME: statistics.*/
 +      else if (chunks_uptodate + chunks_prohibited < rs->set.raid_devs) {
 +              /* We don't have enough chunks to merge. */
 +              atomic_inc(rs->stats + S_CANT_MERGE); /* REMOVEME: statistics.*/
 +              return -EPERM;
 +      }
 +
 +      /*
 +       * If we have all chunks up to date or overwrite them, we
 +       * just zero the parity chunk and let stripe_rw() recreate it.
 +       */
 +      if (chunks_uptodate == rs->set.raid_devs ||
 +          chunks_overwrite == rs->set.data_devs) {
 +              stripe_zero_chunk(stripe, stripe->idx.parity);
 +              BUG_ON(StripeReconstruct(stripe));
 +              SetStripeReconstruct(stripe);   /* Enforce xor in caller. */
 +      } else {
 +              /*
 +               * With less chunks, we xor parity out.
 +               *
 +               * (*4*) We rely on !StripeReconstruct() in chunk_must_xor(),
 +               *       so that only chunks with queued or merged writes
 +               *       are being xored.
 +               */
 +              parity_xor(stripe);
 +      }
 +
 +      /*
 +       * We do have enough chunks to merge.
 +       * All chunks are uptodate or get written over.
 +       */
 +      atomic_inc(rs->stats + S_CAN_MERGE); /* REMOVEME: statistics. */
 +      return 0;
 +}
 +
 +/*
 + * Avoid reading chunks in case we're fully operational.
 + *
 + * We prohibit io on any chunks without bios but the parity chunk.
 + */
 +static void stripe_avoid_reads(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      unsigned dummy = 0, p = rs->set.raid_devs;
 +
 +      /* Walk all chunks. */
 +      while (p--) {
 +              struct stripe_chunk *chunk =
 +                      stripe_chunk_check(stripe, p, &dummy);
 +
 +              if (!chunk)
 +                      continue;
 +
 +              /* If parity or any bios pending -> allow io. */
 +              if (chunk_ref(chunk) || p == stripe->idx.parity)
 +                      SetChunkIo(chunk);
 +              else {
 +                      ClearChunkIo(chunk);
 +                      /* REMOVEME: statistics. */
 +                      atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
 +              }
 +      }
 +}
 +
 +/*
 + * Read/write a stripe.
 + *
 + * All stripe read/write activity goes through this function
 + * unless recovery, which has to call stripe_chunk_rw() directly.
 + *
 + * Make sure we don't try already merged stripes in order
 + * to avoid data corruption.
 + *
 + * Check the state of the RAID set and if degraded (or
 + * resynchronizing for reads), read in all other chunks but
 + * the one on the dead/resynchronizing device in order to be
 + * able to reconstruct the missing one in _do_endios().
 + *
 + * Can be called on active stripes in order
 + * to dispatch new io on inactive chunks.
 + *
 + * States to cover:
 + *   o stripe to read and/or write
 + *   o stripe with error to reconstruct
 + */
 +static int stripe_rw(struct stripe *stripe)
 +{
 +      int nosync, r;
 +      struct raid_set *rs = RS(stripe->sc);
 +
 +      /*
 +       * Check, if a chunk needs to be reconstructed
 +       * because of a degraded set or a region out of sync.
 +       */
 +      nosync = stripe_check_reconstruct(stripe);
 +      switch (nosync) {
 +      case -EBUSY:
 +              return 0; /* Wait for stripe reconstruction to finish. */
 +      case -EPERM:
 +              goto io;
 +      }
 +
 +      /*
 +       * If we don't have merged writes pending, we can schedule
 +       * queued writes to be merged next without corrupting data.
 +       */
 +      if (!StripeMerged(stripe)) {
 +              r = stripe_queue_writes(stripe);
 +              if (r)
 +                      /* Writes got queued -> flag RBW. */
 +                      SetStripeRBW(stripe);
 +      }
 +
 +      /*
 +       * Merge all writes hanging off uptodate/overwritten
 +       * chunks of the stripe.
 +       */
 +      if (StripeRBW(stripe)) {
 +              r = stripe_merge_possible(stripe, nosync);
 +              if (!r) { /* Merge possible. */
 +                      struct stripe_chunk *chunk;
 +
 +                      /*
 +                       * I rely on valid parity in order
 +                       * to xor a fraction of chunks out
 +                       * of parity and back in.
 +                       */
 +                      stripe_merge_writes(stripe);    /* Merge writes in. */
 +                      parity_xor(stripe);             /* Update parity. */
 +                      ClearStripeReconstruct(stripe); /* Reset xor enforce. */
 +                      SetStripeMerged(stripe);        /* Writes merged. */
 +                      ClearStripeRBW(stripe);         /* Disable RBW. */
 +
 +                      /*
 +                       * REMOVEME: sanity check on parity chunk
 +                       *           states after writes got merged.
 +                       */
 +                      chunk = CHUNK(stripe, stripe->idx.parity);
 +                      BUG_ON(ChunkLocked(chunk));
 +                      BUG_ON(!ChunkUptodate(chunk));
 +                      BUG_ON(!ChunkDirty(chunk));
 +                      BUG_ON(!ChunkIo(chunk));
 +              }
 +      } else if (!nosync && !StripeMerged(stripe))
 +              /* Read avoidance if not degraded/resynchronizing/merged. */
 +              stripe_avoid_reads(stripe);
 +
 +io:
 +      /* Now submit any reads/writes for non-uptodate or dirty chunks. */
 +      r = stripe_chunks_rw(stripe);
 +      if (!r) {
 +              /*
 +               * No io submitted because of chunk io
 +               * prohibited or locked chunks/failed devices
 +               * -> push to end io list for processing.
 +               */
 +              stripe_endio_push(stripe);
 +              atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
 +      }
 +
 +      return r;
 +}
 +
 +/*
 + * Recovery functions
 + */
 +/* Read a stripe off a raid set for recovery. */
 +static int stripe_recover_read(struct stripe *stripe, int pi)
 +{
 +      BUG_ON(stripe_io_ref(stripe));
 +
 +      /* Invalidate all chunks so that they get read in. */
 +      stripe_chunks_invalidate(stripe);
 +      stripe_allow_io(stripe); /* Allow io on all recovery chunks. */
 +
 +      /*
 +       * If we are reconstructing a perticular device, we can avoid
 +       * reading the respective chunk in, because we're going to
 +       * reconstruct it anyway.
 +       *
 +       * We can't do that for resynchronization of rotating parity,
 +       * because the recovery stripe chunk size is typically larger
 +       * than the sets chunk size.
 +       */
 +      if (pi > -1)
 +              ClearChunkIo(CHUNK(stripe, pi));
 +
 +      return stripe_chunks_rw(stripe);
 +}
 +
 +/* Write a stripe to a raid set for recovery. */
 +static int stripe_recover_write(struct stripe *stripe, int pi)
 +{
 +      BUG_ON(stripe_io_ref(stripe));
 +
 +      /*
 +       * If this is a reconstruct of a particular device, then
 +       * reconstruct the respective chunk, else create parity chunk.
 +       */
 +      if (pi > -1) {
 +              stripe_zero_chunk(stripe, pi);
 +              common_xor(stripe, stripe->io.size, 0, pi);
 +              chunk_set(CHUNK(stripe, pi), DIRTY);
 +      } else
 +              parity_xor(stripe);
 +
 +      return stripe_chunks_rw(stripe);
 +}
 +
 +/* Read/write a recovery stripe. */
 +static int stripe_recover_rw(struct stripe *stripe)
 +{
 +      int r = 0, sync = 0;
 +
 +      /* Read/write flip-flop. */
 +      if (TestClearStripeRBW(stripe)) {
 +              SetStripeMerged(stripe);
 +              stripe->key = stripe->recover->pos;
 +              r = stripe_recover_read(stripe, dev_for_parity(stripe, &sync));
 +              BUG_ON(!r);
 +      } else if (TestClearStripeMerged(stripe)) {
 +              r = stripe_recover_write(stripe, dev_for_parity(stripe, &sync));
 +              BUG_ON(!r);
 +      }
 +
 +      BUG_ON(sync);
 +      return r;
 +}
 +
 +/* Recover bandwidth available ?. */
 +static int recover_bandwidth(struct raid_set *rs)
 +{
 +      int r, work;
 +
 +      /* On reset or when bios delayed -> allow recovery. */
 +      r = recover_io_reset(rs);
 +      if (r || RSBandwidth(rs))
 +              goto out;
 +
 +      work = atomic_read(rs->recover.io_count + IO_WORK);
 +      if (work) {
 +              /* Pay attention to larger recover stripe size. */
 +              int recover = atomic_read(rs->recover.io_count + IO_RECOVER) *
 +                                        rs->recover.io_size / rs->set.io_size;
 +
 +              /*
 +               * Don't use more than given bandwidth
 +               * of the work io for recovery.
 +               */
 +              if (recover > work / rs->recover.bandwidth_work) {
 +                      /* REMOVEME: statistics. */
 +                      atomic_inc(rs->stats + S_NO_BANDWIDTH);
 +                      return 0;
 +              }
 +      }
 +
 +out:
 +      atomic_inc(rs->stats + S_BANDWIDTH);    /* REMOVEME: statistics. */
 +      return 1;
 +}
 +
 +/* Try to get a region to recover. */
 +static int stripe_recover_get_region(struct stripe *stripe)
 +{
 +      struct raid_set *rs = RS(stripe->sc);
 +      struct recover *rec = &rs->recover;
 +      struct recover_addr *addr = stripe->recover;
 +      struct dm_dirty_log *dl = rec->dl;
 +      struct dm_rh_client *rh = rec->rh;
 +
 +      BUG_ON(!dl);
 +      BUG_ON(!rh);
 +
 +      /* Return, that we have region first to finish it during suspension. */
 +      if (addr->reg)
 +              return 1;
 +
 +      if (RSSuspend(rs))
 +              return -EPERM;
 +
 +      if (dl->type->get_sync_count(dl) >= rec->nr_regions)
 +              return -ENOENT;
 +
 +      /* If we don't have enough bandwidth, we don't proceed recovering. */
 +      if (!recover_bandwidth(rs))
 +              return -EAGAIN;
 +
 +      /* Start quiescing a region. */
 +      dm_rh_recovery_prepare(rh);
 +      addr->reg = dm_rh_recovery_start(rh);
 +      if (!addr->reg)
 +              return -EAGAIN;
 +
 +      addr->pos = dm_rh_region_to_sector(rh, dm_rh_get_region_key(addr->reg));
 +      addr->end = addr->pos + dm_rh_get_region_size(rh);
 +
 +      /*
 +       * Take one global io reference out for the
 +       * whole region, which is going to be released
 +       * when the region is completely done with.
 +       */
 +      io_get(rs);
 +      return 0;
 +}
 +
 +/* Update region hash state. */
 +enum recover_type { REC_FAILURE = 0, REC_SUCCESS = 1 };
 +static void recover_rh_update(struct stripe *stripe, enum recover_type success)
 +{
 +      struct recover_addr *addr = stripe->recover;
 +      struct raid_set *rs = RS(stripe->sc);
 +      struct recover *rec = &rs->recover;
 +
 +      if (!addr->reg) {
 +              DMERR("%s- Called w/o region", __func__);
 +              return;
 +      }
 +
 +      dm_rh_recovery_end(addr->reg, success);
 +      if (success)
 +              rec->nr_regions_recovered++;
 +
 +      addr->reg = NULL;
 +
 +      /*
 +       * Completely done with this region ->
 +       * release the 1st io reference.
 +       */
 +      io_put(rs);
 +}
 +
 +/* Set start of recovery state. */
 +static void set_start_recovery(struct raid_set *rs)
 +{
 +      /* Initialize recovery. */
 +      rs->recover.start_jiffies = jiffies;
 +      rs->recover.end_jiffies = 0;
 +}
 +
 +/* Set end of recovery state. */
 +static void set_end_recovery(struct raid_set *rs)
 +{
 +      ClearRSRecover(rs);
 +/* Achtung: nicht mehr zurück setzten -> 'i' belibt in status output und userpace könnte sich darauf verlassen, das es verschiwndet!!!! */
 +      rs->set.dev_to_init = -1;
 +
 +      /* Check for jiffies overrun. */
 +      rs->recover.end_jiffies = jiffies;
 +      if (rs->recover.end_jiffies < rs->recover.start_jiffies)
 +              rs->recover.end_jiffies = ~0;
 +}
 +
 +/* Handle recovery on one recovery stripe. */
 +static int _do_recovery(struct stripe *stripe)
 +{
 +      int r;
 +      struct raid_set *rs = RS(stripe->sc);
 +      struct recover_addr *addr = stripe->recover;
 +
 +      /* If recovery is active -> return. */
 +      if (stripe_io_ref(stripe))
 +              return 1;
 +
 +      /* IO error is fatal for recovery -> stop it. */
 +      if (unlikely(StripeError(stripe)))
 +              goto err;
 +
 +      /* Recovery end required. */
 +      if (unlikely(RSDegraded(rs)))
 +              goto err;
 +
 +      /* Get a region to recover. */
 +      r = stripe_recover_get_region(stripe);
 +      switch (r) {
 +      case 0: /* Got a new region: flag initial read before write. */
 +              SetStripeRBW(stripe);
 +      case 1: /* Have a region in the works. */
 +              break;
 +      case -EAGAIN:
 +              /* No bandwidth/quiesced region yet, try later. */
 +              if (!io_ref(rs))
 +                      wake_do_raid_delayed(rs, HZ / 4);
 +      case -EPERM:
 +              /* Suspend. */
 +              return 1;
 +      case -ENOENT:   /* No more regions to recover. */
 +              schedule_work(&rs->io.ws_do_table_event);
 +              return 0;
 +      default:
 +              BUG();
 +      }
 +
 +      /* Read/write a recover stripe. */
 +      r = stripe_recover_rw(stripe);
 +      if (r)
 +              /* IO initiated. */
 +              return 1;
 +
 +      /* Read and write finished-> update recovery position within region. */
 +      addr->pos += stripe->io.size;
 +
 +      /* If we're at end of region, update region hash. */
 +      if (addr->pos >= addr->end ||
 +          addr->pos >= rs->set.sectors_per_dev)
 +              recover_rh_update(stripe, REC_SUCCESS);
 +      else
 +              /* Prepare to read next region segment. */
 +              SetStripeRBW(stripe);
 +
 +      /* Schedule myself for another round... */
 +      wake_do_raid(rs);
 +      return 1;
 +
 +err:
 +      /* FIXME: rather try recovering other regions on error? */
 +      rs_check_degrade(stripe);
 +      recover_rh_update(stripe, REC_FAILURE);
 +
 +      /* Check state of partially recovered array. */
 +      if (RSDegraded(rs) && !RSDead(rs) &&
 +          rs->set.dev_to_init != -1 &&
 +          rs->set.ei != rs->set.dev_to_init) {
 +              /* Broken drive != drive to recover -> FATAL. */
 +              SetRSDead(rs);
 +              DMERR("FATAL: failed device != device to initialize -> "
 +                    "RAID set broken");
 +      }
 +
 +      if (StripeError(stripe) || RSDegraded(rs)) {
 +              char buf[BDEVNAME_SIZE];
 +
 +              DMERR("stopping recovery due to "
 +                    "ERROR on /dev/%s, stripe at offset %llu",
 +                    bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
 +                    (unsigned long long) stripe->key);
 +
 +      }
 +
 +      /* Make sure, that all quiesced regions get released. */
 +      while (addr->reg) {
 +              dm_rh_recovery_end(addr->reg, -EIO);
 +              addr->reg = dm_rh_recovery_start(rs->recover.rh);
 +      }
 +
 +      return 0;
 +}
 +
 +/* Called by main io daemon to recover regions. */
 +static int do_recovery(struct raid_set *rs)
 +{
 +      if (RSRecover(rs)) {
 +              int r = 0;
 +              struct stripe *stripe;
 +
 +              list_for_each_entry(stripe, &rs->recover.stripes,
 +                                  lists[LIST_RECOVER])
 +                      r += _do_recovery(stripe);
 +
 +              if (r)
 +                      return r;
 +
 +              set_end_recovery(rs);
 +              stripe_recover_free(rs);
 +      }
 +
 +      return 0;
 +}
 +
 +/*
 + * END recovery functions
 + */
 +
 +/* End io process all stripes handed in by endio() callback. */
 +static void _do_endios(struct raid_set *rs, struct stripe *stripe,
 +                     struct list_head *flush_list)
 +{
 +      /* First unlock all required chunks. */
 +      stripe_chunks_unlock(stripe);
 +
 +      /*
 +       * If an io error on a stripe occured, degrade the RAID set
 +       * and try to endio as many bios as possible. If any bios can't
 +       * be endio processed, requeue the stripe (stripe_ref() != 0).
 +       */
 +      if (TestClearStripeError(stripe)) {
 +              /*
 +               * FIXME: if read, rewrite the failed chunk after reconstruction
 +               *        in order to trigger disk bad sector relocation.
 +               */
 +              rs_check_degrade(stripe); /* Resets ChunkError(). */
 +              ClearStripeReconstruct(stripe);
 +              ClearStripeReconstructed(stripe);
 +
 +              /*
 +               * FIXME: if write, don't endio writes in flight and don't
 +               *        allow for new writes until userspace has updated
 +               *        its metadata.
 +               */
 +      }
 +
 +      /* Got to reconstruct a missing chunk. */
 +      if (StripeReconstruct(stripe)) {
 +              /*
 +               * (*2*) We use StripeReconstruct() to allow for
 +               *       all chunks to be xored into the reconstructed
 +               *       one (see chunk_must_xor()).
 +               */
 +              stripe_reconstruct(stripe);
 +
 +              /*
 +               * (*3*) Now we reset StripeReconstruct() and flag
 +               *       StripeReconstructed() to show to stripe_rw(),
 +               *       that we have reconstructed a missing chunk.
 +               */
 +              ClearStripeReconstruct(stripe);
 +              SetStripeReconstructed(stripe);
 +
 +              /* FIXME: reschedule to be written in case of read. */
 +              /* if (!RSDead && RSDegraded(rs) !StripeRBW(stripe)) {
 +                      chunk_set(CHUNK(stripe, stripe->idx.recover), DIRTY);
 +                      stripe_chunks_rw(stripe);
 +              } */
 +
 +              stripe->idx.recover = -1;
 +      }
 +
 +      /*
 +       * Now that we eventually got a complete stripe, we
 +       * can process the rest of the end ios on reads.
 +       */
 +      stripe_endio(READ, stripe);
 +
 +      /* End io all merged writes if not prohibited. */
 +      if (!RSProhibitWrites(rs) && StripeMerged(stripe)) {
 +              ClearStripeMerged(stripe);
 +              stripe_endio(WRITE_MERGED, stripe);
 +      }
 +
 +      /* If RAID set is dead -> fail any ios to dead drives. */
 +      if (RSDead(rs)) {
 +              if (!TestSetRSDeadEndioMessage(rs))
 +                      DMERR("RAID set dead: failing ios to dead devices");
 +
 +              stripe_fail_io(stripe);
 +      }
 +
 +      /*
 +       * We have stripe references still,
 +       * beacuse of read before writes or IO errors ->
 +       * got to put on flush list for processing.
 +       */
 +      if (stripe_ref(stripe)) {
 +              BUG_ON(!list_empty(stripe->lists + LIST_LRU));
 +              list_add_tail(stripe->lists + LIST_FLUSH, flush_list);
 +              atomic_inc(rs->stats + S_REQUEUE); /* REMOVEME: statistics. */
 +      } else
 +              stripe_lru_add(stripe);
 +}
 +
 +/* Pop any endio stripes off of the endio list and belabour them. */
 +static void do_endios(struct raid_set *rs)
 +{
 +      struct stripe_cache *sc = &rs->sc;
 +      struct stripe *stripe;
 +      /* IO flush list for sorted requeued stripes. */
 +      struct list_head flush_list;
 +
 +      INIT_LIST_HEAD(&flush_list);
 +
 +      while ((stripe = stripe_endio_pop(sc))) {
 +              /* Avoid endio on stripes with newly io'ed chunks. */
 +              if (!stripe_io_ref(stripe))
 +                      _do_endios(rs, stripe, &flush_list);
 +      }
 +
 +      /*
 +       * Insert any requeued stripes in the proper
 +       * order at the beginning of the io (flush) list.
 +       */
 +      list_splice(&flush_list, sc->lists + LIST_FLUSH);
 +}
 +
 +/* Flush any stripes on the io list. */
 +static int do_flush(struct raid_set *rs)
 +{
 +      int r = 0;
 +      struct stripe *stripe;
 +
 +      while ((stripe = stripe_io_pop(&rs->sc)))
 +              r += stripe_rw(stripe); /* Read/write stripe. */
 +
 +      return r;
 +}
 +
 +/* Stripe cache resizing. */
 +static void do_sc_resize(struct raid_set *rs)
 +{
 +      unsigned set = atomic_read(&rs->sc.stripes_to_set);
 +
 +      if (set) {
 +              unsigned cur = atomic_read(&rs->sc.stripes);
 +              int r = (set > cur) ? sc_grow(&rs->sc, set - cur, SC_GROW) :
 +                                    sc_shrink(&rs->sc, cur - set);
 +
 +              /* Flag end of resizeing if ok. */
 +              if (!r)
 +                      atomic_set(&rs->sc.stripes_to_set, 0);
 +      }
 +}
 +
 +/*
 + * Process all ios
 + *
 + * We do different things with the io depending
 + * on the state of the region that it is in:
 + *
 + * o reads: hang off stripe cache or postpone if full
 + *
 + * o writes:
 + *
 + *  CLEAN/DIRTY/NOSYNC:       increment pending and hang io off stripe's stripe set.
 + *                    In case stripe cache is full or busy, postpone the io.
 + *
 + *  RECOVERING:               delay the io until recovery of the region completes.
 + *
 + */
 +static void do_ios(struct raid_set *rs, struct bio_list *ios)
 +{
 +      int r;
 +      unsigned flush = 0, delay = 0;
 +      sector_t sector;
 +      struct dm_rh_client *rh = rs->recover.rh;
 +      struct bio *bio;
 +      struct bio_list reject;
 +
 +      bio_list_init(&reject);
 +
 +      /*
 +       * Classify each io:
 +       *    o delay writes to recovering regions (let reads go through)
 +       *    o queue io to all other regions
 +       */
 +      while ((bio = bio_list_pop(ios))) {
 +              /*
 +               * In case we get a barrier bio, push it back onto
 +               * the input queue unless all work queues are empty
 +               * and the stripe cache is inactive.
 +               */
 +              if (bio->bi_rw & REQ_FLUSH) {
 +                      /* REMOVEME: statistics. */
 +                      atomic_inc(rs->stats + S_BARRIER);
 +                      if (delay ||
 +                          !list_empty(rs->sc.lists + LIST_FLUSH) ||
 +                          !bio_list_empty(&reject) ||
 +                          sc_active(&rs->sc)) {
 +                              bio_list_push(ios, bio);
 +                              break;
 +                      }
 +              }
 +
 +              /* If writes prohibited because of failures -> postpone. */
 +              if (RSProhibitWrites(rs) && bio_data_dir(bio) == WRITE) {
 +                      bio_list_add(&reject, bio);
 +                      continue;
 +              }
 +
 +              /* Check for recovering regions. */
 +              sector = _sector(rs, bio);
 +              r = region_state(rs, sector, DM_RH_RECOVERING);
 +              if (unlikely(r)) {
 +                      delay++;
 +                      /* Wait writing to recovering regions. */
 +                      dm_rh_delay_by_region(rh, bio,
 +                                            dm_rh_sector_to_region(rh,
 +                                                                   sector));
 +                      /* REMOVEME: statistics.*/
 +                      atomic_inc(rs->stats + S_DELAYED_BIOS);
 +                      atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
 +
 +                      /* Force bandwidth tests in recovery. */
 +                      SetRSBandwidth(rs);
 +              } else {
 +                      /*
 +                       * Process ios to non-recovering regions by queueing
 +                       * them to stripes (does dm_rh_inc()) for writes).
 +                       */
 +                      flush += stripe_queue_bio(rs, bio, &reject);
 +              }
 +      }
 +
 +      if (flush) {
 +              /* FIXME: better error handling. */
 +              r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
 +              if (r)
 +                      DMERR_LIMIT("dirty log flush");
 +      }
 +
 +      /* Merge any rejected bios back to the head of the input list. */
 +      bio_list_merge_head(ios, &reject);
 +}
 +
 +/* Send an event in case we're getting too busy. */
 +static void do_busy_event(struct raid_set *rs)
 +{
 +      if (sc_busy(rs)) {
 +              if (!TestSetRSScBusy(rs))
 +                      schedule_work(&rs->io.ws_do_table_event);
 +      } else
 +              ClearRSScBusy(rs);
 +}
 +
 +/* Throw an event. */
 +static void do_table_event(struct work_struct *ws)
 +{
 +      struct raid_set *rs = container_of(ws, struct raid_set,
 +                                         io.ws_do_table_event);
 +      dm_table_event(rs->ti->table);
 +}
 +
 +
 +/*-----------------------------------------------------------------
 + * RAID daemon
 + *---------------------------------------------------------------*/
 +/*
 + * o belabour all end ios
 + * o update the region hash states
 + * o optionally shrink the stripe cache
 + * o optionally do recovery
 + * o unplug any component raid devices with queued bios
 + * o grab the input queue
 + * o work an all requeued or new ios and perform stripe cache flushs
 + * o unplug any component raid devices with queued bios
 + * o check, if the stripe cache gets too busy and throw an event if so
 + */
 +static void do_raid(struct work_struct *ws)
 +{
 +      int r;
 +      struct raid_set *rs = container_of(ws, struct raid_set,
 +                                         io.dws_do_raid.work);
 +      struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
 +
 +      /*
 +       * We always need to end io, so that ios can get errored in
 +       * case the set failed and the region counters get decremented
 +       * before we update region hash states and go any further.
 +       */
 +      do_endios(rs);
 +      dm_rh_update_states(rs->recover.rh, 1);
 +
 +      /*
 +       * Now that we've end io'd, which may have put stripes on the LRU list
 +       * to allow for shrinking, we resize the stripe cache if requested.
 +       */
 +      do_sc_resize(rs);
 +
 +      /* Try to recover regions. */
 +      r = do_recovery(rs);
 +
 +      /* Quickly grab all new ios queued and add them to the work list. */
 +      mutex_lock(&rs->io.in_lock);
 +      bio_list_merge(ios, ios_in);
 +      bio_list_init(ios_in);
 +      mutex_unlock(&rs->io.in_lock);
 +
 +      if (!bio_list_empty(ios))
 +              do_ios(rs, ios); /* Got ios to work into the cache. */
 +
 +      r = do_flush(rs);               /* Flush any stripes on io list. */
 +
 +      do_busy_event(rs);      /* Check if we got too busy. */
 +}
 +
 +/*
 + * Callback for region hash to dispatch
 + * delayed bios queued to recovered regions
 + * (gets called via dm_rh_update_states()).
 + */
 +static void dispatch_delayed_bios(void *context, struct bio_list *bl)
 +{
 +      struct raid_set *rs = context;
 +      struct bio *bio;
 +
 +      /* REMOVEME: statistics; decrement pending delayed bios counter. */
 +      bio_list_for_each(bio, bl)
 +              atomic_dec(rs->stats + S_DELAYED_BIOS);
 +
 +      /* Merge region hash private list to work list. */
 +      bio_list_merge_head(&rs->io.work, bl);
 +      bio_list_init(bl);
 +      ClearRSBandwidth(rs);
 +}
 +
 +/*************************************************************
 + * Constructor helpers
 + *************************************************************/
 +/* Calculate MB/sec. */
 +static unsigned mbpers(struct raid_set *rs, unsigned io_size)
 +{
 +      return to_bytes((rs->xor.speed * rs->set.data_devs *
 +                       io_size * HZ / XOR_SPEED_TICKS) >> 10) >> 10;
 +}
 +
 +/*
 + * Discover fastest xor algorithm and # of chunks combination.
 + */
 +/* Calculate speed of particular algorithm and # of chunks. */
 +static unsigned xor_speed(struct stripe *stripe)
 +{
 +      int ticks = XOR_SPEED_TICKS;
 +      unsigned p = RS(stripe->sc)->set.raid_devs, r = 0;
 +      unsigned long j;
 +
 +      /* Set uptodate so that common_xor()->xor() will belabour chunks. */
 +      while (p--)
 +              SetChunkUptodate(CHUNK(stripe, p));
 +
 +      /* Wait for next tick. */
 +      for (j = jiffies; j == jiffies; );
 +
 +      /* Do xors for a few ticks. */
 +      while (ticks--) {
 +              unsigned xors = 0;
 +
 +              for (j = jiffies; j == jiffies; ) {
 +                      mb();
 +                      common_xor(stripe, stripe->io.size, 0, 0);
 +                      mb();
 +                      xors++;
 +                      mb();
 +              }
 +
 +              if (xors > r)
 +                      r = xors;
 +      }
 +
 +      return r;
 +}
 +
 +/* Define for xor multi recovery stripe optimization runs. */
 +#define DMRAID45_XOR_TEST
 +
 +/* Optimize xor algorithm for this RAID set. */
 +static unsigned xor_optimize(struct raid_set *rs)
 +{
 +      unsigned chunks_max = 2, speed_max = 0;
 +      struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
 +      struct stripe *stripe;
 +      unsigned io_size = 0, speed_hm = 0, speed_min = ~0, speed_xor_blocks = 0;
 +
 +      BUG_ON(list_empty(&rs->recover.stripes));
 +#ifndef DMRAID45_XOR_TEST
 +      stripe = list_first_entry(&rs->recover.stripes, struct stripe,
 +                                lists[LIST_RECOVER]);
 +#endif
 +
 +      /* Try all xor functions. */
 +      while (f-- > xor_funcs) {
 +              unsigned speed;
 +
 +#ifdef DMRAID45_XOR_TEST
 +              list_for_each_entry(stripe, &rs->recover.stripes,
 +                                  lists[LIST_RECOVER]) {
 +                      io_size = stripe->io.size;
 +#endif
 +
 +                      /* Set actual xor function for common_xor(). */
 +                      rs->xor.f = f;
 +                      rs->xor.chunks = (f->f == xor_blocks_wrapper ?
 +                                        (MAX_XOR_BLOCKS + 1) :
 +                                        XOR_CHUNKS_MAX);
 +                      if (rs->xor.chunks > rs->set.raid_devs)
 +                              rs->xor.chunks = rs->set.raid_devs;
 +
 +                      for ( ; rs->xor.chunks > 1; rs->xor.chunks--) {
 +                              speed = xor_speed(stripe);
 +
 +#ifdef DMRAID45_XOR_TEST
 +                              if (f->f == xor_blocks_wrapper) {
 +                                      if (speed > speed_xor_blocks)
 +                                              speed_xor_blocks = speed;
 +                              } else if (speed > speed_hm)
 +                                      speed_hm = speed;
 +
 +                              if (speed < speed_min)
 +                                      speed_min = speed;
 +#endif
 +
 +                              if (speed > speed_max) {
 +                                      speed_max = speed;
 +                                      chunks_max = rs->xor.chunks;
 +                                      f_max = f;
 +                              }
 +                      }
 +#ifdef DMRAID45_XOR_TEST
 +              }
 +#endif
 +      }
 +
 +      /* Memorize optimal parameters. */
 +      rs->xor.f = f_max;
 +      rs->xor.chunks = chunks_max;
 +#ifdef DMRAID45_XOR_TEST
 +      DMINFO("%s stripes=%u/size=%u min=%u xor_blocks=%u hm=%u max=%u",
 +             speed_max == speed_hm ? "HM" : "NB",
 +             rs->recover.recovery_stripes, io_size, speed_min,
 +             speed_xor_blocks, speed_hm, speed_max);
 +#endif
 +      return speed_max;
 +}
 +
 +/*
 + * Allocate a RAID context (a RAID set)
 + */
 +/* Structure for variable RAID parameters. */
 +struct variable_parms {
 +      int bandwidth;
 +      int bandwidth_parm;
 +      int chunk_size;
 +      int chunk_size_parm;
 +      int io_size;
 +      int io_size_parm;
 +      int stripes;
 +      int stripes_parm;
 +      int recover_io_size;
 +      int recover_io_size_parm;
 +      int raid_parms;
 +      int recovery;
 +      int recovery_stripes;
 +      int recovery_stripes_parm;
 +};
 +
 +static struct raid_set *
 +context_alloc(struct raid_type *raid_type, struct variable_parms *p,
 +            unsigned raid_devs, sector_t sectors_per_dev,
 +            struct dm_target *ti, unsigned dl_parms, char **argv)
 +{
 +      int r;
 +      size_t len;
 +      sector_t region_size, ti_len;
 +      struct raid_set *rs = NULL;
 +      struct dm_dirty_log *dl;
 +      struct recover *rec;
 +
 +      /*
 +       * Create the dirty log
 +       *
 +       * We need to change length for the dirty log constructor,
 +       * because we want an amount of regions for all stripes derived
 +       * from the single device size, so that we can keep region
 +       * size = 2^^n independant of the number of devices
 +       */
 +      ti_len = ti->len;
 +      ti->len = sectors_per_dev;
 +      dl = dm_dirty_log_create(argv[0], ti, NULL, dl_parms, argv + 2);
 +      ti->len = ti_len;
 +      if (!dl)
 +              goto bad_dirty_log;
 +
 +      /* Chunk size *must* be smaller than region size. */
 +      region_size = dl->type->get_region_size(dl);
 +      if (p->chunk_size > region_size)
 +              goto bad_chunk_size;
 +
 +      /* Recover io size *must* be smaller than region size as well. */
 +      if (p->recover_io_size > region_size)
 +              goto bad_recover_io_size;
 +
 +      /* Size and allocate the RAID set structure. */
 +      len = sizeof(*rs->data) + sizeof(*rs->dev);
 +      if (dm_array_too_big(sizeof(*rs), len, raid_devs))
 +              goto bad_array;
 +
 +      len = sizeof(*rs) + raid_devs * len;
 +      rs = kzalloc(len, GFP_KERNEL);
 +      if (!rs)
 +              goto bad_alloc;
 +
 +      rec = &rs->recover;
 +      atomic_set(&rs->io.in_process, 0);
 +      atomic_set(&rs->io.in_process_max, 0);
 +      rec->io_size = p->recover_io_size;
 +
 +      /* Pointer to data array. */
 +      rs->data = (unsigned long **)
 +                 ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
 +      rec->dl = dl;
 +      rs->set.raid_devs = raid_devs;
 +      rs->set.data_devs = raid_devs - raid_type->parity_devs;
 +      rs->set.raid_type = raid_type;
 +
 +      rs->set.raid_parms = p->raid_parms;
 +      rs->set.chunk_size_parm = p->chunk_size_parm;
 +      rs->set.io_size_parm = p->io_size_parm;
 +      rs->sc.stripes_parm = p->stripes_parm;
 +      rec->io_size_parm = p->recover_io_size_parm;
 +      rec->bandwidth_parm = p->bandwidth_parm;
 +      rec->recovery = p->recovery;
 +      rec->recovery_stripes = p->recovery_stripes;
 +
 +      /*
 +       * Set chunk and io size and respective shifts
 +       * (used to avoid divisions)
 +       */
 +      rs->set.chunk_size = p->chunk_size;
 +      rs->set.chunk_shift = ffs(p->chunk_size) - 1;
 +
 +      rs->set.io_size = p->io_size;
 +      rs->set.io_mask = p->io_size - 1;
 +      /* Mask to adjust address key in case io_size != chunk_size. */
 +      rs->set.io_inv_mask = (p->chunk_size - 1) & ~rs->set.io_mask;
 +
 +      rs->set.sectors_per_dev = sectors_per_dev;
 +
 +      rs->set.ei = -1;        /* Indicate no failed device. */
 +      atomic_set(&rs->set.failed_devs, 0);
 +
 +      rs->ti = ti;
 +
 +      atomic_set(rec->io_count + IO_WORK, 0);
 +      atomic_set(rec->io_count + IO_RECOVER, 0);
 +
 +      /* Initialize io lock and queues. */
 +      mutex_init(&rs->io.in_lock);
 +      mutex_init(&rs->io.xor_lock);
 +      bio_list_init(&rs->io.in);
 +      bio_list_init(&rs->io.work);
 +
 +      init_waitqueue_head(&rs->io.suspendq);  /* Suspend waiters (dm-io). */
 +
 +      rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
 +      rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios,
 +                      wake_dummy, wake_do_raid, 0, p->recovery_stripes,
 +                      dl, region_size, rec->nr_regions);
 +      if (IS_ERR(rec->rh))
 +              goto bad_rh;
 +
 +      /* Initialize stripe cache. */
 +      r = sc_init(rs, p->stripes);
 +      if (r)
 +              goto bad_sc;
 +
 +      /* REMOVEME: statistics. */
 +      stats_reset(rs);
 +      ClearRSDevelStats(rs);  /* Disnable development status. */
 +      return rs;
 +
 +bad_dirty_log:
 +      TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM));
 +
 +bad_chunk_size:
 +      dm_dirty_log_destroy(dl);
 +      TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL));
 +
 +bad_recover_io_size:
 +      dm_dirty_log_destroy(dl);
 +      TI_ERR_RET("Recover stripe io size larger than region size",
 +                      ERR_PTR(-EINVAL));
 +
 +bad_array:
 +      dm_dirty_log_destroy(dl);
 +      TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL));
 +
 +bad_alloc:
 +      dm_dirty_log_destroy(dl);
 +      TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM));
 +
 +bad_rh:
 +      dm_dirty_log_destroy(dl);
 +      ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
 +      goto free_rs;
 +
 +bad_sc:
 +      dm_region_hash_destroy(rec->rh); /* Destroys dirty log too. */
 +      sc_exit(&rs->sc);
 +      ti->error = DM_MSG_PREFIX "Error creating stripe cache";
 +free_rs:
 +      kfree(rs);
 +      return ERR_PTR(-ENOMEM);
 +}
 +
 +/* Free a RAID context (a RAID set). */
 +static void context_free(struct raid_set *rs, unsigned p)
 +{
 +      while (p--)
 +              dm_put_device(rs->ti, rs->dev[p].dev);
 +
 +      sc_exit(&rs->sc);
 +      dm_region_hash_destroy(rs->recover.rh); /* Destroys dirty log too. */
 +      kfree(rs);
 +}
 +
 +/* Create work queue and initialize delayed work. */
 +static int rs_workqueue_init(struct raid_set *rs)
 +{
 +      struct dm_target *ti = rs->ti;
 +
 +      rs->io.wq = create_singlethread_workqueue(DAEMON);
 +      if (!rs->io.wq)
 +              TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
 +
 +      INIT_DELAYED_WORK(&rs->io.dws_do_raid, do_raid);
 +      INIT_WORK(&rs->io.ws_do_table_event, do_table_event);
 +      return 0;
 +}
 +
 +/* Return pointer to raid_type structure for raid name. */
 +static struct raid_type *get_raid_type(char *name)
 +{
 +      struct raid_type *r = ARRAY_END(raid_types);
 +
 +      while (r-- > raid_types) {
 +              if (!strcmp(r->name, name))
 +                      return r;
 +      }
 +
 +      return NULL;
 +}
 +
 +/* FIXME: factor out to dm core. */
 +static int multiple(sector_t a, sector_t b, sector_t *n)
 +{
 +      sector_t r = a;
 +
 +      sector_div(r, b);
 +      *n = r;
 +      return a == r * b;
 +}
 +
 +/* Log RAID set information to kernel log. */
 +static void rs_log(struct raid_set *rs, unsigned io_size)
 +{
 +      unsigned p;
 +      char buf[BDEVNAME_SIZE];
 +
 +      for (p = 0; p < rs->set.raid_devs; p++)
 +              DMINFO("/dev/%s is raid disk %u%s",
 +                              bdevname(rs->dev[p].dev->bdev, buf), p,
 +                              (p == rs->set.pi) ? " (parity)" : "");
 +
 +      DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n"
 +             "algorithm \"%s\", %u chunks with %uMB/s\n"
 +             "%s set with net %u/%u devices",
 +             rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
 +             atomic_read(&rs->sc.stripes),
 +             rs->xor.f->name, rs->xor.chunks, mbpers(rs, io_size),
 +             rs->set.raid_type->descr, rs->set.data_devs, rs->set.raid_devs);
 +}
 +
 +/* Get all devices and offsets. */
 +static int dev_parms(struct raid_set *rs, char **argv, int *p)
 +{
 +      struct dm_target *ti = rs->ti;
 +
 +DMINFO("rs->set.sectors_per_dev=%llu", (unsigned long long) rs->set.sectors_per_dev);
 +      for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
 +              int r;
 +              unsigned long long tmp;
 +              struct raid_dev *dev = rs->dev + *p;
 +
 +              /* Get offset and device. */
 +              if (sscanf(argv[1], "%llu", &tmp) != 1 ||
 +                  tmp > rs->set.sectors_per_dev)
 +                      TI_ERR("Invalid RAID device offset parameter");
 +
 +              dev->start = tmp;
 +              r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
 +                                &dev->dev);
 +              if (r)
 +                      TI_ERR_RET("RAID device lookup failure", r);
 +
 +              r = raid_dev_lookup(rs, dev);
 +              if (r != -ENODEV && r < *p) {
 +                      (*p)++; /* Ensure dm_put_device() on actual device. */
 +                      TI_ERR_RET("Duplicate RAID device", -ENXIO);
 +              }
 +      }
 +
 +      return 0;
 +}
 +
 +/* Set recovery bandwidth. */
 +static void
 +recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
 +{
 +      rs->recover.bandwidth = bandwidth;
 +      rs->recover.bandwidth_work = 100 / bandwidth;
 +}
 +
 +/* Handle variable number of RAID parameters. */
 +static int get_raid_variable_parms(struct dm_target *ti, char **argv,
 +                                 struct variable_parms *vp)
 +{
 +      int p, value;
 +      struct {
 +              int action; /* -1: skip, 0: no power2 check, 1: power2 check */
 +              char *errmsg;
 +              int min, max;
 +              int *var, *var2, *var3;
 +      } argctr[] = {
 +              { 1,
 +                "Invalid chunk size; must be -1 or 2^^n and <= 16384",
 +                IO_SIZE_MIN, CHUNK_SIZE_MAX,
 +                &vp->chunk_size_parm, &vp->chunk_size, &vp->io_size },
 +              { 0,
 +                "Invalid number of stripes: must be -1 or >= 8 and <= 16384",
 +                STRIPES_MIN, STRIPES_MAX,
 +                &vp->stripes_parm, &vp->stripes, NULL },
 +              { 1,
 +                "Invalid io size; must -1 or >= 8, 2^^n and less equal "
 +                "min(BIO_MAX_SECTORS/2, chunk size)",
 +                IO_SIZE_MIN, 0, /* Needs to be updated in loop below. */
 +                &vp->io_size_parm, &vp->io_size, NULL },
 +              { 1,
 +                "Invalid recovery io size; must be -1 or "
 +                "2^^n and less equal BIO_MAX_SECTORS/2",
 +                RECOVER_IO_SIZE_MIN, BIO_MAX_SECTORS / 2,
 +                &vp->recover_io_size_parm, &vp->recover_io_size, NULL },
 +              { 0,
 +                "Invalid recovery bandwidth percentage; "
 +                "must be -1 or > 0 and <= 100",
 +                BANDWIDTH_MIN, BANDWIDTH_MAX,
 +                &vp->bandwidth_parm, &vp->bandwidth, NULL },
 +              /* Handle sync argument seperately in loop. */
 +              { -1,
 +                "Invalid recovery switch; must be \"sync\" or \"nosync\"" },
 +              { 0,
 +                "Invalid number of recovery stripes;"
 +                "must be -1, > 0 and <= 64",
 +                RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX,
 +                &vp->recovery_stripes_parm, &vp->recovery_stripes, NULL },
 +      }, *varp;
 +
 +      /* Fetch # of variable raid parameters. */
 +      if (sscanf(*(argv++), "%d", &vp->raid_parms) != 1 ||
 +          !range_ok(vp->raid_parms, 0, 7))
 +              TI_ERR("Bad variable raid parameters number");
 +
 +      /* Preset variable RAID parameters. */
 +      vp->chunk_size = CHUNK_SIZE_DEFAULT;
 +      vp->io_size = IO_SIZE_DEFAULT;
 +      vp->stripes = STRIPES_DEFAULT;
 +      vp->recover_io_size = RECOVER_IO_SIZE_DEFAULT;
 +      vp->bandwidth = BANDWIDTH_DEFAULT;
 +      vp->recovery = 1;
 +      vp->recovery_stripes = RECOVERY_STRIPES_DEFAULT;
 +
 +      /* Walk the array of argument constraints for all given ones. */
 +      for (p = 0, varp = argctr; p < vp->raid_parms; p++, varp++) {
 +              BUG_ON(varp >= ARRAY_END(argctr));
 +
 +              /* Special case for "[no]sync" string argument. */
 +              if (varp->action < 0) {
 +                      if (!strcmp(*argv, "sync"))
 +                              ;
 +                      else if (!strcmp(*argv, "nosync"))
 +                              vp->recovery = 0;
 +                      else
 +                              TI_ERR(varp->errmsg);
 +
 +                      argv++;
 +                      continue;
 +              }
 +
 +              /*
 +               * Special case for io_size depending
 +               * on previously set chunk size.
 +               */
 +              if (p == 2)
 +                      varp->max = min(BIO_MAX_SECTORS / 2, vp->chunk_size);
 +
 +              if (sscanf(*(argv++), "%d", &value) != 1 ||
 +                  (value != -1 &&
 +                   ((varp->action && !is_power_of_2(value)) ||
 +                    !range_ok(value, varp->min, varp->max))))
 +                      TI_ERR(varp->errmsg);
 +
 +              *varp->var = value;
 +              if (value != -1) {
 +                      if (varp->var2)
 +                              *varp->var2 = value;
 +                      if (varp->var3)
 +                              *varp->var3 = value;
 +              }
 +      }
 +
 +      return 0;
 +}
 +
 +/* Parse optional locking parameters. */
 +static int get_raid_locking_parms(struct dm_target *ti, char **argv,
 +                                int *locking_parms,
 +                                struct dm_raid45_locking_type **locking_type)
 +{
 +      if (!strnicmp(argv[0], "locking", strlen(argv[0]))) {
 +              char *lckstr = argv[1];
 +              size_t lcksz = strlen(lckstr);
 +
 +              if (!strnicmp(lckstr, "none", lcksz)) {
 +                      *locking_type = &locking_none;
 +                      *locking_parms = 2;
 +              } else if (!strnicmp(lckstr, "cluster", lcksz)) {
 +                      DMERR("locking type \"%s\" not yet implemented",
 +                            lckstr);
 +                      return -EINVAL;
 +              } else {
 +                      DMERR("unknown locking type \"%s\"", lckstr);
 +                      return -EINVAL;
 +              }
 +      }
 +
 +      *locking_parms = 0;
 +      *locking_type = &locking_none;
 +      return 0;
 +}
 +
 +/* Set backing device read ahead properties of RAID set. */
 +static void rs_set_read_ahead(struct raid_set *rs,
 +                            unsigned sectors, unsigned stripes)
 +{
 +      unsigned ra_pages = dm_div_up(sectors, SECTORS_PER_PAGE);
 +      struct mapped_device *md = dm_table_get_md(rs->ti->table);
 +      struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
 +
 +      /* Set read-ahead for the RAID set and the component devices. */
 +      if (ra_pages) {
 +              unsigned p = rs->set.raid_devs;
 +
 +              bdi->ra_pages = stripes * ra_pages * rs->set.data_devs;
 +
 +              while (p--) {
 +                      struct request_queue *q =
 +                              bdev_get_queue(rs->dev[p].dev->bdev);
 +
 +                      q->backing_dev_info.ra_pages = ra_pages;
 +              }
 +      }
 +}
 +
 +/* Set congested function. */
 +static void rs_set_congested_fn(struct raid_set *rs)
 +{
 +      struct mapped_device *md = dm_table_get_md(rs->ti->table);
 +      struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
 +
 +      /* Set congested function and data. */
 +      bdi->congested_fn = rs_congested;
 +      bdi->congested_data = rs;
 +}
 +
 +/*
 + * Construct a RAID4/5 mapping:
 + *
 + * log_type #log_params <log_params> \
 + * raid_type [#parity_dev] #raid_variable_params <raid_params> \
 + * [locking "none"/"cluster"]
 + * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
 + *
 + * log_type = "core"/"disk",
 + * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
 + * log_params = [dirty_log_path] region_size [[no]sync])
 + *
 + * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
 + *
 + * #parity_dev = N if raid_type = "raid4"
 + * o N = -1: pick default = last device
 + * o N >= 0 and < #raid_devs: parity device index
 + *
 + * #raid_variable_params = 0-7; raid_params (-1 = default):
 + *   [chunk_size [#stripes [io_size [recover_io_size \
 + *    [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]]
 + *   o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
 + *     and <= CHUNK_SIZE_MAX)
 + *   o #stripes is number of stripes allocated to stripe cache
 + *     (must be > 1 and < STRIPES_MAX)
 + *   o io_size (io unit size per device in sectors; must be 2^^n and > 8)
 + *   o recover_io_size (io unit size per device for recovery in sectors;
 + must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
 + *   o %recovery_bandwith is the maximum amount spend for recovery during
 + *     application io (1-100%)
 + *   o recovery switch = [sync|nosync]
 + *   o #recovery_stripes is the number of recovery stripes used for
 + *     parallel recovery of the RAID set
 + * If raid_variable_params = 0, defaults will be used.
 + * Any raid_variable_param can be set to -1 to apply a default
 + *
 + * #raid_devs = N (N >= 3)
 + *
 + * #dev_to_initialize = N
 + * -1: initialize parity on all devices
 + * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
 + * of a failed devices content after replacement
 + *
 + * <dev_path> = device_path (eg, /dev/sdd1)
 + * <offset>   = begin at offset on <dev_path>
 + *
 + */
 +#define       MIN_PARMS       13
 +static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 +{
 +      int dev_to_init, dl_parms, i, locking_parms,
 +          parity_parm, pi = -1, r, raid_devs;
 +      sector_t tmp, sectors_per_dev;
 +      struct dm_raid45_locking_type *locking;
 +      struct raid_set *rs;
 +      struct raid_type *raid_type;
 +      struct variable_parms parms;
 +
 +      /* Ensure minimum number of parameters. */
 +      if (argc < MIN_PARMS)
 +              TI_ERR("Not enough parameters");
 +
 +      /* Fetch # of dirty log parameters. */
 +      if (sscanf(argv[1], "%d", &dl_parms) != 1 ||
 +          !range_ok(dl_parms, 1, 4711)) /* ;-) */
 +              TI_ERR("Bad dirty log parameters number");
 +
 +      /* Check raid_type. */
 +      raid_type = get_raid_type(argv[dl_parms + 2]);
 +      if (!raid_type)
 +              TI_ERR("Bad raid type");
 +
 +      /* In case of RAID4, parity drive is selectable. */
 +      parity_parm = !!(raid_type->level == raid4);
 +
 +      /* Handle variable number of RAID parameters. */
 +      r = get_raid_variable_parms(ti, argv + dl_parms + parity_parm + 3,
 +                                  &parms);
 +      if (r)
 +              return r;
 +
 +      /* Handle any locking parameters. */
 +      r = get_raid_locking_parms(ti,
 +                                 argv + dl_parms + parity_parm +
 +                                 parms.raid_parms + 4,
 +                                 &locking_parms, &locking);
 +      if (r)
 +              return r;
 +
 +      /* # of raid devices. */
 +      i = dl_parms + parity_parm + parms.raid_parms + locking_parms + 4;
 +      if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
 +          raid_devs < raid_type->minimal_devs)
 +              TI_ERR("Invalid number of raid devices");
 +
 +      /* In case of RAID4, check parity drive index is in limits. */
 +      if (raid_type->level == raid4) {
 +              /* Fetch index of parity device. */
 +              if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
 +                  (pi != -1 && !range_ok(pi, 0, raid_devs - 1)))
 +                      TI_ERR("Invalid RAID4 parity device index");
 +      }
 +
 +      /*
 +       * Index of device to initialize starts at 0
 +       *
 +       * o -1 -> don't initialize a selected device;
 +       *         initialize parity conforming to algorithm
 +       * o 0..raid_devs-1 -> initialize respective device
 +       *   (used for reconstruction of a replaced device)
 +       */
 +      if (sscanf(argv[dl_parms + parity_parm + parms.raid_parms +
 +                 locking_parms + 5], "%d", &dev_to_init) != 1 ||
 +          !range_ok(dev_to_init, -1, raid_devs - 1))
 +              TI_ERR("Invalid number for raid device to initialize");
 +
 +      /* Check # of raid device arguments. */
 +      if (argc - dl_parms - parity_parm - parms.raid_parms - 6 !=
 +          2 * raid_devs)
 +              TI_ERR("Wrong number of raid device/offset arguments");
 +
 +      /*
 +       * Check that the table length is devisable
 +       * w/o rest by (raid_devs - parity_devs)
 +       */
 +      if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
 +                    &sectors_per_dev))
 +              TI_ERR("Target length not divisible by number of data devices");
 +
 +      /*
 +       * Check that the device size is
 +       * devisable w/o rest by chunk size
 +       */
 +      if (!multiple(sectors_per_dev, parms.chunk_size, &tmp))
 +              TI_ERR("Device length not divisible by chunk_size");
 +
 +      /****************************************************************
 +       * Now that we checked the constructor arguments ->
 +       * let's allocate the RAID set
 +       ****************************************************************/
 +      rs = context_alloc(raid_type, &parms, raid_devs, sectors_per_dev,
 +                         ti, dl_parms, argv);
 +      if (IS_ERR(rs))
 +              return PTR_ERR(rs);
 +
 +
 +      rs->set.dev_to_init = rs->set.dev_to_init_parm = dev_to_init;
 +      rs->set.pi = rs->set.pi_parm = pi;
 +
 +      /* Set RAID4 parity drive index. */
 +      if (raid_type->level == raid4)
 +              rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
 +
 +      recover_set_bandwidth(rs, parms.bandwidth);
 +
 +      /* Use locking type to lock stripe access. */
 +      rs->locking = locking;
 +
 +      /* Get the device/offset tupels. */
 +      argv += dl_parms + 6 + parity_parm + parms.raid_parms;
 +      r = dev_parms(rs, argv, &i);
 +      if (r)
 +              goto err;
 +
 +      /* Set backing device information (eg. read ahead). */
 +      rs_set_read_ahead(rs, 2 * rs->set.chunk_size /* sectors per device */,
 +                            2 /* # of stripes */);
 +      rs_set_congested_fn(rs); /* Set congested function. */
 +      SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
 +      rs->xor.speed = xor_optimize(rs); /* Select best xor algorithm. */
 +
 +      /* Set for recovery of any nosync regions. */
 +      if (parms.recovery)
 +              SetRSRecover(rs);
 +      else {
 +              /*
 +               * Need to free recovery stripe(s) here in case
 +               * of nosync, because xor_optimize uses one.
 +               */
 +              set_start_recovery(rs);
 +              set_end_recovery(rs);
 +              stripe_recover_free(rs);
 +      }
 +
 +      /*
 +       * Enable parity chunk creation enformcement for
 +       * little numbers of array members where it doesn'ti
 +       * gain us performance to xor parity out and back in as
 +       * with larger array member numbers.
 +       */
 +      if (rs->set.raid_devs <= rs->set.raid_type->minimal_devs + 1)
 +              SetRSEnforceParityCreation(rs);
 +
 +      /*
 +       * Make sure that dm core only hands maximum io size
 +       * length down and pays attention to io boundaries.
 +       */
 +      ti->split_io = rs->set.io_size;
 +      ti->private = rs;
 +
 +      /* Initialize work queue to handle this RAID set's io. */
 +      r = rs_workqueue_init(rs);
 +      if (r)
 +              goto err;
 +
 +      rs_log(rs, rs->recover.io_size); /* Log information about RAID set. */
 +      return 0;
 +
 +err:
 +      context_free(rs, i);
 +      return r;
 +}
 +
 +/*
 + * Destruct a raid mapping
 + */
 +static void raid_dtr(struct dm_target *ti)
 +{
 +      struct raid_set *rs = ti->private;
 +
 +      destroy_workqueue(rs->io.wq);
 +      context_free(rs, rs->set.raid_devs);
 +}
 +
 +/* Raid mapping function. */
 +static int raid_map(struct dm_target *ti, struct bio *bio,
 +                  union map_info *map_context)
 +{
 +      /* I don't want to waste stripe cache capacity. */
 +      if (bio_rw(bio) == READA)
 +              return -EIO;
 +      else {
 +              struct raid_set *rs = ti->private;
 +
 +              /*
 +               * Get io reference to be waiting for to drop
 +               * to zero on device suspension/destruction.
 +               */
 +              io_get(rs);
 +              bio->bi_sector -= ti->begin;    /* Remap sector. */
 +
 +              /* Queue io to RAID set. */
 +              mutex_lock(&rs->io.in_lock);
 +              bio_list_add(&rs->io.in, bio);
 +              mutex_unlock(&rs->io.in_lock);
 +
 +              /* Wake daemon to process input list. */
 +              wake_do_raid(rs);
 +
 +              /* REMOVEME: statistics. */
 +              atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
 +                                      S_BIOS_READ : S_BIOS_WRITE));
 +              return DM_MAPIO_SUBMITTED;      /* Handle later. */
 +      }
 +}
 +
 +/* Device suspend. */
 +static void raid_presuspend(struct dm_target *ti)
 +{
 +      struct raid_set *rs = ti->private;
 +      struct dm_dirty_log *dl = rs->recover.dl;
 +
 +      SetRSSuspend(rs);
 +
 +      if (RSRecover(rs))
 +              dm_rh_stop_recovery(rs->recover.rh);
 +
 +      cancel_delayed_work(&rs->io.dws_do_raid);
 +      flush_workqueue(rs->io.wq);
 +      wait_ios(rs);   /* Wait for completion of all ios being processed. */
 +
 +      if (dl->type->presuspend && dl->type->presuspend(dl))
 +              /* FIXME: need better error handling. */
 +              DMWARN("log presuspend failed");
 +}
 +
 +static void raid_postsuspend(struct dm_target *ti)
 +{
 +      struct raid_set *rs = ti->private;
 +      struct dm_dirty_log *dl = rs->recover.dl;
 +
 +      if (dl->type->postsuspend && dl->type->postsuspend(dl))
 +              /* FIXME: need better error handling. */
 +              DMWARN("log postsuspend failed");
 +
 +}
 +
 +/* Device resume. */
 +static void raid_resume(struct dm_target *ti)
 +{
 +      struct raid_set *rs = ti->private;
 +      struct recover *rec = &rs->recover;
 +      struct dm_dirty_log *dl = rec->dl;
 +
 +DMINFO("%s...", __func__);
 +      if (dl->type->resume && dl->type->resume(dl))
 +              /* Resume dirty log. */
 +              /* FIXME: need better error handling. */
 +              DMWARN("log resume failed");
 +
 +      rec->nr_regions_to_recover =
 +              rec->nr_regions - dl->type->get_sync_count(dl);
 +
 +      /* Restart any unfinished recovery. */
 +      if (RSRecover(rs)) {
 +              set_start_recovery(rs);
 +              dm_rh_start_recovery(rec->rh);
 +      }
 +
 +      ClearRSSuspend(rs);
 +}
 +
 +/* Return stripe cache size. */
 +static unsigned sc_size(struct raid_set *rs)
 +{
 +      return to_sector(atomic_read(&rs->sc.stripes) *
 +                       (sizeof(struct stripe) +
 +                        (sizeof(struct stripe_chunk) +
 +                         (sizeof(struct page_list) +
 +                          to_bytes(rs->set.io_size) *
 +                          rs->set.raid_devs)) +
 +                        (rs->recover.end_jiffies ?
 +                         0 : rs->recover.recovery_stripes *
 +                         to_bytes(rs->set.raid_devs * rs->recover.io_size))));
 +}
 +
 +/* REMOVEME: status output for development. */
 +static void raid_devel_stats(struct dm_target *ti, char *result,
 +                           unsigned *size, unsigned maxlen)
 +{
 +      unsigned sz = *size;
 +      unsigned long j;
 +      char buf[BDEVNAME_SIZE], *p;
 +      struct stats_map *sm;
 +      struct raid_set *rs = ti->private;
 +      struct recover *rec = &rs->recover;
 +      struct timespec ts;
 +
 +      DMEMIT("%s %s=%u bw=%u\n",
 +             version, rs->xor.f->name, rs->xor.chunks, rs->recover.bandwidth);
 +      DMEMIT("act_ios=%d ", io_ref(rs));
 +      DMEMIT("act_ios_max=%d\n", atomic_read(&rs->io.in_process_max));
 +      DMEMIT("act_stripes=%d ", sc_active(&rs->sc));
 +      DMEMIT("act_stripes_max=%d\n",
 +             atomic_read(&rs->sc.active_stripes_max));
 +
 +      for (sm = stats_map; sm < ARRAY_END(stats_map); sm++)
 +              DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
 +
 +      DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs) ? "on" : "off");
 +      DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs->set.chunk_size,
 +             atomic_read(&rs->sc.stripes), rs->set.io_size,
 +             rec->recovery_stripes, rec->io_size, rs->sc.hash.buckets,
 +             sc_size(rs));
 +
 +      j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
 +          rec->start_jiffies;
 +      jiffies_to_timespec(j, &ts);
 +      sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
 +      p = strchr(buf, '.');
 +      p[3] = 0;
 +
 +      DMEMIT("rg=%llu/%llu/%llu/%u %s\n",
 +             (unsigned long long) rec->nr_regions_recovered,
 +             (unsigned long long) rec->nr_regions_to_recover,
 +             (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
 +
 +      *size = sz;
 +}
 +
 +static int raid_status(struct dm_target *ti, status_type_t type,
 +                     char *result, unsigned maxlen)
 +{
 +      unsigned p, sz = 0;
 +      char buf[BDEVNAME_SIZE];
 +      struct raid_set *rs = ti->private;
 +      struct dm_dirty_log *dl = rs->recover.dl;
 +      int raid_parms[] = {
 +              rs->set.chunk_size_parm,
 +              rs->sc.stripes_parm,
 +              rs->set.io_size_parm,
 +              rs->recover.io_size_parm,
 +              rs->recover.bandwidth_parm,
 +              -2,
 +              rs->recover.recovery_stripes,
 +      };
 +
 +      switch (type) {
 +      case STATUSTYPE_INFO:
 +              /* REMOVEME: statistics. */
 +              if (RSDevelStats(rs))
 +                      raid_devel_stats(ti, result, &sz, maxlen);
 +
 +              DMEMIT("%u ", rs->set.raid_devs);
 +
 +              for (p = 0; p < rs->set.raid_devs; p++)
 +                      DMEMIT("%s ",
 +                             format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev));
 +
 +              DMEMIT("2 ");
 +              for (p = 0; p < rs->set.raid_devs; p++) {
 +                      DMEMIT("%c", !DevFailed(rs->dev + p) ? 'A' : 'D');
 +
 +                      if (p == rs->set.pi)
 +                              DMEMIT("p");
 +
 +                      if (p == rs->set.dev_to_init)
 +                              DMEMIT("i");
 +              }
 +
 +              DMEMIT(" %llu/%llu ",
 +                    (unsigned long long) dl->type->get_sync_count(dl),
 +                    (unsigned long long) rs->recover.nr_regions);
 +
 +              sz += dl->type->status(dl, type, result+sz, maxlen-sz);
 +              break;
 +      case STATUSTYPE_TABLE:
 +              sz = rs->recover.dl->type->status(rs->recover.dl, type,
 +                                                result, maxlen);
 +              DMEMIT("%s %u ", rs->set.raid_type->name, rs->set.raid_parms);
 +
 +              for (p = 0; p < rs->set.raid_parms; p++) {
 +                      if (raid_parms[p] > -2)
 +                              DMEMIT("%d ", raid_parms[p]);
 +                      else
 +                              DMEMIT("%s ", rs->recover.recovery ?
 +                                            "sync" : "nosync");
 +              }
 +
 +              DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
 +
 +              for (p = 0; p < rs->set.raid_devs; p++)
 +                      DMEMIT("%s %llu ",
 +                             format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev),
 +                             (unsigned long long) rs->dev[p].start);
 +      }
 +
 +      return 0;
 +}
 +
 +/*
 + * Message interface
 + */
 +/* Turn a delta into an absolute value. */
 +static int _absolute(char *action, int act, int r)
 +{
 +      size_t len = strlen(action);
 +
 +      if (len < 2)
 +              len = 2;
 +
 +      /* Make delta absolute. */
 +      if (!strncmp("set", action, len))
 +              ;
 +      else if (!strncmp("grow", action, len))
 +              r += act;
 +      else if (!strncmp("shrink", action, len))
 +              r = act - r;
 +      else
 +              r = -EINVAL;
 +
 +      return r;
 +}
 +
 + /* Change recovery io bandwidth. */
 +static int bandwidth_change(struct raid_set *rs, int argc, char **argv,
 +                          enum raid_set_flags flag)
 +{
 +      int act = rs->recover.bandwidth, bandwidth;
 +
 +      if (argc != 2)
 +              return -EINVAL;
 +
 +      if (sscanf(argv[1], "%d", &bandwidth) == 1 &&
 +          range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
 +              /* Make delta bandwidth absolute. */
 +              bandwidth = _absolute(argv[0], act, bandwidth);
 +
 +              /* Check range. */
 +              if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
 +                      recover_set_bandwidth(rs, bandwidth);
 +                      return 0;
 +              }
 +      }
 +
 +      return -EINVAL;
 +}
 +
 +/* Set/reset development feature flags. */
 +static int devel_flags(struct raid_set *rs, int argc, char **argv,
 +                     enum raid_set_flags flag)
 +{
 +      size_t len;
 +
 +      if (argc != 1)
 +              return -EINVAL;
 +
 +      len = strlen(argv[0]);
 +      if (len < 2)
 +              len = 2;
 +
 +      if (!strncmp(argv[0], "on", len))
 +              return test_and_set_bit(flag, &rs->io.flags) ? -EPERM : 0;
 +      else if (!strncmp(argv[0], "off", len))
 +              return test_and_clear_bit(flag, &rs->io.flags) ? 0 : -EPERM;
 +      else if (!strncmp(argv[0], "reset", len)) {
 +              if (flag == RS_DEVEL_STATS) {
 +                      if  (test_bit(flag, &rs->io.flags)) {
 +                              stats_reset(rs);
 +                              return 0;
 +                      } else
 +                              return -EPERM;
 +              } else  {
 +                      set_bit(flag, &rs->io.flags);
 +                      return 0;
 +              }
 +      }
 +
 +      return -EINVAL;
 +}
 +
 +/* Resize the stripe cache. */
 +static int sc_resize(struct raid_set *rs, int argc, char **argv,
 +                   enum raid_set_flags flag)
 +{
 +      int act, stripes;
 +
 +      if (argc != 2)
 +              return -EINVAL;
 +
 +      /* Deny permission in case the daemon is still resizing!. */
 +      if (atomic_read(&rs->sc.stripes_to_set))
 +              return -EPERM;
 +
 +      if (sscanf(argv[1], "%d", &stripes) == 1 &&
 +          stripes > 0) {
 +              act = atomic_read(&rs->sc.stripes);
 +
 +              /* Make delta stripes absolute. */
 +              stripes = _absolute(argv[0], act, stripes);
 +
 +              /*
 +               * Check range and that the # of stripes changes.
 +               * We leave the resizing to the wroker.
 +               */
 +              if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX) &&
 +                  stripes != atomic_read(&rs->sc.stripes)) {
 +                      atomic_set(&rs->sc.stripes_to_set, stripes);
 +                      wake_do_raid(rs);
 +                      return 0;
 +              }
 +      }
 +
 +      return -EINVAL;
 +}
 +
 +/* Change xor algorithm and number of chunks. */
 +static int xor_set(struct raid_set *rs, int argc, char **argv,
 +                 enum raid_set_flags flag)
 +{
 +      if (argc == 2) {
 +              int chunks;
 +              char *algorithm = argv[0];
 +              struct xor_func *f = ARRAY_END(xor_funcs);
 +
 +              if (sscanf(argv[1], "%d", &chunks) == 1 &&
 +                  range_ok(chunks, 2, XOR_CHUNKS_MAX) &&
 +                  chunks <= rs->set.raid_devs) {
 +                      while (f-- > xor_funcs) {
 +                              if (!strcmp(algorithm, f->name)) {
 +                                      unsigned io_size = 0;
 +                                      struct stripe *stripe = stripe_alloc(&rs->sc, rs->sc.mem_cache_client, SC_GROW);
 +
 +                                      DMINFO("xor: %s", f->name);
 +                                      if (f->f == xor_blocks_wrapper &&
 +                                          chunks > MAX_XOR_BLOCKS + 1) {
 +                                              DMERR("chunks > MAX_XOR_BLOCKS"
 +                                                    " + 1");
 +                                              break;
 +                                      }
 +
 +                                      mutex_lock(&rs->io.xor_lock);
 +                                      rs->xor.f = f;
 +                                      rs->xor.chunks = chunks;
 +                                      rs->xor.speed = 0;
 +                                      mutex_unlock(&rs->io.xor_lock);
 +
 +                                      if (stripe) {
 +                                              rs->xor.speed = xor_speed(stripe);
 +                                              io_size = stripe->io.size;
 +                                              stripe_free(stripe, rs->sc.mem_cache_client);
 +                                      }
 +
 +                                      rs_log(rs, io_size);
 +                                      return 0;
 +                              }
 +                      }
 +              }
 +      }
 +
 +      return -EINVAL;
 +}
 +
 +/*
 + * Allow writes after they got prohibited because of a device failure.
 + *
 + * This needs to be called after userspace updated metadata state
 + * based on an event being thrown during device failure processing.
 + */
 +static int allow_writes(struct raid_set *rs, int argc, char **argv,
 +                      enum raid_set_flags flag)
 +{
 +      if (TestClearRSProhibitWrites(rs)) {
 +DMINFO("%s waking", __func__);
 +              wake_do_raid(rs);
 +              return 0;
 +      }
 +
 +      return -EPERM;
 +}
 +
 +/* Parse the RAID message. */
 +/*
 + * 'all[ow_writes]'
 + * 'ba[ndwidth] {se[t],g[row],sh[rink]} #'    # e.g 'ba se 50'
 + * "o[verwrite]  {on,of[f],r[eset]}'          # e.g. 'o of'
 + * 'sta[tistics] {on,of[f],r[eset]}'          # e.g. 'stat of'
 + * 'str[ipecache] {se[t],g[row],sh[rink]} #'  # e.g. 'stripe set 1024'
 + * 'xor algorithm #chunks'                    # e.g. 'xor xor_8 5'
 + *
 + */
 +static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
 +{
 +      if (argc) {
 +              size_t len = strlen(argv[0]);
 +              struct raid_set *rs = ti->private;
 +              struct {
 +                      const char *name;
 +                      int (*f) (struct raid_set *rs, int argc, char **argv,
 +                                enum raid_set_flags flag);
 +                      enum raid_set_flags flag;
 +              } msg_descr[] = {
 +                      { "allow_writes", allow_writes, 0 },
 +                      { "bandwidth", bandwidth_change, 0 },
 +                      { "overwrite", devel_flags, RS_CHECK_OVERWRITE },
 +                      { "statistics", devel_flags, RS_DEVEL_STATS },
 +                      { "stripe_cache", sc_resize, 0 },
 +                      { "xor", xor_set, 0 },
 +              }, *m = ARRAY_END(msg_descr);
 +
 +              if (len < 3)
 +                      len = 3;
 +
 +              while (m-- > msg_descr) {
 +                      if (!strncmp(argv[0], m->name, len))
 +                              return m->f(rs, argc - 1, argv + 1, m->flag);
 +              }
 +
 +      }
 +
 +      return -EINVAL;
 +}
 +/*
 + * END message interface
 + */
 +
 +/* Provide io hints. */
 +static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 +{
 +      struct raid_set *rs = ti->private;
 +
 +      blk_limits_io_min(limits, rs->set.chunk_size);
 +      blk_limits_io_opt(limits, rs->set.chunk_size * rs->set.data_devs);
 +}
 +
 +static struct target_type raid_target = {
 +      .name = "raid45",
 +      .version = {1, 0, 0},
 +      .module = THIS_MODULE,
 +      .ctr = raid_ctr,
 +      .dtr = raid_dtr,
 +      .map = raid_map,
 +      .presuspend = raid_presuspend,
 +      .postsuspend = raid_postsuspend,
 +      .resume = raid_resume,
 +      .status = raid_status,
 +      .message = raid_message,
 +      .io_hints = raid_io_hints,
 +};
 +
 +static void init_exit(const char *bad_msg, const char *good_msg, int r)
 +{
 +      if (r)
 +              DMERR("Failed to %sregister target [%d]", bad_msg, r);
 +      else
 +              DMINFO("%s %s", good_msg, version);
 +}
 +
 +static int __init dm_raid_init(void)
 +{
 +      int r = dm_register_target(&raid_target);
 +
 +      init_exit("", "initialized", r);
 +      return r;
 +}
 +
 +static void __exit dm_raid_exit(void)
 +{
 +      dm_unregister_target(&raid_target);
 +      init_exit("un", "exit", 0);
 +}
 +
 +/* Module hooks. */
 +module_init(dm_raid_init);
 +module_exit(dm_raid_exit);
 +
 +MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
 +MODULE_AUTHOR("Heinz Mauelshagen <heinzm@redhat.com>");
 +MODULE_LICENSE("GPL");
 +MODULE_ALIAS("dm-raid4");
 +MODULE_ALIAS("dm-raid5");
Simple merge
diff --cc drivers/md/dm.c
Simple merge
Simple merge
index 0000000,9656dd0..dbdd4c5
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,2008 +1,2012 @@@
+ /*    tulip_core.c: A DEC 21x4x-family ethernet driver for Linux.
+       Copyright 2000,2001  The Linux Kernel Team
+       Written/copyright 1994-2001 by Donald Becker.
+       This software may be used and distributed according to the terms
+       of the GNU General Public License, incorporated herein by reference.
+       Please submit bugs to http://bugzilla.kernel.org/ .
+ */
+ #define pr_fmt(fmt) "tulip: " fmt
+ #define DRV_NAME      "tulip"
+ #ifdef CONFIG_TULIP_NAPI
+ #define DRV_VERSION    "1.1.15-NAPI" /* Keep at least for test */
+ #else
+ #define DRV_VERSION   "1.1.15"
+ #endif
+ #define DRV_RELDATE   "Feb 27, 2007"
+ #include <linux/module.h>
+ #include <linux/pci.h>
+ #include <linux/slab.h>
+ #include "tulip.h"
+ #include <linux/init.h>
+ #include <linux/interrupt.h>
+ #include <linux/etherdevice.h>
+ #include <linux/delay.h>
+ #include <linux/mii.h>
+ #include <linux/crc32.h>
+ #include <asm/unaligned.h>
+ #include <asm/uaccess.h>
+ #ifdef CONFIG_SPARC
+ #include <asm/prom.h>
+ #endif
+ static char version[] __devinitdata =
+       "Linux Tulip driver version " DRV_VERSION " (" DRV_RELDATE ")\n";
+ /* A few user-configurable values. */
+ /* Maximum events (Rx packets, etc.) to handle at each interrupt. */
+ static unsigned int max_interrupt_work = 25;
+ #define MAX_UNITS 8
+ /* Used to pass the full-duplex flag, etc. */
+ static int full_duplex[MAX_UNITS];
+ static int options[MAX_UNITS];
+ static int mtu[MAX_UNITS];                    /* Jumbo MTU for interfaces. */
+ /*  The possible media types that can be set in options[] are: */
+ const char * const medianame[32] = {
+       "10baseT", "10base2", "AUI", "100baseTx",
+       "10baseT-FDX", "100baseTx-FDX", "100baseT4", "100baseFx",
+       "100baseFx-FDX", "MII 10baseT", "MII 10baseT-FDX", "MII",
+       "10baseT(forced)", "MII 100baseTx", "MII 100baseTx-FDX", "MII 100baseT4",
+       "MII 100baseFx-HDX", "MII 100baseFx-FDX", "Home-PNA 1Mbps", "Invalid-19",
+       "","","","", "","","","",  "","","","Transceiver reset",
+ };
+ /* Set the copy breakpoint for the copy-only-tiny-buffer Rx structure. */
+ #if defined(__alpha__) || defined(__arm__) || defined(__hppa__) || \
+       defined(CONFIG_SPARC) || defined(__ia64__) || \
+       defined(__sh__) || defined(__mips__)
+ static int rx_copybreak = 1518;
+ #else
+ static int rx_copybreak = 100;
+ #endif
+ /*
+   Set the bus performance register.
+       Typical: Set 16 longword cache alignment, no burst limit.
+       Cache alignment bits 15:14           Burst length 13:8
+               0000    No alignment  0x00000000 unlimited              0800 8 longwords
+               4000    8  longwords            0100 1 longword         1000 16 longwords
+               8000    16 longwords            0200 2 longwords        2000 32 longwords
+               C000    32  longwords           0400 4 longwords
+       Warning: many older 486 systems are broken and require setting 0x00A04800
+          8 longword cache alignment, 8 longword burst.
+       ToDo: Non-Intel setting could be better.
+ */
+ #if defined(__alpha__) || defined(__ia64__)
+ static int csr0 = 0x01A00000 | 0xE000;
+ #elif defined(__i386__) || defined(__powerpc__) || defined(__x86_64__)
+ static int csr0 = 0x01A00000 | 0x8000;
+ #elif defined(CONFIG_SPARC) || defined(__hppa__)
+ /* The UltraSparc PCI controllers will disconnect at every 64-byte
+  * crossing anyways so it makes no sense to tell Tulip to burst
+  * any more than that.
+  */
+ static int csr0 = 0x01A00000 | 0x9000;
+ #elif defined(__arm__) || defined(__sh__)
+ static int csr0 = 0x01A00000 | 0x4800;
+ #elif defined(__mips__)
+ static int csr0 = 0x00200000 | 0x4000;
+ #else
+ #warning Processor architecture undefined!
+ static int csr0 = 0x00A00000 | 0x4800;
+ #endif
+ /* Operational parameters that usually are not changed. */
+ /* Time in jiffies before concluding the transmitter is hung. */
+ #define TX_TIMEOUT  (4*HZ)
+ MODULE_AUTHOR("The Linux Kernel Team");
+ MODULE_DESCRIPTION("Digital 21*4* Tulip ethernet driver");
+ MODULE_LICENSE("GPL");
+ MODULE_VERSION(DRV_VERSION);
+ module_param(tulip_debug, int, 0);
+ module_param(max_interrupt_work, int, 0);
+ module_param(rx_copybreak, int, 0);
+ module_param(csr0, int, 0);
+ module_param_array(options, int, NULL, 0);
+ module_param_array(full_duplex, int, NULL, 0);
+ #ifdef TULIP_DEBUG
+ int tulip_debug = TULIP_DEBUG;
+ #else
+ int tulip_debug = 1;
+ #endif
+ static void tulip_timer(unsigned long data)
+ {
+       struct net_device *dev = (struct net_device *)data;
+       struct tulip_private *tp = netdev_priv(dev);
+       if (netif_running(dev))
+               schedule_work(&tp->media_work);
+ }
+ /*
+  * This table use during operation for capabilities and media timer.
+  *
+  * It is indexed via the values in 'enum chips'
+  */
+ struct tulip_chip_table tulip_tbl[] = {
+   { }, /* placeholder for array, slot unused currently */
+   { }, /* placeholder for array, slot unused currently */
+   /* DC21140 */
+   { "Digital DS21140 Tulip", 128, 0x0001ebef,
+       HAS_MII | HAS_MEDIA_TABLE | CSR12_IN_SROM | HAS_PCI_MWI, tulip_timer,
+       tulip_media_task },
+   /* DC21142, DC21143 */
+   { "Digital DS21142/43 Tulip", 128, 0x0801fbff,
+       HAS_MII | HAS_MEDIA_TABLE | ALWAYS_CHECK_MII | HAS_ACPI | HAS_NWAY
+       | HAS_INTR_MITIGATION | HAS_PCI_MWI, tulip_timer, t21142_media_task },
+   /* LC82C168 */
+   { "Lite-On 82c168 PNIC", 256, 0x0001fbef,
+       HAS_MII | HAS_PNICNWAY, pnic_timer, },
+   /* MX98713 */
+   { "Macronix 98713 PMAC", 128, 0x0001ebef,
+       HAS_MII | HAS_MEDIA_TABLE | CSR12_IN_SROM, mxic_timer, },
+   /* MX98715 */
+   { "Macronix 98715 PMAC", 256, 0x0001ebef,
+       HAS_MEDIA_TABLE, mxic_timer, },
+   /* MX98725 */
+   { "Macronix 98725 PMAC", 256, 0x0001ebef,
+       HAS_MEDIA_TABLE, mxic_timer, },
+   /* AX88140 */
+   { "ASIX AX88140", 128, 0x0001fbff,
+       HAS_MII | HAS_MEDIA_TABLE | CSR12_IN_SROM | MC_HASH_ONLY
+       | IS_ASIX, tulip_timer, tulip_media_task },
+   /* PNIC2 */
+   { "Lite-On PNIC-II", 256, 0x0801fbff,
+       HAS_MII | HAS_NWAY | HAS_8023X | HAS_PCI_MWI, pnic2_timer, },
+   /* COMET */
+   { "ADMtek Comet", 256, 0x0001abef,
+       HAS_MII | MC_HASH_ONLY | COMET_MAC_ADDR, comet_timer, },
+   /* COMPEX9881 */
+   { "Compex 9881 PMAC", 128, 0x0001ebef,
+       HAS_MII | HAS_MEDIA_TABLE | CSR12_IN_SROM, mxic_timer, },
+   /* I21145 */
+   { "Intel DS21145 Tulip", 128, 0x0801fbff,
+       HAS_MII | HAS_MEDIA_TABLE | ALWAYS_CHECK_MII | HAS_ACPI
+       | HAS_NWAY | HAS_PCI_MWI, tulip_timer, tulip_media_task },
+   /* DM910X */
+ #ifdef CONFIG_TULIP_DM910X
+   { "Davicom DM9102/DM9102A", 128, 0x0001ebef,
+       HAS_MII | HAS_MEDIA_TABLE | CSR12_IN_SROM | HAS_ACPI,
+       tulip_timer, tulip_media_task },
+ #else
+   { NULL },
+ #endif
+   /* RS7112 */
+   { "Conexant LANfinity", 256, 0x0001ebef,
+       HAS_MII | HAS_ACPI, tulip_timer, tulip_media_task },
+ };
+ static DEFINE_PCI_DEVICE_TABLE(tulip_pci_tbl) = {
+       { 0x1011, 0x0009, PCI_ANY_ID, PCI_ANY_ID, 0, 0, DC21140 },
+       { 0x1011, 0x0019, PCI_ANY_ID, PCI_ANY_ID, 0, 0, DC21143 },
+       { 0x11AD, 0x0002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, LC82C168 },
+       { 0x10d9, 0x0512, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MX98713 },
+       { 0x10d9, 0x0531, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MX98715 },
+ /*    { 0x10d9, 0x0531, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MX98725 },*/
+       { 0x125B, 0x1400, PCI_ANY_ID, PCI_ANY_ID, 0, 0, AX88140 },
+       { 0x11AD, 0xc115, PCI_ANY_ID, PCI_ANY_ID, 0, 0, PNIC2 },
+       { 0x1317, 0x0981, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x1317, 0x0985, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x1317, 0x1985, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x1317, 0x9511, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x13D1, 0xAB02, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x13D1, 0xAB03, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x13D1, 0xAB08, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x104A, 0x0981, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x104A, 0x2774, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x1259, 0xa120, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x11F6, 0x9881, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMPEX9881 },
+       { 0x8086, 0x0039, PCI_ANY_ID, PCI_ANY_ID, 0, 0, I21145 },
+ #ifdef CONFIG_TULIP_DM910X
+       { 0x1282, 0x9100, PCI_ANY_ID, PCI_ANY_ID, 0, 0, DM910X },
+       { 0x1282, 0x9102, PCI_ANY_ID, PCI_ANY_ID, 0, 0, DM910X },
+ #endif
+       { 0x1113, 0x1216, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x1113, 0x1217, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MX98715 },
+       { 0x1113, 0x9511, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x1186, 0x1541, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x1186, 0x1561, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x1186, 0x1591, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x14f1, 0x1803, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CONEXANT },
+       { 0x1626, 0x8410, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x1737, 0xAB09, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x1737, 0xAB08, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x17B3, 0xAB08, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { 0x10b7, 0x9300, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET }, /* 3Com 3CSOHO100B-TX */
+       { 0x14ea, 0xab08, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET }, /* Planex FNW-3602-TX */
+       { 0x1414, 0x0001, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET }, /* Microsoft MN-120 */
+       { 0x1414, 0x0002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+       { } /* terminate list */
+ };
+ MODULE_DEVICE_TABLE(pci, tulip_pci_tbl);
+ /* A full-duplex map for media types. */
+ const char tulip_media_cap[32] =
+ {0,0,0,16,  3,19,16,24,  27,4,7,5, 0,20,23,20,  28,31,0,0, };
+ static void tulip_tx_timeout(struct net_device *dev);
+ static void tulip_init_ring(struct net_device *dev);
+ static void tulip_free_ring(struct net_device *dev);
+ static netdev_tx_t tulip_start_xmit(struct sk_buff *skb,
+                                         struct net_device *dev);
+ static int tulip_open(struct net_device *dev);
+ static int tulip_close(struct net_device *dev);
+ static void tulip_up(struct net_device *dev);
+ static void tulip_down(struct net_device *dev);
+ static struct net_device_stats *tulip_get_stats(struct net_device *dev);
+ static int private_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
+ static void set_rx_mode(struct net_device *dev);
+ static void tulip_set_wolopts(struct pci_dev *pdev, u32 wolopts);
+ #ifdef CONFIG_NET_POLL_CONTROLLER
+ static void poll_tulip(struct net_device *dev);
+ #endif
+ static void tulip_set_power_state (struct tulip_private *tp,
+                                  int sleep, int snooze)
+ {
+       if (tp->flags & HAS_ACPI) {
+               u32 tmp, newtmp;
+               pci_read_config_dword (tp->pdev, CFDD, &tmp);
+               newtmp = tmp & ~(CFDD_Sleep | CFDD_Snooze);
+               if (sleep)
+                       newtmp |= CFDD_Sleep;
+               else if (snooze)
+                       newtmp |= CFDD_Snooze;
+               if (tmp != newtmp)
+                       pci_write_config_dword (tp->pdev, CFDD, newtmp);
+       }
+ }
+ static void tulip_up(struct net_device *dev)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       void __iomem *ioaddr = tp->base_addr;
+       int next_tick = 3*HZ;
+       u32 reg;
+       int i;
+ #ifdef CONFIG_TULIP_NAPI
+       napi_enable(&tp->napi);
+ #endif
+       /* Wake the chip from sleep/snooze mode. */
+       tulip_set_power_state (tp, 0, 0);
+       /* Disable all WOL events */
+       pci_enable_wake(tp->pdev, PCI_D3hot, 0);
+       pci_enable_wake(tp->pdev, PCI_D3cold, 0);
+       tulip_set_wolopts(tp->pdev, 0);
+       /* On some chip revs we must set the MII/SYM port before the reset!? */
+       if (tp->mii_cnt  ||  (tp->mtable  &&  tp->mtable->has_mii))
+               iowrite32(0x00040000, ioaddr + CSR6);
+       /* Reset the chip, holding bit 0 set at least 50 PCI cycles. */
+       iowrite32(0x00000001, ioaddr + CSR0);
+       pci_read_config_dword(tp->pdev, PCI_COMMAND, &reg);  /* flush write */
+       udelay(100);
+       /* Deassert reset.
+          Wait the specified 50 PCI cycles after a reset by initializing
+          Tx and Rx queues and the address filter list. */
+       iowrite32(tp->csr0, ioaddr + CSR0);
+       pci_read_config_dword(tp->pdev, PCI_COMMAND, &reg);  /* flush write */
+       udelay(100);
+       if (tulip_debug > 1)
+               netdev_dbg(dev, "tulip_up(), irq==%d\n", dev->irq);
+       iowrite32(tp->rx_ring_dma, ioaddr + CSR3);
+       iowrite32(tp->tx_ring_dma, ioaddr + CSR4);
+       tp->cur_rx = tp->cur_tx = 0;
+       tp->dirty_rx = tp->dirty_tx = 0;
+       if (tp->flags & MC_HASH_ONLY) {
+               u32 addr_low = get_unaligned_le32(dev->dev_addr);
+               u32 addr_high = get_unaligned_le16(dev->dev_addr + 4);
+               if (tp->chip_id == AX88140) {
+                       iowrite32(0, ioaddr + CSR13);
+                       iowrite32(addr_low,  ioaddr + CSR14);
+                       iowrite32(1, ioaddr + CSR13);
+                       iowrite32(addr_high, ioaddr + CSR14);
+               } else if (tp->flags & COMET_MAC_ADDR) {
+                       iowrite32(addr_low,  ioaddr + 0xA4);
+                       iowrite32(addr_high, ioaddr + 0xA8);
+                       iowrite32(0, ioaddr + CSR27);
+                       iowrite32(0, ioaddr + CSR28);
+               }
+       } else {
+               /* This is set_rx_mode(), but without starting the transmitter. */
+               u16 *eaddrs = (u16 *)dev->dev_addr;
+               u16 *setup_frm = &tp->setup_frame[15*6];
+               dma_addr_t mapping;
+               /* 21140 bug: you must add the broadcast address. */
+               memset(tp->setup_frame, 0xff, sizeof(tp->setup_frame));
+               /* Fill the final entry of the table with our physical address. */
+               *setup_frm++ = eaddrs[0]; *setup_frm++ = eaddrs[0];
+               *setup_frm++ = eaddrs[1]; *setup_frm++ = eaddrs[1];
+               *setup_frm++ = eaddrs[2]; *setup_frm++ = eaddrs[2];
+               mapping = pci_map_single(tp->pdev, tp->setup_frame,
+                                        sizeof(tp->setup_frame),
+                                        PCI_DMA_TODEVICE);
+               tp->tx_buffers[tp->cur_tx].skb = NULL;
+               tp->tx_buffers[tp->cur_tx].mapping = mapping;
+               /* Put the setup frame on the Tx list. */
+               tp->tx_ring[tp->cur_tx].length = cpu_to_le32(0x08000000 | 192);
+               tp->tx_ring[tp->cur_tx].buffer1 = cpu_to_le32(mapping);
+               tp->tx_ring[tp->cur_tx].status = cpu_to_le32(DescOwned);
+               tp->cur_tx++;
+       }
+       tp->saved_if_port = dev->if_port;
+       if (dev->if_port == 0)
+               dev->if_port = tp->default_port;
+       /* Allow selecting a default media. */
+       i = 0;
+       if (tp->mtable == NULL)
+               goto media_picked;
+       if (dev->if_port) {
+               int looking_for = tulip_media_cap[dev->if_port] & MediaIsMII ? 11 :
+                       (dev->if_port == 12 ? 0 : dev->if_port);
+               for (i = 0; i < tp->mtable->leafcount; i++)
+                       if (tp->mtable->mleaf[i].media == looking_for) {
+                               dev_info(&dev->dev,
+                                        "Using user-specified media %s\n",
+                                        medianame[dev->if_port]);
+                               goto media_picked;
+                       }
+       }
+       if ((tp->mtable->defaultmedia & 0x0800) == 0) {
+               int looking_for = tp->mtable->defaultmedia & MEDIA_MASK;
+               for (i = 0; i < tp->mtable->leafcount; i++)
+                       if (tp->mtable->mleaf[i].media == looking_for) {
+                               dev_info(&dev->dev,
+                                        "Using EEPROM-set media %s\n",
+                                        medianame[looking_for]);
+                               goto media_picked;
+                       }
+       }
+       /* Start sensing first non-full-duplex media. */
+       for (i = tp->mtable->leafcount - 1;
+                (tulip_media_cap[tp->mtable->mleaf[i].media] & MediaAlwaysFD) && i > 0; i--)
+               ;
+ media_picked:
+       tp->csr6 = 0;
+       tp->cur_index = i;
+       tp->nwayset = 0;
+       if (dev->if_port) {
+               if (tp->chip_id == DC21143  &&
+                   (tulip_media_cap[dev->if_port] & MediaIsMII)) {
+                       /* We must reset the media CSRs when we force-select MII mode. */
+                       iowrite32(0x0000, ioaddr + CSR13);
+                       iowrite32(0x0000, ioaddr + CSR14);
+                       iowrite32(0x0008, ioaddr + CSR15);
+               }
+               tulip_select_media(dev, 1);
+       } else if (tp->chip_id == DC21142) {
+               if (tp->mii_cnt) {
+                       tulip_select_media(dev, 1);
+                       if (tulip_debug > 1)
+                               dev_info(&dev->dev,
+                                        "Using MII transceiver %d, status %04x\n",
+                                        tp->phys[0],
+                                        tulip_mdio_read(dev, tp->phys[0], 1));
+                       iowrite32(csr6_mask_defstate, ioaddr + CSR6);
+                       tp->csr6 = csr6_mask_hdcap;
+                       dev->if_port = 11;
+                       iowrite32(0x0000, ioaddr + CSR13);
+                       iowrite32(0x0000, ioaddr + CSR14);
+               } else
+                       t21142_start_nway(dev);
+       } else if (tp->chip_id == PNIC2) {
+               /* for initial startup advertise 10/100 Full and Half */
+               tp->sym_advertise = 0x01E0;
+                 /* enable autonegotiate end interrupt */
+               iowrite32(ioread32(ioaddr+CSR5)| 0x00008010, ioaddr + CSR5);
+               iowrite32(ioread32(ioaddr+CSR7)| 0x00008010, ioaddr + CSR7);
+               pnic2_start_nway(dev);
+       } else if (tp->chip_id == LC82C168  &&  ! tp->medialock) {
+               if (tp->mii_cnt) {
+                       dev->if_port = 11;
+                       tp->csr6 = 0x814C0000 | (tp->full_duplex ? 0x0200 : 0);
+                       iowrite32(0x0001, ioaddr + CSR15);
+               } else if (ioread32(ioaddr + CSR5) & TPLnkPass)
+                       pnic_do_nway(dev);
+               else {
+                       /* Start with 10mbps to do autonegotiation. */
+                       iowrite32(0x32, ioaddr + CSR12);
+                       tp->csr6 = 0x00420000;
+                       iowrite32(0x0001B078, ioaddr + 0xB8);
+                       iowrite32(0x0201B078, ioaddr + 0xB8);
+                       next_tick = 1*HZ;
+               }
+       } else if ((tp->chip_id == MX98713 || tp->chip_id == COMPEX9881) &&
+                  ! tp->medialock) {
+               dev->if_port = 0;
+               tp->csr6 = 0x01880000 | (tp->full_duplex ? 0x0200 : 0);
+               iowrite32(0x0f370000 | ioread16(ioaddr + 0x80), ioaddr + 0x80);
+       } else if (tp->chip_id == MX98715 || tp->chip_id == MX98725) {
+               /* Provided by BOLO, Macronix - 12/10/1998. */
+               dev->if_port = 0;
+               tp->csr6 = 0x01a80200;
+               iowrite32(0x0f370000 | ioread16(ioaddr + 0x80), ioaddr + 0x80);
+               iowrite32(0x11000 | ioread16(ioaddr + 0xa0), ioaddr + 0xa0);
+       } else if (tp->chip_id == COMET || tp->chip_id == CONEXANT) {
+               /* Enable automatic Tx underrun recovery. */
+               iowrite32(ioread32(ioaddr + 0x88) | 1, ioaddr + 0x88);
+               dev->if_port = tp->mii_cnt ? 11 : 0;
+               tp->csr6 = 0x00040000;
+       } else if (tp->chip_id == AX88140) {
+               tp->csr6 = tp->mii_cnt ? 0x00040100 : 0x00000100;
+       } else
+               tulip_select_media(dev, 1);
+       /* Start the chip's Tx to process setup frame. */
+       tulip_stop_rxtx(tp);
+       barrier();
+       udelay(5);
+       iowrite32(tp->csr6 | TxOn, ioaddr + CSR6);
+       /* Enable interrupts by setting the interrupt mask. */
+       iowrite32(tulip_tbl[tp->chip_id].valid_intrs, ioaddr + CSR5);
+       iowrite32(tulip_tbl[tp->chip_id].valid_intrs, ioaddr + CSR7);
+       tulip_start_rxtx(tp);
+       iowrite32(0, ioaddr + CSR2);            /* Rx poll demand */
+       if (tulip_debug > 2) {
+               netdev_dbg(dev, "Done tulip_up(), CSR0 %08x, CSR5 %08x CSR6 %08x\n",
+                          ioread32(ioaddr + CSR0),
+                          ioread32(ioaddr + CSR5),
+                          ioread32(ioaddr + CSR6));
+       }
+       /* Set the timer to switch to check for link beat and perhaps switch
+          to an alternate media type. */
+       tp->timer.expires = RUN_AT(next_tick);
+       add_timer(&tp->timer);
+ #ifdef CONFIG_TULIP_NAPI
+       init_timer(&tp->oom_timer);
+         tp->oom_timer.data = (unsigned long)dev;
+         tp->oom_timer.function = oom_timer;
+ #endif
+ }
+ static int
+ tulip_open(struct net_device *dev)
+ {
+       int retval;
+       tulip_init_ring (dev);
+       retval = request_irq(dev->irq, tulip_interrupt, IRQF_SHARED, dev->name, dev);
+       if (retval)
+               goto free_ring;
+       tulip_up (dev);
+       netif_start_queue (dev);
+       return 0;
+ free_ring:
+       tulip_free_ring (dev);
+       return retval;
+ }
+ static void tulip_tx_timeout(struct net_device *dev)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       void __iomem *ioaddr = tp->base_addr;
+       unsigned long flags;
+       spin_lock_irqsave (&tp->lock, flags);
+       if (tulip_media_cap[dev->if_port] & MediaIsMII) {
+               /* Do nothing -- the media monitor should handle this. */
+               if (tulip_debug > 1)
+                       dev_warn(&dev->dev,
+                                "Transmit timeout using MII device\n");
+       } else if (tp->chip_id == DC21140 || tp->chip_id == DC21142 ||
+                  tp->chip_id == MX98713 || tp->chip_id == COMPEX9881 ||
+                  tp->chip_id == DM910X) {
+               dev_warn(&dev->dev,
+                        "21140 transmit timed out, status %08x, SIA %08x %08x %08x %08x, resetting...\n",
+                        ioread32(ioaddr + CSR5), ioread32(ioaddr + CSR12),
+                        ioread32(ioaddr + CSR13), ioread32(ioaddr + CSR14),
+                        ioread32(ioaddr + CSR15));
+               tp->timeout_recovery = 1;
+               schedule_work(&tp->media_work);
+               goto out_unlock;
+       } else if (tp->chip_id == PNIC2) {
+               dev_warn(&dev->dev,
+                        "PNIC2 transmit timed out, status %08x, CSR6/7 %08x / %08x CSR12 %08x, resetting...\n",
+                        (int)ioread32(ioaddr + CSR5),
+                        (int)ioread32(ioaddr + CSR6),
+                        (int)ioread32(ioaddr + CSR7),
+                        (int)ioread32(ioaddr + CSR12));
+       } else {
+               dev_warn(&dev->dev,
+                        "Transmit timed out, status %08x, CSR12 %08x, resetting...\n",
+                        ioread32(ioaddr + CSR5), ioread32(ioaddr + CSR12));
+               dev->if_port = 0;
+       }
+ #if defined(way_too_many_messages)
+       if (tulip_debug > 3) {
+               int i;
+               for (i = 0; i < RX_RING_SIZE; i++) {
+                       u8 *buf = (u8 *)(tp->rx_ring[i].buffer1);
+                       int j;
+                       printk(KERN_DEBUG
+                              "%2d: %08x %08x %08x %08x  %02x %02x %02x\n",
+                              i,
+                              (unsigned int)tp->rx_ring[i].status,
+                              (unsigned int)tp->rx_ring[i].length,
+                              (unsigned int)tp->rx_ring[i].buffer1,
+                              (unsigned int)tp->rx_ring[i].buffer2,
+                              buf[0], buf[1], buf[2]);
+                       for (j = 0; buf[j] != 0xee && j < 1600; j++)
+                               if (j < 100)
+                                       pr_cont(" %02x", buf[j]);
+                       pr_cont(" j=%d\n", j);
+               }
+               printk(KERN_DEBUG "  Rx ring %p: ", tp->rx_ring);
+               for (i = 0; i < RX_RING_SIZE; i++)
+                       pr_cont(" %08x", (unsigned int)tp->rx_ring[i].status);
+               printk(KERN_DEBUG "  Tx ring %p: ", tp->tx_ring);
+               for (i = 0; i < TX_RING_SIZE; i++)
+                       pr_cont(" %08x", (unsigned int)tp->tx_ring[i].status);
+               pr_cont("\n");
+       }
+ #endif
+       tulip_tx_timeout_complete(tp, ioaddr);
+ out_unlock:
+       spin_unlock_irqrestore (&tp->lock, flags);
+       dev->trans_start = jiffies; /* prevent tx timeout */
+       netif_wake_queue (dev);
+ }
+ /* Initialize the Rx and Tx rings, along with various 'dev' bits. */
+ static void tulip_init_ring(struct net_device *dev)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       int i;
+       tp->susp_rx = 0;
+       tp->ttimer = 0;
+       tp->nir = 0;
+       for (i = 0; i < RX_RING_SIZE; i++) {
+               tp->rx_ring[i].status = 0x00000000;
+               tp->rx_ring[i].length = cpu_to_le32(PKT_BUF_SZ);
+               tp->rx_ring[i].buffer2 = cpu_to_le32(tp->rx_ring_dma + sizeof(struct tulip_rx_desc) * (i + 1));
+               tp->rx_buffers[i].skb = NULL;
+               tp->rx_buffers[i].mapping = 0;
+       }
+       /* Mark the last entry as wrapping the ring. */
+       tp->rx_ring[i-1].length = cpu_to_le32(PKT_BUF_SZ | DESC_RING_WRAP);
+       tp->rx_ring[i-1].buffer2 = cpu_to_le32(tp->rx_ring_dma);
+       for (i = 0; i < RX_RING_SIZE; i++) {
+               dma_addr_t mapping;
+               /* Note the receive buffer must be longword aligned.
+                  dev_alloc_skb() provides 16 byte alignment.  But do *not*
+                  use skb_reserve() to align the IP header! */
+               struct sk_buff *skb = dev_alloc_skb(PKT_BUF_SZ);
+               tp->rx_buffers[i].skb = skb;
+               if (skb == NULL)
+                       break;
+               mapping = pci_map_single(tp->pdev, skb->data,
+                                        PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+               tp->rx_buffers[i].mapping = mapping;
+               skb->dev = dev;                 /* Mark as being used by this device. */
+               tp->rx_ring[i].status = cpu_to_le32(DescOwned); /* Owned by Tulip chip */
+               tp->rx_ring[i].buffer1 = cpu_to_le32(mapping);
+       }
+       tp->dirty_rx = (unsigned int)(i - RX_RING_SIZE);
+       /* The Tx buffer descriptor is filled in as needed, but we
+          do need to clear the ownership bit. */
+       for (i = 0; i < TX_RING_SIZE; i++) {
+               tp->tx_buffers[i].skb = NULL;
+               tp->tx_buffers[i].mapping = 0;
+               tp->tx_ring[i].status = 0x00000000;
+               tp->tx_ring[i].buffer2 = cpu_to_le32(tp->tx_ring_dma + sizeof(struct tulip_tx_desc) * (i + 1));
+       }
+       tp->tx_ring[i-1].buffer2 = cpu_to_le32(tp->tx_ring_dma);
+ }
+ static netdev_tx_t
+ tulip_start_xmit(struct sk_buff *skb, struct net_device *dev)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       int entry;
+       u32 flag;
+       dma_addr_t mapping;
+       unsigned long flags;
+       spin_lock_irqsave(&tp->lock, flags);
+       /* Calculate the next Tx descriptor entry. */
+       entry = tp->cur_tx % TX_RING_SIZE;
+       tp->tx_buffers[entry].skb = skb;
+       mapping = pci_map_single(tp->pdev, skb->data,
+                                skb->len, PCI_DMA_TODEVICE);
+       tp->tx_buffers[entry].mapping = mapping;
+       tp->tx_ring[entry].buffer1 = cpu_to_le32(mapping);
+       if (tp->cur_tx - tp->dirty_tx < TX_RING_SIZE/2) {/* Typical path */
+               flag = 0x60000000; /* No interrupt */
+       } else if (tp->cur_tx - tp->dirty_tx == TX_RING_SIZE/2) {
+               flag = 0xe0000000; /* Tx-done intr. */
+       } else if (tp->cur_tx - tp->dirty_tx < TX_RING_SIZE - 2) {
+               flag = 0x60000000; /* No Tx-done intr. */
+       } else {                /* Leave room for set_rx_mode() to fill entries. */
+               flag = 0xe0000000; /* Tx-done intr. */
+               netif_stop_queue(dev);
+       }
+       if (entry == TX_RING_SIZE-1)
+               flag = 0xe0000000 | DESC_RING_WRAP;
+       tp->tx_ring[entry].length = cpu_to_le32(skb->len | flag);
+       /* if we were using Transmit Automatic Polling, we would need a
+        * wmb() here. */
+       tp->tx_ring[entry].status = cpu_to_le32(DescOwned);
+       wmb();
+       tp->cur_tx++;
+       /* Trigger an immediate transmit demand. */
+       iowrite32(0, tp->base_addr + CSR1);
+       spin_unlock_irqrestore(&tp->lock, flags);
+       return NETDEV_TX_OK;
+ }
+ static void tulip_clean_tx_ring(struct tulip_private *tp)
+ {
+       unsigned int dirty_tx;
+       for (dirty_tx = tp->dirty_tx ; tp->cur_tx - dirty_tx > 0;
+               dirty_tx++) {
+               int entry = dirty_tx % TX_RING_SIZE;
+               int status = le32_to_cpu(tp->tx_ring[entry].status);
+               if (status < 0) {
+                       tp->dev->stats.tx_errors++;     /* It wasn't Txed */
+                       tp->tx_ring[entry].status = 0;
+               }
+               /* Check for Tx filter setup frames. */
+               if (tp->tx_buffers[entry].skb == NULL) {
+                       /* test because dummy frames not mapped */
+                       if (tp->tx_buffers[entry].mapping)
+                               pci_unmap_single(tp->pdev,
+                                       tp->tx_buffers[entry].mapping,
+                                       sizeof(tp->setup_frame),
+                                       PCI_DMA_TODEVICE);
+                       continue;
+               }
+               pci_unmap_single(tp->pdev, tp->tx_buffers[entry].mapping,
+                               tp->tx_buffers[entry].skb->len,
+                               PCI_DMA_TODEVICE);
+               /* Free the original skb. */
+               dev_kfree_skb_irq(tp->tx_buffers[entry].skb);
+               tp->tx_buffers[entry].skb = NULL;
+               tp->tx_buffers[entry].mapping = 0;
+       }
+ }
+ static void tulip_down (struct net_device *dev)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       void __iomem *ioaddr = tp->base_addr;
+       unsigned long flags;
+       cancel_work_sync(&tp->media_work);
+ #ifdef CONFIG_TULIP_NAPI
+       napi_disable(&tp->napi);
+ #endif
+       del_timer_sync (&tp->timer);
+ #ifdef CONFIG_TULIP_NAPI
+       del_timer_sync (&tp->oom_timer);
+ #endif
+       spin_lock_irqsave (&tp->lock, flags);
+       /* Disable interrupts by clearing the interrupt mask. */
+       iowrite32 (0x00000000, ioaddr + CSR7);
+       /* Stop the Tx and Rx processes. */
+       tulip_stop_rxtx(tp);
+       /* prepare receive buffers */
+       tulip_refill_rx(dev);
+       /* release any unconsumed transmit buffers */
+       tulip_clean_tx_ring(tp);
+       if (ioread32(ioaddr + CSR6) != 0xffffffff)
+               dev->stats.rx_missed_errors += ioread32(ioaddr + CSR8) & 0xffff;
+       spin_unlock_irqrestore (&tp->lock, flags);
+       init_timer(&tp->timer);
+       tp->timer.data = (unsigned long)dev;
+       tp->timer.function = tulip_tbl[tp->chip_id].media_timer;
+       dev->if_port = tp->saved_if_port;
+       /* Leave the driver in snooze, not sleep, mode. */
+       tulip_set_power_state (tp, 0, 1);
+ }
+ static void tulip_free_ring (struct net_device *dev)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       int i;
+       /* Free all the skbuffs in the Rx queue. */
+       for (i = 0; i < RX_RING_SIZE; i++) {
+               struct sk_buff *skb = tp->rx_buffers[i].skb;
+               dma_addr_t mapping = tp->rx_buffers[i].mapping;
+               tp->rx_buffers[i].skb = NULL;
+               tp->rx_buffers[i].mapping = 0;
+               tp->rx_ring[i].status = 0;      /* Not owned by Tulip chip. */
+               tp->rx_ring[i].length = 0;
+               /* An invalid address. */
+               tp->rx_ring[i].buffer1 = cpu_to_le32(0xBADF00D0);
+               if (skb) {
+                       pci_unmap_single(tp->pdev, mapping, PKT_BUF_SZ,
+                                        PCI_DMA_FROMDEVICE);
+                       dev_kfree_skb (skb);
+               }
+       }
+       for (i = 0; i < TX_RING_SIZE; i++) {
+               struct sk_buff *skb = tp->tx_buffers[i].skb;
+               if (skb != NULL) {
+                       pci_unmap_single(tp->pdev, tp->tx_buffers[i].mapping,
+                                        skb->len, PCI_DMA_TODEVICE);
+                       dev_kfree_skb (skb);
+               }
+               tp->tx_buffers[i].skb = NULL;
+               tp->tx_buffers[i].mapping = 0;
+       }
+ }
+ static int tulip_close (struct net_device *dev)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       void __iomem *ioaddr = tp->base_addr;
+       netif_stop_queue (dev);
+       tulip_down (dev);
+       if (tulip_debug > 1)
+               netdev_dbg(dev, "Shutting down ethercard, status was %02x\n",
+                          ioread32 (ioaddr + CSR5));
+       free_irq (dev->irq, dev);
+       tulip_free_ring (dev);
+       return 0;
+ }
+ static struct net_device_stats *tulip_get_stats(struct net_device *dev)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       void __iomem *ioaddr = tp->base_addr;
+       if (netif_running(dev)) {
+               unsigned long flags;
+               spin_lock_irqsave (&tp->lock, flags);
+               dev->stats.rx_missed_errors += ioread32(ioaddr + CSR8) & 0xffff;
+               spin_unlock_irqrestore(&tp->lock, flags);
+       }
+       return &dev->stats;
+ }
+ static void tulip_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+ {
+       struct tulip_private *np = netdev_priv(dev);
+       strcpy(info->driver, DRV_NAME);
+       strcpy(info->version, DRV_VERSION);
+       strcpy(info->bus_info, pci_name(np->pdev));
+ }
+ static int tulip_ethtool_set_wol(struct net_device *dev,
+                                struct ethtool_wolinfo *wolinfo)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       if (wolinfo->wolopts & (~tp->wolinfo.supported))
+                  return -EOPNOTSUPP;
+       tp->wolinfo.wolopts = wolinfo->wolopts;
+       device_set_wakeup_enable(&tp->pdev->dev, tp->wolinfo.wolopts);
+       return 0;
+ }
+ static void tulip_ethtool_get_wol(struct net_device *dev,
+                                 struct ethtool_wolinfo *wolinfo)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       wolinfo->supported = tp->wolinfo.supported;
+       wolinfo->wolopts = tp->wolinfo.wolopts;
+       return;
+ }
+ static const struct ethtool_ops ops = {
+       .get_drvinfo = tulip_get_drvinfo,
+       .set_wol     = tulip_ethtool_set_wol,
+       .get_wol     = tulip_ethtool_get_wol,
+ };
+ /* Provide ioctl() calls to examine the MII xcvr state. */
+ static int private_ioctl (struct net_device *dev, struct ifreq *rq, int cmd)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       void __iomem *ioaddr = tp->base_addr;
+       struct mii_ioctl_data *data = if_mii(rq);
+       const unsigned int phy_idx = 0;
+       int phy = tp->phys[phy_idx] & 0x1f;
+       unsigned int regnum = data->reg_num;
+       switch (cmd) {
+       case SIOCGMIIPHY:               /* Get address of MII PHY in use. */
+               if (tp->mii_cnt)
+                       data->phy_id = phy;
+               else if (tp->flags & HAS_NWAY)
+                       data->phy_id = 32;
+               else if (tp->chip_id == COMET)
+                       data->phy_id = 1;
+               else
+                       return -ENODEV;
+       case SIOCGMIIREG:               /* Read MII PHY register. */
+               if (data->phy_id == 32 && (tp->flags & HAS_NWAY)) {
+                       int csr12 = ioread32 (ioaddr + CSR12);
+                       int csr14 = ioread32 (ioaddr + CSR14);
+                       switch (regnum) {
+                       case 0:
+                                 if (((csr14<<5) & 0x1000) ||
+                                         (dev->if_port == 5 && tp->nwayset))
+                                         data->val_out = 0x1000;
+                                 else
+                                         data->val_out = (tulip_media_cap[dev->if_port]&MediaIs100 ? 0x2000 : 0)
+                                                 | (tulip_media_cap[dev->if_port]&MediaIsFD ? 0x0100 : 0);
+                               break;
+                       case 1:
+                                 data->val_out =
+                                       0x1848 +
+                                       ((csr12&0x7000) == 0x5000 ? 0x20 : 0) +
+                                       ((csr12&0x06) == 6 ? 0 : 4);
+                                 data->val_out |= 0x6048;
+                               break;
+                       case 4:
+                                 /* Advertised value, bogus 10baseTx-FD value from CSR6. */
+                                 data->val_out =
+                                       ((ioread32(ioaddr + CSR6) >> 3) & 0x0040) +
+                                       ((csr14 >> 1) & 0x20) + 1;
+                                 data->val_out |= ((csr14 >> 9) & 0x03C0);
+                               break;
+                       case 5: data->val_out = tp->lpar; break;
+                       default: data->val_out = 0; break;
+                       }
+               } else {
+                       data->val_out = tulip_mdio_read (dev, data->phy_id & 0x1f, regnum);
+               }
+               return 0;
+       case SIOCSMIIREG:               /* Write MII PHY register. */
+               if (regnum & ~0x1f)
+                       return -EINVAL;
+               if (data->phy_id == phy) {
+                       u16 value = data->val_in;
+                       switch (regnum) {
+                       case 0: /* Check for autonegotiation on or reset. */
+                               tp->full_duplex_lock = (value & 0x9000) ? 0 : 1;
+                               if (tp->full_duplex_lock)
+                                       tp->full_duplex = (value & 0x0100) ? 1 : 0;
+                               break;
+                       case 4:
+                               tp->advertising[phy_idx] =
+                               tp->mii_advertise = data->val_in;
+                               break;
+                       }
+               }
+               if (data->phy_id == 32 && (tp->flags & HAS_NWAY)) {
+                       u16 value = data->val_in;
+                       if (regnum == 0) {
+                         if ((value & 0x1200) == 0x1200) {
+                           if (tp->chip_id == PNIC2) {
+                                    pnic2_start_nway (dev);
+                             } else {
+                                  t21142_start_nway (dev);
+                             }
+                         }
+                       } else if (regnum == 4)
+                               tp->sym_advertise = value;
+               } else {
+                       tulip_mdio_write (dev, data->phy_id & 0x1f, regnum, data->val_in);
+               }
+               return 0;
+       default:
+               return -EOPNOTSUPP;
+       }
+       return -EOPNOTSUPP;
+ }
+ /* Set or clear the multicast filter for this adaptor.
+    Note that we only use exclusion around actually queueing the
+    new frame, not around filling tp->setup_frame.  This is non-deterministic
+    when re-entered but still correct. */
+ #undef set_bit_le
+ #define set_bit_le(i,p) do { ((char *)(p))[(i)/8] |= (1<<((i)%8)); } while(0)
+ static void build_setup_frame_hash(u16 *setup_frm, struct net_device *dev)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       u16 hash_table[32];
+       struct netdev_hw_addr *ha;
+       int i;
+       u16 *eaddrs;
+       memset(hash_table, 0, sizeof(hash_table));
+       set_bit_le(255, hash_table);                    /* Broadcast entry */
+       /* This should work on big-endian machines as well. */
+       netdev_for_each_mc_addr(ha, dev) {
+               int index = ether_crc_le(ETH_ALEN, ha->addr) & 0x1ff;
+               set_bit_le(index, hash_table);
+       }
+       for (i = 0; i < 32; i++) {
+               *setup_frm++ = hash_table[i];
+               *setup_frm++ = hash_table[i];
+       }
+       setup_frm = &tp->setup_frame[13*6];
+       /* Fill the final entry with our physical address. */
+       eaddrs = (u16 *)dev->dev_addr;
+       *setup_frm++ = eaddrs[0]; *setup_frm++ = eaddrs[0];
+       *setup_frm++ = eaddrs[1]; *setup_frm++ = eaddrs[1];
+       *setup_frm++ = eaddrs[2]; *setup_frm++ = eaddrs[2];
+ }
+ static void build_setup_frame_perfect(u16 *setup_frm, struct net_device *dev)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       struct netdev_hw_addr *ha;
+       u16 *eaddrs;
+       /* We have <= 14 addresses so we can use the wonderful
+          16 address perfect filtering of the Tulip. */
+       netdev_for_each_mc_addr(ha, dev) {
+               eaddrs = (u16 *) ha->addr;
+               *setup_frm++ = *eaddrs; *setup_frm++ = *eaddrs++;
+               *setup_frm++ = *eaddrs; *setup_frm++ = *eaddrs++;
+               *setup_frm++ = *eaddrs; *setup_frm++ = *eaddrs++;
+       }
+       /* Fill the unused entries with the broadcast address. */
+       memset(setup_frm, 0xff, (15 - netdev_mc_count(dev)) * 12);
+       setup_frm = &tp->setup_frame[15*6];
+       /* Fill the final entry with our physical address. */
+       eaddrs = (u16 *)dev->dev_addr;
+       *setup_frm++ = eaddrs[0]; *setup_frm++ = eaddrs[0];
+       *setup_frm++ = eaddrs[1]; *setup_frm++ = eaddrs[1];
+       *setup_frm++ = eaddrs[2]; *setup_frm++ = eaddrs[2];
+ }
+ static void set_rx_mode(struct net_device *dev)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       void __iomem *ioaddr = tp->base_addr;
+       int csr6;
+       csr6 = ioread32(ioaddr + CSR6) & ~0x00D5;
+       tp->csr6 &= ~0x00D5;
+       if (dev->flags & IFF_PROMISC) {                 /* Set promiscuous. */
+               tp->csr6 |= AcceptAllMulticast | AcceptAllPhys;
+               csr6 |= AcceptAllMulticast | AcceptAllPhys;
+       } else if ((netdev_mc_count(dev) > 1000) ||
+                  (dev->flags & IFF_ALLMULTI)) {
+               /* Too many to filter well -- accept all multicasts. */
+               tp->csr6 |= AcceptAllMulticast;
+               csr6 |= AcceptAllMulticast;
+       } else  if (tp->flags & MC_HASH_ONLY) {
+               /* Some work-alikes have only a 64-entry hash filter table. */
+               /* Should verify correctness on big-endian/__powerpc__ */
+               struct netdev_hw_addr *ha;
+               if (netdev_mc_count(dev) > 64) {
+                       /* Arbitrary non-effective limit. */
+                       tp->csr6 |= AcceptAllMulticast;
+                       csr6 |= AcceptAllMulticast;
+               } else {
+                       u32 mc_filter[2] = {0, 0};               /* Multicast hash filter */
+                       int filterbit;
+                       netdev_for_each_mc_addr(ha, dev) {
+                               if (tp->flags & COMET_MAC_ADDR)
+                                       filterbit = ether_crc_le(ETH_ALEN,
+                                                                ha->addr);
+                               else
+                                       filterbit = ether_crc(ETH_ALEN,
+                                                             ha->addr) >> 26;
+                               filterbit &= 0x3f;
+                               mc_filter[filterbit >> 5] |= 1 << (filterbit & 31);
+                               if (tulip_debug > 2)
+                                       dev_info(&dev->dev,
+                                                "Added filter for %pM  %08x bit %d\n",
+                                                ha->addr,
+                                                ether_crc(ETH_ALEN, ha->addr),
+                                                filterbit);
+                       }
+                       if (mc_filter[0] == tp->mc_filter[0]  &&
+                               mc_filter[1] == tp->mc_filter[1])
+                               ;                               /* No change. */
+                       else if (tp->flags & IS_ASIX) {
+                               iowrite32(2, ioaddr + CSR13);
+                               iowrite32(mc_filter[0], ioaddr + CSR14);
+                               iowrite32(3, ioaddr + CSR13);
+                               iowrite32(mc_filter[1], ioaddr + CSR14);
+                       } else if (tp->flags & COMET_MAC_ADDR) {
+                               iowrite32(mc_filter[0], ioaddr + CSR27);
+                               iowrite32(mc_filter[1], ioaddr + CSR28);
+                       }
+                       tp->mc_filter[0] = mc_filter[0];
+                       tp->mc_filter[1] = mc_filter[1];
+               }
+       } else {
+               unsigned long flags;
+               u32 tx_flags = 0x08000000 | 192;
+               /* Note that only the low-address shortword of setup_frame is valid!
+                  The values are doubled for big-endian architectures. */
+               if (netdev_mc_count(dev) > 14) {
+                       /* Must use a multicast hash table. */
+                       build_setup_frame_hash(tp->setup_frame, dev);
+                       tx_flags = 0x08400000 | 192;
+               } else {
+                       build_setup_frame_perfect(tp->setup_frame, dev);
+               }
+               spin_lock_irqsave(&tp->lock, flags);
+               if (tp->cur_tx - tp->dirty_tx > TX_RING_SIZE - 2) {
+                       /* Same setup recently queued, we need not add it. */
+               } else {
+                       unsigned int entry;
+                       int dummy = -1;
+                       /* Now add this frame to the Tx list. */
+                       entry = tp->cur_tx++ % TX_RING_SIZE;
+                       if (entry != 0) {
+                               /* Avoid a chip errata by prefixing a dummy entry. */
+                               tp->tx_buffers[entry].skb = NULL;
+                               tp->tx_buffers[entry].mapping = 0;
+                               tp->tx_ring[entry].length =
+                                       (entry == TX_RING_SIZE-1) ? cpu_to_le32(DESC_RING_WRAP) : 0;
+                               tp->tx_ring[entry].buffer1 = 0;
+                               /* Must set DescOwned later to avoid race with chip */
+                               dummy = entry;
+                               entry = tp->cur_tx++ % TX_RING_SIZE;
+                       }
+                       tp->tx_buffers[entry].skb = NULL;
+                       tp->tx_buffers[entry].mapping =
+                               pci_map_single(tp->pdev, tp->setup_frame,
+                                              sizeof(tp->setup_frame),
+                                              PCI_DMA_TODEVICE);
+                       /* Put the setup frame on the Tx list. */
+                       if (entry == TX_RING_SIZE-1)
+                               tx_flags |= DESC_RING_WRAP;             /* Wrap ring. */
+                       tp->tx_ring[entry].length = cpu_to_le32(tx_flags);
+                       tp->tx_ring[entry].buffer1 =
+                               cpu_to_le32(tp->tx_buffers[entry].mapping);
+                       tp->tx_ring[entry].status = cpu_to_le32(DescOwned);
+                       if (dummy >= 0)
+                               tp->tx_ring[dummy].status = cpu_to_le32(DescOwned);
+                       if (tp->cur_tx - tp->dirty_tx >= TX_RING_SIZE - 2)
+                               netif_stop_queue(dev);
+                       /* Trigger an immediate transmit demand. */
+                       iowrite32(0, ioaddr + CSR1);
+               }
+               spin_unlock_irqrestore(&tp->lock, flags);
+       }
+       iowrite32(csr6, ioaddr + CSR6);
+ }
+ #ifdef CONFIG_TULIP_MWI
+ static void __devinit tulip_mwi_config (struct pci_dev *pdev,
+                                       struct net_device *dev)
+ {
+       struct tulip_private *tp = netdev_priv(dev);
+       u8 cache;
+       u16 pci_command;
+       u32 csr0;
+       if (tulip_debug > 3)
+               netdev_dbg(dev, "tulip_mwi_config()\n");
+       tp->csr0 = csr0 = 0;
+       /* if we have any cache line size at all, we can do MRM and MWI */
+       csr0 |= MRM | MWI;
+       /* Enable MWI in the standard PCI command bit.
+        * Check for the case where MWI is desired but not available
+        */
+       pci_try_set_mwi(pdev);
+       /* read result from hardware (in case bit refused to enable) */
+       pci_read_config_word(pdev, PCI_COMMAND, &pci_command);
+       if ((csr0 & MWI) && (!(pci_command & PCI_COMMAND_INVALIDATE)))
+               csr0 &= ~MWI;
+       /* if cache line size hardwired to zero, no MWI */
+       pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &cache);
+       if ((csr0 & MWI) && (cache == 0)) {
+               csr0 &= ~MWI;
+               pci_clear_mwi(pdev);
+       }
+       /* assign per-cacheline-size cache alignment and
+        * burst length values
+        */
+       switch (cache) {
+       case 8:
+               csr0 |= MRL | (1 << CALShift) | (16 << BurstLenShift);
+               break;
+       case 16:
+               csr0 |= MRL | (2 << CALShift) | (16 << BurstLenShift);
+               break;
+       case 32:
+               csr0 |= MRL | (3 << CALShift) | (32 << BurstLenShift);
+               break;
+       default:
+               cache = 0;
+               break;
+       }
+       /* if we have a good cache line size, we by now have a good
+        * csr0, so save it and exit
+        */
+       if (cache)
+               goto out;
+       /* we don't have a good csr0 or cache line size, disable MWI */
+       if (csr0 & MWI) {
+               pci_clear_mwi(pdev);
+               csr0 &= ~MWI;
+       }
+       /* sane defaults for burst length and cache alignment
+        * originally from de4x5 driver
+        */
+       csr0 |= (8 << BurstLenShift) | (1 << CALShift);
+ out:
+       tp->csr0 = csr0;
+       if (tulip_debug > 2)
+               netdev_dbg(dev, "MWI config cacheline=%d, csr0=%08x\n",
+                          cache, csr0);
+ }
+ #endif
+ /*
+  *    Chips that have the MRM/reserved bit quirk and the burst quirk. That
+  *    is the DM910X and the on chip ULi devices
+  */
+ static int tulip_uli_dm_quirk(struct pci_dev *pdev)
+ {
+       if (pdev->vendor == 0x1282 && pdev->device == 0x9102)
+               return 1;
+       return 0;
+ }
+ static const struct net_device_ops tulip_netdev_ops = {
+       .ndo_open               = tulip_open,
+       .ndo_start_xmit         = tulip_start_xmit,
+       .ndo_tx_timeout         = tulip_tx_timeout,
+       .ndo_stop               = tulip_close,
+       .ndo_get_stats          = tulip_get_stats,
+       .ndo_do_ioctl           = private_ioctl,
+       .ndo_set_rx_mode        = set_rx_mode,
+       .ndo_change_mtu         = eth_change_mtu,
+       .ndo_set_mac_address    = eth_mac_addr,
+       .ndo_validate_addr      = eth_validate_addr,
+ #ifdef CONFIG_NET_POLL_CONTROLLER
+       .ndo_poll_controller     = poll_tulip,
+ #endif
+ };
+ DEFINE_PCI_DEVICE_TABLE(early_486_chipsets) = {
+       { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82424) },
+       { PCI_DEVICE(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_496) },
+       { },
+ };
+ static int __devinit tulip_init_one (struct pci_dev *pdev,
+                                    const struct pci_device_id *ent)
+ {
+       struct tulip_private *tp;
+       /* See note below on the multiport cards. */
+       static unsigned char last_phys_addr[6] = {0x00, 'L', 'i', 'n', 'u', 'x'};
+       static int last_irq;
+       static int multiport_cnt;       /* For four-port boards w/one EEPROM */
+       int i, irq;
+       unsigned short sum;
+       unsigned char *ee_data;
+       struct net_device *dev;
+       void __iomem *ioaddr;
+       static int board_idx = -1;
+       int chip_idx = ent->driver_data;
+       const char *chip_name = tulip_tbl[chip_idx].chip_name;
+       unsigned int eeprom_missing = 0;
+       unsigned int force_csr0 = 0;
+ #ifndef MODULE
+       if (tulip_debug > 0)
+               printk_once(KERN_INFO "%s", version);
+ #endif
+       board_idx++;
+       /*
+        *      Lan media wire a tulip chip to a wan interface. Needs a very
+        *      different driver (lmc driver)
+        */
+         if (pdev->subsystem_vendor == PCI_VENDOR_ID_LMC) {
+               pr_err("skipping LMC card\n");
+               return -ENODEV;
+       } else if (pdev->subsystem_vendor == PCI_VENDOR_ID_SBE &&
+                  (pdev->subsystem_device == PCI_SUBDEVICE_ID_SBE_T3E3 ||
+                   pdev->subsystem_device == PCI_SUBDEVICE_ID_SBE_2T3E3_P0 ||
+                   pdev->subsystem_device == PCI_SUBDEVICE_ID_SBE_2T3E3_P1)) {
+               pr_err("skipping SBE T3E3 port\n");
+               return -ENODEV;
+       }
+       /*
+        *      DM910x chips should be handled by the dmfe driver, except
+        *      on-board chips on SPARC systems.  Also, early DM9100s need
+        *      software CRC which only the dmfe driver supports.
+        */
+ #ifdef CONFIG_TULIP_DM910X
+       if (chip_idx == DM910X) {
+               struct device_node *dp;
+               if (pdev->vendor == 0x1282 && pdev->device == 0x9100 &&
+                   pdev->revision < 0x30) {
+                       pr_info("skipping early DM9100 with Crc bug (use dmfe)\n");
+                       return -ENODEV;
+               }
+               dp = pci_device_to_OF_node(pdev);
+               if (!(dp && of_get_property(dp, "local-mac-address", NULL))) {
+                       pr_info("skipping DM910x expansion card (use dmfe)\n");
+                       return -ENODEV;
+               }
+       }
+ #endif
+       /*
+        *      Looks for early PCI chipsets where people report hangs
+        *      without the workarounds being on.
+        */
+       /* 1. Intel Saturn. Switch to 8 long words burst, 8 long word cache
+             aligned.  Aries might need this too. The Saturn errata are not
+             pretty reading but thankfully it's an old 486 chipset.
+          2. The dreaded SiS496 486 chipset. Same workaround as Intel
+             Saturn.
+       */
+       if (pci_dev_present(early_486_chipsets)) {
+               csr0 = MRL | MRM | (8 << BurstLenShift) | (1 << CALShift);
+               force_csr0 = 1;
+       }
+       /* bugfix: the ASIX must have a burst limit or horrible things happen. */
+       if (chip_idx == AX88140) {
+               if ((csr0 & 0x3f00) == 0)
+                       csr0 |= 0x2000;
+       }
+       /* PNIC doesn't have MWI/MRL/MRM... */
+       if (chip_idx == LC82C168)
+               csr0 &= ~0xfff10000; /* zero reserved bits 31:20, 16 */
+       /* DM9102A has troubles with MRM & clear reserved bits 24:22, 20, 16, 7:1 */
+       if (tulip_uli_dm_quirk(pdev)) {
+               csr0 &= ~0x01f100ff;
+ #if defined(CONFIG_SPARC)
+                 csr0 = (csr0 & ~0xff00) | 0xe000;
+ #endif
+       }
+       /*
+        *      And back to business
+        */
+       i = pci_enable_device(pdev);
+       if (i) {
+               pr_err("Cannot enable tulip board #%d, aborting\n", board_idx);
+               return i;
+       }
+       /* The chip will fail to enter a low-power state later unless
+        * first explicitly commanded into D0 */
+       if (pci_set_power_state(pdev, PCI_D0)) {
+               pr_notice("Failed to set power state to D0\n");
+       }
+       irq = pdev->irq;
+       /* alloc_etherdev ensures aligned and zeroed private structures */
+       dev = alloc_etherdev (sizeof (*tp));
+       if (!dev) {
+               pr_err("ether device alloc failed, aborting\n");
+               return -ENOMEM;
+       }
+       SET_NETDEV_DEV(dev, &pdev->dev);
+       if (pci_resource_len (pdev, 0) < tulip_tbl[chip_idx].io_size) {
+               pr_err("%s: I/O region (0x%llx@0x%llx) too small, aborting\n",
+                      pci_name(pdev),
+                      (unsigned long long)pci_resource_len (pdev, 0),
+                      (unsigned long long)pci_resource_start (pdev, 0));
+               goto err_out_free_netdev;
+       }
+       /* grab all resources from both PIO and MMIO regions, as we
+        * don't want anyone else messing around with our hardware */
+       if (pci_request_regions (pdev, DRV_NAME))
+               goto err_out_free_netdev;
+       ioaddr =  pci_iomap(pdev, TULIP_BAR, tulip_tbl[chip_idx].io_size);
+       if (!ioaddr)
+               goto err_out_free_res;
+       /*
+        * initialize private data structure 'tp'
+        * it is zeroed and aligned in alloc_etherdev
+        */
+       tp = netdev_priv(dev);
+       tp->dev = dev;
+       tp->rx_ring = pci_alloc_consistent(pdev,
+                                          sizeof(struct tulip_rx_desc) * RX_RING_SIZE +
+                                          sizeof(struct tulip_tx_desc) * TX_RING_SIZE,
+                                          &tp->rx_ring_dma);
+       if (!tp->rx_ring)
+               goto err_out_mtable;
+       tp->tx_ring = (struct tulip_tx_desc *)(tp->rx_ring + RX_RING_SIZE);
+       tp->tx_ring_dma = tp->rx_ring_dma + sizeof(struct tulip_rx_desc) * RX_RING_SIZE;
+       tp->chip_id = chip_idx;
+       tp->flags = tulip_tbl[chip_idx].flags;
+       tp->wolinfo.supported = 0;
+       tp->wolinfo.wolopts = 0;
+       /* COMET: Enable power management only for AN983B */
+       if (chip_idx == COMET ) {
+               u32 sig;
+               pci_read_config_dword (pdev, 0x80, &sig);
+               if (sig == 0x09811317) {
+                       tp->flags |= COMET_PM;
+                       tp->wolinfo.supported = WAKE_PHY | WAKE_MAGIC;
+                       pr_info("%s: Enabled WOL support for AN983B\n",
+                               __func__);
+               }
+       }
+       tp->pdev = pdev;
+       tp->base_addr = ioaddr;
+       tp->revision = pdev->revision;
+       tp->csr0 = csr0;
+       spin_lock_init(&tp->lock);
+       spin_lock_init(&tp->mii_lock);
+       init_timer(&tp->timer);
+       tp->timer.data = (unsigned long)dev;
+       tp->timer.function = tulip_tbl[tp->chip_id].media_timer;
+       INIT_WORK(&tp->media_work, tulip_tbl[tp->chip_id].media_task);
+       dev->base_addr = (unsigned long)ioaddr;
+ #ifdef CONFIG_TULIP_MWI
+       if (!force_csr0 && (tp->flags & HAS_PCI_MWI))
+               tulip_mwi_config (pdev, dev);
+ #endif
+       /* Stop the chip's Tx and Rx processes. */
+       tulip_stop_rxtx(tp);
+       pci_set_master(pdev);
+ #ifdef CONFIG_GSC
+       if (pdev->subsystem_vendor == PCI_VENDOR_ID_HP) {
+               switch (pdev->subsystem_device) {
+               default:
+                       break;
+               case 0x1061:
+               case 0x1062:
+               case 0x1063:
+               case 0x1098:
+               case 0x1099:
+               case 0x10EE:
+                       tp->flags |= HAS_SWAPPED_SEEPROM | NEEDS_FAKE_MEDIA_TABLE;
+                       chip_name = "GSC DS21140 Tulip";
+               }
+       }
+ #endif
+       /* Clear the missed-packet counter. */
+       ioread32(ioaddr + CSR8);
+       /* The station address ROM is read byte serially.  The register must
+          be polled, waiting for the value to be read bit serially from the
+          EEPROM.
+          */
+       ee_data = tp->eeprom;
+       memset(ee_data, 0, sizeof(tp->eeprom));
+       sum = 0;
+       if (chip_idx == LC82C168) {
+               for (i = 0; i < 3; i++) {
+                       int value, boguscnt = 100000;
+                       iowrite32(0x600 | i, ioaddr + 0x98);
+                       do {
+                               value = ioread32(ioaddr + CSR9);
+                       } while (value < 0  && --boguscnt > 0);
+                       put_unaligned_le16(value, ((__le16 *)dev->dev_addr) + i);
+                       sum += value & 0xffff;
+               }
+       } else if (chip_idx == COMET) {
+               /* No need to read the EEPROM. */
+               put_unaligned_le32(ioread32(ioaddr + 0xA4), dev->dev_addr);
+               put_unaligned_le16(ioread32(ioaddr + 0xA8), dev->dev_addr + 4);
+               for (i = 0; i < 6; i ++)
+                       sum += dev->dev_addr[i];
+       } else {
+               /* A serial EEPROM interface, we read now and sort it out later. */
+               int sa_offset = 0;
+               int ee_addr_size = tulip_read_eeprom(dev, 0xff, 8) & 0x40000 ? 8 : 6;
+               int ee_max_addr = ((1 << ee_addr_size) - 1) * sizeof(u16);
+               if (ee_max_addr > sizeof(tp->eeprom))
+                       ee_max_addr = sizeof(tp->eeprom);
+               for (i = 0; i < ee_max_addr ; i += sizeof(u16)) {
+                       u16 data = tulip_read_eeprom(dev, i/2, ee_addr_size);
+                       ee_data[i] = data & 0xff;
+                       ee_data[i + 1] = data >> 8;
+               }
+               /* DEC now has a specification (see Notes) but early board makers
+                  just put the address in the first EEPROM locations. */
+               /* This does  memcmp(ee_data, ee_data+16, 8) */
+               for (i = 0; i < 8; i ++)
+                       if (ee_data[i] != ee_data[16+i])
+                               sa_offset = 20;
+               if (chip_idx == CONEXANT) {
+                       /* Check that the tuple type and length is correct. */
+                       if (ee_data[0x198] == 0x04  &&  ee_data[0x199] == 6)
+                               sa_offset = 0x19A;
+               } else if (ee_data[0] == 0xff  &&  ee_data[1] == 0xff &&
+                                  ee_data[2] == 0) {
+                       sa_offset = 2;          /* Grrr, damn Matrox boards. */
+                       multiport_cnt = 4;
+               }
+ #ifdef CONFIG_MIPS_COBALT
+                if ((pdev->bus->number == 0) &&
+                    ((PCI_SLOT(pdev->devfn) == 7) ||
+                     (PCI_SLOT(pdev->devfn) == 12))) {
+                        /* Cobalt MAC address in first EEPROM locations. */
+                        sa_offset = 0;
+                      /* Ensure our media table fixup get's applied */
+                      memcpy(ee_data + 16, ee_data, 8);
+                }
+ #endif
+ #ifdef CONFIG_GSC
+               /* Check to see if we have a broken srom */
+               if (ee_data[0] == 0x61 && ee_data[1] == 0x10) {
+                       /* pci_vendor_id and subsystem_id are swapped */
+                       ee_data[0] = ee_data[2];
+                       ee_data[1] = ee_data[3];
+                       ee_data[2] = 0x61;
+                       ee_data[3] = 0x10;
+                       /* HSC-PCI boards need to be byte-swaped and shifted
+                        * up 1 word.  This shift needs to happen at the end
+                        * of the MAC first because of the 2 byte overlap.
+                        */
+                       for (i = 4; i >= 0; i -= 2) {
+                               ee_data[17 + i + 3] = ee_data[17 + i];
+                               ee_data[16 + i + 5] = ee_data[16 + i];
+                       }
+               }
+ #endif
+               for (i = 0; i < 6; i ++) {
+                       dev->dev_addr[i] = ee_data[i + sa_offset];
+                       sum += ee_data[i + sa_offset];
+               }
+       }
+       /* Lite-On boards have the address byte-swapped. */
+       if ((dev->dev_addr[0] == 0xA0 ||
+            dev->dev_addr[0] == 0xC0 ||
+            dev->dev_addr[0] == 0x02) &&
+           dev->dev_addr[1] == 0x00)
+               for (i = 0; i < 6; i+=2) {
+                       char tmp = dev->dev_addr[i];
+                       dev->dev_addr[i] = dev->dev_addr[i+1];
+                       dev->dev_addr[i+1] = tmp;
+               }
+       /* On the Zynx 315 Etherarray and other multiport boards only the
+          first Tulip has an EEPROM.
+          On Sparc systems the mac address is held in the OBP property
+          "local-mac-address".
+          The addresses of the subsequent ports are derived from the first.
+          Many PCI BIOSes also incorrectly report the IRQ line, so we correct
+          that here as well. */
+       if (sum == 0  || sum == 6*0xff) {
+ #if defined(CONFIG_SPARC)
+               struct device_node *dp = pci_device_to_OF_node(pdev);
+               const unsigned char *addr;
+               int len;
+ #endif
+               eeprom_missing = 1;
+               for (i = 0; i < 5; i++)
+                       dev->dev_addr[i] = last_phys_addr[i];
+               dev->dev_addr[i] = last_phys_addr[i] + 1;
+ #if defined(CONFIG_SPARC)
+               addr = of_get_property(dp, "local-mac-address", &len);
+               if (addr && len == 6)
+                       memcpy(dev->dev_addr, addr, 6);
+ #endif
+ #if defined(__i386__) || defined(__x86_64__)  /* Patch up x86 BIOS bug. */
+               if (last_irq)
+                       irq = last_irq;
+ #endif
+       }
+       for (i = 0; i < 6; i++)
+               last_phys_addr[i] = dev->dev_addr[i];
+       last_irq = irq;
+       dev->irq = irq;
+       /* The lower four bits are the media type. */
+       if (board_idx >= 0  &&  board_idx < MAX_UNITS) {
+               if (options[board_idx] & MEDIA_MASK)
+                       tp->default_port = options[board_idx] & MEDIA_MASK;
+               if ((options[board_idx] & FullDuplex) || full_duplex[board_idx] > 0)
+                       tp->full_duplex = 1;
+               if (mtu[board_idx] > 0)
+                       dev->mtu = mtu[board_idx];
+       }
+       if (dev->mem_start & MEDIA_MASK)
+               tp->default_port = dev->mem_start & MEDIA_MASK;
+       if (tp->default_port) {
+               pr_info(DRV_NAME "%d: Transceiver selection forced to %s\n",
+                       board_idx, medianame[tp->default_port & MEDIA_MASK]);
+               tp->medialock = 1;
+               if (tulip_media_cap[tp->default_port] & MediaAlwaysFD)
+                       tp->full_duplex = 1;
+       }
+       if (tp->full_duplex)
+               tp->full_duplex_lock = 1;
+       if (tulip_media_cap[tp->default_port] & MediaIsMII) {
+               static const u16 media2advert[] = {
+                       0x20, 0x40, 0x03e0, 0x60, 0x80, 0x100, 0x200
+               };
+               tp->mii_advertise = media2advert[tp->default_port - 9];
+               tp->mii_advertise |= (tp->flags & HAS_8023X); /* Matching bits! */
+       }
+       if (tp->flags & HAS_MEDIA_TABLE) {
+               sprintf(dev->name, DRV_NAME "%d", board_idx);   /* hack */
+               tulip_parse_eeprom(dev);
+               strcpy(dev->name, "eth%d");                     /* un-hack */
+       }
+       if ((tp->flags & ALWAYS_CHECK_MII) ||
+               (tp->mtable  &&  tp->mtable->has_mii) ||
+               ( ! tp->mtable  &&  (tp->flags & HAS_MII))) {
+               if (tp->mtable  &&  tp->mtable->has_mii) {
+                       for (i = 0; i < tp->mtable->leafcount; i++)
+                               if (tp->mtable->mleaf[i].media == 11) {
+                                       tp->cur_index = i;
+                                       tp->saved_if_port = dev->if_port;
+                                       tulip_select_media(dev, 2);
+                                       dev->if_port = tp->saved_if_port;
+                                       break;
+                               }
+               }
+               /* Find the connected MII xcvrs.
+                  Doing this in open() would allow detecting external xcvrs
+                  later, but takes much time. */
+               tulip_find_mii (dev, board_idx);
+       }
+       /* The Tulip-specific entries in the device structure. */
+       dev->netdev_ops = &tulip_netdev_ops;
+       dev->watchdog_timeo = TX_TIMEOUT;
+ #ifdef CONFIG_TULIP_NAPI
+       netif_napi_add(dev, &tp->napi, tulip_poll, 16);
+ #endif
+       SET_ETHTOOL_OPS(dev, &ops);
+       if (register_netdev(dev))
+               goto err_out_free_ring;
+       pci_set_drvdata(pdev, dev);
+       dev_info(&dev->dev,
+ #ifdef CONFIG_TULIP_MMIO
+                "%s rev %d at MMIO %#llx,%s %pM, IRQ %d\n",
+ #else
+                "%s rev %d at Port %#llx,%s %pM, IRQ %d\n",
+ #endif
+                chip_name, pdev->revision,
+                (unsigned long long)pci_resource_start(pdev, TULIP_BAR),
+                eeprom_missing ? " EEPROM not present," : "",
+                dev->dev_addr, irq);
+         if (tp->chip_id == PNIC2)
+               tp->link_change = pnic2_lnk_change;
+       else if (tp->flags & HAS_NWAY)
+               tp->link_change = t21142_lnk_change;
+       else if (tp->flags & HAS_PNICNWAY)
+               tp->link_change = pnic_lnk_change;
+       /* Reset the xcvr interface and turn on heartbeat. */
+       switch (chip_idx) {
+       case DC21140:
+       case DM910X:
+       default:
+               if (tp->mtable)
+                       iowrite32(tp->mtable->csr12dir | 0x100, ioaddr + CSR12);
+               break;
+       case DC21142:
+               if (tp->mii_cnt  ||  tulip_media_cap[dev->if_port] & MediaIsMII) {
+                       iowrite32(csr6_mask_defstate, ioaddr + CSR6);
+                       iowrite32(0x0000, ioaddr + CSR13);
+                       iowrite32(0x0000, ioaddr + CSR14);
+                       iowrite32(csr6_mask_hdcap, ioaddr + CSR6);
+               } else
+                       t21142_start_nway(dev);
+               break;
+       case PNIC2:
+               /* just do a reset for sanity sake */
+               iowrite32(0x0000, ioaddr + CSR13);
+               iowrite32(0x0000, ioaddr + CSR14);
+               break;
+       case LC82C168:
+               if ( ! tp->mii_cnt) {
+                       tp->nway = 1;
+                       tp->nwayset = 0;
+                       iowrite32(csr6_ttm | csr6_ca, ioaddr + CSR6);
+                       iowrite32(0x30, ioaddr + CSR12);
+                       iowrite32(0x0001F078, ioaddr + CSR6);
+                       iowrite32(0x0201F078, ioaddr + CSR6); /* Turn on autonegotiation. */
+               }
+               break;
+       case MX98713:
+       case COMPEX9881:
+               iowrite32(0x00000000, ioaddr + CSR6);
+               iowrite32(0x000711C0, ioaddr + CSR14); /* Turn on NWay. */
+               iowrite32(0x00000001, ioaddr + CSR13);
+               break;
+       case MX98715:
+       case MX98725:
+               iowrite32(0x01a80000, ioaddr + CSR6);
+               iowrite32(0xFFFFFFFF, ioaddr + CSR14);
+               iowrite32(0x00001000, ioaddr + CSR12);
+               break;
+       case COMET:
+               /* No initialization necessary. */
+               break;
+       }
+       /* put the chip in snooze mode until opened */
+       tulip_set_power_state (tp, 0, 1);
+       return 0;
+ err_out_free_ring:
+       pci_free_consistent (pdev,
+                            sizeof (struct tulip_rx_desc) * RX_RING_SIZE +
+                            sizeof (struct tulip_tx_desc) * TX_RING_SIZE,
+                            tp->rx_ring, tp->rx_ring_dma);
+ err_out_mtable:
+       kfree (tp->mtable);
+       pci_iounmap(pdev, ioaddr);
+ err_out_free_res:
+       pci_release_regions (pdev);
+ err_out_free_netdev:
+       free_netdev (dev);
+       return -ENODEV;
+ }
+ /* set the registers according to the given wolopts */
+ static void tulip_set_wolopts (struct pci_dev *pdev, u32 wolopts)
+ {
+       struct net_device *dev = pci_get_drvdata(pdev);
+       struct tulip_private *tp = netdev_priv(dev);
+       void __iomem *ioaddr = tp->base_addr;
+       if (tp->flags & COMET_PM) {
+         
+               unsigned int tmp;
+                       
+               tmp = ioread32(ioaddr + CSR18);
+               tmp &= ~(comet_csr18_pmes_sticky | comet_csr18_apm_mode | comet_csr18_d3a);
+               tmp |= comet_csr18_pm_mode;
+               iowrite32(tmp, ioaddr + CSR18);
+                       
+               /* Set the Wake-up Control/Status Register to the given WOL options*/
+               tmp = ioread32(ioaddr + CSR13);
+               tmp &= ~(comet_csr13_linkoffe | comet_csr13_linkone | comet_csr13_wfre | comet_csr13_lsce | comet_csr13_mpre);
+               if (wolopts & WAKE_MAGIC)
+                       tmp |= comet_csr13_mpre;
+               if (wolopts & WAKE_PHY)
+                       tmp |= comet_csr13_linkoffe | comet_csr13_linkone | comet_csr13_lsce;
+               /* Clear the event flags */
+               tmp |= comet_csr13_wfr | comet_csr13_mpr | comet_csr13_lsc;
+               iowrite32(tmp, ioaddr + CSR13);
+       }
+ }
+ #ifdef CONFIG_PM
+ static int tulip_suspend (struct pci_dev *pdev, pm_message_t state)
+ {
+       pci_power_t pstate;
+       struct net_device *dev = pci_get_drvdata(pdev);
+       struct tulip_private *tp = netdev_priv(dev);
+       if (!dev)
+               return -EINVAL;
+       if (!netif_running(dev))
+               goto save_state;
+       tulip_down(dev);
+       netif_device_detach(dev);
+       free_irq(dev->irq, dev);
+ save_state:
+       pci_save_state(pdev);
+       pci_disable_device(pdev);
+       pstate = pci_choose_state(pdev, state);
+       if (state.event == PM_EVENT_SUSPEND && pstate != PCI_D0) {
+               int rc;
+               tulip_set_wolopts(pdev, tp->wolinfo.wolopts);
+               rc = pci_enable_wake(pdev, pstate, tp->wolinfo.wolopts);
+               if (rc)
+                       pr_err("pci_enable_wake failed (%d)\n", rc);
+       }
+       pci_set_power_state(pdev, pstate);
+       return 0;
+ }
+ static int tulip_resume(struct pci_dev *pdev)
+ {
+       struct net_device *dev = pci_get_drvdata(pdev);
+       struct tulip_private *tp = netdev_priv(dev);
+       void __iomem *ioaddr = tp->base_addr;
+       int retval;
+       unsigned int tmp;
+       if (!dev)
+               return -EINVAL;
+       pci_set_power_state(pdev, PCI_D0);
+       pci_restore_state(pdev);
+       if (!netif_running(dev))
+               return 0;
+       if ((retval = pci_enable_device(pdev))) {
+               pr_err("pci_enable_device failed in resume\n");
+               return retval;
+       }
+       if ((retval = request_irq(dev->irq, tulip_interrupt, IRQF_SHARED, dev->name, dev))) {
+               pr_err("request_irq failed in resume\n");
+               return retval;
+       }
+       if (tp->flags & COMET_PM) {
+               pci_enable_wake(pdev, PCI_D3hot, 0);
+               pci_enable_wake(pdev, PCI_D3cold, 0);
+               /* Clear the PMES flag */
+               tmp = ioread32(ioaddr + CSR20);
+               tmp |= comet_csr20_pmes;
+               iowrite32(tmp, ioaddr + CSR20);
+               /* Disable all wake-up events */
+               tulip_set_wolopts(pdev, 0);
+       }
+       netif_device_attach(dev);
+       if (netif_running(dev))
+               tulip_up(dev);
+       return 0;
+ }
+ #endif /* CONFIG_PM */
+ static void __devexit tulip_remove_one (struct pci_dev *pdev)
+ {
+       struct net_device *dev = pci_get_drvdata (pdev);
+       struct tulip_private *tp;
+       if (!dev)
+               return;
+       tp = netdev_priv(dev);
++
++      /* shoot NIC in the head before deallocating descriptors */
++      pci_disable_device(tp->pdev);
++
+       unregister_netdev(dev);
+       pci_free_consistent (pdev,
+                            sizeof (struct tulip_rx_desc) * RX_RING_SIZE +
+                            sizeof (struct tulip_tx_desc) * TX_RING_SIZE,
+                            tp->rx_ring, tp->rx_ring_dma);
+       kfree (tp->mtable);
+       pci_iounmap(pdev, tp->base_addr);
+       free_netdev (dev);
+       pci_release_regions (pdev);
+       pci_set_drvdata (pdev, NULL);
+       /* pci_power_off (pdev, -1); */
+ }
+ #ifdef CONFIG_NET_POLL_CONTROLLER
+ /*
+  * Polling 'interrupt' - used by things like netconsole to send skbs
+  * without having to re-enable interrupts. It's not called while
+  * the interrupt routine is executing.
+  */
+ static void poll_tulip (struct net_device *dev)
+ {
+       /* disable_irq here is not very nice, but with the lockless
+          interrupt handler we have no other choice. */
+       disable_irq(dev->irq);
+       tulip_interrupt (dev->irq, dev);
+       enable_irq(dev->irq);
+ }
+ #endif
+ static struct pci_driver tulip_driver = {
+       .name           = DRV_NAME,
+       .id_table       = tulip_pci_tbl,
+       .probe          = tulip_init_one,
+       .remove         = __devexit_p(tulip_remove_one),
+ #ifdef CONFIG_PM
+       .suspend        = tulip_suspend,
+       .resume         = tulip_resume,
+ #endif /* CONFIG_PM */
+ };
+ static int __init tulip_init (void)
+ {
+ #ifdef MODULE
+       pr_info("%s", version);
+ #endif
+       /* copy module parms into globals */
+       tulip_rx_copybreak = rx_copybreak;
+       tulip_max_interrupt_work = max_interrupt_work;
+       /* probe for and init boards */
+       return pci_register_driver(&tulip_driver);
+ }
+ static void __exit tulip_cleanup (void)
+ {
+       pci_unregister_driver (&tulip_driver);
+ }
+ module_init(tulip_init);
+ module_exit(tulip_cleanup);
index 0000000,37b70f7..5fdd5bb
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,3556 +1,3568 @@@
+ /*
+  *  linux/drivers/net/ehea/ehea_main.c
+  *
+  *  eHEA ethernet device driver for IBM eServer System p
+  *
+  *  (C) Copyright IBM Corp. 2006
+  *
+  *  Authors:
+  *     Christoph Raisch <raisch@de.ibm.com>
+  *     Jan-Bernd Themann <themann@de.ibm.com>
+  *     Thomas Klein <tklein@de.ibm.com>
+  *
+  *
+  * This program is free software; you can redistribute it and/or modify
+  * it under the terms of the GNU General Public License as published by
+  * the Free Software Foundation; either version 2, or (at your option)
+  * any later version.
+  *
+  * This program is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.        See the
+  * GNU General Public License for more details.
+  *
+  * You should have received a copy of the GNU General Public License
+  * along with this program; if not, write to the Free Software
+  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+  */
+ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+ #include <linux/in.h>
+ #include <linux/ip.h>
+ #include <linux/tcp.h>
+ #include <linux/udp.h>
+ #include <linux/if.h>
+ #include <linux/list.h>
+ #include <linux/slab.h>
+ #include <linux/if_ether.h>
+ #include <linux/notifier.h>
+ #include <linux/reboot.h>
+ #include <linux/memory.h>
+ #include <asm/kexec.h>
+ #include <linux/mutex.h>
+ #include <linux/prefetch.h>
+ #include <net/ip.h>
+ #include "ehea.h"
+ #include "ehea_qmr.h"
+ #include "ehea_phyp.h"
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
+ MODULE_DESCRIPTION("IBM eServer HEA Driver");
+ MODULE_VERSION(DRV_VERSION);
+ static int msg_level = -1;
+ static int rq1_entries = EHEA_DEF_ENTRIES_RQ1;
+ static int rq2_entries = EHEA_DEF_ENTRIES_RQ2;
+ static int rq3_entries = EHEA_DEF_ENTRIES_RQ3;
+ static int sq_entries = EHEA_DEF_ENTRIES_SQ;
+ static int use_mcs = 1;
+ static int prop_carrier_state;
+ module_param(msg_level, int, 0);
+ module_param(rq1_entries, int, 0);
+ module_param(rq2_entries, int, 0);
+ module_param(rq3_entries, int, 0);
+ module_param(sq_entries, int, 0);
+ module_param(prop_carrier_state, int, 0);
+ module_param(use_mcs, int, 0);
+ MODULE_PARM_DESC(msg_level, "msg_level");
+ MODULE_PARM_DESC(prop_carrier_state, "Propagate carrier state of physical "
+                "port to stack. 1:yes, 0:no.  Default = 0 ");
+ MODULE_PARM_DESC(rq3_entries, "Number of entries for Receive Queue 3 "
+                "[2^x - 1], x = [6..14]. Default = "
+                __MODULE_STRING(EHEA_DEF_ENTRIES_RQ3) ")");
+ MODULE_PARM_DESC(rq2_entries, "Number of entries for Receive Queue 2 "
+                "[2^x - 1], x = [6..14]. Default = "
+                __MODULE_STRING(EHEA_DEF_ENTRIES_RQ2) ")");
+ MODULE_PARM_DESC(rq1_entries, "Number of entries for Receive Queue 1 "
+                "[2^x - 1], x = [6..14]. Default = "
+                __MODULE_STRING(EHEA_DEF_ENTRIES_RQ1) ")");
+ MODULE_PARM_DESC(sq_entries, " Number of entries for the Send Queue  "
+                "[2^x - 1], x = [6..14]. Default = "
+                __MODULE_STRING(EHEA_DEF_ENTRIES_SQ) ")");
+ MODULE_PARM_DESC(use_mcs, " Multiple receive queues, 1: enable, 0: disable, "
+                "Default = 1");
+ static int port_name_cnt;
+ static LIST_HEAD(adapter_list);
+ static unsigned long ehea_driver_flags;
+ static DEFINE_MUTEX(dlpar_mem_lock);
+ struct ehea_fw_handle_array ehea_fw_handles;
+ struct ehea_bcmc_reg_array ehea_bcmc_regs;
+ static int __devinit ehea_probe_adapter(struct platform_device *dev,
+                                       const struct of_device_id *id);
+ static int __devexit ehea_remove(struct platform_device *dev);
++static struct of_device_id ehea_module_device_table[] = {
++      {
++              .name = "lhea",
++              .compatible = "IBM,lhea",
++      },
++      {
++              .type = "network",
++              .compatible = "IBM,lhea-ethernet",
++      },
++      {},
++};
++MODULE_DEVICE_TABLE(of, ehea_module_device_table);
++
+ static struct of_device_id ehea_device_table[] = {
+       {
+               .name = "lhea",
+               .compatible = "IBM,lhea",
+       },
+       {},
+ };
 -MODULE_DEVICE_TABLE(of, ehea_device_table);
+ static struct of_platform_driver ehea_driver = {
+       .driver = {
+               .name = "ehea",
+               .owner = THIS_MODULE,
+               .of_match_table = ehea_device_table,
+       },
+       .probe = ehea_probe_adapter,
+       .remove = ehea_remove,
+ };
+ void ehea_dump(void *adr, int len, char *msg)
+ {
+       int x;
+       unsigned char *deb = adr;
+       for (x = 0; x < len; x += 16) {
+               pr_info("%s adr=%p ofs=%04x %016llx %016llx\n",
+                       msg, deb, x, *((u64 *)&deb[0]), *((u64 *)&deb[8]));
+               deb += 16;
+       }
+ }
+ void ehea_schedule_port_reset(struct ehea_port *port)
+ {
+       if (!test_bit(__EHEA_DISABLE_PORT_RESET, &port->flags))
+               schedule_work(&port->reset_task);
+ }
+ static void ehea_update_firmware_handles(void)
+ {
+       struct ehea_fw_handle_entry *arr = NULL;
+       struct ehea_adapter *adapter;
+       int num_adapters = 0;
+       int num_ports = 0;
+       int num_portres = 0;
+       int i = 0;
+       int num_fw_handles, k, l;
+       /* Determine number of handles */
+       mutex_lock(&ehea_fw_handles.lock);
+       list_for_each_entry(adapter, &adapter_list, list) {
+               num_adapters++;
+               for (k = 0; k < EHEA_MAX_PORTS; k++) {
+                       struct ehea_port *port = adapter->port[k];
+                       if (!port || (port->state != EHEA_PORT_UP))
+                               continue;
+                       num_ports++;
+                       num_portres += port->num_def_qps;
+               }
+       }
+       num_fw_handles = num_adapters * EHEA_NUM_ADAPTER_FW_HANDLES +
+                        num_ports * EHEA_NUM_PORT_FW_HANDLES +
+                        num_portres * EHEA_NUM_PORTRES_FW_HANDLES;
+       if (num_fw_handles) {
+               arr = kcalloc(num_fw_handles, sizeof(*arr), GFP_KERNEL);
+               if (!arr)
+                       goto out;  /* Keep the existing array */
+       } else
+               goto out_update;
+       list_for_each_entry(adapter, &adapter_list, list) {
+               if (num_adapters == 0)
+                       break;
+               for (k = 0; k < EHEA_MAX_PORTS; k++) {
+                       struct ehea_port *port = adapter->port[k];
+                       if (!port || (port->state != EHEA_PORT_UP) ||
+                           (num_ports == 0))
+                               continue;
+                       for (l = 0; l < port->num_def_qps; l++) {
+                               struct ehea_port_res *pr = &port->port_res[l];
+                               arr[i].adh = adapter->handle;
+                               arr[i++].fwh = pr->qp->fw_handle;
+                               arr[i].adh = adapter->handle;
+                               arr[i++].fwh = pr->send_cq->fw_handle;
+                               arr[i].adh = adapter->handle;
+                               arr[i++].fwh = pr->recv_cq->fw_handle;
+                               arr[i].adh = adapter->handle;
+                               arr[i++].fwh = pr->eq->fw_handle;
+                               arr[i].adh = adapter->handle;
+                               arr[i++].fwh = pr->send_mr.handle;
+                               arr[i].adh = adapter->handle;
+                               arr[i++].fwh = pr->recv_mr.handle;
+                       }
+                       arr[i].adh = adapter->handle;
+                       arr[i++].fwh = port->qp_eq->fw_handle;
+                       num_ports--;
+               }
+               arr[i].adh = adapter->handle;
+               arr[i++].fwh = adapter->neq->fw_handle;
+               if (adapter->mr.handle) {
+                       arr[i].adh = adapter->handle;
+                       arr[i++].fwh = adapter->mr.handle;
+               }
+               num_adapters--;
+       }
+ out_update:
+       kfree(ehea_fw_handles.arr);
+       ehea_fw_handles.arr = arr;
+       ehea_fw_handles.num_entries = i;
+ out:
+       mutex_unlock(&ehea_fw_handles.lock);
+ }
+ static void ehea_update_bcmc_registrations(void)
+ {
+       unsigned long flags;
+       struct ehea_bcmc_reg_entry *arr = NULL;
+       struct ehea_adapter *adapter;
+       struct ehea_mc_list *mc_entry;
+       int num_registrations = 0;
+       int i = 0;
+       int k;
+       spin_lock_irqsave(&ehea_bcmc_regs.lock, flags);
+       /* Determine number of registrations */
+       list_for_each_entry(adapter, &adapter_list, list)
+               for (k = 0; k < EHEA_MAX_PORTS; k++) {
+                       struct ehea_port *port = adapter->port[k];
+                       if (!port || (port->state != EHEA_PORT_UP))
+                               continue;
+                       num_registrations += 2; /* Broadcast registrations */
+                       list_for_each_entry(mc_entry, &port->mc_list->list,list)
+                               num_registrations += 2;
+               }
+       if (num_registrations) {
+               arr = kcalloc(num_registrations, sizeof(*arr), GFP_ATOMIC);
+               if (!arr)
+                       goto out;  /* Keep the existing array */
+       } else
+               goto out_update;
+       list_for_each_entry(adapter, &adapter_list, list) {
+               for (k = 0; k < EHEA_MAX_PORTS; k++) {
+                       struct ehea_port *port = adapter->port[k];
+                       if (!port || (port->state != EHEA_PORT_UP))
+                               continue;
+                       if (num_registrations == 0)
+                               goto out_update;
+                       arr[i].adh = adapter->handle;
+                       arr[i].port_id = port->logical_port_id;
+                       arr[i].reg_type = EHEA_BCMC_BROADCAST |
+                                         EHEA_BCMC_UNTAGGED;
+                       arr[i++].macaddr = port->mac_addr;
+                       arr[i].adh = adapter->handle;
+                       arr[i].port_id = port->logical_port_id;
+                       arr[i].reg_type = EHEA_BCMC_BROADCAST |
+                                         EHEA_BCMC_VLANID_ALL;
+                       arr[i++].macaddr = port->mac_addr;
+                       num_registrations -= 2;
+                       list_for_each_entry(mc_entry,
+                                           &port->mc_list->list, list) {
+                               if (num_registrations == 0)
+                                       goto out_update;
+                               arr[i].adh = adapter->handle;
+                               arr[i].port_id = port->logical_port_id;
+                               arr[i].reg_type = EHEA_BCMC_SCOPE_ALL |
+                                                 EHEA_BCMC_MULTICAST |
+                                                 EHEA_BCMC_UNTAGGED;
+                               arr[i++].macaddr = mc_entry->macaddr;
+                               arr[i].adh = adapter->handle;
+                               arr[i].port_id = port->logical_port_id;
+                               arr[i].reg_type = EHEA_BCMC_SCOPE_ALL |
+                                                 EHEA_BCMC_MULTICAST |
+                                                 EHEA_BCMC_VLANID_ALL;
+                               arr[i++].macaddr = mc_entry->macaddr;
+                               num_registrations -= 2;
+                       }
+               }
+       }
+ out_update:
+       kfree(ehea_bcmc_regs.arr);
+       ehea_bcmc_regs.arr = arr;
+       ehea_bcmc_regs.num_entries = i;
+ out:
+       spin_unlock_irqrestore(&ehea_bcmc_regs.lock, flags);
+ }
+ static struct rtnl_link_stats64 *ehea_get_stats64(struct net_device *dev,
+                                       struct rtnl_link_stats64 *stats)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       u64 rx_packets = 0, tx_packets = 0, rx_bytes = 0, tx_bytes = 0;
+       int i;
+       for (i = 0; i < port->num_def_qps; i++) {
+               rx_packets += port->port_res[i].rx_packets;
+               rx_bytes   += port->port_res[i].rx_bytes;
+       }
+       for (i = 0; i < port->num_def_qps; i++) {
+               tx_packets += port->port_res[i].tx_packets;
+               tx_bytes   += port->port_res[i].tx_bytes;
+       }
+       stats->tx_packets = tx_packets;
+       stats->rx_bytes = rx_bytes;
+       stats->tx_bytes = tx_bytes;
+       stats->rx_packets = rx_packets;
+       return &port->stats;
+ }
+ static void ehea_update_stats(struct work_struct *work)
+ {
+       struct ehea_port *port =
+               container_of(work, struct ehea_port, stats_work.work);
+       struct net_device *dev = port->netdev;
+       struct rtnl_link_stats64 *stats = &port->stats;
+       struct hcp_ehea_port_cb2 *cb2;
+       u64 hret;
+       cb2 = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!cb2) {
+               netdev_err(dev, "No mem for cb2. Some interface statistics were not updated\n");
+               goto resched;
+       }
+       hret = ehea_h_query_ehea_port(port->adapter->handle,
+                                     port->logical_port_id,
+                                     H_PORT_CB2, H_PORT_CB2_ALL, cb2);
+       if (hret != H_SUCCESS) {
+               netdev_err(dev, "query_ehea_port failed\n");
+               goto out_herr;
+       }
+       if (netif_msg_hw(port))
+               ehea_dump(cb2, sizeof(*cb2), "net_device_stats");
+       stats->multicast = cb2->rxmcp;
+       stats->rx_errors = cb2->rxuerr;
+ out_herr:
+       free_page((unsigned long)cb2);
+ resched:
+       schedule_delayed_work(&port->stats_work, msecs_to_jiffies(1000));
+ }
+ static void ehea_refill_rq1(struct ehea_port_res *pr, int index, int nr_of_wqes)
+ {
+       struct sk_buff **skb_arr_rq1 = pr->rq1_skba.arr;
+       struct net_device *dev = pr->port->netdev;
+       int max_index_mask = pr->rq1_skba.len - 1;
+       int fill_wqes = pr->rq1_skba.os_skbs + nr_of_wqes;
+       int adder = 0;
+       int i;
+       pr->rq1_skba.os_skbs = 0;
+       if (unlikely(test_bit(__EHEA_STOP_XFER, &ehea_driver_flags))) {
+               if (nr_of_wqes > 0)
+                       pr->rq1_skba.index = index;
+               pr->rq1_skba.os_skbs = fill_wqes;
+               return;
+       }
+       for (i = 0; i < fill_wqes; i++) {
+               if (!skb_arr_rq1[index]) {
+                       skb_arr_rq1[index] = netdev_alloc_skb(dev,
+                                                             EHEA_L_PKT_SIZE);
+                       if (!skb_arr_rq1[index]) {
+                               netdev_info(dev, "Unable to allocate enough skb in the array\n");
+                               pr->rq1_skba.os_skbs = fill_wqes - i;
+                               break;
+                       }
+               }
+               index--;
+               index &= max_index_mask;
+               adder++;
+       }
+       if (adder == 0)
+               return;
+       /* Ring doorbell */
+       ehea_update_rq1a(pr->qp, adder);
+ }
+ static void ehea_init_fill_rq1(struct ehea_port_res *pr, int nr_rq1a)
+ {
+       struct sk_buff **skb_arr_rq1 = pr->rq1_skba.arr;
+       struct net_device *dev = pr->port->netdev;
+       int i;
+       if (nr_rq1a > pr->rq1_skba.len) {
+               netdev_err(dev, "NR_RQ1A bigger than skb array len\n");
+               return;
+       }
+       for (i = 0; i < nr_rq1a; i++) {
+               skb_arr_rq1[i] = netdev_alloc_skb(dev, EHEA_L_PKT_SIZE);
+               if (!skb_arr_rq1[i]) {
+                       netdev_info(dev, "Not enough memory to allocate skb array\n");
+                       break;
+               }
+       }
+       /* Ring doorbell */
+       ehea_update_rq1a(pr->qp, i - 1);
+ }
+ static int ehea_refill_rq_def(struct ehea_port_res *pr,
+                             struct ehea_q_skb_arr *q_skba, int rq_nr,
+                             int num_wqes, int wqe_type, int packet_size)
+ {
+       struct net_device *dev = pr->port->netdev;
+       struct ehea_qp *qp = pr->qp;
+       struct sk_buff **skb_arr = q_skba->arr;
+       struct ehea_rwqe *rwqe;
+       int i, index, max_index_mask, fill_wqes;
+       int adder = 0;
+       int ret = 0;
+       fill_wqes = q_skba->os_skbs + num_wqes;
+       q_skba->os_skbs = 0;
+       if (unlikely(test_bit(__EHEA_STOP_XFER, &ehea_driver_flags))) {
+               q_skba->os_skbs = fill_wqes;
+               return ret;
+       }
+       index = q_skba->index;
+       max_index_mask = q_skba->len - 1;
+       for (i = 0; i < fill_wqes; i++) {
+               u64 tmp_addr;
+               struct sk_buff *skb;
+               skb = netdev_alloc_skb_ip_align(dev, packet_size);
+               if (!skb) {
+                       q_skba->os_skbs = fill_wqes - i;
+                       if (q_skba->os_skbs == q_skba->len - 2) {
+                               netdev_info(pr->port->netdev,
+                                           "rq%i ran dry - no mem for skb\n",
+                                           rq_nr);
+                               ret = -ENOMEM;
+                       }
+                       break;
+               }
+               skb_arr[index] = skb;
+               tmp_addr = ehea_map_vaddr(skb->data);
+               if (tmp_addr == -1) {
+                       dev_kfree_skb(skb);
+                       q_skba->os_skbs = fill_wqes - i;
+                       ret = 0;
+                       break;
+               }
+               rwqe = ehea_get_next_rwqe(qp, rq_nr);
+               rwqe->wr_id = EHEA_BMASK_SET(EHEA_WR_ID_TYPE, wqe_type)
+                           | EHEA_BMASK_SET(EHEA_WR_ID_INDEX, index);
+               rwqe->sg_list[0].l_key = pr->recv_mr.lkey;
+               rwqe->sg_list[0].vaddr = tmp_addr;
+               rwqe->sg_list[0].len = packet_size;
+               rwqe->data_segments = 1;
+               index++;
+               index &= max_index_mask;
+               adder++;
+       }
+       q_skba->index = index;
+       if (adder == 0)
+               goto out;
+       /* Ring doorbell */
+       iosync();
+       if (rq_nr == 2)
+               ehea_update_rq2a(pr->qp, adder);
+       else
+               ehea_update_rq3a(pr->qp, adder);
+ out:
+       return ret;
+ }
+ static int ehea_refill_rq2(struct ehea_port_res *pr, int nr_of_wqes)
+ {
+       return ehea_refill_rq_def(pr, &pr->rq2_skba, 2,
+                                 nr_of_wqes, EHEA_RWQE2_TYPE,
+                                 EHEA_RQ2_PKT_SIZE);
+ }
+ static int ehea_refill_rq3(struct ehea_port_res *pr, int nr_of_wqes)
+ {
+       return ehea_refill_rq_def(pr, &pr->rq3_skba, 3,
+                                 nr_of_wqes, EHEA_RWQE3_TYPE,
+                                 EHEA_MAX_PACKET_SIZE);
+ }
+ static inline int ehea_check_cqe(struct ehea_cqe *cqe, int *rq_num)
+ {
+       *rq_num = (cqe->type & EHEA_CQE_TYPE_RQ) >> 5;
+       if ((cqe->status & EHEA_CQE_STAT_ERR_MASK) == 0)
+               return 0;
+       if (((cqe->status & EHEA_CQE_STAT_ERR_TCP) != 0) &&
+           (cqe->header_length == 0))
+               return 0;
+       return -EINVAL;
+ }
+ static inline void ehea_fill_skb(struct net_device *dev,
+                                struct sk_buff *skb, struct ehea_cqe *cqe,
+                                struct ehea_port_res *pr)
+ {
+       int length = cqe->num_bytes_transfered - 4;     /*remove CRC */
+       skb_put(skb, length);
+       skb->protocol = eth_type_trans(skb, dev);
+       /* The packet was not an IPV4 packet so a complemented checksum was
+          calculated. The value is found in the Internet Checksum field. */
+       if (cqe->status & EHEA_CQE_BLIND_CKSUM) {
+               skb->ip_summed = CHECKSUM_COMPLETE;
+               skb->csum = csum_unfold(~cqe->inet_checksum_value);
+       } else
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+       skb_record_rx_queue(skb, pr - &pr->port->port_res[0]);
+ }
+ static inline struct sk_buff *get_skb_by_index(struct sk_buff **skb_array,
+                                              int arr_len,
+                                              struct ehea_cqe *cqe)
+ {
+       int skb_index = EHEA_BMASK_GET(EHEA_WR_ID_INDEX, cqe->wr_id);
+       struct sk_buff *skb;
+       void *pref;
+       int x;
+       x = skb_index + 1;
+       x &= (arr_len - 1);
+       pref = skb_array[x];
+       if (pref) {
+               prefetchw(pref);
+               prefetchw(pref + EHEA_CACHE_LINE);
+               pref = (skb_array[x]->data);
+               prefetch(pref);
+               prefetch(pref + EHEA_CACHE_LINE);
+               prefetch(pref + EHEA_CACHE_LINE * 2);
+               prefetch(pref + EHEA_CACHE_LINE * 3);
+       }
+       skb = skb_array[skb_index];
+       skb_array[skb_index] = NULL;
+       return skb;
+ }
+ static inline struct sk_buff *get_skb_by_index_ll(struct sk_buff **skb_array,
+                                                 int arr_len, int wqe_index)
+ {
+       struct sk_buff *skb;
+       void *pref;
+       int x;
+       x = wqe_index + 1;
+       x &= (arr_len - 1);
+       pref = skb_array[x];
+       if (pref) {
+               prefetchw(pref);
+               prefetchw(pref + EHEA_CACHE_LINE);
+               pref = (skb_array[x]->data);
+               prefetchw(pref);
+               prefetchw(pref + EHEA_CACHE_LINE);
+       }
+       skb = skb_array[wqe_index];
+       skb_array[wqe_index] = NULL;
+       return skb;
+ }
+ static int ehea_treat_poll_error(struct ehea_port_res *pr, int rq,
+                                struct ehea_cqe *cqe, int *processed_rq2,
+                                int *processed_rq3)
+ {
+       struct sk_buff *skb;
+       if (cqe->status & EHEA_CQE_STAT_ERR_TCP)
+               pr->p_stats.err_tcp_cksum++;
+       if (cqe->status & EHEA_CQE_STAT_ERR_IP)
+               pr->p_stats.err_ip_cksum++;
+       if (cqe->status & EHEA_CQE_STAT_ERR_CRC)
+               pr->p_stats.err_frame_crc++;
+       if (rq == 2) {
+               *processed_rq2 += 1;
+               skb = get_skb_by_index(pr->rq2_skba.arr, pr->rq2_skba.len, cqe);
+               dev_kfree_skb(skb);
+       } else if (rq == 3) {
+               *processed_rq3 += 1;
+               skb = get_skb_by_index(pr->rq3_skba.arr, pr->rq3_skba.len, cqe);
+               dev_kfree_skb(skb);
+       }
+       if (cqe->status & EHEA_CQE_STAT_FAT_ERR_MASK) {
+               if (netif_msg_rx_err(pr->port)) {
+                       pr_err("Critical receive error for QP %d. Resetting port.\n",
+                              pr->qp->init_attr.qp_nr);
+                       ehea_dump(cqe, sizeof(*cqe), "CQE");
+               }
+               ehea_schedule_port_reset(pr->port);
+               return 1;
+       }
+       return 0;
+ }
+ static int ehea_proc_rwqes(struct net_device *dev,
+                          struct ehea_port_res *pr,
+                          int budget)
+ {
+       struct ehea_port *port = pr->port;
+       struct ehea_qp *qp = pr->qp;
+       struct ehea_cqe *cqe;
+       struct sk_buff *skb;
+       struct sk_buff **skb_arr_rq1 = pr->rq1_skba.arr;
+       struct sk_buff **skb_arr_rq2 = pr->rq2_skba.arr;
+       struct sk_buff **skb_arr_rq3 = pr->rq3_skba.arr;
+       int skb_arr_rq1_len = pr->rq1_skba.len;
+       int skb_arr_rq2_len = pr->rq2_skba.len;
+       int skb_arr_rq3_len = pr->rq3_skba.len;
+       int processed, processed_rq1, processed_rq2, processed_rq3;
+       u64 processed_bytes = 0;
+       int wqe_index, last_wqe_index, rq, port_reset;
+       processed = processed_rq1 = processed_rq2 = processed_rq3 = 0;
+       last_wqe_index = 0;
+       cqe = ehea_poll_rq1(qp, &wqe_index);
+       while ((processed < budget) && cqe) {
+               ehea_inc_rq1(qp);
+               processed_rq1++;
+               processed++;
+               if (netif_msg_rx_status(port))
+                       ehea_dump(cqe, sizeof(*cqe), "CQE");
+               last_wqe_index = wqe_index;
+               rmb();
+               if (!ehea_check_cqe(cqe, &rq)) {
+                       if (rq == 1) {
+                               /* LL RQ1 */
+                               skb = get_skb_by_index_ll(skb_arr_rq1,
+                                                         skb_arr_rq1_len,
+                                                         wqe_index);
+                               if (unlikely(!skb)) {
+                                       netif_info(port, rx_err, dev,
+                                                 "LL rq1: skb=NULL\n");
+                                       skb = netdev_alloc_skb(dev,
+                                                              EHEA_L_PKT_SIZE);
+                                       if (!skb) {
+                                               netdev_err(dev, "Not enough memory to allocate skb\n");
+                                               break;
+                                       }
+                               }
+                               skb_copy_to_linear_data(skb, ((char *)cqe) + 64,
+                                                cqe->num_bytes_transfered - 4);
+                               ehea_fill_skb(dev, skb, cqe, pr);
+                       } else if (rq == 2) {
+                               /* RQ2 */
+                               skb = get_skb_by_index(skb_arr_rq2,
+                                                      skb_arr_rq2_len, cqe);
+                               if (unlikely(!skb)) {
+                                       netif_err(port, rx_err, dev,
+                                                 "rq2: skb=NULL\n");
+                                       break;
+                               }
+                               ehea_fill_skb(dev, skb, cqe, pr);
+                               processed_rq2++;
+                       } else {
+                               /* RQ3 */
+                               skb = get_skb_by_index(skb_arr_rq3,
+                                                      skb_arr_rq3_len, cqe);
+                               if (unlikely(!skb)) {
+                                       netif_err(port, rx_err, dev,
+                                                 "rq3: skb=NULL\n");
+                                       break;
+                               }
+                               ehea_fill_skb(dev, skb, cqe, pr);
+                               processed_rq3++;
+                       }
+                       processed_bytes += skb->len;
+                       if (cqe->status & EHEA_CQE_VLAN_TAG_XTRACT)
+                               __vlan_hwaccel_put_tag(skb, cqe->vlan_tag);
+                       napi_gro_receive(&pr->napi, skb);
+               } else {
+                       pr->p_stats.poll_receive_errors++;
+                       port_reset = ehea_treat_poll_error(pr, rq, cqe,
+                                                          &processed_rq2,
+                                                          &processed_rq3);
+                       if (port_reset)
+                               break;
+               }
+               cqe = ehea_poll_rq1(qp, &wqe_index);
+       }
+       pr->rx_packets += processed;
+       pr->rx_bytes += processed_bytes;
+       ehea_refill_rq1(pr, last_wqe_index, processed_rq1);
+       ehea_refill_rq2(pr, processed_rq2);
+       ehea_refill_rq3(pr, processed_rq3);
+       return processed;
+ }
+ #define SWQE_RESTART_CHECK 0xdeadbeaff00d0000ull
+ static void reset_sq_restart_flag(struct ehea_port *port)
+ {
+       int i;
+       for (i = 0; i < port->num_def_qps; i++) {
+               struct ehea_port_res *pr = &port->port_res[i];
+               pr->sq_restart_flag = 0;
+       }
+       wake_up(&port->restart_wq);
+ }
+ static void check_sqs(struct ehea_port *port)
+ {
+       struct ehea_swqe *swqe;
+       int swqe_index;
+       int i, k;
+       for (i = 0; i < port->num_def_qps; i++) {
+               struct ehea_port_res *pr = &port->port_res[i];
+               int ret;
+               k = 0;
+               swqe = ehea_get_swqe(pr->qp, &swqe_index);
+               memset(swqe, 0, SWQE_HEADER_SIZE);
+               atomic_dec(&pr->swqe_avail);
+               swqe->tx_control |= EHEA_SWQE_PURGE;
+               swqe->wr_id = SWQE_RESTART_CHECK;
+               swqe->tx_control |= EHEA_SWQE_SIGNALLED_COMPLETION;
+               swqe->tx_control |= EHEA_SWQE_IMM_DATA_PRESENT;
+               swqe->immediate_data_length = 80;
+               ehea_post_swqe(pr->qp, swqe);
+               ret = wait_event_timeout(port->restart_wq,
+                                        pr->sq_restart_flag == 0,
+                                        msecs_to_jiffies(100));
+               if (!ret) {
+                       pr_err("HW/SW queues out of sync\n");
+                       ehea_schedule_port_reset(pr->port);
+                       return;
+               }
+       }
+ }
+ static struct ehea_cqe *ehea_proc_cqes(struct ehea_port_res *pr, int my_quota)
+ {
+       struct sk_buff *skb;
+       struct ehea_cq *send_cq = pr->send_cq;
+       struct ehea_cqe *cqe;
+       int quota = my_quota;
+       int cqe_counter = 0;
+       int swqe_av = 0;
+       int index;
+       struct netdev_queue *txq = netdev_get_tx_queue(pr->port->netdev,
+                                               pr - &pr->port->port_res[0]);
+       cqe = ehea_poll_cq(send_cq);
+       while (cqe && (quota > 0)) {
+               ehea_inc_cq(send_cq);
+               cqe_counter++;
+               rmb();
+               if (cqe->wr_id == SWQE_RESTART_CHECK) {
+                       pr->sq_restart_flag = 1;
+                       swqe_av++;
+                       break;
+               }
+               if (cqe->status & EHEA_CQE_STAT_ERR_MASK) {
+                       pr_err("Bad send completion status=0x%04X\n",
+                              cqe->status);
+                       if (netif_msg_tx_err(pr->port))
+                               ehea_dump(cqe, sizeof(*cqe), "Send CQE");
+                       if (cqe->status & EHEA_CQE_STAT_RESET_MASK) {
+                               pr_err("Resetting port\n");
+                               ehea_schedule_port_reset(pr->port);
+                               break;
+                       }
+               }
+               if (netif_msg_tx_done(pr->port))
+                       ehea_dump(cqe, sizeof(*cqe), "CQE");
+               if (likely(EHEA_BMASK_GET(EHEA_WR_ID_TYPE, cqe->wr_id)
+                          == EHEA_SWQE2_TYPE)) {
+                       index = EHEA_BMASK_GET(EHEA_WR_ID_INDEX, cqe->wr_id);
+                       skb = pr->sq_skba.arr[index];
+                       dev_kfree_skb(skb);
+                       pr->sq_skba.arr[index] = NULL;
+               }
+               swqe_av += EHEA_BMASK_GET(EHEA_WR_ID_REFILL, cqe->wr_id);
+               quota--;
+               cqe = ehea_poll_cq(send_cq);
+       }
+       ehea_update_feca(send_cq, cqe_counter);
+       atomic_add(swqe_av, &pr->swqe_avail);
+       if (unlikely(netif_tx_queue_stopped(txq) &&
+                    (atomic_read(&pr->swqe_avail) >= pr->swqe_refill_th))) {
+               __netif_tx_lock(txq, smp_processor_id());
+               if (netif_tx_queue_stopped(txq) &&
+                   (atomic_read(&pr->swqe_avail) >= pr->swqe_refill_th))
+                       netif_tx_wake_queue(txq);
+               __netif_tx_unlock(txq);
+       }
+       wake_up(&pr->port->swqe_avail_wq);
+       return cqe;
+ }
+ #define EHEA_POLL_MAX_CQES 65535
+ static int ehea_poll(struct napi_struct *napi, int budget)
+ {
+       struct ehea_port_res *pr = container_of(napi, struct ehea_port_res,
+                                               napi);
+       struct net_device *dev = pr->port->netdev;
+       struct ehea_cqe *cqe;
+       struct ehea_cqe *cqe_skb = NULL;
+       int wqe_index;
+       int rx = 0;
+       cqe_skb = ehea_proc_cqes(pr, EHEA_POLL_MAX_CQES);
+       rx += ehea_proc_rwqes(dev, pr, budget - rx);
+       while (rx != budget) {
+               napi_complete(napi);
+               ehea_reset_cq_ep(pr->recv_cq);
+               ehea_reset_cq_ep(pr->send_cq);
+               ehea_reset_cq_n1(pr->recv_cq);
+               ehea_reset_cq_n1(pr->send_cq);
+               rmb();
+               cqe = ehea_poll_rq1(pr->qp, &wqe_index);
+               cqe_skb = ehea_poll_cq(pr->send_cq);
+               if (!cqe && !cqe_skb)
+                       return rx;
+               if (!napi_reschedule(napi))
+                       return rx;
+               cqe_skb = ehea_proc_cqes(pr, EHEA_POLL_MAX_CQES);
+               rx += ehea_proc_rwqes(dev, pr, budget - rx);
+       }
+       return rx;
+ }
+ #ifdef CONFIG_NET_POLL_CONTROLLER
+ static void ehea_netpoll(struct net_device *dev)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       int i;
+       for (i = 0; i < port->num_def_qps; i++)
+               napi_schedule(&port->port_res[i].napi);
+ }
+ #endif
+ static irqreturn_t ehea_recv_irq_handler(int irq, void *param)
+ {
+       struct ehea_port_res *pr = param;
+       napi_schedule(&pr->napi);
+       return IRQ_HANDLED;
+ }
+ static irqreturn_t ehea_qp_aff_irq_handler(int irq, void *param)
+ {
+       struct ehea_port *port = param;
+       struct ehea_eqe *eqe;
+       struct ehea_qp *qp;
+       u32 qp_token;
+       u64 resource_type, aer, aerr;
+       int reset_port = 0;
+       eqe = ehea_poll_eq(port->qp_eq);
+       while (eqe) {
+               qp_token = EHEA_BMASK_GET(EHEA_EQE_QP_TOKEN, eqe->entry);
+               pr_err("QP aff_err: entry=0x%llx, token=0x%x\n",
+                      eqe->entry, qp_token);
+               qp = port->port_res[qp_token].qp;
+               resource_type = ehea_error_data(port->adapter, qp->fw_handle,
+                                               &aer, &aerr);
+               if (resource_type == EHEA_AER_RESTYPE_QP) {
+                       if ((aer & EHEA_AER_RESET_MASK) ||
+                           (aerr & EHEA_AERR_RESET_MASK))
+                                reset_port = 1;
+               } else
+                       reset_port = 1;   /* Reset in case of CQ or EQ error */
+               eqe = ehea_poll_eq(port->qp_eq);
+       }
+       if (reset_port) {
+               pr_err("Resetting port\n");
+               ehea_schedule_port_reset(port);
+       }
+       return IRQ_HANDLED;
+ }
+ static struct ehea_port *ehea_get_port(struct ehea_adapter *adapter,
+                                      int logical_port)
+ {
+       int i;
+       for (i = 0; i < EHEA_MAX_PORTS; i++)
+               if (adapter->port[i])
+                       if (adapter->port[i]->logical_port_id == logical_port)
+                               return adapter->port[i];
+       return NULL;
+ }
+ int ehea_sense_port_attr(struct ehea_port *port)
+ {
+       int ret;
+       u64 hret;
+       struct hcp_ehea_port_cb0 *cb0;
+       /* may be called via ehea_neq_tasklet() */
+       cb0 = (void *)get_zeroed_page(GFP_ATOMIC);
+       if (!cb0) {
+               pr_err("no mem for cb0\n");
+               ret = -ENOMEM;
+               goto out;
+       }
+       hret = ehea_h_query_ehea_port(port->adapter->handle,
+                                     port->logical_port_id, H_PORT_CB0,
+                                     EHEA_BMASK_SET(H_PORT_CB0_ALL, 0xFFFF),
+                                     cb0);
+       if (hret != H_SUCCESS) {
+               ret = -EIO;
+               goto out_free;
+       }
+       /* MAC address */
+       port->mac_addr = cb0->port_mac_addr << 16;
+       if (!is_valid_ether_addr((u8 *)&port->mac_addr)) {
+               ret = -EADDRNOTAVAIL;
+               goto out_free;
+       }
+       /* Port speed */
+       switch (cb0->port_speed) {
+       case H_SPEED_10M_H:
+               port->port_speed = EHEA_SPEED_10M;
+               port->full_duplex = 0;
+               break;
+       case H_SPEED_10M_F:
+               port->port_speed = EHEA_SPEED_10M;
+               port->full_duplex = 1;
+               break;
+       case H_SPEED_100M_H:
+               port->port_speed = EHEA_SPEED_100M;
+               port->full_duplex = 0;
+               break;
+       case H_SPEED_100M_F:
+               port->port_speed = EHEA_SPEED_100M;
+               port->full_duplex = 1;
+               break;
+       case H_SPEED_1G_F:
+               port->port_speed = EHEA_SPEED_1G;
+               port->full_duplex = 1;
+               break;
+       case H_SPEED_10G_F:
+               port->port_speed = EHEA_SPEED_10G;
+               port->full_duplex = 1;
+               break;
+       default:
+               port->port_speed = 0;
+               port->full_duplex = 0;
+               break;
+       }
+       port->autoneg = 1;
+       port->num_mcs = cb0->num_default_qps;
+       /* Number of default QPs */
+       if (use_mcs)
+               port->num_def_qps = cb0->num_default_qps;
+       else
+               port->num_def_qps = 1;
+       if (!port->num_def_qps) {
+               ret = -EINVAL;
+               goto out_free;
+       }
+       ret = 0;
+ out_free:
+       if (ret || netif_msg_probe(port))
+               ehea_dump(cb0, sizeof(*cb0), "ehea_sense_port_attr");
+       free_page((unsigned long)cb0);
+ out:
+       return ret;
+ }
+ int ehea_set_portspeed(struct ehea_port *port, u32 port_speed)
+ {
+       struct hcp_ehea_port_cb4 *cb4;
+       u64 hret;
+       int ret = 0;
+       cb4 = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!cb4) {
+               pr_err("no mem for cb4\n");
+               ret = -ENOMEM;
+               goto out;
+       }
+       cb4->port_speed = port_speed;
+       netif_carrier_off(port->netdev);
+       hret = ehea_h_modify_ehea_port(port->adapter->handle,
+                                      port->logical_port_id,
+                                      H_PORT_CB4, H_PORT_CB4_SPEED, cb4);
+       if (hret == H_SUCCESS) {
+               port->autoneg = port_speed == EHEA_SPEED_AUTONEG ? 1 : 0;
+               hret = ehea_h_query_ehea_port(port->adapter->handle,
+                                             port->logical_port_id,
+                                             H_PORT_CB4, H_PORT_CB4_SPEED,
+                                             cb4);
+               if (hret == H_SUCCESS) {
+                       switch (cb4->port_speed) {
+                       case H_SPEED_10M_H:
+                               port->port_speed = EHEA_SPEED_10M;
+                               port->full_duplex = 0;
+                               break;
+                       case H_SPEED_10M_F:
+                               port->port_speed = EHEA_SPEED_10M;
+                               port->full_duplex = 1;
+                               break;
+                       case H_SPEED_100M_H:
+                               port->port_speed = EHEA_SPEED_100M;
+                               port->full_duplex = 0;
+                               break;
+                       case H_SPEED_100M_F:
+                               port->port_speed = EHEA_SPEED_100M;
+                               port->full_duplex = 1;
+                               break;
+                       case H_SPEED_1G_F:
+                               port->port_speed = EHEA_SPEED_1G;
+                               port->full_duplex = 1;
+                               break;
+                       case H_SPEED_10G_F:
+                               port->port_speed = EHEA_SPEED_10G;
+                               port->full_duplex = 1;
+                               break;
+                       default:
+                               port->port_speed = 0;
+                               port->full_duplex = 0;
+                               break;
+                       }
+               } else {
+                       pr_err("Failed sensing port speed\n");
+                       ret = -EIO;
+               }
+       } else {
+               if (hret == H_AUTHORITY) {
+                       pr_info("Hypervisor denied setting port speed\n");
+                       ret = -EPERM;
+               } else {
+                       ret = -EIO;
+                       pr_err("Failed setting port speed\n");
+               }
+       }
+       if (!prop_carrier_state || (port->phy_link == EHEA_PHY_LINK_UP))
+               netif_carrier_on(port->netdev);
+       free_page((unsigned long)cb4);
+ out:
+       return ret;
+ }
+ static void ehea_parse_eqe(struct ehea_adapter *adapter, u64 eqe)
+ {
+       int ret;
+       u8 ec;
+       u8 portnum;
+       struct ehea_port *port;
+       struct net_device *dev;
+       ec = EHEA_BMASK_GET(NEQE_EVENT_CODE, eqe);
+       portnum = EHEA_BMASK_GET(NEQE_PORTNUM, eqe);
+       port = ehea_get_port(adapter, portnum);
+       dev = port->netdev;
+       switch (ec) {
+       case EHEA_EC_PORTSTATE_CHG:     /* port state change */
+               if (!port) {
+                       netdev_err(dev, "unknown portnum %x\n", portnum);
+                       break;
+               }
+               if (EHEA_BMASK_GET(NEQE_PORT_UP, eqe)) {
+                       if (!netif_carrier_ok(dev)) {
+                               ret = ehea_sense_port_attr(port);
+                               if (ret) {
+                                       netdev_err(dev, "failed resensing port attributes\n");
+                                       break;
+                               }
+                               netif_info(port, link, dev,
+                                          "Logical port up: %dMbps %s Duplex\n",
+                                          port->port_speed,
+                                          port->full_duplex == 1 ?
+                                          "Full" : "Half");
+                               netif_carrier_on(dev);
+                               netif_wake_queue(dev);
+                       }
+               } else
+                       if (netif_carrier_ok(dev)) {
+                               netif_info(port, link, dev,
+                                          "Logical port down\n");
+                               netif_carrier_off(dev);
+                               netif_tx_disable(dev);
+                       }
+               if (EHEA_BMASK_GET(NEQE_EXTSWITCH_PORT_UP, eqe)) {
+                       port->phy_link = EHEA_PHY_LINK_UP;
+                       netif_info(port, link, dev,
+                                  "Physical port up\n");
+                       if (prop_carrier_state)
+                               netif_carrier_on(dev);
+               } else {
+                       port->phy_link = EHEA_PHY_LINK_DOWN;
+                       netif_info(port, link, dev,
+                                  "Physical port down\n");
+                       if (prop_carrier_state)
+                               netif_carrier_off(dev);
+               }
+               if (EHEA_BMASK_GET(NEQE_EXTSWITCH_PRIMARY, eqe))
+                       netdev_info(dev,
+                                   "External switch port is primary port\n");
+               else
+                       netdev_info(dev,
+                                   "External switch port is backup port\n");
+               break;
+       case EHEA_EC_ADAPTER_MALFUNC:
+               netdev_err(dev, "Adapter malfunction\n");
+               break;
+       case EHEA_EC_PORT_MALFUNC:
+               netdev_info(dev, "Port malfunction\n");
+               netif_carrier_off(dev);
+               netif_tx_disable(dev);
+               break;
+       default:
+               netdev_err(dev, "unknown event code %x, eqe=0x%llX\n", ec, eqe);
+               break;
+       }
+ }
+ static void ehea_neq_tasklet(unsigned long data)
+ {
+       struct ehea_adapter *adapter = (struct ehea_adapter *)data;
+       struct ehea_eqe *eqe;
+       u64 event_mask;
+       eqe = ehea_poll_eq(adapter->neq);
+       pr_debug("eqe=%p\n", eqe);
+       while (eqe) {
+               pr_debug("*eqe=%lx\n", (unsigned long) eqe->entry);
+               ehea_parse_eqe(adapter, eqe->entry);
+               eqe = ehea_poll_eq(adapter->neq);
+               pr_debug("next eqe=%p\n", eqe);
+       }
+       event_mask = EHEA_BMASK_SET(NELR_PORTSTATE_CHG, 1)
+                  | EHEA_BMASK_SET(NELR_ADAPTER_MALFUNC, 1)
+                  | EHEA_BMASK_SET(NELR_PORT_MALFUNC, 1);
+       ehea_h_reset_events(adapter->handle,
+                           adapter->neq->fw_handle, event_mask);
+ }
+ static irqreturn_t ehea_interrupt_neq(int irq, void *param)
+ {
+       struct ehea_adapter *adapter = param;
+       tasklet_hi_schedule(&adapter->neq_tasklet);
+       return IRQ_HANDLED;
+ }
+ static int ehea_fill_port_res(struct ehea_port_res *pr)
+ {
+       int ret;
+       struct ehea_qp_init_attr *init_attr = &pr->qp->init_attr;
+       ehea_init_fill_rq1(pr, pr->rq1_skba.len);
+       ret = ehea_refill_rq2(pr, init_attr->act_nr_rwqes_rq2 - 1);
+       ret |= ehea_refill_rq3(pr, init_attr->act_nr_rwqes_rq3 - 1);
+       return ret;
+ }
+ static int ehea_reg_interrupts(struct net_device *dev)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       struct ehea_port_res *pr;
+       int i, ret;
+       snprintf(port->int_aff_name, EHEA_IRQ_NAME_SIZE - 1, "%s-aff",
+                dev->name);
+       ret = ibmebus_request_irq(port->qp_eq->attr.ist1,
+                                 ehea_qp_aff_irq_handler,
+                                 IRQF_DISABLED, port->int_aff_name, port);
+       if (ret) {
+               netdev_err(dev, "failed registering irq for qp_aff_irq_handler:ist=%X\n",
+                          port->qp_eq->attr.ist1);
+               goto out_free_qpeq;
+       }
+       netif_info(port, ifup, dev,
+                  "irq_handle 0x%X for function qp_aff_irq_handler registered\n",
+                  port->qp_eq->attr.ist1);
+       for (i = 0; i < port->num_def_qps; i++) {
+               pr = &port->port_res[i];
+               snprintf(pr->int_send_name, EHEA_IRQ_NAME_SIZE - 1,
+                        "%s-queue%d", dev->name, i);
+               ret = ibmebus_request_irq(pr->eq->attr.ist1,
+                                         ehea_recv_irq_handler,
+                                         IRQF_DISABLED, pr->int_send_name,
+                                         pr);
+               if (ret) {
+                       netdev_err(dev, "failed registering irq for ehea_queue port_res_nr:%d, ist=%X\n",
+                                  i, pr->eq->attr.ist1);
+                       goto out_free_req;
+               }
+               netif_info(port, ifup, dev,
+                          "irq_handle 0x%X for function ehea_queue_int %d registered\n",
+                          pr->eq->attr.ist1, i);
+       }
+ out:
+       return ret;
+ out_free_req:
+       while (--i >= 0) {
+               u32 ist = port->port_res[i].eq->attr.ist1;
+               ibmebus_free_irq(ist, &port->port_res[i]);
+       }
+ out_free_qpeq:
+       ibmebus_free_irq(port->qp_eq->attr.ist1, port);
+       i = port->num_def_qps;
+       goto out;
+ }
+ static void ehea_free_interrupts(struct net_device *dev)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       struct ehea_port_res *pr;
+       int i;
+       /* send */
+       for (i = 0; i < port->num_def_qps; i++) {
+               pr = &port->port_res[i];
+               ibmebus_free_irq(pr->eq->attr.ist1, pr);
+               netif_info(port, intr, dev,
+                          "free send irq for res %d with handle 0x%X\n",
+                          i, pr->eq->attr.ist1);
+       }
+       /* associated events */
+       ibmebus_free_irq(port->qp_eq->attr.ist1, port);
+       netif_info(port, intr, dev,
+                  "associated event interrupt for handle 0x%X freed\n",
+                  port->qp_eq->attr.ist1);
+ }
+ static int ehea_configure_port(struct ehea_port *port)
+ {
+       int ret, i;
+       u64 hret, mask;
+       struct hcp_ehea_port_cb0 *cb0;
+       ret = -ENOMEM;
+       cb0 = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!cb0)
+               goto out;
+       cb0->port_rc = EHEA_BMASK_SET(PXLY_RC_VALID, 1)
+                    | EHEA_BMASK_SET(PXLY_RC_IP_CHKSUM, 1)
+                    | EHEA_BMASK_SET(PXLY_RC_TCP_UDP_CHKSUM, 1)
+                    | EHEA_BMASK_SET(PXLY_RC_VLAN_XTRACT, 1)
+                    | EHEA_BMASK_SET(PXLY_RC_VLAN_TAG_FILTER,
+                                     PXLY_RC_VLAN_FILTER)
+                    | EHEA_BMASK_SET(PXLY_RC_JUMBO_FRAME, 1);
+       for (i = 0; i < port->num_mcs; i++)
+               if (use_mcs)
+                       cb0->default_qpn_arr[i] =
+                               port->port_res[i].qp->init_attr.qp_nr;
+               else
+                       cb0->default_qpn_arr[i] =
+                               port->port_res[0].qp->init_attr.qp_nr;
+       if (netif_msg_ifup(port))
+               ehea_dump(cb0, sizeof(*cb0), "ehea_configure_port");
+       mask = EHEA_BMASK_SET(H_PORT_CB0_PRC, 1)
+            | EHEA_BMASK_SET(H_PORT_CB0_DEFQPNARRAY, 1);
+       hret = ehea_h_modify_ehea_port(port->adapter->handle,
+                                      port->logical_port_id,
+                                      H_PORT_CB0, mask, cb0);
+       ret = -EIO;
+       if (hret != H_SUCCESS)
+               goto out_free;
+       ret = 0;
+ out_free:
+       free_page((unsigned long)cb0);
+ out:
+       return ret;
+ }
+ int ehea_gen_smrs(struct ehea_port_res *pr)
+ {
+       int ret;
+       struct ehea_adapter *adapter = pr->port->adapter;
+       ret = ehea_gen_smr(adapter, &adapter->mr, &pr->send_mr);
+       if (ret)
+               goto out;
+       ret = ehea_gen_smr(adapter, &adapter->mr, &pr->recv_mr);
+       if (ret)
+               goto out_free;
+       return 0;
+ out_free:
+       ehea_rem_mr(&pr->send_mr);
+ out:
+       pr_err("Generating SMRS failed\n");
+       return -EIO;
+ }
+ int ehea_rem_smrs(struct ehea_port_res *pr)
+ {
+       if ((ehea_rem_mr(&pr->send_mr)) ||
+           (ehea_rem_mr(&pr->recv_mr)))
+               return -EIO;
+       else
+               return 0;
+ }
+ static int ehea_init_q_skba(struct ehea_q_skb_arr *q_skba, int max_q_entries)
+ {
+       int arr_size = sizeof(void *) * max_q_entries;
+       q_skba->arr = vzalloc(arr_size);
+       if (!q_skba->arr)
+               return -ENOMEM;
+       q_skba->len = max_q_entries;
+       q_skba->index = 0;
+       q_skba->os_skbs = 0;
+       return 0;
+ }
+ static int ehea_init_port_res(struct ehea_port *port, struct ehea_port_res *pr,
+                             struct port_res_cfg *pr_cfg, int queue_token)
+ {
+       struct ehea_adapter *adapter = port->adapter;
+       enum ehea_eq_type eq_type = EHEA_EQ;
+       struct ehea_qp_init_attr *init_attr = NULL;
+       int ret = -EIO;
+       u64 tx_bytes, rx_bytes, tx_packets, rx_packets;
+       tx_bytes = pr->tx_bytes;
+       tx_packets = pr->tx_packets;
+       rx_bytes = pr->rx_bytes;
+       rx_packets = pr->rx_packets;
+       memset(pr, 0, sizeof(struct ehea_port_res));
+       pr->tx_bytes = rx_bytes;
+       pr->tx_packets = tx_packets;
+       pr->rx_bytes = rx_bytes;
+       pr->rx_packets = rx_packets;
+       pr->port = port;
+       pr->eq = ehea_create_eq(adapter, eq_type, EHEA_MAX_ENTRIES_EQ, 0);
+       if (!pr->eq) {
+               pr_err("create_eq failed (eq)\n");
+               goto out_free;
+       }
+       pr->recv_cq = ehea_create_cq(adapter, pr_cfg->max_entries_rcq,
+                                    pr->eq->fw_handle,
+                                    port->logical_port_id);
+       if (!pr->recv_cq) {
+               pr_err("create_cq failed (cq_recv)\n");
+               goto out_free;
+       }
+       pr->send_cq = ehea_create_cq(adapter, pr_cfg->max_entries_scq,
+                                    pr->eq->fw_handle,
+                                    port->logical_port_id);
+       if (!pr->send_cq) {
+               pr_err("create_cq failed (cq_send)\n");
+               goto out_free;
+       }
+       if (netif_msg_ifup(port))
+               pr_info("Send CQ: act_nr_cqes=%d, Recv CQ: act_nr_cqes=%d\n",
+                       pr->send_cq->attr.act_nr_of_cqes,
+                       pr->recv_cq->attr.act_nr_of_cqes);
+       init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
+       if (!init_attr) {
+               ret = -ENOMEM;
+               pr_err("no mem for ehea_qp_init_attr\n");
+               goto out_free;
+       }
+       init_attr->low_lat_rq1 = 1;
+       init_attr->signalingtype = 1;   /* generate CQE if specified in WQE */
+       init_attr->rq_count = 3;
+       init_attr->qp_token = queue_token;
+       init_attr->max_nr_send_wqes = pr_cfg->max_entries_sq;
+       init_attr->max_nr_rwqes_rq1 = pr_cfg->max_entries_rq1;
+       init_attr->max_nr_rwqes_rq2 = pr_cfg->max_entries_rq2;
+       init_attr->max_nr_rwqes_rq3 = pr_cfg->max_entries_rq3;
+       init_attr->wqe_size_enc_sq = EHEA_SG_SQ;
+       init_attr->wqe_size_enc_rq1 = EHEA_SG_RQ1;
+       init_attr->wqe_size_enc_rq2 = EHEA_SG_RQ2;
+       init_attr->wqe_size_enc_rq3 = EHEA_SG_RQ3;
+       init_attr->rq2_threshold = EHEA_RQ2_THRESHOLD;
+       init_attr->rq3_threshold = EHEA_RQ3_THRESHOLD;
+       init_attr->port_nr = port->logical_port_id;
+       init_attr->send_cq_handle = pr->send_cq->fw_handle;
+       init_attr->recv_cq_handle = pr->recv_cq->fw_handle;
+       init_attr->aff_eq_handle = port->qp_eq->fw_handle;
+       pr->qp = ehea_create_qp(adapter, adapter->pd, init_attr);
+       if (!pr->qp) {
+               pr_err("create_qp failed\n");
+               ret = -EIO;
+               goto out_free;
+       }
+       if (netif_msg_ifup(port))
+               pr_info("QP: qp_nr=%d\n act_nr_snd_wqe=%d\n nr_rwqe_rq1=%d\n nr_rwqe_rq2=%d\n nr_rwqe_rq3=%d\n",
+                       init_attr->qp_nr,
+                       init_attr->act_nr_send_wqes,
+                       init_attr->act_nr_rwqes_rq1,
+                       init_attr->act_nr_rwqes_rq2,
+                       init_attr->act_nr_rwqes_rq3);
+       pr->sq_skba_size = init_attr->act_nr_send_wqes + 1;
+       ret = ehea_init_q_skba(&pr->sq_skba, pr->sq_skba_size);
+       ret |= ehea_init_q_skba(&pr->rq1_skba, init_attr->act_nr_rwqes_rq1 + 1);
+       ret |= ehea_init_q_skba(&pr->rq2_skba, init_attr->act_nr_rwqes_rq2 + 1);
+       ret |= ehea_init_q_skba(&pr->rq3_skba, init_attr->act_nr_rwqes_rq3 + 1);
+       if (ret)
+               goto out_free;
+       pr->swqe_refill_th = init_attr->act_nr_send_wqes / 10;
+       if (ehea_gen_smrs(pr) != 0) {
+               ret = -EIO;
+               goto out_free;
+       }
+       atomic_set(&pr->swqe_avail, init_attr->act_nr_send_wqes - 1);
+       kfree(init_attr);
+       netif_napi_add(pr->port->netdev, &pr->napi, ehea_poll, 64);
+       ret = 0;
+       goto out;
+ out_free:
+       kfree(init_attr);
+       vfree(pr->sq_skba.arr);
+       vfree(pr->rq1_skba.arr);
+       vfree(pr->rq2_skba.arr);
+       vfree(pr->rq3_skba.arr);
+       ehea_destroy_qp(pr->qp);
+       ehea_destroy_cq(pr->send_cq);
+       ehea_destroy_cq(pr->recv_cq);
+       ehea_destroy_eq(pr->eq);
+ out:
+       return ret;
+ }
+ static int ehea_clean_portres(struct ehea_port *port, struct ehea_port_res *pr)
+ {
+       int ret, i;
+       if (pr->qp)
+               netif_napi_del(&pr->napi);
+       ret = ehea_destroy_qp(pr->qp);
+       if (!ret) {
+               ehea_destroy_cq(pr->send_cq);
+               ehea_destroy_cq(pr->recv_cq);
+               ehea_destroy_eq(pr->eq);
+               for (i = 0; i < pr->rq1_skba.len; i++)
+                       if (pr->rq1_skba.arr[i])
+                               dev_kfree_skb(pr->rq1_skba.arr[i]);
+               for (i = 0; i < pr->rq2_skba.len; i++)
+                       if (pr->rq2_skba.arr[i])
+                               dev_kfree_skb(pr->rq2_skba.arr[i]);
+               for (i = 0; i < pr->rq3_skba.len; i++)
+                       if (pr->rq3_skba.arr[i])
+                               dev_kfree_skb(pr->rq3_skba.arr[i]);
+               for (i = 0; i < pr->sq_skba.len; i++)
+                       if (pr->sq_skba.arr[i])
+                               dev_kfree_skb(pr->sq_skba.arr[i]);
+               vfree(pr->rq1_skba.arr);
+               vfree(pr->rq2_skba.arr);
+               vfree(pr->rq3_skba.arr);
+               vfree(pr->sq_skba.arr);
+               ret = ehea_rem_smrs(pr);
+       }
+       return ret;
+ }
+ static void write_swqe2_immediate(struct sk_buff *skb, struct ehea_swqe *swqe,
+                                 u32 lkey)
+ {
+       int skb_data_size = skb_headlen(skb);
+       u8 *imm_data = &swqe->u.immdata_desc.immediate_data[0];
+       struct ehea_vsgentry *sg1entry = &swqe->u.immdata_desc.sg_entry;
+       unsigned int immediate_len = SWQE2_MAX_IMM;
+       swqe->descriptors = 0;
+       if (skb_is_gso(skb)) {
+               swqe->tx_control |= EHEA_SWQE_TSO;
+               swqe->mss = skb_shinfo(skb)->gso_size;
+               /*
+                * For TSO packets we only copy the headers into the
+                * immediate area.
+                */
+               immediate_len = ETH_HLEN + ip_hdrlen(skb) + tcp_hdrlen(skb);
+       }
+       if (skb_is_gso(skb) || skb_data_size >= SWQE2_MAX_IMM) {
+               skb_copy_from_linear_data(skb, imm_data, immediate_len);
+               swqe->immediate_data_length = immediate_len;
+               if (skb_data_size > immediate_len) {
+                       sg1entry->l_key = lkey;
+                       sg1entry->len = skb_data_size - immediate_len;
+                       sg1entry->vaddr =
+                               ehea_map_vaddr(skb->data + immediate_len);
+                       swqe->descriptors++;
+               }
+       } else {
+               skb_copy_from_linear_data(skb, imm_data, skb_data_size);
+               swqe->immediate_data_length = skb_data_size;
+       }
+ }
+ static inline void write_swqe2_data(struct sk_buff *skb, struct net_device *dev,
+                                   struct ehea_swqe *swqe, u32 lkey)
+ {
+       struct ehea_vsgentry *sg_list, *sg1entry, *sgentry;
+       skb_frag_t *frag;
+       int nfrags, sg1entry_contains_frag_data, i;
+       nfrags = skb_shinfo(skb)->nr_frags;
+       sg1entry = &swqe->u.immdata_desc.sg_entry;
+       sg_list = (struct ehea_vsgentry *)&swqe->u.immdata_desc.sg_list;
+       sg1entry_contains_frag_data = 0;
+       write_swqe2_immediate(skb, swqe, lkey);
+       /* write descriptors */
+       if (nfrags > 0) {
+               if (swqe->descriptors == 0) {
+                       /* sg1entry not yet used */
+                       frag = &skb_shinfo(skb)->frags[0];
+                       /* copy sg1entry data */
+                       sg1entry->l_key = lkey;
+                       sg1entry->len = skb_frag_size(frag);
+                       sg1entry->vaddr =
+                               ehea_map_vaddr(skb_frag_address(frag));
+                       swqe->descriptors++;
+                       sg1entry_contains_frag_data = 1;
+               }
+               for (i = sg1entry_contains_frag_data; i < nfrags; i++) {
+                       frag = &skb_shinfo(skb)->frags[i];
+                       sgentry = &sg_list[i - sg1entry_contains_frag_data];
+                       sgentry->l_key = lkey;
+                       sgentry->len = skb_frag_size(frag);
+                       sgentry->vaddr = ehea_map_vaddr(skb_frag_address(frag));
+                       swqe->descriptors++;
+               }
+       }
+ }
+ static int ehea_broadcast_reg_helper(struct ehea_port *port, u32 hcallid)
+ {
+       int ret = 0;
+       u64 hret;
+       u8 reg_type;
+       /* De/Register untagged packets */
+       reg_type = EHEA_BCMC_BROADCAST | EHEA_BCMC_UNTAGGED;
+       hret = ehea_h_reg_dereg_bcmc(port->adapter->handle,
+                                    port->logical_port_id,
+                                    reg_type, port->mac_addr, 0, hcallid);
+       if (hret != H_SUCCESS) {
+               pr_err("%sregistering bc address failed (tagged)\n",
+                      hcallid == H_REG_BCMC ? "" : "de");
+               ret = -EIO;
+               goto out_herr;
+       }
+       /* De/Register VLAN packets */
+       reg_type = EHEA_BCMC_BROADCAST | EHEA_BCMC_VLANID_ALL;
+       hret = ehea_h_reg_dereg_bcmc(port->adapter->handle,
+                                    port->logical_port_id,
+                                    reg_type, port->mac_addr, 0, hcallid);
+       if (hret != H_SUCCESS) {
+               pr_err("%sregistering bc address failed (vlan)\n",
+                      hcallid == H_REG_BCMC ? "" : "de");
+               ret = -EIO;
+       }
+ out_herr:
+       return ret;
+ }
+ static int ehea_set_mac_addr(struct net_device *dev, void *sa)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       struct sockaddr *mac_addr = sa;
+       struct hcp_ehea_port_cb0 *cb0;
+       int ret;
+       u64 hret;
+       if (!is_valid_ether_addr(mac_addr->sa_data)) {
+               ret = -EADDRNOTAVAIL;
+               goto out;
+       }
+       cb0 = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!cb0) {
+               pr_err("no mem for cb0\n");
+               ret = -ENOMEM;
+               goto out;
+       }
+       memcpy(&(cb0->port_mac_addr), &(mac_addr->sa_data[0]), ETH_ALEN);
+       cb0->port_mac_addr = cb0->port_mac_addr >> 16;
+       hret = ehea_h_modify_ehea_port(port->adapter->handle,
+                                      port->logical_port_id, H_PORT_CB0,
+                                      EHEA_BMASK_SET(H_PORT_CB0_MAC, 1), cb0);
+       if (hret != H_SUCCESS) {
+               ret = -EIO;
+               goto out_free;
+       }
+       memcpy(dev->dev_addr, mac_addr->sa_data, dev->addr_len);
+       /* Deregister old MAC in pHYP */
+       if (port->state == EHEA_PORT_UP) {
+               ret = ehea_broadcast_reg_helper(port, H_DEREG_BCMC);
+               if (ret)
+                       goto out_upregs;
+       }
+       port->mac_addr = cb0->port_mac_addr << 16;
+       /* Register new MAC in pHYP */
+       if (port->state == EHEA_PORT_UP) {
+               ret = ehea_broadcast_reg_helper(port, H_REG_BCMC);
+               if (ret)
+                       goto out_upregs;
+       }
+       ret = 0;
+ out_upregs:
+       ehea_update_bcmc_registrations();
+ out_free:
+       free_page((unsigned long)cb0);
+ out:
+       return ret;
+ }
+ static void ehea_promiscuous_error(u64 hret, int enable)
+ {
+       if (hret == H_AUTHORITY)
+               pr_info("Hypervisor denied %sabling promiscuous mode\n",
+                       enable == 1 ? "en" : "dis");
+       else
+               pr_err("failed %sabling promiscuous mode\n",
+                      enable == 1 ? "en" : "dis");
+ }
+ static void ehea_promiscuous(struct net_device *dev, int enable)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       struct hcp_ehea_port_cb7 *cb7;
+       u64 hret;
+       if (enable == port->promisc)
+               return;
+       cb7 = (void *)get_zeroed_page(GFP_ATOMIC);
+       if (!cb7) {
+               pr_err("no mem for cb7\n");
+               goto out;
+       }
+       /* Modify Pxs_DUCQPN in CB7 */
+       cb7->def_uc_qpn = enable == 1 ? port->port_res[0].qp->fw_handle : 0;
+       hret = ehea_h_modify_ehea_port(port->adapter->handle,
+                                      port->logical_port_id,
+                                      H_PORT_CB7, H_PORT_CB7_DUCQPN, cb7);
+       if (hret) {
+               ehea_promiscuous_error(hret, enable);
+               goto out;
+       }
+       port->promisc = enable;
+ out:
+       free_page((unsigned long)cb7);
+ }
+ static u64 ehea_multicast_reg_helper(struct ehea_port *port, u64 mc_mac_addr,
+                                    u32 hcallid)
+ {
+       u64 hret;
+       u8 reg_type;
+       reg_type = EHEA_BCMC_SCOPE_ALL | EHEA_BCMC_MULTICAST
+                | EHEA_BCMC_UNTAGGED;
+       hret = ehea_h_reg_dereg_bcmc(port->adapter->handle,
+                                    port->logical_port_id,
+                                    reg_type, mc_mac_addr, 0, hcallid);
+       if (hret)
+               goto out;
+       reg_type = EHEA_BCMC_SCOPE_ALL | EHEA_BCMC_MULTICAST
+                | EHEA_BCMC_VLANID_ALL;
+       hret = ehea_h_reg_dereg_bcmc(port->adapter->handle,
+                                    port->logical_port_id,
+                                    reg_type, mc_mac_addr, 0, hcallid);
+ out:
+       return hret;
+ }
+ static int ehea_drop_multicast_list(struct net_device *dev)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       struct ehea_mc_list *mc_entry = port->mc_list;
+       struct list_head *pos;
+       struct list_head *temp;
+       int ret = 0;
+       u64 hret;
+       list_for_each_safe(pos, temp, &(port->mc_list->list)) {
+               mc_entry = list_entry(pos, struct ehea_mc_list, list);
+               hret = ehea_multicast_reg_helper(port, mc_entry->macaddr,
+                                                H_DEREG_BCMC);
+               if (hret) {
+                       pr_err("failed deregistering mcast MAC\n");
+                       ret = -EIO;
+               }
+               list_del(pos);
+               kfree(mc_entry);
+       }
+       return ret;
+ }
+ static void ehea_allmulti(struct net_device *dev, int enable)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       u64 hret;
+       if (!port->allmulti) {
+               if (enable) {
+                       /* Enable ALLMULTI */
+                       ehea_drop_multicast_list(dev);
+                       hret = ehea_multicast_reg_helper(port, 0, H_REG_BCMC);
+                       if (!hret)
+                               port->allmulti = 1;
+                       else
+                               netdev_err(dev,
+                                          "failed enabling IFF_ALLMULTI\n");
+               }
+       } else
+               if (!enable) {
+                       /* Disable ALLMULTI */
+                       hret = ehea_multicast_reg_helper(port, 0, H_DEREG_BCMC);
+                       if (!hret)
+                               port->allmulti = 0;
+                       else
+                               netdev_err(dev,
+                                          "failed disabling IFF_ALLMULTI\n");
+               }
+ }
+ static void ehea_add_multicast_entry(struct ehea_port *port, u8 *mc_mac_addr)
+ {
+       struct ehea_mc_list *ehea_mcl_entry;
+       u64 hret;
+       ehea_mcl_entry = kzalloc(sizeof(*ehea_mcl_entry), GFP_ATOMIC);
+       if (!ehea_mcl_entry) {
+               pr_err("no mem for mcl_entry\n");
+               return;
+       }
+       INIT_LIST_HEAD(&ehea_mcl_entry->list);
+       memcpy(&ehea_mcl_entry->macaddr, mc_mac_addr, ETH_ALEN);
+       hret = ehea_multicast_reg_helper(port, ehea_mcl_entry->macaddr,
+                                        H_REG_BCMC);
+       if (!hret)
+               list_add(&ehea_mcl_entry->list, &port->mc_list->list);
+       else {
+               pr_err("failed registering mcast MAC\n");
+               kfree(ehea_mcl_entry);
+       }
+ }
+ static void ehea_set_multicast_list(struct net_device *dev)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       struct netdev_hw_addr *ha;
+       int ret;
+       if (port->promisc) {
+               ehea_promiscuous(dev, 1);
+               return;
+       }
+       ehea_promiscuous(dev, 0);
+       if (dev->flags & IFF_ALLMULTI) {
+               ehea_allmulti(dev, 1);
+               goto out;
+       }
+       ehea_allmulti(dev, 0);
+       if (!netdev_mc_empty(dev)) {
+               ret = ehea_drop_multicast_list(dev);
+               if (ret) {
+                       /* Dropping the current multicast list failed.
+                        * Enabling ALL_MULTI is the best we can do.
+                        */
+                       ehea_allmulti(dev, 1);
+               }
+               if (netdev_mc_count(dev) > port->adapter->max_mc_mac) {
+                       pr_info("Mcast registration limit reached (0x%llx). Use ALLMULTI!\n",
+                               port->adapter->max_mc_mac);
+                       goto out;
+               }
+               netdev_for_each_mc_addr(ha, dev)
+                       ehea_add_multicast_entry(port, ha->addr);
+       }
+ out:
+       ehea_update_bcmc_registrations();
+ }
+ static int ehea_change_mtu(struct net_device *dev, int new_mtu)
+ {
+       if ((new_mtu < 68) || (new_mtu > EHEA_MAX_PACKET_SIZE))
+               return -EINVAL;
+       dev->mtu = new_mtu;
+       return 0;
+ }
+ static void xmit_common(struct sk_buff *skb, struct ehea_swqe *swqe)
+ {
+       swqe->tx_control |= EHEA_SWQE_IMM_DATA_PRESENT | EHEA_SWQE_CRC;
+       if (skb->protocol != htons(ETH_P_IP))
+               return;
+       if (skb->ip_summed == CHECKSUM_PARTIAL)
+               swqe->tx_control |= EHEA_SWQE_IP_CHECKSUM;
+       swqe->ip_start = skb_network_offset(skb);
+       swqe->ip_end = swqe->ip_start + ip_hdrlen(skb) - 1;
+       switch (ip_hdr(skb)->protocol) {
+       case IPPROTO_UDP:
+               if (skb->ip_summed == CHECKSUM_PARTIAL)
+                       swqe->tx_control |= EHEA_SWQE_TCP_CHECKSUM;
+               swqe->tcp_offset = swqe->ip_end + 1 +
+                                  offsetof(struct udphdr, check);
+               break;
+       case IPPROTO_TCP:
+               if (skb->ip_summed == CHECKSUM_PARTIAL)
+                       swqe->tx_control |= EHEA_SWQE_TCP_CHECKSUM;
+               swqe->tcp_offset = swqe->ip_end + 1 +
+                                  offsetof(struct tcphdr, check);
+               break;
+       }
+ }
+ static void ehea_xmit2(struct sk_buff *skb, struct net_device *dev,
+                      struct ehea_swqe *swqe, u32 lkey)
+ {
+       swqe->tx_control |= EHEA_SWQE_DESCRIPTORS_PRESENT;
+       xmit_common(skb, swqe);
+       write_swqe2_data(skb, dev, swqe, lkey);
+ }
+ static void ehea_xmit3(struct sk_buff *skb, struct net_device *dev,
+                      struct ehea_swqe *swqe)
+ {
+       u8 *imm_data = &swqe->u.immdata_nodesc.immediate_data[0];
+       xmit_common(skb, swqe);
+       if (!skb->data_len)
+               skb_copy_from_linear_data(skb, imm_data, skb->len);
+       else
+               skb_copy_bits(skb, 0, imm_data, skb->len);
+       swqe->immediate_data_length = skb->len;
+       dev_kfree_skb(skb);
+ }
+ static int ehea_start_xmit(struct sk_buff *skb, struct net_device *dev)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       struct ehea_swqe *swqe;
+       u32 lkey;
+       int swqe_index;
+       struct ehea_port_res *pr;
+       struct netdev_queue *txq;
+       pr = &port->port_res[skb_get_queue_mapping(skb)];
+       txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+       swqe = ehea_get_swqe(pr->qp, &swqe_index);
+       memset(swqe, 0, SWQE_HEADER_SIZE);
+       atomic_dec(&pr->swqe_avail);
+       if (vlan_tx_tag_present(skb)) {
+               swqe->tx_control |= EHEA_SWQE_VLAN_INSERT;
+               swqe->vlan_tag = vlan_tx_tag_get(skb);
+       }
+       pr->tx_packets++;
+       pr->tx_bytes += skb->len;
+       if (skb->len <= SWQE3_MAX_IMM) {
+               u32 sig_iv = port->sig_comp_iv;
+               u32 swqe_num = pr->swqe_id_counter;
+               ehea_xmit3(skb, dev, swqe);
+               swqe->wr_id = EHEA_BMASK_SET(EHEA_WR_ID_TYPE, EHEA_SWQE3_TYPE)
+                       | EHEA_BMASK_SET(EHEA_WR_ID_COUNT, swqe_num);
+               if (pr->swqe_ll_count >= (sig_iv - 1)) {
+                       swqe->wr_id |= EHEA_BMASK_SET(EHEA_WR_ID_REFILL,
+                                                     sig_iv);
+                       swqe->tx_control |= EHEA_SWQE_SIGNALLED_COMPLETION;
+                       pr->swqe_ll_count = 0;
+               } else
+                       pr->swqe_ll_count += 1;
+       } else {
+               swqe->wr_id =
+                       EHEA_BMASK_SET(EHEA_WR_ID_TYPE, EHEA_SWQE2_TYPE)
+                     | EHEA_BMASK_SET(EHEA_WR_ID_COUNT, pr->swqe_id_counter)
+                     | EHEA_BMASK_SET(EHEA_WR_ID_REFILL, 1)
+                     | EHEA_BMASK_SET(EHEA_WR_ID_INDEX, pr->sq_skba.index);
+               pr->sq_skba.arr[pr->sq_skba.index] = skb;
+               pr->sq_skba.index++;
+               pr->sq_skba.index &= (pr->sq_skba.len - 1);
+               lkey = pr->send_mr.lkey;
+               ehea_xmit2(skb, dev, swqe, lkey);
+               swqe->tx_control |= EHEA_SWQE_SIGNALLED_COMPLETION;
+       }
+       pr->swqe_id_counter += 1;
+       netif_info(port, tx_queued, dev,
+                  "post swqe on QP %d\n", pr->qp->init_attr.qp_nr);
+       if (netif_msg_tx_queued(port))
+               ehea_dump(swqe, 512, "swqe");
+       if (unlikely(test_bit(__EHEA_STOP_XFER, &ehea_driver_flags))) {
+               netif_tx_stop_queue(txq);
+               swqe->tx_control |= EHEA_SWQE_PURGE;
+       }
+       ehea_post_swqe(pr->qp, swqe);
+       if (unlikely(atomic_read(&pr->swqe_avail) <= 1)) {
+               pr->p_stats.queue_stopped++;
+               netif_tx_stop_queue(txq);
+       }
+       return NETDEV_TX_OK;
+ }
+ static void ehea_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       struct ehea_adapter *adapter = port->adapter;
+       struct hcp_ehea_port_cb1 *cb1;
+       int index;
+       u64 hret;
+       cb1 = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!cb1) {
+               pr_err("no mem for cb1\n");
+               goto out;
+       }
+       hret = ehea_h_query_ehea_port(adapter->handle, port->logical_port_id,
+                                     H_PORT_CB1, H_PORT_CB1_ALL, cb1);
+       if (hret != H_SUCCESS) {
+               pr_err("query_ehea_port failed\n");
+               goto out;
+       }
+       index = (vid / 64);
+       cb1->vlan_filter[index] |= ((u64)(0x8000000000000000 >> (vid & 0x3F)));
+       hret = ehea_h_modify_ehea_port(adapter->handle, port->logical_port_id,
+                                      H_PORT_CB1, H_PORT_CB1_ALL, cb1);
+       if (hret != H_SUCCESS)
+               pr_err("modify_ehea_port failed\n");
+ out:
+       free_page((unsigned long)cb1);
+       return;
+ }
+ static void ehea_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       struct ehea_adapter *adapter = port->adapter;
+       struct hcp_ehea_port_cb1 *cb1;
+       int index;
+       u64 hret;
+       cb1 = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!cb1) {
+               pr_err("no mem for cb1\n");
+               goto out;
+       }
+       hret = ehea_h_query_ehea_port(adapter->handle, port->logical_port_id,
+                                     H_PORT_CB1, H_PORT_CB1_ALL, cb1);
+       if (hret != H_SUCCESS) {
+               pr_err("query_ehea_port failed\n");
+               goto out;
+       }
+       index = (vid / 64);
+       cb1->vlan_filter[index] &= ~((u64)(0x8000000000000000 >> (vid & 0x3F)));
+       hret = ehea_h_modify_ehea_port(adapter->handle, port->logical_port_id,
+                                      H_PORT_CB1, H_PORT_CB1_ALL, cb1);
+       if (hret != H_SUCCESS)
+               pr_err("modify_ehea_port failed\n");
+ out:
+       free_page((unsigned long)cb1);
+ }
+ int ehea_activate_qp(struct ehea_adapter *adapter, struct ehea_qp *qp)
+ {
+       int ret = -EIO;
+       u64 hret;
+       u16 dummy16 = 0;
+       u64 dummy64 = 0;
+       struct hcp_modify_qp_cb0 *cb0;
+       cb0 = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!cb0) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+                                   EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF), cb0);
+       if (hret != H_SUCCESS) {
+               pr_err("query_ehea_qp failed (1)\n");
+               goto out;
+       }
+       cb0->qp_ctl_reg = H_QP_CR_STATE_INITIALIZED;
+       hret = ehea_h_modify_ehea_qp(adapter->handle, 0, qp->fw_handle,
+                                    EHEA_BMASK_SET(H_QPCB0_QP_CTL_REG, 1), cb0,
+                                    &dummy64, &dummy64, &dummy16, &dummy16);
+       if (hret != H_SUCCESS) {
+               pr_err("modify_ehea_qp failed (1)\n");
+               goto out;
+       }
+       hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+                                   EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF), cb0);
+       if (hret != H_SUCCESS) {
+               pr_err("query_ehea_qp failed (2)\n");
+               goto out;
+       }
+       cb0->qp_ctl_reg = H_QP_CR_ENABLED | H_QP_CR_STATE_INITIALIZED;
+       hret = ehea_h_modify_ehea_qp(adapter->handle, 0, qp->fw_handle,
+                                    EHEA_BMASK_SET(H_QPCB0_QP_CTL_REG, 1), cb0,
+                                    &dummy64, &dummy64, &dummy16, &dummy16);
+       if (hret != H_SUCCESS) {
+               pr_err("modify_ehea_qp failed (2)\n");
+               goto out;
+       }
+       hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+                                   EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF), cb0);
+       if (hret != H_SUCCESS) {
+               pr_err("query_ehea_qp failed (3)\n");
+               goto out;
+       }
+       cb0->qp_ctl_reg = H_QP_CR_ENABLED | H_QP_CR_STATE_RDY2SND;
+       hret = ehea_h_modify_ehea_qp(adapter->handle, 0, qp->fw_handle,
+                                    EHEA_BMASK_SET(H_QPCB0_QP_CTL_REG, 1), cb0,
+                                    &dummy64, &dummy64, &dummy16, &dummy16);
+       if (hret != H_SUCCESS) {
+               pr_err("modify_ehea_qp failed (3)\n");
+               goto out;
+       }
+       hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+                                   EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF), cb0);
+       if (hret != H_SUCCESS) {
+               pr_err("query_ehea_qp failed (4)\n");
+               goto out;
+       }
+       ret = 0;
+ out:
+       free_page((unsigned long)cb0);
+       return ret;
+ }
+ static int ehea_port_res_setup(struct ehea_port *port, int def_qps)
+ {
+       int ret, i;
+       struct port_res_cfg pr_cfg, pr_cfg_small_rx;
+       enum ehea_eq_type eq_type = EHEA_EQ;
+       port->qp_eq = ehea_create_eq(port->adapter, eq_type,
+                                  EHEA_MAX_ENTRIES_EQ, 1);
+       if (!port->qp_eq) {
+               ret = -EINVAL;
+               pr_err("ehea_create_eq failed (qp_eq)\n");
+               goto out_kill_eq;
+       }
+       pr_cfg.max_entries_rcq = rq1_entries + rq2_entries + rq3_entries;
+       pr_cfg.max_entries_scq = sq_entries * 2;
+       pr_cfg.max_entries_sq = sq_entries;
+       pr_cfg.max_entries_rq1 = rq1_entries;
+       pr_cfg.max_entries_rq2 = rq2_entries;
+       pr_cfg.max_entries_rq3 = rq3_entries;
+       pr_cfg_small_rx.max_entries_rcq = 1;
+       pr_cfg_small_rx.max_entries_scq = sq_entries;
+       pr_cfg_small_rx.max_entries_sq = sq_entries;
+       pr_cfg_small_rx.max_entries_rq1 = 1;
+       pr_cfg_small_rx.max_entries_rq2 = 1;
+       pr_cfg_small_rx.max_entries_rq3 = 1;
+       for (i = 0; i < def_qps; i++) {
+               ret = ehea_init_port_res(port, &port->port_res[i], &pr_cfg, i);
+               if (ret)
+                       goto out_clean_pr;
+       }
+       for (i = def_qps; i < def_qps; i++) {
+               ret = ehea_init_port_res(port, &port->port_res[i],
+                                        &pr_cfg_small_rx, i);
+               if (ret)
+                       goto out_clean_pr;
+       }
+       return 0;
+ out_clean_pr:
+       while (--i >= 0)
+               ehea_clean_portres(port, &port->port_res[i]);
+ out_kill_eq:
+       ehea_destroy_eq(port->qp_eq);
+       return ret;
+ }
+ static int ehea_clean_all_portres(struct ehea_port *port)
+ {
+       int ret = 0;
+       int i;
+       for (i = 0; i < port->num_def_qps; i++)
+               ret |= ehea_clean_portres(port, &port->port_res[i]);
+       ret |= ehea_destroy_eq(port->qp_eq);
+       return ret;
+ }
+ static void ehea_remove_adapter_mr(struct ehea_adapter *adapter)
+ {
+       if (adapter->active_ports)
+               return;
+       ehea_rem_mr(&adapter->mr);
+ }
+ static int ehea_add_adapter_mr(struct ehea_adapter *adapter)
+ {
+       if (adapter->active_ports)
+               return 0;
+       return ehea_reg_kernel_mr(adapter, &adapter->mr);
+ }
+ static int ehea_up(struct net_device *dev)
+ {
+       int ret, i;
+       struct ehea_port *port = netdev_priv(dev);
+       if (port->state == EHEA_PORT_UP)
+               return 0;
+       ret = ehea_port_res_setup(port, port->num_def_qps);
+       if (ret) {
+               netdev_err(dev, "port_res_failed\n");
+               goto out;
+       }
+       /* Set default QP for this port */
+       ret = ehea_configure_port(port);
+       if (ret) {
+               netdev_err(dev, "ehea_configure_port failed. ret:%d\n", ret);
+               goto out_clean_pr;
+       }
+       ret = ehea_reg_interrupts(dev);
+       if (ret) {
+               netdev_err(dev, "reg_interrupts failed. ret:%d\n", ret);
+               goto out_clean_pr;
+       }
+       for (i = 0; i < port->num_def_qps; i++) {
+               ret = ehea_activate_qp(port->adapter, port->port_res[i].qp);
+               if (ret) {
+                       netdev_err(dev, "activate_qp failed\n");
+                       goto out_free_irqs;
+               }
+       }
+       for (i = 0; i < port->num_def_qps; i++) {
+               ret = ehea_fill_port_res(&port->port_res[i]);
+               if (ret) {
+                       netdev_err(dev, "out_free_irqs\n");
+                       goto out_free_irqs;
+               }
+       }
+       ret = ehea_broadcast_reg_helper(port, H_REG_BCMC);
+       if (ret) {
+               ret = -EIO;
+               goto out_free_irqs;
+       }
+       port->state = EHEA_PORT_UP;
+       ret = 0;
+       goto out;
+ out_free_irqs:
+       ehea_free_interrupts(dev);
+ out_clean_pr:
+       ehea_clean_all_portres(port);
+ out:
+       if (ret)
+               netdev_info(dev, "Failed starting. ret=%i\n", ret);
+       ehea_update_bcmc_registrations();
+       ehea_update_firmware_handles();
+       return ret;
+ }
+ static void port_napi_disable(struct ehea_port *port)
+ {
+       int i;
+       for (i = 0; i < port->num_def_qps; i++)
+               napi_disable(&port->port_res[i].napi);
+ }
+ static void port_napi_enable(struct ehea_port *port)
+ {
+       int i;
+       for (i = 0; i < port->num_def_qps; i++)
+               napi_enable(&port->port_res[i].napi);
+ }
+ static int ehea_open(struct net_device *dev)
+ {
+       int ret;
+       struct ehea_port *port = netdev_priv(dev);
+       mutex_lock(&port->port_lock);
+       netif_info(port, ifup, dev, "enabling port\n");
+       ret = ehea_up(dev);
+       if (!ret) {
+               port_napi_enable(port);
+               netif_tx_start_all_queues(dev);
+       }
+       mutex_unlock(&port->port_lock);
+       schedule_delayed_work(&port->stats_work, msecs_to_jiffies(1000));
+       return ret;
+ }
+ static int ehea_down(struct net_device *dev)
+ {
+       int ret;
+       struct ehea_port *port = netdev_priv(dev);
+       if (port->state == EHEA_PORT_DOWN)
+               return 0;
+       ehea_drop_multicast_list(dev);
+       ehea_broadcast_reg_helper(port, H_DEREG_BCMC);
+       ehea_free_interrupts(dev);
+       port->state = EHEA_PORT_DOWN;
+       ehea_update_bcmc_registrations();
+       ret = ehea_clean_all_portres(port);
+       if (ret)
+               netdev_info(dev, "Failed freeing resources. ret=%i\n", ret);
+       ehea_update_firmware_handles();
+       return ret;
+ }
+ static int ehea_stop(struct net_device *dev)
+ {
+       int ret;
+       struct ehea_port *port = netdev_priv(dev);
+       netif_info(port, ifdown, dev, "disabling port\n");
+       set_bit(__EHEA_DISABLE_PORT_RESET, &port->flags);
+       cancel_work_sync(&port->reset_task);
+       cancel_delayed_work_sync(&port->stats_work);
+       mutex_lock(&port->port_lock);
+       netif_tx_stop_all_queues(dev);
+       port_napi_disable(port);
+       ret = ehea_down(dev);
+       mutex_unlock(&port->port_lock);
+       clear_bit(__EHEA_DISABLE_PORT_RESET, &port->flags);
+       return ret;
+ }
+ static void ehea_purge_sq(struct ehea_qp *orig_qp)
+ {
+       struct ehea_qp qp = *orig_qp;
+       struct ehea_qp_init_attr *init_attr = &qp.init_attr;
+       struct ehea_swqe *swqe;
+       int wqe_index;
+       int i;
+       for (i = 0; i < init_attr->act_nr_send_wqes; i++) {
+               swqe = ehea_get_swqe(&qp, &wqe_index);
+               swqe->tx_control |= EHEA_SWQE_PURGE;
+       }
+ }
+ static void ehea_flush_sq(struct ehea_port *port)
+ {
+       int i;
+       for (i = 0; i < port->num_def_qps; i++) {
+               struct ehea_port_res *pr = &port->port_res[i];
+               int swqe_max = pr->sq_skba_size - 2 - pr->swqe_ll_count;
+               int ret;
+               ret = wait_event_timeout(port->swqe_avail_wq,
+                        atomic_read(&pr->swqe_avail) >= swqe_max,
+                        msecs_to_jiffies(100));
+               if (!ret) {
+                       pr_err("WARNING: sq not flushed completely\n");
+                       break;
+               }
+       }
+ }
+ int ehea_stop_qps(struct net_device *dev)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       struct ehea_adapter *adapter = port->adapter;
+       struct hcp_modify_qp_cb0 *cb0;
+       int ret = -EIO;
+       int dret;
+       int i;
+       u64 hret;
+       u64 dummy64 = 0;
+       u16 dummy16 = 0;
+       cb0 = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!cb0) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       for (i = 0; i < (port->num_def_qps); i++) {
+               struct ehea_port_res *pr =  &port->port_res[i];
+               struct ehea_qp *qp = pr->qp;
+               /* Purge send queue */
+               ehea_purge_sq(qp);
+               /* Disable queue pair */
+               hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+                                           EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF),
+                                           cb0);
+               if (hret != H_SUCCESS) {
+                       pr_err("query_ehea_qp failed (1)\n");
+                       goto out;
+               }
+               cb0->qp_ctl_reg = (cb0->qp_ctl_reg & H_QP_CR_RES_STATE) << 8;
+               cb0->qp_ctl_reg &= ~H_QP_CR_ENABLED;
+               hret = ehea_h_modify_ehea_qp(adapter->handle, 0, qp->fw_handle,
+                                            EHEA_BMASK_SET(H_QPCB0_QP_CTL_REG,
+                                                           1), cb0, &dummy64,
+                                            &dummy64, &dummy16, &dummy16);
+               if (hret != H_SUCCESS) {
+                       pr_err("modify_ehea_qp failed (1)\n");
+                       goto out;
+               }
+               hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+                                           EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF),
+                                           cb0);
+               if (hret != H_SUCCESS) {
+                       pr_err("query_ehea_qp failed (2)\n");
+                       goto out;
+               }
+               /* deregister shared memory regions */
+               dret = ehea_rem_smrs(pr);
+               if (dret) {
+                       pr_err("unreg shared memory region failed\n");
+                       goto out;
+               }
+       }
+       ret = 0;
+ out:
+       free_page((unsigned long)cb0);
+       return ret;
+ }
+ void ehea_update_rqs(struct ehea_qp *orig_qp, struct ehea_port_res *pr)
+ {
+       struct ehea_qp qp = *orig_qp;
+       struct ehea_qp_init_attr *init_attr = &qp.init_attr;
+       struct ehea_rwqe *rwqe;
+       struct sk_buff **skba_rq2 = pr->rq2_skba.arr;
+       struct sk_buff **skba_rq3 = pr->rq3_skba.arr;
+       struct sk_buff *skb;
+       u32 lkey = pr->recv_mr.lkey;
+       int i;
+       int index;
+       for (i = 0; i < init_attr->act_nr_rwqes_rq2 + 1; i++) {
+               rwqe = ehea_get_next_rwqe(&qp, 2);
+               rwqe->sg_list[0].l_key = lkey;
+               index = EHEA_BMASK_GET(EHEA_WR_ID_INDEX, rwqe->wr_id);
+               skb = skba_rq2[index];
+               if (skb)
+                       rwqe->sg_list[0].vaddr = ehea_map_vaddr(skb->data);
+       }
+       for (i = 0; i < init_attr->act_nr_rwqes_rq3 + 1; i++) {
+               rwqe = ehea_get_next_rwqe(&qp, 3);
+               rwqe->sg_list[0].l_key = lkey;
+               index = EHEA_BMASK_GET(EHEA_WR_ID_INDEX, rwqe->wr_id);
+               skb = skba_rq3[index];
+               if (skb)
+                       rwqe->sg_list[0].vaddr = ehea_map_vaddr(skb->data);
+       }
+ }
+ int ehea_restart_qps(struct net_device *dev)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       struct ehea_adapter *adapter = port->adapter;
+       int ret = 0;
+       int i;
+       struct hcp_modify_qp_cb0 *cb0;
+       u64 hret;
+       u64 dummy64 = 0;
+       u16 dummy16 = 0;
+       cb0 = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!cb0) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       for (i = 0; i < (port->num_def_qps); i++) {
+               struct ehea_port_res *pr =  &port->port_res[i];
+               struct ehea_qp *qp = pr->qp;
+               ret = ehea_gen_smrs(pr);
+               if (ret) {
+                       netdev_err(dev, "creation of shared memory regions failed\n");
+                       goto out;
+               }
+               ehea_update_rqs(qp, pr);
+               /* Enable queue pair */
+               hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+                                           EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF),
+                                           cb0);
+               if (hret != H_SUCCESS) {
+                       netdev_err(dev, "query_ehea_qp failed (1)\n");
+                       goto out;
+               }
+               cb0->qp_ctl_reg = (cb0->qp_ctl_reg & H_QP_CR_RES_STATE) << 8;
+               cb0->qp_ctl_reg |= H_QP_CR_ENABLED;
+               hret = ehea_h_modify_ehea_qp(adapter->handle, 0, qp->fw_handle,
+                                            EHEA_BMASK_SET(H_QPCB0_QP_CTL_REG,
+                                                           1), cb0, &dummy64,
+                                            &dummy64, &dummy16, &dummy16);
+               if (hret != H_SUCCESS) {
+                       netdev_err(dev, "modify_ehea_qp failed (1)\n");
+                       goto out;
+               }
+               hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+                                           EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF),
+                                           cb0);
+               if (hret != H_SUCCESS) {
+                       netdev_err(dev, "query_ehea_qp failed (2)\n");
+                       goto out;
+               }
+               /* refill entire queue */
+               ehea_refill_rq1(pr, pr->rq1_skba.index, 0);
+               ehea_refill_rq2(pr, 0);
+               ehea_refill_rq3(pr, 0);
+       }
+ out:
+       free_page((unsigned long)cb0);
+       return ret;
+ }
+ static void ehea_reset_port(struct work_struct *work)
+ {
+       int ret;
+       struct ehea_port *port =
+               container_of(work, struct ehea_port, reset_task);
+       struct net_device *dev = port->netdev;
+       mutex_lock(&dlpar_mem_lock);
+       port->resets++;
+       mutex_lock(&port->port_lock);
+       netif_tx_disable(dev);
+       port_napi_disable(port);
+       ehea_down(dev);
+       ret = ehea_up(dev);
+       if (ret)
+               goto out;
+       ehea_set_multicast_list(dev);
+       netif_info(port, timer, dev, "reset successful\n");
+       port_napi_enable(port);
+       netif_tx_wake_all_queues(dev);
+ out:
+       mutex_unlock(&port->port_lock);
+       mutex_unlock(&dlpar_mem_lock);
+ }
+ static void ehea_rereg_mrs(void)
+ {
+       int ret, i;
+       struct ehea_adapter *adapter;
+       pr_info("LPAR memory changed - re-initializing driver\n");
+       list_for_each_entry(adapter, &adapter_list, list)
+               if (adapter->active_ports) {
+                       /* Shutdown all ports */
+                       for (i = 0; i < EHEA_MAX_PORTS; i++) {
+                               struct ehea_port *port = adapter->port[i];
+                               struct net_device *dev;
+                               if (!port)
+                                       continue;
+                               dev = port->netdev;
+                               if (dev->flags & IFF_UP) {
+                                       mutex_lock(&port->port_lock);
+                                       netif_tx_disable(dev);
+                                       ehea_flush_sq(port);
+                                       ret = ehea_stop_qps(dev);
+                                       if (ret) {
+                                               mutex_unlock(&port->port_lock);
+                                               goto out;
+                                       }
+                                       port_napi_disable(port);
+                                       mutex_unlock(&port->port_lock);
+                               }
+                               reset_sq_restart_flag(port);
+                       }
+                       /* Unregister old memory region */
+                       ret = ehea_rem_mr(&adapter->mr);
+                       if (ret) {
+                               pr_err("unregister MR failed - driver inoperable!\n");
+                               goto out;
+                       }
+               }
+       clear_bit(__EHEA_STOP_XFER, &ehea_driver_flags);
+       list_for_each_entry(adapter, &adapter_list, list)
+               if (adapter->active_ports) {
+                       /* Register new memory region */
+                       ret = ehea_reg_kernel_mr(adapter, &adapter->mr);
+                       if (ret) {
+                               pr_err("register MR failed - driver inoperable!\n");
+                               goto out;
+                       }
+                       /* Restart all ports */
+                       for (i = 0; i < EHEA_MAX_PORTS; i++) {
+                               struct ehea_port *port = adapter->port[i];
+                               if (port) {
+                                       struct net_device *dev = port->netdev;
+                                       if (dev->flags & IFF_UP) {
+                                               mutex_lock(&port->port_lock);
+                                               ret = ehea_restart_qps(dev);
+                                               if (!ret) {
+                                                       check_sqs(port);
+                                                       port_napi_enable(port);
+                                                       netif_tx_wake_all_queues(dev);
+                                               } else {
+                                                       netdev_err(dev, "Unable to restart QPS\n");
+                                               }
+                                               mutex_unlock(&port->port_lock);
+                                       }
+                               }
+                       }
+               }
+       pr_info("re-initializing driver complete\n");
+ out:
+       return;
+ }
+ static void ehea_tx_watchdog(struct net_device *dev)
+ {
+       struct ehea_port *port = netdev_priv(dev);
+       if (netif_carrier_ok(dev) &&
+           !test_bit(__EHEA_STOP_XFER, &ehea_driver_flags))
+               ehea_schedule_port_reset(port);
+ }
+ int ehea_sense_adapter_attr(struct ehea_adapter *adapter)
+ {
+       struct hcp_query_ehea *cb;
+       u64 hret;
+       int ret;
+       cb = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!cb) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       hret = ehea_h_query_ehea(adapter->handle, cb);
+       if (hret != H_SUCCESS) {
+               ret = -EIO;
+               goto out_herr;
+       }
+       adapter->max_mc_mac = cb->max_mc_mac - 1;
+       ret = 0;
+ out_herr:
+       free_page((unsigned long)cb);
+ out:
+       return ret;
+ }
+ int ehea_get_jumboframe_status(struct ehea_port *port, int *jumbo)
+ {
+       struct hcp_ehea_port_cb4 *cb4;
+       u64 hret;
+       int ret = 0;
+       *jumbo = 0;
+       /* (Try to) enable *jumbo frames */
+       cb4 = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!cb4) {
+               pr_err("no mem for cb4\n");
+               ret = -ENOMEM;
+               goto out;
+       } else {
+               hret = ehea_h_query_ehea_port(port->adapter->handle,
+                                             port->logical_port_id,
+                                             H_PORT_CB4,
+                                             H_PORT_CB4_JUMBO, cb4);
+               if (hret == H_SUCCESS) {
+                       if (cb4->jumbo_frame)
+                               *jumbo = 1;
+                       else {
+                               cb4->jumbo_frame = 1;
+                               hret = ehea_h_modify_ehea_port(port->adapter->
+                                                              handle,
+                                                              port->
+                                                              logical_port_id,
+                                                              H_PORT_CB4,
+                                                              H_PORT_CB4_JUMBO,
+                                                              cb4);
+                               if (hret == H_SUCCESS)
+                                       *jumbo = 1;
+                       }
+               } else
+                       ret = -EINVAL;
+               free_page((unsigned long)cb4);
+       }
+ out:
+       return ret;
+ }
+ static ssize_t ehea_show_port_id(struct device *dev,
+                                struct device_attribute *attr, char *buf)
+ {
+       struct ehea_port *port = container_of(dev, struct ehea_port, ofdev.dev);
+       return sprintf(buf, "%d", port->logical_port_id);
+ }
+ static DEVICE_ATTR(log_port_id, S_IRUSR | S_IRGRP | S_IROTH, ehea_show_port_id,
+                  NULL);
+ static void __devinit logical_port_release(struct device *dev)
+ {
+       struct ehea_port *port = container_of(dev, struct ehea_port, ofdev.dev);
+       of_node_put(port->ofdev.dev.of_node);
+ }
+ static struct device *ehea_register_port(struct ehea_port *port,
+                                        struct device_node *dn)
+ {
+       int ret;
+       port->ofdev.dev.of_node = of_node_get(dn);
+       port->ofdev.dev.parent = &port->adapter->ofdev->dev;
+       port->ofdev.dev.bus = &ibmebus_bus_type;
+       dev_set_name(&port->ofdev.dev, "port%d", port_name_cnt++);
+       port->ofdev.dev.release = logical_port_release;
+       ret = of_device_register(&port->ofdev);
+       if (ret) {
+               pr_err("failed to register device. ret=%d\n", ret);
+               goto out;
+       }
+       ret = device_create_file(&port->ofdev.dev, &dev_attr_log_port_id);
+       if (ret) {
+               pr_err("failed to register attributes, ret=%d\n", ret);
+               goto out_unreg_of_dev;
+       }
+       return &port->ofdev.dev;
+ out_unreg_of_dev:
+       of_device_unregister(&port->ofdev);
+ out:
+       return NULL;
+ }
+ static void ehea_unregister_port(struct ehea_port *port)
+ {
+       device_remove_file(&port->ofdev.dev, &dev_attr_log_port_id);
+       of_device_unregister(&port->ofdev);
+ }
+ static const struct net_device_ops ehea_netdev_ops = {
+       .ndo_open               = ehea_open,
+       .ndo_stop               = ehea_stop,
+       .ndo_start_xmit         = ehea_start_xmit,
+ #ifdef CONFIG_NET_POLL_CONTROLLER
+       .ndo_poll_controller    = ehea_netpoll,
+ #endif
+       .ndo_get_stats64        = ehea_get_stats64,
+       .ndo_set_mac_address    = ehea_set_mac_addr,
+       .ndo_validate_addr      = eth_validate_addr,
+       .ndo_set_rx_mode        = ehea_set_multicast_list,
+       .ndo_change_mtu         = ehea_change_mtu,
+       .ndo_vlan_rx_add_vid    = ehea_vlan_rx_add_vid,
+       .ndo_vlan_rx_kill_vid   = ehea_vlan_rx_kill_vid,
+       .ndo_tx_timeout         = ehea_tx_watchdog,
+ };
+ struct ehea_port *ehea_setup_single_port(struct ehea_adapter *adapter,
+                                        u32 logical_port_id,
+                                        struct device_node *dn)
+ {
+       int ret;
+       struct net_device *dev;
+       struct ehea_port *port;
+       struct device *port_dev;
+       int jumbo;
+       /* allocate memory for the port structures */
+       dev = alloc_etherdev_mq(sizeof(struct ehea_port), EHEA_MAX_PORT_RES);
+       if (!dev) {
+               pr_err("no mem for net_device\n");
+               ret = -ENOMEM;
+               goto out_err;
+       }
+       port = netdev_priv(dev);
+       mutex_init(&port->port_lock);
+       port->state = EHEA_PORT_DOWN;
+       port->sig_comp_iv = sq_entries / 10;
+       port->adapter = adapter;
+       port->netdev = dev;
+       port->logical_port_id = logical_port_id;
+       port->msg_enable = netif_msg_init(msg_level, EHEA_MSG_DEFAULT);
+       port->mc_list = kzalloc(sizeof(struct ehea_mc_list), GFP_KERNEL);
+       if (!port->mc_list) {
+               ret = -ENOMEM;
+               goto out_free_ethdev;
+       }
+       INIT_LIST_HEAD(&port->mc_list->list);
+       ret = ehea_sense_port_attr(port);
+       if (ret)
+               goto out_free_mc_list;
+       netif_set_real_num_rx_queues(dev, port->num_def_qps);
+       netif_set_real_num_tx_queues(dev, port->num_def_qps);
+       port_dev = ehea_register_port(port, dn);
+       if (!port_dev)
+               goto out_free_mc_list;
+       SET_NETDEV_DEV(dev, port_dev);
+       /* initialize net_device structure */
+       memcpy(dev->dev_addr, &port->mac_addr, ETH_ALEN);
+       dev->netdev_ops = &ehea_netdev_ops;
+       ehea_set_ethtool_ops(dev);
+       dev->hw_features = NETIF_F_SG | NETIF_F_TSO
+                     | NETIF_F_IP_CSUM | NETIF_F_HW_VLAN_TX | NETIF_F_LRO;
+       dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_TSO
+                     | NETIF_F_HIGHDMA | NETIF_F_IP_CSUM | NETIF_F_HW_VLAN_TX
+                     | NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER
+                     | NETIF_F_RXCSUM;
+       dev->vlan_features = NETIF_F_SG | NETIF_F_TSO | NETIF_F_HIGHDMA |
+                       NETIF_F_IP_CSUM;
+       dev->watchdog_timeo = EHEA_WATCH_DOG_TIMEOUT;
+       INIT_WORK(&port->reset_task, ehea_reset_port);
+       INIT_DELAYED_WORK(&port->stats_work, ehea_update_stats);
+       init_waitqueue_head(&port->swqe_avail_wq);
+       init_waitqueue_head(&port->restart_wq);
+       memset(&port->stats, 0, sizeof(struct net_device_stats));
+       ret = register_netdev(dev);
+       if (ret) {
+               pr_err("register_netdev failed. ret=%d\n", ret);
+               goto out_unreg_port;
+       }
+       ret = ehea_get_jumboframe_status(port, &jumbo);
+       if (ret)
+               netdev_err(dev, "failed determining jumbo frame status\n");
+       netdev_info(dev, "Jumbo frames are %sabled\n",
+                   jumbo == 1 ? "en" : "dis");
+       adapter->active_ports++;
+       return port;
+ out_unreg_port:
+       ehea_unregister_port(port);
+ out_free_mc_list:
+       kfree(port->mc_list);
+ out_free_ethdev:
+       free_netdev(dev);
+ out_err:
+       pr_err("setting up logical port with id=%d failed, ret=%d\n",
+              logical_port_id, ret);
+       return NULL;
+ }
+ static void ehea_shutdown_single_port(struct ehea_port *port)
+ {
+       struct ehea_adapter *adapter = port->adapter;
+       cancel_work_sync(&port->reset_task);
+       cancel_delayed_work_sync(&port->stats_work);
+       unregister_netdev(port->netdev);
+       ehea_unregister_port(port);
+       kfree(port->mc_list);
+       free_netdev(port->netdev);
+       adapter->active_ports--;
+ }
+ static int ehea_setup_ports(struct ehea_adapter *adapter)
+ {
+       struct device_node *lhea_dn;
+       struct device_node *eth_dn = NULL;
+       const u32 *dn_log_port_id;
+       int i = 0;
+       lhea_dn = adapter->ofdev->dev.of_node;
+       while ((eth_dn = of_get_next_child(lhea_dn, eth_dn))) {
+               dn_log_port_id = of_get_property(eth_dn, "ibm,hea-port-no",
+                                                NULL);
+               if (!dn_log_port_id) {
+                       pr_err("bad device node: eth_dn name=%s\n",
+                              eth_dn->full_name);
+                       continue;
+               }
+               if (ehea_add_adapter_mr(adapter)) {
+                       pr_err("creating MR failed\n");
+                       of_node_put(eth_dn);
+                       return -EIO;
+               }
+               adapter->port[i] = ehea_setup_single_port(adapter,
+                                                         *dn_log_port_id,
+                                                         eth_dn);
+               if (adapter->port[i])
+                       netdev_info(adapter->port[i]->netdev,
+                                   "logical port id #%d\n", *dn_log_port_id);
+               else
+                       ehea_remove_adapter_mr(adapter);
+               i++;
+       }
+       return 0;
+ }
+ static struct device_node *ehea_get_eth_dn(struct ehea_adapter *adapter,
+                                          u32 logical_port_id)
+ {
+       struct device_node *lhea_dn;
+       struct device_node *eth_dn = NULL;
+       const u32 *dn_log_port_id;
+       lhea_dn = adapter->ofdev->dev.of_node;
+       while ((eth_dn = of_get_next_child(lhea_dn, eth_dn))) {
+               dn_log_port_id = of_get_property(eth_dn, "ibm,hea-port-no",
+                                                NULL);
+               if (dn_log_port_id)
+                       if (*dn_log_port_id == logical_port_id)
+                               return eth_dn;
+       }
+       return NULL;
+ }
+ static ssize_t ehea_probe_port(struct device *dev,
+                              struct device_attribute *attr,
+                              const char *buf, size_t count)
+ {
+       struct ehea_adapter *adapter = dev_get_drvdata(dev);
+       struct ehea_port *port;
+       struct device_node *eth_dn = NULL;
+       int i;
+       u32 logical_port_id;
+       sscanf(buf, "%d", &logical_port_id);
+       port = ehea_get_port(adapter, logical_port_id);
+       if (port) {
+               netdev_info(port->netdev, "adding port with logical port id=%d failed: port already configured\n",
+                           logical_port_id);
+               return -EINVAL;
+       }
+       eth_dn = ehea_get_eth_dn(adapter, logical_port_id);
+       if (!eth_dn) {
+               pr_info("no logical port with id %d found\n", logical_port_id);
+               return -EINVAL;
+       }
+       if (ehea_add_adapter_mr(adapter)) {
+               pr_err("creating MR failed\n");
+               return -EIO;
+       }
+       port = ehea_setup_single_port(adapter, logical_port_id, eth_dn);
+       of_node_put(eth_dn);
+       if (port) {
+               for (i = 0; i < EHEA_MAX_PORTS; i++)
+                       if (!adapter->port[i]) {
+                               adapter->port[i] = port;
+                               break;
+                       }
+               netdev_info(port->netdev, "added: (logical port id=%d)\n",
+                           logical_port_id);
+       } else {
+               ehea_remove_adapter_mr(adapter);
+               return -EIO;
+       }
+       return (ssize_t) count;
+ }
+ static ssize_t ehea_remove_port(struct device *dev,
+                               struct device_attribute *attr,
+                               const char *buf, size_t count)
+ {
+       struct ehea_adapter *adapter = dev_get_drvdata(dev);
+       struct ehea_port *port;
+       int i;
+       u32 logical_port_id;
+       sscanf(buf, "%d", &logical_port_id);
+       port = ehea_get_port(adapter, logical_port_id);
+       if (port) {
+               netdev_info(port->netdev, "removed: (logical port id=%d)\n",
+                           logical_port_id);
+               ehea_shutdown_single_port(port);
+               for (i = 0; i < EHEA_MAX_PORTS; i++)
+                       if (adapter->port[i] == port) {
+                               adapter->port[i] = NULL;
+                               break;
+                       }
+       } else {
+               pr_err("removing port with logical port id=%d failed. port not configured.\n",
+                      logical_port_id);
+               return -EINVAL;
+       }
+       ehea_remove_adapter_mr(adapter);
+       return (ssize_t) count;
+ }
+ static DEVICE_ATTR(probe_port, S_IWUSR, NULL, ehea_probe_port);
+ static DEVICE_ATTR(remove_port, S_IWUSR, NULL, ehea_remove_port);
+ int ehea_create_device_sysfs(struct platform_device *dev)
+ {
+       int ret = device_create_file(&dev->dev, &dev_attr_probe_port);
+       if (ret)
+               goto out;
+       ret = device_create_file(&dev->dev, &dev_attr_remove_port);
+ out:
+       return ret;
+ }
+ void ehea_remove_device_sysfs(struct platform_device *dev)
+ {
+       device_remove_file(&dev->dev, &dev_attr_probe_port);
+       device_remove_file(&dev->dev, &dev_attr_remove_port);
+ }
+ static int __devinit ehea_probe_adapter(struct platform_device *dev,
+                                       const struct of_device_id *id)
+ {
+       struct ehea_adapter *adapter;
+       const u64 *adapter_handle;
+       int ret;
+       if (!dev || !dev->dev.of_node) {
+               pr_err("Invalid ibmebus device probed\n");
+               return -EINVAL;
+       }
+       adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+       if (!adapter) {
+               ret = -ENOMEM;
+               dev_err(&dev->dev, "no mem for ehea_adapter\n");
+               goto out;
+       }
+       list_add(&adapter->list, &adapter_list);
+       adapter->ofdev = dev;
+       adapter_handle = of_get_property(dev->dev.of_node, "ibm,hea-handle",
+                                        NULL);
+       if (adapter_handle)
+               adapter->handle = *adapter_handle;
+       if (!adapter->handle) {
+               dev_err(&dev->dev, "failed getting handle for adapter"
+                       " '%s'\n", dev->dev.of_node->full_name);
+               ret = -ENODEV;
+               goto out_free_ad;
+       }
+       adapter->pd = EHEA_PD_ID;
+       dev_set_drvdata(&dev->dev, adapter);
+       /* initialize adapter and ports */
+       /* get adapter properties */
+       ret = ehea_sense_adapter_attr(adapter);
+       if (ret) {
+               dev_err(&dev->dev, "sense_adapter_attr failed: %d\n", ret);
+               goto out_free_ad;
+       }
+       adapter->neq = ehea_create_eq(adapter,
+                                     EHEA_NEQ, EHEA_MAX_ENTRIES_EQ, 1);
+       if (!adapter->neq) {
+               ret = -EIO;
+               dev_err(&dev->dev, "NEQ creation failed\n");
+               goto out_free_ad;
+       }
+       tasklet_init(&adapter->neq_tasklet, ehea_neq_tasklet,
+                    (unsigned long)adapter);
+       ret = ibmebus_request_irq(adapter->neq->attr.ist1,
+                                 ehea_interrupt_neq, IRQF_DISABLED,
+                                 "ehea_neq", adapter);
+       if (ret) {
+               dev_err(&dev->dev, "requesting NEQ IRQ failed\n");
+               goto out_kill_eq;
+       }
+       ret = ehea_create_device_sysfs(dev);
+       if (ret)
+               goto out_free_irq;
+       ret = ehea_setup_ports(adapter);
+       if (ret) {
+               dev_err(&dev->dev, "setup_ports failed\n");
+               goto out_rem_dev_sysfs;
+       }
+       ret = 0;
+       goto out;
+ out_rem_dev_sysfs:
+       ehea_remove_device_sysfs(dev);
+ out_free_irq:
+       ibmebus_free_irq(adapter->neq->attr.ist1, adapter);
+ out_kill_eq:
+       ehea_destroy_eq(adapter->neq);
+ out_free_ad:
+       list_del(&adapter->list);
+       kfree(adapter);
+ out:
+       ehea_update_firmware_handles();
+       return ret;
+ }
+ static int __devexit ehea_remove(struct platform_device *dev)
+ {
+       struct ehea_adapter *adapter = dev_get_drvdata(&dev->dev);
+       int i;
+       for (i = 0; i < EHEA_MAX_PORTS; i++)
+               if (adapter->port[i]) {
+                       ehea_shutdown_single_port(adapter->port[i]);
+                       adapter->port[i] = NULL;
+               }
+       ehea_remove_device_sysfs(dev);
+       ibmebus_free_irq(adapter->neq->attr.ist1, adapter);
+       tasklet_kill(&adapter->neq_tasklet);
+       ehea_destroy_eq(adapter->neq);
+       ehea_remove_adapter_mr(adapter);
+       list_del(&adapter->list);
+       kfree(adapter);
+       ehea_update_firmware_handles();
+       return 0;
+ }
+ void ehea_crash_handler(void)
+ {
+       int i;
+       if (ehea_fw_handles.arr)
+               for (i = 0; i < ehea_fw_handles.num_entries; i++)
+                       ehea_h_free_resource(ehea_fw_handles.arr[i].adh,
+                                            ehea_fw_handles.arr[i].fwh,
+                                            FORCE_FREE);
+       if (ehea_bcmc_regs.arr)
+               for (i = 0; i < ehea_bcmc_regs.num_entries; i++)
+                       ehea_h_reg_dereg_bcmc(ehea_bcmc_regs.arr[i].adh,
+                                             ehea_bcmc_regs.arr[i].port_id,
+                                             ehea_bcmc_regs.arr[i].reg_type,
+                                             ehea_bcmc_regs.arr[i].macaddr,
+                                             0, H_DEREG_BCMC);
+ }
+ static int ehea_mem_notifier(struct notifier_block *nb,
+                              unsigned long action, void *data)
+ {
+       int ret = NOTIFY_BAD;
+       struct memory_notify *arg = data;
+       mutex_lock(&dlpar_mem_lock);
+       switch (action) {
+       case MEM_CANCEL_OFFLINE:
+               pr_info("memory offlining canceled");
+               /* Readd canceled memory block */
+       case MEM_ONLINE:
+               pr_info("memory is going online");
+               set_bit(__EHEA_STOP_XFER, &ehea_driver_flags);
+               if (ehea_add_sect_bmap(arg->start_pfn, arg->nr_pages))
+                       goto out_unlock;
+               ehea_rereg_mrs();
+               break;
+       case MEM_GOING_OFFLINE:
+               pr_info("memory is going offline");
+               set_bit(__EHEA_STOP_XFER, &ehea_driver_flags);
+               if (ehea_rem_sect_bmap(arg->start_pfn, arg->nr_pages))
+                       goto out_unlock;
+               ehea_rereg_mrs();
+               break;
+       default:
+               break;
+       }
+       ehea_update_firmware_handles();
+       ret = NOTIFY_OK;
+ out_unlock:
+       mutex_unlock(&dlpar_mem_lock);
+       return ret;
+ }
+ static struct notifier_block ehea_mem_nb = {
+       .notifier_call = ehea_mem_notifier,
+ };
+ static int ehea_reboot_notifier(struct notifier_block *nb,
+                               unsigned long action, void *unused)
+ {
+       if (action == SYS_RESTART) {
+               pr_info("Reboot: freeing all eHEA resources\n");
+               ibmebus_unregister_driver(&ehea_driver);
+       }
+       return NOTIFY_DONE;
+ }
+ static struct notifier_block ehea_reboot_nb = {
+       .notifier_call = ehea_reboot_notifier,
+ };
+ static int check_module_parm(void)
+ {
+       int ret = 0;
+       if ((rq1_entries < EHEA_MIN_ENTRIES_QP) ||
+           (rq1_entries > EHEA_MAX_ENTRIES_RQ1)) {
+               pr_info("Bad parameter: rq1_entries\n");
+               ret = -EINVAL;
+       }
+       if ((rq2_entries < EHEA_MIN_ENTRIES_QP) ||
+           (rq2_entries > EHEA_MAX_ENTRIES_RQ2)) {
+               pr_info("Bad parameter: rq2_entries\n");
+               ret = -EINVAL;
+       }
+       if ((rq3_entries < EHEA_MIN_ENTRIES_QP) ||
+           (rq3_entries > EHEA_MAX_ENTRIES_RQ3)) {
+               pr_info("Bad parameter: rq3_entries\n");
+               ret = -EINVAL;
+       }
+       if ((sq_entries < EHEA_MIN_ENTRIES_QP) ||
+           (sq_entries > EHEA_MAX_ENTRIES_SQ)) {
+               pr_info("Bad parameter: sq_entries\n");
+               ret = -EINVAL;
+       }
+       return ret;
+ }
+ static ssize_t ehea_show_capabilities(struct device_driver *drv,
+                                     char *buf)
+ {
+       return sprintf(buf, "%d", EHEA_CAPABILITIES);
+ }
+ static DRIVER_ATTR(capabilities, S_IRUSR | S_IRGRP | S_IROTH,
+                  ehea_show_capabilities, NULL);
+ int __init ehea_module_init(void)
+ {
+       int ret;
+       pr_info("IBM eHEA ethernet device driver (Release %s)\n", DRV_VERSION);
+       memset(&ehea_fw_handles, 0, sizeof(ehea_fw_handles));
+       memset(&ehea_bcmc_regs, 0, sizeof(ehea_bcmc_regs));
+       mutex_init(&ehea_fw_handles.lock);
+       spin_lock_init(&ehea_bcmc_regs.lock);
+       ret = check_module_parm();
+       if (ret)
+               goto out;
+       ret = ehea_create_busmap();
+       if (ret)
+               goto out;
+       ret = register_reboot_notifier(&ehea_reboot_nb);
+       if (ret)
+               pr_info("failed registering reboot notifier\n");
+       ret = register_memory_notifier(&ehea_mem_nb);
+       if (ret)
+               pr_info("failed registering memory remove notifier\n");
+       ret = crash_shutdown_register(ehea_crash_handler);
+       if (ret)
+               pr_info("failed registering crash handler\n");
+       ret = ibmebus_register_driver(&ehea_driver);
+       if (ret) {
+               pr_err("failed registering eHEA device driver on ebus\n");
+               goto out2;
+       }
+       ret = driver_create_file(&ehea_driver.driver,
+                                &driver_attr_capabilities);
+       if (ret) {
+               pr_err("failed to register capabilities attribute, ret=%d\n",
+                      ret);
+               goto out3;
+       }
+       return ret;
+ out3:
+       ibmebus_unregister_driver(&ehea_driver);
+ out2:
+       unregister_memory_notifier(&ehea_mem_nb);
+       unregister_reboot_notifier(&ehea_reboot_nb);
+       crash_shutdown_unregister(ehea_crash_handler);
+ out:
+       return ret;
+ }
+ static void __exit ehea_module_exit(void)
+ {
+       int ret;
+       driver_remove_file(&ehea_driver.driver, &driver_attr_capabilities);
+       ibmebus_unregister_driver(&ehea_driver);
+       unregister_reboot_notifier(&ehea_reboot_nb);
+       ret = crash_shutdown_unregister(ehea_crash_handler);
+       if (ret)
+               pr_info("failed unregistering crash handler\n");
+       unregister_memory_notifier(&ehea_mem_nb);
+       kfree(ehea_fw_handles.arr);
+       kfree(ehea_bcmc_regs.arr);
+       ehea_destroy_busmap();
+ }
+ module_init(ehea_module_init);
+ module_exit(ehea_module_exit);
Simple merge
@@@ -440,8 -441,16 +441,16 @@@ int scsi_dh_activate(struct request_que
        struct device *dev = NULL;
  
        spin_lock_irqsave(q->queue_lock, flags);
 -      sdev = q->queuedata;
 +      sdev = scsi_device_from_queue(q);
-       if (sdev && sdev->scsi_dh_data)
+       if (!sdev) {
+               spin_unlock_irqrestore(q->queue_lock, flags);
+               err = SCSI_DH_NOSYS;
+               if (fn)
+                       fn(data, err);
+               return err;
+       }
+       if (sdev->scsi_dh_data)
                scsi_dh = sdev->scsi_dh_data->scsi_dh;
        dev = get_device(&sdev->sdev_gendev);
        if (!scsi_dh || !dev ||
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
  #include <linux/notifier.h>
  #include <linux/jiffies.h>
  
+ #include <asm/irq_regs.h>
 +#include <linux/bootsplash.h>
 +
  extern void ctrl_alt_del(void);
  
  /*
Simple merge
Simple merge
Simple merge
Simple merge
diff --cc fs/Kconfig
Simple merge
diff --cc fs/Makefile
Simple merge
diff --cc fs/ext4/ext4.h
Simple merge
diff --cc fs/ext4/file.c
Simple merge
Simple merge
diff --cc fs/ext4/inode.c
@@@ -42,9 -42,7 +42,8 @@@
  #include "ext4_jbd2.h"
  #include "xattr.h"
  #include "acl.h"
- #include "ext4_extents.h"
  #include "truncate.h"
 +#include "richacl.h"
  
  #include <trace/events/ext4.h>
  
@@@ -3415,12 -3791,9 +3792,12 @@@ struct inode *ext4_iget(struct super_bl
                inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
                inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
-       inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+       set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
  
        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
 +#ifdef CONFIG_EXT4_FS_RICHACL
 +      ei->i_richacl = EXT4_RICHACL_NOT_CACHED;
 +#endif
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
diff --cc fs/ext4/namei.c
Simple merge
diff --cc fs/ext4/super.c
@@@ -3247,6 -3253,36 +3276,33 @@@ static int ext4_fill_super(struct super
                           &journal_ioprio, NULL, 0))
                goto failed_mount;
  
+       if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+               printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
+                           "with data=journal disables delayed "
+                           "allocation and O_DIRECT support!\n");
+               if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+                       ext4_msg(sb, KERN_ERR, "can't mount with "
+                                "both data=journal and delalloc");
+                       goto failed_mount;
+               }
+               if (test_opt(sb, DIOREAD_NOLOCK)) {
+                       ext4_msg(sb, KERN_ERR, "can't mount with "
+                                "both data=journal and delalloc");
+                       goto failed_mount;
+               }
+               if (test_opt(sb, DELALLOC))
+                       clear_opt(sb, DELALLOC);
+       }
+       blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+       if (test_opt(sb, DIOREAD_NOLOCK)) {
+               if (blocksize < PAGE_SIZE) {
+                       ext4_msg(sb, KERN_ERR, "can't mount with "
+                                "dioread_nolock if block size != PAGE_SIZE");
+                       goto failed_mount;
+               }
+       }
 -      sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 -              (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
 -
        if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
            (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
             EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
diff --cc fs/ext4/xattr.c
Simple merge
diff --cc fs/namei.c
Simple merge
diff --cc fs/nfs/inode.c
Simple merge
diff --cc fs/super.c
@@@ -722,8 -727,13 +722,13 @@@ static int __do_remount_sb(struct super
  
        if (sb->s_op->remount_fs) {
                retval = sb->s_op->remount_fs(sb, &flags, data);
-               if (retval)
-                       return retval;
+               if (retval) {
 -                      if (!force)
++                      if (!(rflags & REMOUNT_FORCE))
+                               return retval;
+                       /* If forced remount, go ahead despite any errors */
+                       WARN(1, "forced remount of a %s fs returned %i\n",
+                            sb->s_type->name, retval);
+               }
        }
        sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
  
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -785,95 -803,58 +803,93 @@@ extern const char *dev_driver_string(co
  
  #ifdef CONFIG_PRINTK
  
+ extern int __dev_printk(const char *level, const struct device *dev,
+                       struct va_format *vaf);
+ extern __printf(3, 4)
++
 +#if defined(KMSG_COMPONENT) && (defined(CONFIG_KMSG_IDS) || defined(__KMSG_CHECKER))
 +/* dev_printk_hash for message documentation */
 +#if defined(__KMSG_CHECKER) && defined(KMSG_COMPONENT)
 +
 +/* generate magic string for scripts/kmsg-doc to parse */
 +#define dev_printk_hash(level, dev, format, arg...) \
 +      __KMSG_DEV(level _FMT_ format _ARGS_ dev, ## arg _END_)
 +
 +#elif defined(CONFIG_KMSG_IDS) && defined(KMSG_COMPONENT)
 +
 +int printk_dev_hash(const char *, const char *, const char *, ...);
 +#define dev_printk_hash(level, dev, format, arg...) \
 +      printk_dev_hash(level "%s.%06x: ", dev_driver_string(dev), \
 +                      "%s: " format, dev_name(dev), ## arg)
 +
 +#endif
 +
 +#define dev_printk(level, dev, format, arg...)                \
 +      dev_printk_hash(level , dev, format, ## arg)
 +#define dev_emerg(dev, format, arg...)                \
 +      dev_printk_hash(KERN_EMERG , dev , format , ## arg)
 +#define dev_alert(dev, format, arg...)                \
 +      dev_printk_hash(KERN_ALERT , dev , format , ## arg)
 +#define dev_crit(dev, format, arg...)         \
 +      dev_printk_hash(KERN_CRIT , dev , format , ## arg)
 +#define dev_err(dev, format, arg...)          \
 +      dev_printk_hash(KERN_ERR , dev , format , ## arg)
 +#define dev_warn(dev, format, arg...)         \
 +      dev_printk_hash(KERN_WARNING , dev , format , ## arg)
 +#define dev_notice(dev, format, arg...)               \
 +      dev_printk_hash(KERN_NOTICE , dev , format , ## arg)
 +#define _dev_info(dev, format, arg...)                \
 +      dev_printk_hash(KERN_INFO , dev , format , ## arg)
 +#else
- extern int dev_printk(const char *level, const struct device *dev,
-                     const char *fmt, ...)
-       __attribute__ ((format (printf, 3, 4)));
- extern int dev_emerg(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
- extern int dev_alert(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
- extern int dev_crit(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
- extern int dev_err(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
- extern int dev_warn(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
- extern int dev_notice(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
- extern int _dev_info(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
+ int dev_printk(const char *level, const struct device *dev,
+              const char *fmt, ...)
+       ;
+ extern __printf(2, 3)
+ int dev_emerg(const struct device *dev, const char *fmt, ...);
+ extern __printf(2, 3)
+ int dev_alert(const struct device *dev, const char *fmt, ...);
+ extern __printf(2, 3)
+ int dev_crit(const struct device *dev, const char *fmt, ...);
+ extern __printf(2, 3)
+ int dev_err(const struct device *dev, const char *fmt, ...);
+ extern __printf(2, 3)
+ int dev_warn(const struct device *dev, const char *fmt, ...);
+ extern __printf(2, 3)
+ int dev_notice(const struct device *dev, const char *fmt, ...);
+ extern __printf(2, 3)
+ int _dev_info(const struct device *dev, const char *fmt, ...);
 -
 +#endif
  #else
  
- static inline int dev_printk(const char *level, const struct device *dev,
-                     const char *fmt, ...)
-       __attribute__ ((format (printf, 3, 4)));
- static inline int dev_printk(const char *level, const struct device *dev,
-                     const char *fmt, ...)
-        { return 0; }
- static inline int dev_emerg(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
- static inline int dev_emerg(const struct device *dev, const char *fmt, ...)
-       { return 0; }
- static inline int dev_crit(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
- static inline int dev_crit(const struct device *dev, const char *fmt, ...)
-       { return 0; }
- static inline int dev_alert(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
- static inline int dev_alert(const struct device *dev, const char *fmt, ...)
-       { return 0; }
- static inline int dev_err(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
- static inline int dev_err(const struct device *dev, const char *fmt, ...)
-       { return 0; }
- static inline int dev_warn(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
- static inline int dev_warn(const struct device *dev, const char *fmt, ...)
-       { return 0; }
- static inline int dev_notice(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
- static inline int dev_notice(const struct device *dev, const char *fmt, ...)
-       { return 0; }
- static inline int _dev_info(const struct device *dev, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
- static inline int _dev_info(const struct device *dev, const char *fmt, ...)
-       { return 0; }
+ static inline int __dev_printk(const char *level, const struct device *dev,
+                              struct va_format *vaf)
+ { return 0; }
+ static inline __printf(3, 4)
+ int dev_printk(const char *level, const struct device *dev,
+              const char *fmt, ...)
+ { return 0; }
+ static inline __printf(2, 3)
+ int dev_emerg(const struct device *dev, const char *fmt, ...)
+ { return 0; }
+ static inline __printf(2, 3)
+ int dev_crit(const struct device *dev, const char *fmt, ...)
+ { return 0; }
+ static inline __printf(2, 3)
+ int dev_alert(const struct device *dev, const char *fmt, ...)
+ { return 0; }
+ static inline __printf(2, 3)
+ int dev_err(const struct device *dev, const char *fmt, ...)
+ { return 0; }
+ static inline __printf(2, 3)
+ int dev_warn(const struct device *dev, const char *fmt, ...)
+ { return 0; }
+ static inline __printf(2, 3)
+ int dev_notice(const struct device *dev, const char *fmt, ...)
+ { return 0; }
+ static inline __printf(2, 3)
+ int _dev_info(const struct device *dev, const char *fmt, ...)
+ { return 0; }
  
  #endif
  
Simple merge
@@@ -371,16 -371,8 +373,17 @@@ extern enum system_states 
  #define TAINT_WARN                    9
  #define TAINT_CRAP                    10
  #define TAINT_FIRMWARE_WORKAROUND     11
+ #define TAINT_OOT_MODULE              12
  
 +#ifdef CONFIG_ENTERPRISE_SUPPORT
 +/*
 + * Take the upper bits to hopefully allow them
 + * to stay the same for more than one release.
 + */
 +#define TAINT_NO_SUPPORT              30
 +#define TAINT_EXTERNAL_SUPPORT                31
 +#endif
 +
  extern const char hex_asc[];
  #define hex_asc_lo(x) hex_asc[((x) & 0x0f)]
  #define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4]
Simple merge
Simple merge
@@@ -230,10 -229,7 +229,11 @@@ struct nfs_inode 
  #define NFS_INO_COMMIT                (7)             /* inode is committing unstable writes */
  #define NFS_INO_PNFS_COMMIT   (8)             /* use pnfs code for commit */
  #define NFS_INO_LAYOUTCOMMIT  (9)             /* layoutcommit required */
- #define NFS_INO_SEEN_GETATTR  (10)            /* flag to track if app is calling
+ #define NFS_INO_LAYOUTCOMMITTING (10)         /* layoutcommit inflight */
++#define NFS_INO_SEEN_GETATTR  (11)            /* flag to track if app is calling
 +                                               * getattr in a directory during
 +                                               * readdir
 +                                               */
  
  static inline struct nfs_inode *NFS_I(const struct inode *inode)
  {
Simple merge
diff --cc init/Kconfig
Simple merge
diff --cc init/main.c
Simple merge
Simple merge
diff --cc kernel/Makefile
Simple merge
diff --cc kernel/ksysfs.c
Simple merge
diff --cc kernel/module.c
Simple merge
diff --cc kernel/panic.c
@@@ -177,10 -177,7 +177,11 @@@ static const struct tnt tnts[] = 
        { TAINT_WARN,                   'W', ' ' },
        { TAINT_CRAP,                   'C', ' ' },
        { TAINT_FIRMWARE_WORKAROUND,    'I', ' ' },
+       { TAINT_OOT_MODULE,             'O', ' ' },
 +#ifdef CONFIG_ENTERPRISE_SUPPORT
 +      { TAINT_NO_SUPPORT,             'N', ' ' },
 +      { TAINT_EXTERNAL_SUPPORT,       'X', ' ' },
 +#endif
  };
  
  /**
   *  'W' - Taint on warning.
   *  'C' - modules from drivers/staging are loaded.
   *  'I' - Working around severe firmware bug.
+  *  'O' - Out-of-tree module has been loaded.
 + *  'N' - Unsuported modules loaded.
 + *  'X' - Modules with external support loaded.
   *
   *    The string is overwritten by the next call to print_tainted().
   */
diff --cc kernel/printk.c
Simple merge
diff --cc kernel/sysctl.c
Simple merge
Simple merge
@@@ -1099,8 -1093,7 +1111,8 @@@ config FAULT_INJECTION_STACKTRACE_FILTE
        depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
        depends on !X86_64
        select STACKTRACE
-       select FRAME_POINTER if !PPC && !S390 && !MICROBLAZE && !X86
 -      select FRAME_POINTER if !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND
++      select FRAME_POINTER if !PPC && !S390 && !MICROBLAZE && !X86 && !ARM_UNWIND
 +      select UNWIND_INFO if X86 && !FRAME_POINTER
        help
          Provide stacktrace filter for fault-injection capabilities
  
@@@ -1110,8 -1103,7 +1122,8 @@@ config LATENCYTO
        depends on DEBUG_KERNEL
        depends on STACKTRACE_SUPPORT
        depends on PROC_FS
-       select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !X86
 -      select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND
++      select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !X86 && !ARM_UNWIND
 +      select UNWIND_INFO if X86 && !FRAME_POINTER
        select KALLSYMS
        select KALLSYMS_ALL
        select STACKTRACE
diff --cc mm/page_alloc.c
@@@ -1791,14 -1785,8 +1785,14 @@@ void warn_alloc_failed(gfp_t gfp_mask, 
                va_end(args);
        }
  
 -      pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
 +      if (!(gfp_mask & __GFP_WAIT)) {
 +              pr_info("The following is only an harmless informational message.\n");
 +              pr_info("Unless you get a _continuous_flood_ of these messages it means\n");
 +              pr_info("everything is working fine. Allocations from irqs cannot be\n");
 +              pr_info("perfectly reliable and the kernel is designed to handle that.\n");
 +      }
 +      pr_info("%s: page allocation failure. order:%d, mode:0x%x\n",
-                  current->comm, order, gfp_mask);
+               current->comm, order, gfp_mask);
  
        dump_stack();
        if (!should_suppress_show_mem())
diff --cc mm/thrash.c
Simple merge
diff --cc mm/truncate.c
Simple merge
Simple merge
Simple merge
Simple merge
@@@ -40,10 -40,11 +40,11 @@@ static struct symbol *symtab[HASH_BUCKE
  static FILE *debugfile;
  
  int cur_line = 1;
- char *cur_filename;
+ char *cur_filename, *source_file;
+ int in_source_file;
  
  static int flag_debug, flag_dump_defs, flag_reference, flag_dump_types,
 -         flag_preserve, flag_warnings;
 +         flag_override, flag_preserve, flag_warnings;
  static const char *arch = "";
  static const char *mod_prefix = "";
  
Simple merge
@@@ -2231,8 -2175,8 +2237,9 @@@ int main(int argc, char **argv
                buf.pos = 0;
  
                add_header(&buf, mod);
+               add_intree_flag(&buf, !external_module);
                add_staging_flag(&buf, mod->name);
 +              add_supported_flag(&buf, mod);
                err |= add_versions(&buf, mod);
                add_depends(&buf, mod, modules);
                add_moddevtable(&buf, mod);
Simple merge
Simple merge
Simple merge