}
return 0;
}
- #endif
+static int __init force_acpi_rsdt(const struct dmi_system_id *d)
+{
+ if (!acpi_force) {
+ printk(KERN_NOTICE "%s detected: force use of acpi=rsdt\n",
+ d->ident);
+ acpi_rsdt_forced = 1;
+ } else {
+ printk(KERN_NOTICE
+ "Warning: acpi=force overrules DMI blacklist: "
+ "acpi=rsdt\n");
+ }
+ return 0;
+
+}
+
/*
* If your system is blacklisted here, but you find that acpi=force
* works for you, please contact linux-acpi@vger.kernel.org
CFI_ENDPROC
END(call_softirq)
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+ CFI_STARTPROC
+ movq %r15, R15(%rdi)
+ movq %r14, R14(%rdi)
+ xchgq %rsi, %rdx
+ movq %r13, R13(%rdi)
+ movq %r12, R12(%rdi)
+ xorl %eax, %eax
+ movq %rbp, RBP(%rdi)
+ movq %rbx, RBX(%rdi)
+ movq (%rsp), %r9
+ xchgq %rdx, %rcx
+ movq %rax, R11(%rdi)
+ movq %rax, R10(%rdi)
+ movq %rax, R9(%rdi)
+ movq %rax, R8(%rdi)
+ movq %rax, RAX(%rdi)
+ movq %rax, RCX(%rdi)
+ movq %rax, RDX(%rdi)
+ movq %rax, RSI(%rdi)
+ movq %rax, RDI(%rdi)
+ movq %rax, ORIG_RAX(%rdi)
+ movq %r9, RIP(%rdi)
+ leaq 8(%rsp), %r9
+ movq $__KERNEL_CS, CS(%rdi)
+ movq %rax, EFLAGS(%rdi)
+ movq %r9, RSP(%rdi)
+ movq $__KERNEL_DS, SS(%rdi)
+ jmpq *%rcx
+ CFI_ENDPROC
+END(arch_unwind_init_running)
+#endif
+
- #ifdef CONFIG_PARAVIRT_XEN
+ #ifdef CONFIG_XEN
zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
/*
case MSR_VM_HSAVE_PA:
case MSR_AMD64_PATCH_LOADER:
break;
- case 0xe2:
++ case MSR_NHM_SNB_PKG_CST_CFG_CTL: /* 0xe2 */
case 0x200 ... 0x2ff:
return set_msr_mtrr(vcpu, msr, data);
case MSR_IA32_APICBASE:
case MSR_K8_INT_PENDING_MSG:
case MSR_AMD64_NB_CFG:
case MSR_FAM10H_MMIO_CONF_BASE:
- case 0xe2:
++ case MSR_NHM_SNB_PKG_CST_CFG_CTL: /* 0xe2 */
data = 0;
break;
+ case MSR_IA32_UCODE_REV:
+ data = 0x100000000ULL;
+ break;
case MSR_MTRRcap:
data = 0x500 | KVM_NR_VAR_MTRR;
break;
#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/debugfs.h>
+ #include <linux/module.h>
+#include <linux/uaccess.h>
#include "internal.h"
MODULE_AUTHOR("Thomas Renninger <trenn@suse.de>");
priv->pkt_type = SYN_MODEL_NEWABS(priv->model_id) ? SYN_NEWABS : SYN_OLDABS;
- printk(KERN_INFO "Synaptics Touchpad, model: %ld, fw: %ld.%ld, id: %#lx, caps: %#lx/%#lx/%#lx\n",
- SYN_ID_MODEL(priv->identity),
- SYN_ID_MAJOR(priv->identity), SYN_ID_MINOR(priv->identity),
- priv->model_id, priv->capabilities, priv->ext_cap, priv->ext_cap_0c);
+ psmouse_info(psmouse,
+ "Touchpad model: %ld, fw: %ld.%ld, id: %#lx, caps: %#lx/%#lx/%#lx\n",
+ SYN_ID_MODEL(priv->identity),
+ SYN_ID_MAJOR(priv->identity), SYN_ID_MINOR(priv->identity),
+ priv->model_id,
+ priv->capabilities, priv->ext_cap, priv->ext_cap_0c);
+ if (synaptics_init_led(psmouse) < 0)
+ goto init_fail;
+
set_input_params(psmouse->dev, priv);
/*
unsigned int down:1;
unsigned char ext_buttons;
signed char scroll;
+
+ /* As reported in last AGM-CONTACT packets */
+ struct synaptics_mt_state mt_state;
};
+struct synaptics_led;
+
struct synaptics_data {
/* Data read from the touchpad */
unsigned long int model_id; /* Model-ID */
struct serio *pt_port; /* Pass-through serio port */
- struct synaptics_hw_state mt; /* current gesture packet */
+ struct synaptics_mt_state mt_state; /* Current mt finger state */
+ bool mt_state_lost; /* mt_state may be incorrect */
+
+ /*
+ * Last received Advanced Gesture Mode (AGM) packet. An AGM packet
+ * contains position data for a second contact, at half resolution.
+ */
+ struct synaptics_hw_state agm;
+ bool agm_pending; /* new AGM packet received */
+ struct synaptics_led *led;
};
void synaptics_module_init(void);
obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
obj-$(CONFIG_DM_ZERO) += dm-zero.o
obj-$(CONFIG_DM_RAID) += dm-raid.o
+ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
+obj-$(CONFIG_DM_RAID45) += dm-raid45.o dm-log.o dm-memcache.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
--- /dev/null
+/*
+ * (C) Copyright 2008 Hewlett-Packard Development Company, L.P
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
++#include <linux/module.h>
+
+#define DM_MSG_PREFIX "multipath least-pending"
+
+/*-----------------------------------------------------------------
+* Path-handling code, paths are held in lists
+*---------------------------------------------------------------*/
+struct path_info {
+ struct list_head list;
+ struct dm_path *path;
+ unsigned repeat_count;
+ atomic_t io_count;
+};
+
+static void free_paths(struct list_head *paths)
+{
+ struct path_info *pi, *next;
+
+ list_for_each_entry_safe(pi, next, paths, list) {
+ list_del(&pi->list);
+ kfree(pi);
+ }
+}
+
+/*-----------------------------------------------------------------
+ * Least-pending selector
+ *---------------------------------------------------------------*/
+
+#define LPP_MIN_IO 1
+
+struct selector {
+ struct list_head valid_paths;
+ struct list_head invalid_paths;
+};
+
+static struct selector *alloc_selector(void)
+{
+ struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (s) {
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->invalid_paths);
+ }
+
+ return s;
+}
+
+static int lpp_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+ struct selector *s;
+
+ s = alloc_selector();
+ if (!s)
+ return -ENOMEM;
+
+ ps->context = s;
+ return 0;
+}
+
+static void lpp_destroy(struct path_selector *ps)
+{
+ struct selector *s = ps->context;
+
+ free_paths(&s->valid_paths);
+ free_paths(&s->invalid_paths);
+ kfree(s);
+ ps->context = NULL;
+}
+
+static int lpp_status(struct path_selector *ps, struct dm_path *path,
+ status_type_t type, char *result, unsigned int maxlen)
+{
+ struct path_info *pi;
+ int sz = 0;
+
+ if (!path)
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("1 ");
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT("0 ");
+ break;
+ }
+ else {
+ pi = path->pscontext;
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%u:%u ", pi->repeat_count,
+ atomic_read(&pi->io_count));
+ break;
+ case STATUSTYPE_TABLE:
+ break;
+ }
+ }
+
+ return sz;
+}
+
+/*
+ * Called during initialisation to register each path with an
+ * optional repeat_count.
+ */
+static int lpp_add_path(struct path_selector *ps, struct dm_path *path,
+ int argc, char **argv, char **error)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi;
+ unsigned repeat_count = LPP_MIN_IO;
+
+ if (argc > 1) {
+ *error = "least-pending ps: incorrect number of arguments";
+ return -EINVAL;
+ }
+
+ /* First path argument is number of I/Os before switching path */
+ if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+ *error = "least-pending ps: invalid repeat count";
+ return -EINVAL;
+ }
+
+ /* allocate the path */
+ pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+ if (!pi) {
+ *error = "least-pending ps: Error allocating path context";
+ return -ENOMEM;
+ }
+
+ pi->path = path;
+ pi->repeat_count = repeat_count;
+ atomic_set(&pi->io_count, 0);
+
+ path->pscontext = pi;
+
+ list_add(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+static void lpp_fail_path(struct path_selector *ps, struct dm_path *p)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = p->pscontext;
+
+ if (!pi)
+ return;
+
+ atomic_set(&pi->io_count, 0);
+
+ list_move(&pi->list, &s->invalid_paths);
+}
+
+static int lpp_reinstate_path(struct path_selector *ps, struct dm_path *p)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi = p->pscontext;
+
+ if (!pi)
+ return 1;
+
+ list_move(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+static struct dm_path *lpp_select_path(struct path_selector *ps,
+ unsigned *repeat_count,
+ size_t nr_bytes)
+{
+ struct selector *s = ps->context;
+ struct path_info *pi, *next, *least_io_path = NULL;
+ struct list_head *paths;
+
+ if (list_empty(&s->valid_paths))
+ return NULL;
+
+ paths = &s->valid_paths;
+
+ list_for_each_entry_safe(pi, next, paths, list) {
+ if (!least_io_path || atomic_read(&least_io_path->io_count) < atomic_read(&pi->io_count))
+ least_io_path = pi;
+ if (!atomic_read(&least_io_path->io_count))
+ break;
+ }
+
+ if (!least_io_path)
+ return NULL;
+
+ atomic_inc(&least_io_path->io_count);
+ *repeat_count = least_io_path->repeat_count;
+
+ return least_io_path->path;
+}
+
+static int lpp_end_io(struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes)
+{
+ struct path_info *pi = NULL;
+
+ pi = path->pscontext;
+ if (!pi)
+ return 1;
+
+ atomic_dec(&pi->io_count);
+
+ return 0;
+}
+
+static struct path_selector_type lpp_ps = {
+ .name = "least-pending",
+ .module = THIS_MODULE,
+ .table_args = 1,
+ .info_args = 0,
+ .create = lpp_create,
+ .destroy = lpp_destroy,
+ .status = lpp_status,
+ .add_path = lpp_add_path,
+ .fail_path = lpp_fail_path,
+ .reinstate_path = lpp_reinstate_path,
+ .select_path = lpp_select_path,
+ .end_io = lpp_end_io,
+};
+
+static int __init dm_lpp_init(void)
+{
+ int r = dm_register_path_selector(&lpp_ps);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ DMINFO("version 1.0.0 loaded");
+
+ return r;
+}
+
+static void __exit dm_lpp_exit(void)
+{
+ int r = dm_unregister_path_selector(&lpp_ps);
+
+ if (r < 0)
+ DMERR("unregister failed %d", r);
+}
+
+module_init(dm_lpp_init);
+module_exit(dm_lpp_exit);
+
+MODULE_DESCRIPTION(DM_NAME " least-pending multipath path selector");
+MODULE_AUTHOR("Sakshi Chaitanya Veni <vsakshi@hp.com>");
+MODULE_LICENSE("GPL");
+
--- /dev/null
+/*
+ * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
+ *
+ * Device-mapper memory object handling:
+ *
+ * o allocate/free total_pages in a per client page pool.
+ *
+ * o allocate/free memory objects with chunks (1..n) of
+ * pages_per_chunk pages hanging off.
+ *
+ * This file is released under the GPL.
+ */
+
+#define DM_MEM_CACHE_VERSION "0.2"
+
+#include "dm.h"
+#include "dm-memcache.h"
+#include <linux/dm-io.h>
+#include <linux/slab.h>
++#include <linux/module.h>
+
+struct dm_mem_cache_client {
+ spinlock_t lock;
+ mempool_t *objs_pool;
+ struct page_list *free_list;
+ unsigned objects;
+ unsigned chunks;
+ unsigned pages_per_chunk;
+ unsigned free_pages;
+ unsigned total_pages;
+};
+
+/*
+ * Free pages and page_list elements of client.
+ */
+static void free_cache_pages(struct page_list *list)
+{
+ while (list) {
+ struct page_list *pl = list;
+
+ list = pl->next;
+ BUG_ON(!pl->page);
+ __free_page(pl->page);
+ kfree(pl);
+ }
+}
+
+/*
+ * Alloc number of pages and page_list elements as required by client.
+ */
+static struct page_list *alloc_cache_pages(unsigned pages)
+{
+ struct page_list *pl, *ret = NULL;
+ struct page *page;
+
+ while (pages--) {
+ page = alloc_page(GFP_NOIO);
+ if (!page)
+ goto err;
+
+ pl = kmalloc(sizeof(*pl), GFP_NOIO);
+ if (!pl) {
+ __free_page(page);
+ goto err;
+ }
+
+ pl->page = page;
+ pl->next = ret;
+ ret = pl;
+ }
+
+ return ret;
+
+err:
+ free_cache_pages(ret);
+ return NULL;
+}
+
+/*
+ * Allocate page_list elements from the pool to chunks of the memory object.
+ */
+static void alloc_chunks(struct dm_mem_cache_client *cl,
+ struct dm_mem_cache_object *obj)
+{
+ unsigned chunks = cl->chunks;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ local_irq_disable();
+ while (chunks--) {
+ unsigned p = cl->pages_per_chunk;
+
+ obj[chunks].pl = NULL;
+
+ while (p--) {
+ struct page_list *pl;
+
+ /* Take next element from free list */
+ spin_lock(&cl->lock);
+ pl = cl->free_list;
+ BUG_ON(!pl);
+ cl->free_list = pl->next;
+ spin_unlock(&cl->lock);
+
+ pl->next = obj[chunks].pl;
+ obj[chunks].pl = pl;
+ }
+ }
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Free page_list elements putting them back onto free list
+ */
+static void free_chunks(struct dm_mem_cache_client *cl,
+ struct dm_mem_cache_object *obj)
+{
+ unsigned chunks = cl->chunks;
+ unsigned long flags;
+ struct page_list *next, *pl;
+
+ local_irq_save(flags);
+ local_irq_disable();
+ while (chunks--) {
+ for (pl = obj[chunks].pl; pl; pl = next) {
+ next = pl->next;
+
+ spin_lock(&cl->lock);
+ pl->next = cl->free_list;
+ cl->free_list = pl;
+ cl->free_pages++;
+ spin_unlock(&cl->lock);
+ }
+ }
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Create/destroy dm memory cache client resources.
+ */
+struct dm_mem_cache_client *
+dm_mem_cache_client_create(unsigned objects, unsigned chunks,
+ unsigned pages_per_chunk)
+{
+ unsigned total_pages = objects * chunks * pages_per_chunk;
+ struct dm_mem_cache_client *client;
+
+ BUG_ON(!total_pages);
+ client = kzalloc(sizeof(*client), GFP_KERNEL);
+ if (!client)
+ return ERR_PTR(-ENOMEM);
+
+ client->objs_pool = mempool_create_kmalloc_pool(objects,
+ chunks * sizeof(struct dm_mem_cache_object));
+ if (!client->objs_pool)
+ goto err;
+
+ client->free_list = alloc_cache_pages(total_pages);
+ if (!client->free_list)
+ goto err1;
+
+ spin_lock_init(&client->lock);
+ client->objects = objects;
+ client->chunks = chunks;
+ client->pages_per_chunk = pages_per_chunk;
+ client->free_pages = client->total_pages = total_pages;
+ return client;
+
+err1:
+ mempool_destroy(client->objs_pool);
+err:
+ kfree(client);
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(dm_mem_cache_client_create);
+
+void dm_mem_cache_client_destroy(struct dm_mem_cache_client *cl)
+{
+ BUG_ON(cl->free_pages != cl->total_pages);
+ free_cache_pages(cl->free_list);
+ mempool_destroy(cl->objs_pool);
+ kfree(cl);
+}
+EXPORT_SYMBOL(dm_mem_cache_client_destroy);
+
+/*
+ * Grow a clients cache by an amount of pages.
+ *
+ * Don't call from interrupt context!
+ */
+int dm_mem_cache_grow(struct dm_mem_cache_client *cl, unsigned objects)
+{
+ unsigned pages = objects * cl->chunks * cl->pages_per_chunk;
+ struct page_list *pl, *last;
+
+ BUG_ON(!pages);
+ pl = alloc_cache_pages(pages);
+ if (!pl)
+ return -ENOMEM;
+
+ last = pl;
+ while (last->next)
+ last = last->next;
+
+ spin_lock_irq(&cl->lock);
+ last->next = cl->free_list;
+ cl->free_list = pl;
+ cl->free_pages += pages;
+ cl->total_pages += pages;
+ cl->objects += objects;
+ spin_unlock_irq(&cl->lock);
+
+ mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
+ return 0;
+}
+EXPORT_SYMBOL(dm_mem_cache_grow);
+
+/* Shrink a clients cache by an amount of pages */
+int dm_mem_cache_shrink(struct dm_mem_cache_client *cl, unsigned objects)
+{
+ int r;
+ unsigned pages = objects * cl->chunks * cl->pages_per_chunk, p = pages;
+ unsigned long flags;
+ struct page_list *last = NULL, *pl, *pos;
+
+ BUG_ON(!pages);
+
+ spin_lock_irqsave(&cl->lock, flags);
+ pl = pos = cl->free_list;
+ while (p-- && pos->next) {
+ last = pos;
+ pos = pos->next;
+ }
+
+ if (++p)
+ r = -ENOMEM;
+ else {
+ r = 0;
+ cl->free_list = pos;
+ cl->free_pages -= pages;
+ cl->total_pages -= pages;
+ cl->objects -= objects;
+ last->next = NULL;
+ }
+ spin_unlock_irqrestore(&cl->lock, flags);
+
+ if (!r) {
+ free_cache_pages(pl);
+ mempool_resize(cl->objs_pool, cl->objects, GFP_NOIO);
+ }
+
+ return r;
+}
+EXPORT_SYMBOL(dm_mem_cache_shrink);
+
+/*
+ * Allocate/free a memory object
+ *
+ * Can be called from interrupt context
+ */
+struct dm_mem_cache_object *dm_mem_cache_alloc(struct dm_mem_cache_client *cl)
+{
+ int r = 0;
+ unsigned pages = cl->chunks * cl->pages_per_chunk;
+ unsigned long flags;
+ struct dm_mem_cache_object *obj;
+
+ obj = mempool_alloc(cl->objs_pool, GFP_NOIO);
+ if (!obj)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_irqsave(&cl->lock, flags);
+ if (pages > cl->free_pages)
+ r = -ENOMEM;
+ else
+ cl->free_pages -= pages;
+ spin_unlock_irqrestore(&cl->lock, flags);
+
+ if (r) {
+ mempool_free(obj, cl->objs_pool);
+ return ERR_PTR(r);
+ }
+
+ alloc_chunks(cl, obj);
+ return obj;
+}
+EXPORT_SYMBOL(dm_mem_cache_alloc);
+
+void dm_mem_cache_free(struct dm_mem_cache_client *cl,
+ struct dm_mem_cache_object *obj)
+{
+ free_chunks(cl, obj);
+ mempool_free(obj, cl->objs_pool);
+}
+EXPORT_SYMBOL(dm_mem_cache_free);
+
+MODULE_DESCRIPTION(DM_NAME " dm memory cache");
+MODULE_AUTHOR("Heinz Mauelshagen <heinzm@redhat.com>");
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
+ *
+ * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
+ *
+ * This file is released under the GPL.
+ *
+ *
+ * Linux 2.6 Device Mapper RAID4 and RAID5 target.
+ *
+ * Tested-by: Intel; Marcin.Labun@intel.com, krzysztof.wojcik@intel.com
+ *
+ *
+ * Supports the following ATARAID vendor solutions (and SNIA DDF):
+ *
+ * Adaptec HostRAID ASR
+ * SNIA DDF1
+ * Hiphpoint 37x
+ * Hiphpoint 45x
+ * Intel IMSM
+ * Jmicron ATARAID
+ * LSI Logic MegaRAID
+ * NVidia RAID
+ * Promise FastTrack
+ * Silicon Image Medley
+ * VIA Software RAID
+ *
+ * via the dmraid application.
+ *
+ *
+ * Features:
+ *
+ * o RAID4 with dedicated and selectable parity device
+ * o RAID5 with rotating parity (left+right, symmetric+asymmetric)
+ * o recovery of out of sync device for initial
+ * RAID set creation or after dead drive replacement
+ * o run time optimization of xor algorithm used to calculate parity
+ *
+ *
+ * Thanks to MD for:
+ * o the raid address calculation algorithm
+ * o the base of the biovec <-> page list copier.
+ *
+ *
+ * Uses region hash to keep track of how many writes are in flight to
+ * regions in order to use dirty log to keep state of regions to recover:
+ *
+ * o clean regions (those which are synchronized
+ * and don't have write io in flight)
+ * o dirty regions (those with write io in flight)
+ *
+ *
+ * On startup, any dirty regions are migrated to the
+ * 'nosync' state and are subject to recovery by the daemon.
+ *
+ * See raid_ctr() for table definition.
+ *
+ * ANALYZEME: recovery bandwidth
+ */
+
+static const char *version = "v0.2597k";
+
+#include "dm.h"
+#include "dm-memcache.h"
+#include "dm-raid45.h"
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/raid/xor.h>
+#include <linux/slab.h>
++#include <linux/module.h>
+
+#include <linux/bio.h>
+#include <linux/dm-io.h>
+#include <linux/dm-dirty-log.h>
+#include <linux/dm-region-hash.h>
+
+
+/*
+ * Configurable parameters
+ */
+
+/* Minimum/maximum and default # of selectable stripes. */
+#define STRIPES_MIN 8
+#define STRIPES_MAX 16384
+#define STRIPES_DEFAULT 80
+
+/* Maximum and default chunk size in sectors if not set in constructor. */
+#define CHUNK_SIZE_MIN 8
+#define CHUNK_SIZE_MAX 16384
+#define CHUNK_SIZE_DEFAULT 64
+
+/* Default io size in sectors if not set in constructor. */
+#define IO_SIZE_MIN CHUNK_SIZE_MIN
+#define IO_SIZE_DEFAULT IO_SIZE_MIN
+
+/* Recover io size default in sectors. */
+#define RECOVER_IO_SIZE_MIN 64
+#define RECOVER_IO_SIZE_DEFAULT 256
+
+/* Default, minimum and maximum percentage of recover io bandwidth. */
+#define BANDWIDTH_DEFAULT 10
+#define BANDWIDTH_MIN 1
+#define BANDWIDTH_MAX 100
+
+/* # of parallel recovered regions */
+#define RECOVERY_STRIPES_MIN 1
+#define RECOVERY_STRIPES_MAX 64
+#define RECOVERY_STRIPES_DEFAULT RECOVERY_STRIPES_MIN
+/*
+ * END Configurable parameters
+ */
+
+#define TARGET "dm-raid45"
+#define DAEMON "kraid45d"
+#define DM_MSG_PREFIX TARGET
+
+#define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT)
+
+/* Amount/size for __xor(). */
+#define XOR_SIZE PAGE_SIZE
+
+/* Ticks to run xor_speed() test for. */
+#define XOR_SPEED_TICKS 5
+
+/* Check value in range. */
+#define range_ok(i, min, max) (i >= min && i <= max)
+
+/* Structure access macros. */
+/* Derive raid_set from stripe_cache pointer. */
+#define RS(x) container_of(x, struct raid_set, sc)
+
+/* Page reference. */
+#define PAGE(stripe, p) ((stripe)->obj[p].pl->page)
+
+/* Stripe chunk reference. */
+#define CHUNK(stripe, p) ((stripe)->chunk + p)
+
+/* Bio list reference. */
+#define BL(stripe, p, rw) (stripe->chunk[p].bl + rw)
+#define BL_CHUNK(chunk, rw) (chunk->bl + rw)
+
+/* Page list reference. */
+#define PL(stripe, p) (stripe->obj[p].pl)
+/* END: structure access macros. */
+
+/* Factor out to dm-bio-list.h */
+static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
+{
+ bio->bi_next = bl->head;
+ bl->head = bio;
+
+ if (!bl->tail)
+ bl->tail = bio;
+}
+
+/* Factor out to dm.h */
+#define TI_ERR_RET(str, ret) \
+ do { ti->error = str; return ret; } while (0);
+#define TI_ERR(str) TI_ERR_RET(str, -EINVAL)
+
+/* Macro to define access IO flags access inline functions. */
+#define BITOPS(name, what, var, flag) \
+static inline int TestClear ## name ## what(struct var *v) \
+{ return test_and_clear_bit(flag, &v->io.flags); } \
+static inline int TestSet ## name ## what(struct var *v) \
+{ return test_and_set_bit(flag, &v->io.flags); } \
+static inline void Clear ## name ## what(struct var *v) \
+{ clear_bit(flag, &v->io.flags); } \
+static inline void Set ## name ## what(struct var *v) \
+{ set_bit(flag, &v->io.flags); } \
+static inline int name ## what(struct var *v) \
+{ return test_bit(flag, &v->io.flags); }
+
+/*-----------------------------------------------------------------
+ * Stripe cache
+ *
+ * Cache for all reads and writes to raid sets (operational or degraded)
+ *
+ * We need to run all data to and from a RAID set through this cache,
+ * because parity chunks need to get calculated from data chunks
+ * or, in the degraded/resynchronization case, missing chunks need
+ * to be reconstructed using the other chunks of the stripe.
+ *---------------------------------------------------------------*/
+/* Unique kmem cache name suffix # counter. */
+static atomic_t _stripe_sc_nr = ATOMIC_INIT(-1); /* kmem cache # counter. */
+
+/* A chunk within a stripe (holds bios hanging off). */
+/* IO status flags for chunks of a stripe. */
+enum chunk_flags {
+ CHUNK_DIRTY, /* Pages of chunk dirty; need writing. */
+ CHUNK_ERROR, /* IO error on any chunk page. */
+ CHUNK_IO, /* Allow/prohibit IO on chunk pages. */
+ CHUNK_LOCKED, /* Chunk pages locked during IO. */
+ CHUNK_MUST_IO, /* Chunk must io. */
+ CHUNK_UNLOCK, /* Enforce chunk unlock. */
+ CHUNK_UPTODATE, /* Chunk pages are uptodate. */
+};
+
+enum bl_type {
+ WRITE_QUEUED = WRITE + 1,
+ WRITE_MERGED,
+ NR_BL_TYPES, /* Must be last one! */
+};
+struct stripe_chunk {
+ atomic_t cnt; /* Reference count. */
+ struct stripe *stripe; /* Backpointer to stripe for endio(). */
+ /* Bio lists for reads, writes, and writes merged. */
+ struct bio_list bl[NR_BL_TYPES];
+ struct {
+ unsigned long flags; /* IO status flags. */
+ } io;
+};
+
+/* Define chunk bit operations. */
+BITOPS(Chunk, Dirty, stripe_chunk, CHUNK_DIRTY)
+BITOPS(Chunk, Error, stripe_chunk, CHUNK_ERROR)
+BITOPS(Chunk, Io, stripe_chunk, CHUNK_IO)
+BITOPS(Chunk, Locked, stripe_chunk, CHUNK_LOCKED)
+BITOPS(Chunk, MustIo, stripe_chunk, CHUNK_MUST_IO)
+BITOPS(Chunk, Unlock, stripe_chunk, CHUNK_UNLOCK)
+BITOPS(Chunk, Uptodate, stripe_chunk, CHUNK_UPTODATE)
+
+/*
+ * Stripe linked list indexes. Keep order, because the stripe
+ * and the stripe cache rely on the first 3!
+ */
+enum list_types {
+ LIST_FLUSH, /* Stripes to flush for io. */
+ LIST_ENDIO, /* Stripes to endio. */
+ LIST_LRU, /* Least recently used stripes. */
+ SC_NR_LISTS, /* # of lists in stripe cache. */
+ LIST_HASH = SC_NR_LISTS, /* Hashed stripes. */
+ LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
+ STRIPE_NR_LISTS,/* To size array in struct stripe. */
+};
+
+/* Adressing region recovery. */
+struct recover_addr {
+ struct dm_region *reg; /* Actual region to recover. */
+ sector_t pos; /* Position within region to recover. */
+ sector_t end; /* End of region to recover. */
+};
+
+/* A stripe: the io object to handle all reads and writes to a RAID set. */
+struct stripe {
+ atomic_t cnt; /* Reference count. */
+ struct stripe_cache *sc; /* Backpointer to stripe cache. */
+
+ /*
+ * 4 linked lists:
+ * o io list to flush io
+ * o endio list
+ * o LRU list to put stripes w/o reference count on
+ * o stripe cache hash
+ */
+ struct list_head lists[STRIPE_NR_LISTS];
+
+ sector_t key; /* Hash key. */
+ region_t region; /* Region stripe is mapped to. */
+
+ struct {
+ unsigned long flags; /* Stripe state flags (see below). */
+
+ /*
+ * Pending ios in flight:
+ *
+ * used to control move of stripe to endio list
+ */
+ atomic_t pending;
+
+ /* Sectors to read and write for multi page stripe sets. */
+ unsigned size;
+ } io;
+
+ /* Address region recovery. */
+ struct recover_addr *recover;
+
+ /* Lock on stripe (Future: for clustering). */
+ void *lock;
+
+ struct {
+ unsigned short parity; /* Parity chunk index. */
+ short recover; /* Recovery chunk index. */
+ } idx;
+
+ /*
+ * This stripe's memory cache object (dm-mem-cache);
+ * i.e. the io chunk pages.
+ */
+ struct dm_mem_cache_object *obj;
+
+ /* Array of stripe sets (dynamically allocated). */
+ struct stripe_chunk chunk[0];
+};
+
+/* States stripes can be in (flags field). */
+enum stripe_states {
+ STRIPE_ERROR, /* io error on stripe. */
+ STRIPE_MERGED, /* Writes got merged to be written. */
+ STRIPE_RBW, /* Read-before-write stripe. */
+ STRIPE_RECONSTRUCT, /* Reconstruct of a missing chunk required. */
+ STRIPE_RECONSTRUCTED, /* Reconstructed of a missing chunk. */
+ STRIPE_RECOVER, /* Stripe used for RAID set recovery. */
+};
+
+/* Define stripe bit operations. */
+BITOPS(Stripe, Error, stripe, STRIPE_ERROR)
+BITOPS(Stripe, Merged, stripe, STRIPE_MERGED)
+BITOPS(Stripe, RBW, stripe, STRIPE_RBW)
+BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT)
+BITOPS(Stripe, Reconstructed, stripe, STRIPE_RECONSTRUCTED)
+BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER)
+
+/* A stripe hash. */
+struct stripe_hash {
+ struct list_head *hash;
+ unsigned buckets;
+ unsigned mask;
+ unsigned prime;
+ unsigned shift;
+};
+
+enum sc_lock_types {
+ LOCK_ENDIO, /* Protect endio list. */
+ NR_LOCKS, /* To size array in struct stripe_cache. */
+};
+
+/* A stripe cache. */
+struct stripe_cache {
+ /* Stripe hash. */
+ struct stripe_hash hash;
+
+ spinlock_t locks[NR_LOCKS]; /* Locks to protect lists. */
+
+ /* Stripes with io to flush, stripes to endio and LRU lists. */
+ struct list_head lists[SC_NR_LISTS];
+
+ /* Slab cache to allocate stripes from. */
+ struct {
+ struct kmem_cache *cache; /* Cache itself. */
+ char name[32]; /* Unique name. */
+ } kc;
+
+ struct dm_io_client *dm_io_client; /* dm-io client resource context. */
+
+ /* dm-mem-cache client resource context. */
+ struct dm_mem_cache_client *mem_cache_client;
+
+ int stripes_parm; /* # stripes parameter from constructor. */
+ atomic_t stripes; /* actual # of stripes in cache. */
+ atomic_t stripes_to_set; /* # of stripes to resize cache to. */
+ atomic_t stripes_last; /* last # of stripes in cache. */
+ atomic_t active_stripes; /* actual # of active stripes in cache. */
+
+ /* REMOVEME: */
+ atomic_t active_stripes_max; /* actual # of active stripes in cache. */
+};
+
+/* Flag specs for raid_dev */ ;
+enum raid_dev_flags {
+ DEV_FAILED, /* Device failed. */
+ DEV_IO_QUEUED, /* Io got queued to device. */
+};
+
+/* The raid device in a set. */
+struct raid_dev {
+ struct dm_dev *dev;
+ sector_t start; /* Offset to map to. */
+ struct { /* Using struct to be able to BITOPS(). */
+ unsigned long flags; /* raid_dev_flags. */
+ } io;
+};
+
+BITOPS(Dev, Failed, raid_dev, DEV_FAILED)
+BITOPS(Dev, IoQueued, raid_dev, DEV_IO_QUEUED)
+
+/* Flags spec for raid_set. */
+enum raid_set_flags {
+ RS_CHECK_OVERWRITE, /* Check for chunk overwrites. */
+ RS_DEAD, /* RAID set inoperational. */
+ RS_DEAD_ENDIO_MESSAGE, /* RAID set dead endio one-off message. */
+ RS_DEGRADED, /* Io errors on RAID device. */
+ RS_DEVEL_STATS, /* REMOVEME: display status information. */
+ RS_ENFORCE_PARITY_CREATION,/* Enforce parity creation. */
+ RS_PROHIBIT_WRITES, /* Prohibit writes on device failure. */
+ RS_RECOVER, /* Do recovery. */
+ RS_RECOVERY_BANDWIDTH, /* Allow recovery bandwidth (delayed bios). */
+ RS_SC_BUSY, /* Stripe cache busy -> send an event. */
+ RS_SUSPEND, /* Suspend RAID set. */
+};
+
+/* REMOVEME: devel stats counters. */
+enum stats_types {
+ S_BIOS_READ,
+ S_BIOS_ADDED_READ,
+ S_BIOS_ENDIO_READ,
+ S_BIOS_WRITE,
+ S_BIOS_ADDED_WRITE,
+ S_BIOS_ENDIO_WRITE,
+ S_CAN_MERGE,
+ S_CANT_MERGE,
+ S_CONGESTED,
+ S_DM_IO_READ,
+ S_DM_IO_WRITE,
+ S_BANDWIDTH,
+ S_BARRIER,
+ S_BIO_COPY_PL_NEXT,
+ S_DEGRADED,
+ S_DELAYED_BIOS,
+ S_FLUSHS,
+ S_HITS_1ST,
+ S_IOS_POST,
+ S_INSCACHE,
+ S_MAX_LOOKUP,
+ S_CHUNK_LOCKED,
+ S_NO_BANDWIDTH,
+ S_NOT_CONGESTED,
+ S_NO_RW,
+ S_NOSYNC,
+ S_OVERWRITE,
+ S_PROHIBITCHUNKIO,
+ S_RECONSTRUCT_EI,
+ S_RECONSTRUCT_DEV,
+ S_RECONSTRUCT_SET,
+ S_RECONSTRUCTED,
+ S_REQUEUE,
+ S_STRIPE_ERROR,
+ S_SUM_DELAYED_BIOS,
+ S_XORS,
+ S_NR_STATS, /* # of stats counters. Must be last! */
+};
+
+/* Status type -> string mappings. */
+struct stats_map {
+ const enum stats_types type;
+ const char *str;
+};
+
+static struct stats_map stats_map[] = {
+ { S_BIOS_READ, "r=" },
+ { S_BIOS_ADDED_READ, "/" },
+ { S_BIOS_ENDIO_READ, "/" },
+ { S_BIOS_WRITE, " w=" },
+ { S_BIOS_ADDED_WRITE, "/" },
+ { S_BIOS_ENDIO_WRITE, "/" },
+ { S_DM_IO_READ, " rc=" },
+ { S_DM_IO_WRITE, " wc=" },
+ { S_BANDWIDTH, "\nbw=" },
+ { S_NO_BANDWIDTH, " no_bw=" },
+ { S_BARRIER, "\nbarrier=" },
+ { S_BIO_COPY_PL_NEXT, "\nbio_cp_next=" },
+ { S_CAN_MERGE, "\nmerge=" },
+ { S_CANT_MERGE, "/no_merge=" },
+ { S_CHUNK_LOCKED, "\nchunk_locked=" },
+ { S_CONGESTED, "\ncgst=" },
+ { S_NOT_CONGESTED, "/not_cgst=" },
+ { S_DEGRADED, "\ndegraded=" },
+ { S_DELAYED_BIOS, "\ndel_bios=" },
+ { S_SUM_DELAYED_BIOS, "/sum_del_bios=" },
+ { S_FLUSHS, "\nflushs=" },
+ { S_HITS_1ST, "\nhits_1st=" },
+ { S_IOS_POST, " ios_post=" },
+ { S_INSCACHE, " inscache=" },
+ { S_MAX_LOOKUP, " maxlookup=" },
+ { S_NO_RW, "\nno_rw=" },
+ { S_NOSYNC, " nosync=" },
+ { S_OVERWRITE, " ovr=" },
+ { S_PROHIBITCHUNKIO, " prhbt_io=" },
+ { S_RECONSTRUCT_EI, "\nrec_ei=" },
+ { S_RECONSTRUCT_DEV, " rec_dev=" },
+ { S_RECONSTRUCT_SET, " rec_set=" },
+ { S_RECONSTRUCTED, " rec=" },
+ { S_REQUEUE, " requeue=" },
+ { S_STRIPE_ERROR, " stripe_err=" },
+ { S_XORS, " xors=" },
+};
+
+/*
+ * A RAID set.
+ */
+#define dm_rh_client dm_region_hash
+enum count_type { IO_WORK = 0, IO_RECOVER, IO_NR_COUNT };
+typedef void (*xor_function_t)(unsigned count, unsigned long **data);
+struct raid_set {
+ struct dm_target *ti; /* Target pointer. */
+
+ struct {
+ unsigned long flags; /* State flags. */
+ struct mutex in_lock; /* Protects central input list below. */
+ struct mutex xor_lock; /* Protects xor algorithm set. */
+ struct bio_list in; /* Pending ios (central input list). */
+ struct bio_list work; /* ios work set. */
+ wait_queue_head_t suspendq; /* suspend synchronization. */
+ atomic_t in_process; /* counter of queued bios (suspendq). */
+ atomic_t in_process_max;/* counter of queued bios max. */
+
+ /* io work. */
+ struct workqueue_struct *wq;
+ struct delayed_work dws_do_raid; /* For main worker. */
+ struct work_struct ws_do_table_event; /* For event worker. */
+ } io;
+
+ /* Stripe locking abstraction. */
+ struct dm_raid45_locking_type *locking;
+
+ struct stripe_cache sc; /* Stripe cache for this set. */
+
+ /* Xor optimization. */
+ struct {
+ struct xor_func *f;
+ unsigned chunks;
+ unsigned speed;
+ } xor;
+
+ /* Recovery parameters. */
+ struct recover {
+ struct dm_dirty_log *dl; /* Dirty log. */
+ struct dm_rh_client *rh; /* Region hash. */
+
+ struct dm_io_client *dm_io_client; /* recovery dm-io client. */
+ /* dm-mem-cache client resource context for recovery stripes. */
+ struct dm_mem_cache_client *mem_cache_client;
+
+ struct list_head stripes; /* List of recovery stripes. */
+
+ region_t nr_regions;
+ region_t nr_regions_to_recover;
+ region_t nr_regions_recovered;
+ unsigned long start_jiffies;
+ unsigned long end_jiffies;
+
+ unsigned bandwidth; /* Recovery bandwidth [%]. */
+ unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
+ unsigned bandwidth_parm; /* " constructor parm. */
+ unsigned io_size; /* recovery io size <= region size. */
+ unsigned io_size_parm; /* recovery io size ctr parameter. */
+ unsigned recovery; /* Recovery allowed/prohibited. */
+ unsigned recovery_stripes; /* # of parallel recovery stripes. */
+
+ /* recovery io throttling. */
+ atomic_t io_count[IO_NR_COUNT]; /* counter recover/regular io.*/
+ unsigned long last_jiffies;
+ } recover;
+
+ /* RAID set parameters. */
+ struct {
+ struct raid_type *raid_type; /* RAID type (eg, RAID4). */
+ unsigned raid_parms; /* # variable raid parameters. */
+
+ unsigned chunk_size; /* Sectors per chunk. */
+ unsigned chunk_size_parm;
+ unsigned chunk_shift; /* rsector chunk size shift. */
+
+ unsigned io_size; /* Sectors per io. */
+ unsigned io_size_parm;
+ unsigned io_mask; /* Mask for bio_copy_page_list(). */
+ unsigned io_inv_mask; /* Mask for raid_address(). */
+
+ sector_t sectors_per_dev; /* Sectors per device. */
+
+ atomic_t failed_devs; /* Amount of devices failed. */
+
+ /* Index of device to initialize. */
+ int dev_to_init;
+ int dev_to_init_parm;
+
+ /* Raid devices dynamically allocated. */
+ unsigned raid_devs; /* # of RAID devices below. */
+ unsigned data_devs; /* # of RAID data devices. */
+
+ int ei; /* index of failed RAID device. */
+
+ /* Index of dedicated parity device (i.e. RAID4). */
+ int pi;
+ int pi_parm; /* constructor parm for status output. */
+ } set;
+
+ /* REMOVEME: devel stats counters. */
+ atomic_t stats[S_NR_STATS];
+
+ /* Dynamically allocated temporary pointers for xor(). */
+ unsigned long **data;
+
+ /* Dynamically allocated RAID devices. Alignment? */
+ struct raid_dev dev[0];
+};
+
+/* Define RAID set bit operations. */
+BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
+BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
+BITOPS(RS, Dead, raid_set, RS_DEAD)
+BITOPS(RS, DeadEndioMessage, raid_set, RS_DEAD_ENDIO_MESSAGE)
+BITOPS(RS, Degraded, raid_set, RS_DEGRADED)
+BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
+BITOPS(RS, EnforceParityCreation, raid_set, RS_ENFORCE_PARITY_CREATION)
+BITOPS(RS, ProhibitWrites, raid_set, RS_PROHIBIT_WRITES)
+BITOPS(RS, Recover, raid_set, RS_RECOVER)
+BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
+BITOPS(RS, Suspend, raid_set, RS_SUSPEND)
+#undef BITOPS
+
+/*-----------------------------------------------------------------
+ * Raid-4/5 set structures.
+ *---------------------------------------------------------------*/
+/* RAID level definitions. */
+enum raid_level {
+ raid4,
+ raid5,
+};
+
+/* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
+enum raid_algorithm {
+ none,
+ left_asym,
+ right_asym,
+ left_sym,
+ right_sym,
+};
+
+struct raid_type {
+ const char *name; /* RAID algorithm. */
+ const char *descr; /* Descriptor text for logging. */
+ const unsigned parity_devs; /* # of parity devices. */
+ const unsigned minimal_devs; /* minimal # of devices in set. */
+ const enum raid_level level; /* RAID level. */
+ const enum raid_algorithm algorithm; /* RAID algorithm. */
+};
+
+/* Supported raid types and properties. */
+static struct raid_type raid_types[] = {
+ {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
+ {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym},
+ {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym},
+ {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym},
+ {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym},
+};
+
+/* Address as calculated by raid_address(). */
+struct raid_address {
+ sector_t key; /* Hash key (address of stripe % chunk_size). */
+ unsigned di, pi; /* Data and parity disks index. */
+};
+
+/* REMOVEME: reset statistics counters. */
+static void stats_reset(struct raid_set *rs)
+{
+ unsigned s = S_NR_STATS;
+
+ while (s--)
+ atomic_set(rs->stats + s, 0);
+}
+
+/*----------------------------------------------------------------
+ * RAID set management routines.
+ *--------------------------------------------------------------*/
+/*
+ * Begin small helper functions.
+ */
+/* No need to be called from region hash indirectly at dm_rh_dec(). */
+static void wake_dummy(void *context) {}
+
+/* Return # of io reference. */
+static int io_ref(struct raid_set *rs)
+{
+ return atomic_read(&rs->io.in_process);
+}
+
+/* Get an io reference. */
+static void io_get(struct raid_set *rs)
+{
+ int p = atomic_inc_return(&rs->io.in_process);
+
+ if (p > atomic_read(&rs->io.in_process_max))
+ atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
+}
+
+/* Put the io reference and conditionally wake io waiters. */
+static void io_put(struct raid_set *rs)
+{
+ /* Intel: rebuild data corrupter? */
+ if (atomic_dec_and_test(&rs->io.in_process))
+ wake_up(&rs->io.suspendq);
+ else
+ BUG_ON(io_ref(rs) < 0);
+}
+
+/* Wait until all io has been processed. */
+static void wait_ios(struct raid_set *rs)
+{
+ wait_event(rs->io.suspendq, !io_ref(rs));
+}
+
+/* Queue (optionally delayed) io work. */
+static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
+{
+ queue_delayed_work(rs->io.wq, &rs->io.dws_do_raid, delay);
+}
+
+/* Queue io work immediately (called from region hash too). */
+static void wake_do_raid(void *context)
+{
+ struct raid_set *rs = context;
+
+ queue_work(rs->io.wq, &rs->io.dws_do_raid.work);
+}
+
+/* Calculate device sector offset. */
+static sector_t _sector(struct raid_set *rs, struct bio *bio)
+{
+ sector_t sector = bio->bi_sector;
+
+ sector_div(sector, rs->set.data_devs);
+ return sector;
+}
+
+/* Return # of active stripes in stripe cache. */
+static int sc_active(struct stripe_cache *sc)
+{
+ return atomic_read(&sc->active_stripes);
+}
+
+/* Stripe cache busy indicator. */
+static int sc_busy(struct raid_set *rs)
+{
+ return sc_active(&rs->sc) >
+ atomic_read(&rs->sc.stripes) - (STRIPES_MIN / 2);
+}
+
+/* Set chunks states. */
+enum chunk_dirty_type { CLEAN, DIRTY, ERROR };
+static void chunk_set(struct stripe_chunk *chunk, enum chunk_dirty_type type)
+{
+ switch (type) {
+ case CLEAN:
+ ClearChunkDirty(chunk);
+ break;
+ case DIRTY:
+ SetChunkDirty(chunk);
+ break;
+ case ERROR:
+ SetChunkError(chunk);
+ SetStripeError(chunk->stripe);
+ return;
+ default:
+ BUG();
+ }
+
+ SetChunkUptodate(chunk);
+ SetChunkIo(chunk);
+ ClearChunkError(chunk);
+}
+
+/* Return region state for a sector. */
+static int region_state(struct raid_set *rs, sector_t sector,
+ enum dm_rh_region_states state)
+{
+ struct dm_rh_client *rh = rs->recover.rh;
+ region_t region = dm_rh_sector_to_region(rh, sector);
+
+ return !!(dm_rh_get_state(rh, region, 1) & state);
+}
+
+/*
+ * Return true in case a chunk should be read/written
+ *
+ * Conditions to read/write:
+ * o chunk not uptodate
+ * o chunk dirty
+ *
+ * Conditios to avoid io:
+ * o io already ongoing on chunk
+ * o io explitely prohibited
+ */
+static int chunk_io(struct stripe_chunk *chunk)
+{
+ /* 2nd run optimization (flag set below on first run). */
+ if (TestClearChunkMustIo(chunk))
+ return 1;
+
+ /* Avoid io if prohibited or a locked chunk. */
+ if (!ChunkIo(chunk) || ChunkLocked(chunk))
+ return 0;
+
+ if (!ChunkUptodate(chunk) || ChunkDirty(chunk)) {
+ SetChunkMustIo(chunk); /* 2nd run optimization. */
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Call a function on each chunk needing io unless device failed. */
+static unsigned for_each_io_dev(struct stripe *stripe,
+ void (*f_io)(struct stripe *stripe, unsigned p))
+{
+ struct raid_set *rs = RS(stripe->sc);
+ unsigned p, r = 0;
+
+ for (p = 0; p < rs->set.raid_devs; p++) {
+ if (chunk_io(CHUNK(stripe, p)) && !DevFailed(rs->dev + p)) {
+ f_io(stripe, p);
+ r++;
+ }
+ }
+
+ return r;
+}
+
+/*
+ * Index of device to calculate parity on.
+ *
+ * Either the parity device index *or* the selected
+ * device to init after a spare replacement.
+ */
+static int dev_for_parity(struct stripe *stripe, int *sync)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ int r = region_state(rs, stripe->key, DM_RH_NOSYNC | DM_RH_RECOVERING);
+
+ *sync = !r;
+
+ /* Reconstruct a particular device ?. */
+ if (r && rs->set.dev_to_init > -1)
+ return rs->set.dev_to_init;
+ else if (rs->set.raid_type->level == raid4)
+ return rs->set.pi;
+ else if (!StripeRecover(stripe))
+ return stripe->idx.parity;
+ else
+ return -1;
+}
+
+/* RAID set congested function. */
+static int rs_congested(void *congested_data, int bdi_bits)
+{
+ int r;
+ unsigned p;
+ struct raid_set *rs = congested_data;
+
+ if (sc_busy(rs) || RSSuspend(rs) || RSProhibitWrites(rs))
+ r = 1;
+ else for (r = 0, p = rs->set.raid_devs; !r && p--; ) {
+ /* If any of our component devices are overloaded. */
+ struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
+
+ r |= bdi_congested(&q->backing_dev_info, bdi_bits);
+ }
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
+ return r;
+}
+
+/* RAID device degrade check. */
+static void rs_check_degrade_dev(struct raid_set *rs,
+ struct stripe *stripe, unsigned p)
+{
+ if (TestSetDevFailed(rs->dev + p))
+ return;
+
+ /* Through an event in case of member device errors. */
+ if ((atomic_inc_return(&rs->set.failed_devs) >
+ rs->set.raid_type->parity_devs) &&
+ !TestSetRSDead(rs)) {
+ /* Display RAID set dead message once. */
+ unsigned p;
+ char buf[BDEVNAME_SIZE];
+
+ DMERR("FATAL: too many devices failed -> RAID set broken");
+ for (p = 0; p < rs->set.raid_devs; p++) {
+ if (DevFailed(rs->dev + p))
+ DMERR("device /dev/%s failed",
+ bdevname(rs->dev[p].dev->bdev, buf));
+ }
+ }
+
+ /* Only log the first member error. */
+ if (!TestSetRSDegraded(rs)) {
+ char buf[BDEVNAME_SIZE];
+
+ /* Store index for recovery. */
+ rs->set.ei = p;
+ DMERR("CRITICAL: %sio error on device /dev/%s "
+ "in region=%llu; DEGRADING RAID set\n",
+ stripe ? "" : "FAKED ",
+ bdevname(rs->dev[p].dev->bdev, buf),
+ (unsigned long long) (stripe ? stripe->key : 0));
+ DMERR("further device error messages suppressed");
+ }
+
+ /* Prohibit further writes to allow for userpace to update metadata. */
+ SetRSProhibitWrites(rs);
+ schedule_work(&rs->io.ws_do_table_event);
+}
+
+/* RAID set degrade check. */
+static void rs_check_degrade(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ unsigned p = rs->set.raid_devs;
+
+ while (p--) {
+ if (ChunkError(CHUNK(stripe, p)))
+ rs_check_degrade_dev(rs, stripe, p);
+ }
+}
+
+/* Lookup a RAID device by name or by major:minor number. */
+static int raid_dev_lookup(struct raid_set *rs, struct raid_dev *dev_lookup)
+{
+ unsigned p;
+ struct raid_dev *dev;
+
+ /*
+ * Must be an incremental loop, because the device array
+ * can have empty slots still on calls from raid_ctr()
+ */
+ for (dev = rs->dev, p = 0;
+ dev->dev && p < rs->set.raid_devs;
+ dev++, p++) {
+ if (dev_lookup->dev->bdev->bd_dev == dev->dev->bdev->bd_dev)
+ return p;
+ }
+
+ return -ENODEV;
+}
+/*
+ * End small helper functions.
+ */
+
+/*
+ * Stripe hash functions
+ */
+/* Initialize/destroy stripe hash. */
+static int hash_init(struct stripe_hash *hash, unsigned stripes)
+{
+ unsigned buckets = roundup_pow_of_two(stripes >> 1);
+ static unsigned hash_primes[] = {
+ /* Table of primes for hash_fn/table size optimization. */
+ 1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769,
+ 1543, 3079, 6151, 12289, 24593, 49157, 98317,
+ };
+
+ /* Allocate stripe hash buckets. */
+ hash->hash = vmalloc(buckets * sizeof(*hash->hash));
+ if (!hash->hash)
+ return -ENOMEM;
+
+ hash->buckets = buckets;
+ hash->mask = buckets - 1;
+ hash->shift = ffs(buckets);
+ if (hash->shift > ARRAY_SIZE(hash_primes))
+ hash->shift = ARRAY_SIZE(hash_primes) - 1;
+
+ BUG_ON(hash->shift < 2);
+ hash->prime = hash_primes[hash->shift];
+
+ /* Initialize buckets. */
+ while (buckets--)
+ INIT_LIST_HEAD(hash->hash + buckets);
+ return 0;
+}
+
+static void hash_exit(struct stripe_hash *hash)
+{
+ if (hash->hash) {
+ vfree(hash->hash);
+ hash->hash = NULL;
+ }
+}
+
+static unsigned hash_fn(struct stripe_hash *hash, sector_t key)
+{
+ return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
+}
+
+static struct list_head *hash_bucket(struct stripe_hash *hash, sector_t key)
+{
+ return hash->hash + hash_fn(hash, key);
+}
+
+/* Insert an entry into a hash. */
+static void stripe_insert(struct stripe_hash *hash, struct stripe *stripe)
+{
+ list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
+}
+
+/* Lookup an entry in the stripe hash. */
+static struct stripe *stripe_lookup(struct stripe_cache *sc, sector_t key)
+{
+ unsigned look = 0;
+ struct stripe *stripe;
+ struct list_head *bucket = hash_bucket(&sc->hash, key);
+
+ list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
+ look++;
+
+ if (stripe->key == key) {
+ /* REMOVEME: statisics. */
+ if (look > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
+ atomic_set(RS(sc)->stats + S_MAX_LOOKUP, look);
+ return stripe;
+ }
+ }
+
+ return NULL;
+}
+
+/* Resize the stripe cache hash on size changes. */
+static int sc_hash_resize(struct stripe_cache *sc)
+{
+ /* Resize indicated ? */
+ if (atomic_read(&sc->stripes) != atomic_read(&sc->stripes_last)) {
+ int r;
+ struct stripe_hash hash;
+
+ r = hash_init(&hash, atomic_read(&sc->stripes));
+ if (r)
+ return r;
+
+ if (sc->hash.hash) {
+ unsigned b = sc->hash.buckets;
+ struct list_head *pos, *tmp;
+
+ /* Walk old buckets and insert into new. */
+ while (b--) {
+ list_for_each_safe(pos, tmp, sc->hash.hash + b)
+ stripe_insert(&hash,
+ list_entry(pos, struct stripe,
+ lists[LIST_HASH]));
+ }
+
+ }
+
+ hash_exit(&sc->hash);
+ memcpy(&sc->hash, &hash, sizeof(sc->hash));
+ atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
+ }
+
+ return 0;
+}
+/* End hash stripe hash function. */
+
+/* List add, delete, push and pop functions. */
+/* Add stripe to flush list. */
+#define DEL_LIST(lh) \
+ if (!list_empty(lh)) \
+ list_del_init(lh);
+
+/* Delete stripe from hash. */
+static void stripe_hash_del(struct stripe *stripe)
+{
+ DEL_LIST(stripe->lists + LIST_HASH);
+}
+
+/* Return stripe reference count. */
+static inline int stripe_ref(struct stripe *stripe)
+{
+ return atomic_read(&stripe->cnt);
+}
+
+static void stripe_flush_add(struct stripe *stripe)
+{
+ struct stripe_cache *sc = stripe->sc;
+ struct list_head *lh = stripe->lists + LIST_FLUSH;
+
+ if (!StripeReconstruct(stripe) && list_empty(lh))
+ list_add_tail(lh, sc->lists + LIST_FLUSH);
+}
+
+/*
+ * Add stripe to LRU (inactive) list.
+ *
+ * Need lock, because of concurrent access from message interface.
+ */
+static void stripe_lru_add(struct stripe *stripe)
+{
+ if (!StripeRecover(stripe)) {
+ struct list_head *lh = stripe->lists + LIST_LRU;
+
+ if (list_empty(lh))
+ list_add_tail(lh, stripe->sc->lists + LIST_LRU);
+ }
+}
+
+#define POP_LIST(list) \
+ do { \
+ if (list_empty(sc->lists + (list))) \
+ stripe = NULL; \
+ else { \
+ stripe = list_first_entry(sc->lists + (list), \
+ struct stripe, \
+ lists[(list)]); \
+ list_del_init(stripe->lists + (list)); \
+ } \
+ } while (0);
+
+/* Pop an available stripe off the LRU list. */
+static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
+{
+ struct stripe *stripe;
+
+ POP_LIST(LIST_LRU);
+ return stripe;
+}
+
+/* Pop an available stripe off the io list. */
+static struct stripe *stripe_io_pop(struct stripe_cache *sc)
+{
+ struct stripe *stripe;
+
+ POP_LIST(LIST_FLUSH);
+ return stripe;
+}
+
+/* Push a stripe safely onto the endio list to be handled by do_endios(). */
+static void stripe_endio_push(struct stripe *stripe)
+{
+ unsigned long flags;
+ struct stripe_cache *sc = stripe->sc;
+ struct list_head *stripe_list = stripe->lists + LIST_ENDIO,
+ *sc_list = sc->lists + LIST_ENDIO;
+ spinlock_t *lock = sc->locks + LOCK_ENDIO;
+
+ /* This runs in parallel with do_endios(). */
+ spin_lock_irqsave(lock, flags);
+ if (list_empty(stripe_list))
+ list_add_tail(stripe_list, sc_list);
+ spin_unlock_irqrestore(lock, flags);
+
+ wake_do_raid(RS(sc)); /* Wake myself. */
+}
+
+/* Pop a stripe off safely off the endio list. */
+static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
+{
+ struct stripe *stripe;
+ spinlock_t *lock = sc->locks + LOCK_ENDIO;
+
+ /* This runs in parallel with endio(). */
+ spin_lock_irq(lock);
+ POP_LIST(LIST_ENDIO)
+ spin_unlock_irq(lock);
+ return stripe;
+}
+#undef POP_LIST
+
+/*
+ * Stripe cache locking functions
+ */
+/* Dummy lock function for single host RAID4+5. */
+static void *no_lock(sector_t key, enum dm_lock_type type)
+{
+ return &no_lock;
+}
+
+/* Dummy unlock function for single host RAID4+5. */
+static void no_unlock(void *lock_handle)
+{
+}
+
+/* No locking (for single host RAID 4+5). */
+static struct dm_raid45_locking_type locking_none = {
+ .lock = no_lock,
+ .unlock = no_unlock,
+};
+
+/* Lock a stripe (for clustering). */
+static int
+stripe_lock(struct stripe *stripe, int rw, sector_t key)
+{
+ stripe->lock = RS(stripe->sc)->locking->lock(key, rw == READ ? DM_RAID45_SHARED : DM_RAID45_EX);
+ return stripe->lock ? 0 : -EPERM;
+}
+
+/* Unlock a stripe (for clustering). */
+static void stripe_unlock(struct stripe *stripe)
+{
+ RS(stripe->sc)->locking->unlock(stripe->lock);
+ stripe->lock = NULL;
+}
+
+/* Test io pending on stripe. */
+static int stripe_io_ref(struct stripe *stripe)
+{
+ return atomic_read(&stripe->io.pending);
+}
+
+static void stripe_io_get(struct stripe *stripe)
+{
+ if (atomic_inc_return(&stripe->io.pending) == 1)
+ /* REMOVEME: statistics */
+ atomic_inc(&stripe->sc->active_stripes);
+ else
+ BUG_ON(stripe_io_ref(stripe) < 0);
+}
+
+static void stripe_io_put(struct stripe *stripe)
+{
+ if (atomic_dec_and_test(&stripe->io.pending)) {
+ if (unlikely(StripeRecover(stripe)))
+ /* Don't put recovery stripe on endio list. */
+ wake_do_raid(RS(stripe->sc));
+ else
+ /* Add regular stripe to endio list and wake daemon. */
+ stripe_endio_push(stripe);
+
+ /* REMOVEME: statistics */
+ atomic_dec(&stripe->sc->active_stripes);
+ } else
+ BUG_ON(stripe_io_ref(stripe) < 0);
+}
+
+/* Take stripe reference out. */
+static int stripe_get(struct stripe *stripe)
+{
+ int r;
+ struct list_head *lh = stripe->lists + LIST_LRU;
+
+ /* Delete stripe from LRU (inactive) list if on. */
+ DEL_LIST(lh);
+ BUG_ON(stripe_ref(stripe) < 0);
+
+ /* Lock stripe on first reference */
+ r = (atomic_inc_return(&stripe->cnt) == 1) ?
+ stripe_lock(stripe, WRITE, stripe->key) : 0;
+
+ return r;
+}
+#undef DEL_LIST
+
+/* Return references on a chunk. */
+static int chunk_ref(struct stripe_chunk *chunk)
+{
+ return atomic_read(&chunk->cnt);
+}
+
+/* Take out reference on a chunk. */
+static int chunk_get(struct stripe_chunk *chunk)
+{
+ return atomic_inc_return(&chunk->cnt);
+}
+
+/* Drop reference on a chunk. */
+static void chunk_put(struct stripe_chunk *chunk)
+{
+ BUG_ON(atomic_dec_return(&chunk->cnt) < 0);
+}
+
+/*
+ * Drop reference on a stripe.
+ *
+ * Move it to list of LRU stripes if zero.
+ */
+static void stripe_put(struct stripe *stripe)
+{
+ if (atomic_dec_and_test(&stripe->cnt)) {
+ BUG_ON(stripe_io_ref(stripe));
+ stripe_unlock(stripe);
+ } else
+ BUG_ON(stripe_ref(stripe) < 0);
+}
+
+/* Helper needed by for_each_io_dev(). */
+static void stripe_get_references(struct stripe *stripe, unsigned p)
+{
+
+ /*
+ * Another one to reference the stripe in
+ * order to protect vs. LRU list moves.
+ */
+ io_get(RS(stripe->sc)); /* Global io references. */
+ stripe_get(stripe);
+ stripe_io_get(stripe); /* One for each chunk io. */
+}
+
+/* Helper for endio() to put all take references. */
+static void stripe_put_references(struct stripe *stripe)
+{
+ stripe_io_put(stripe); /* One for each chunk io. */
+ stripe_put(stripe);
+ io_put(RS(stripe->sc));
+}
+
+/*
+ * Stripe cache functions.
+ */
+/*
+ * Invalidate all chunks (i.e. their pages) of a stripe.
+ *
+ * I only keep state for the whole chunk.
+ */
+static inline void stripe_chunk_invalidate(struct stripe_chunk *chunk)
+{
+ chunk->io.flags = 0;
+}
+
+static void
+stripe_chunks_invalidate(struct stripe *stripe)
+{
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ while (p--)
+ stripe_chunk_invalidate(CHUNK(stripe, p));
+}
+
+/* Prepare stripe for (re)use. */
+static void stripe_invalidate(struct stripe *stripe)
+{
+ stripe->io.flags = 0;
+ stripe->idx.parity = stripe->idx.recover = -1;
+ stripe_chunks_invalidate(stripe);
+}
+
+/*
+ * Allow io on all chunks of a stripe.
+ * If not set, IO will not occur; i.e. it's prohibited.
+ *
+ * Actual IO submission for allowed chunks depends
+ * on their !uptodate or dirty state.
+ */
+static void stripe_allow_io(struct stripe *stripe)
+{
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ while (p--)
+ SetChunkIo(CHUNK(stripe, p));
+}
+
+/* Initialize a stripe. */
+static void stripe_init(struct stripe_cache *sc, struct stripe *stripe)
+{
+ unsigned i, p = RS(sc)->set.raid_devs;
+
+ /* Work all io chunks. */
+ while (p--) {
+ struct stripe_chunk *chunk = CHUNK(stripe, p);
+
+ atomic_set(&chunk->cnt, 0);
+ chunk->stripe = stripe;
+ i = ARRAY_SIZE(chunk->bl);
+ while (i--)
+ bio_list_init(chunk->bl + i);
+ }
+
+ stripe->sc = sc;
+
+ i = ARRAY_SIZE(stripe->lists);
+ while (i--)
+ INIT_LIST_HEAD(stripe->lists + i);
+
+ stripe->io.size = RS(sc)->set.io_size;
+ atomic_set(&stripe->cnt, 0);
+ atomic_set(&stripe->io.pending, 0);
+ stripe_invalidate(stripe);
+}
+
+/* Number of pages per chunk. */
+static inline unsigned chunk_pages(unsigned sectors)
+{
+ return dm_div_up(sectors, SECTORS_PER_PAGE);
+}
+
+/* Number of pages per stripe. */
+static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
+{
+ return chunk_pages(io_size) * rs->set.raid_devs;
+}
+
+/* Initialize part of page_list (recovery). */
+static void stripe_zero_pl_part(struct stripe *stripe, int p,
+ unsigned start, unsigned count)
+{
+ unsigned o = start / SECTORS_PER_PAGE, pages = chunk_pages(count);
+ /* Get offset into the page_list. */
+ struct page_list *pl = pl_elem(PL(stripe, p), o);
+
+ BUG_ON(!pl);
+ while (pl && pages--) {
+ BUG_ON(!pl->page);
+ memset(page_address(pl->page), 0, PAGE_SIZE);
+ pl = pl->next;
+ }
+}
+
+/* Initialize parity chunk of stripe. */
+static void stripe_zero_chunk(struct stripe *stripe, int p)
+{
+ if (p > -1)
+ stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
+}
+
+/* Return dynamic stripe structure size. */
+static size_t stripe_size(struct raid_set *rs)
+{
+ return sizeof(struct stripe) +
+ rs->set.raid_devs * sizeof(struct stripe_chunk);
+}
+
+/* Allocate a stripe and its memory object. */
+/* XXX adjust to cope with stripe cache and recovery stripe caches. */
+enum grow { SC_GROW, SC_KEEP };
+static struct stripe *stripe_alloc(struct stripe_cache *sc,
+ struct dm_mem_cache_client *mc,
+ enum grow grow)
+{
+ int r;
+ struct stripe *stripe;
+
+ stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
+ if (stripe) {
+ /* Grow the dm-mem-cache by one object. */
+ if (grow == SC_GROW) {
+ r = dm_mem_cache_grow(mc, 1);
+ if (r)
+ goto err_free;
+ }
+
+ stripe->obj = dm_mem_cache_alloc(mc);
+ if (IS_ERR(stripe->obj))
+ goto err_shrink;
+
+ stripe_init(sc, stripe);
+ }
+
+ return stripe;
+
+err_shrink:
+ if (grow == SC_GROW)
+ dm_mem_cache_shrink(mc, 1);
+err_free:
+ kmem_cache_free(sc->kc.cache, stripe);
+ return NULL;
+}
+
+/*
+ * Free a stripes memory object, shrink the
+ * memory cache and free the stripe itself.
+ */
+static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
+{
+ dm_mem_cache_free(mc, stripe->obj);
+ dm_mem_cache_shrink(mc, 1);
+ kmem_cache_free(stripe->sc->kc.cache, stripe);
+}
+
+/* Free the recovery stripe. */
+static void stripe_recover_free(struct raid_set *rs)
+{
+ struct recover *rec = &rs->recover;
+ struct dm_mem_cache_client *mc;
+
+ mc = rec->mem_cache_client;
+ rec->mem_cache_client = NULL;
+ if (mc) {
+ struct stripe *stripe;
+
+ while (!list_empty(&rec->stripes)) {
+ stripe = list_first_entry(&rec->stripes, struct stripe,
+ lists[LIST_RECOVER]);
+ list_del(stripe->lists + LIST_RECOVER);
+ kfree(stripe->recover);
+ stripe_free(stripe, mc);
+ }
+
+ dm_mem_cache_client_destroy(mc);
+ dm_io_client_destroy(rec->dm_io_client);
+ rec->dm_io_client = NULL;
+ }
+}
+
+/* Grow stripe cache. */
+static int sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
+{
+ int r = 0;
+
+ /* Try to allocate this many (additional) stripes. */
+ while (stripes--) {
+ struct stripe *stripe =
+ stripe_alloc(sc, sc->mem_cache_client, grow);
+
+ if (likely(stripe)) {
+ stripe_lru_add(stripe);
+ atomic_inc(&sc->stripes);
+ } else {
+ r = -ENOMEM;
+ break;
+ }
+ }
+
+ return r ? r : sc_hash_resize(sc);
+}
+
+/* Shrink stripe cache. */
+static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
+{
+ int r = 0;
+
+ /* Try to get unused stripe from LRU list. */
+ while (stripes--) {
+ struct stripe *stripe;
+
+ stripe = stripe_lru_pop(sc);
+ if (stripe) {
+ /* An LRU stripe may never have ios pending! */
+ BUG_ON(stripe_io_ref(stripe));
+ BUG_ON(stripe_ref(stripe));
+ atomic_dec(&sc->stripes);
+ /* Remove from hash if on before deletion. */
+ stripe_hash_del(stripe);
+ stripe_free(stripe, sc->mem_cache_client);
+ } else {
+ r = -ENOENT;
+ break;
+ }
+ }
+
+ /* Check if stats are still sane. */
+ if (atomic_read(&sc->active_stripes_max) >
+ atomic_read(&sc->stripes))
+ atomic_set(&sc->active_stripes_max, 0);
+
+ if (r)
+ return r;
+
+ return atomic_read(&sc->stripes) ? sc_hash_resize(sc) : 0;
+}
+
+/* Create stripe cache and recovery. */
+static int sc_init(struct raid_set *rs, unsigned stripes)
+{
+ unsigned i, r, rstripes;
+ struct stripe_cache *sc = &rs->sc;
+ struct stripe *stripe;
+ struct recover *rec = &rs->recover;
+ struct mapped_device *md;
+ struct gendisk *disk;
+
+
+ /* Initialize lists and locks. */
+ i = ARRAY_SIZE(sc->lists);
+ while (i--)
+ INIT_LIST_HEAD(sc->lists + i);
+
+ INIT_LIST_HEAD(&rec->stripes);
+
+ /* Initialize endio and LRU list locks. */
+ i = NR_LOCKS;
+ while (i--)
+ spin_lock_init(sc->locks + i);
+
+ /* Initialize atomic variables. */
+ atomic_set(&sc->stripes, 0);
+ atomic_set(&sc->stripes_to_set, 0);
+ atomic_set(&sc->active_stripes, 0);
+ atomic_set(&sc->active_stripes_max, 0); /* REMOVEME: statistics. */
+
+ /*
+ * We need a runtime unique # to suffix the kmem cache name
+ * because we'll have one for each active RAID set.
+ */
+ md = dm_table_get_md(rs->ti->table);
+ disk = dm_disk(md);
+ snprintf(sc->kc.name, sizeof(sc->kc.name), "%s-%d.%d", TARGET,
+ disk->first_minor, atomic_inc_return(&_stripe_sc_nr));
+ sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
+ 0, 0, NULL);
+ if (!sc->kc.cache)
+ return -ENOMEM;
+
+ /* Create memory cache client context for RAID stripe cache. */
+ sc->mem_cache_client =
+ dm_mem_cache_client_create(stripes, rs->set.raid_devs,
+ chunk_pages(rs->set.io_size));
+ if (IS_ERR(sc->mem_cache_client))
+ return PTR_ERR(sc->mem_cache_client);
+
+ /* Create memory cache client context for RAID recovery stripe(s). */
+ rstripes = rec->recovery_stripes;
+ rec->mem_cache_client =
+ dm_mem_cache_client_create(rstripes, rs->set.raid_devs,
+ chunk_pages(rec->io_size));
+ if (IS_ERR(rec->mem_cache_client))
+ return PTR_ERR(rec->mem_cache_client);
+
+ /* Create dm-io client context for IO stripes. */
+ sc->dm_io_client = dm_io_client_create();
+ if (IS_ERR(sc->dm_io_client))
+ return PTR_ERR(sc->dm_io_client);
+
+ /* FIXME: intermingeled with stripe cache initialization. */
+ /* Create dm-io client context for recovery stripes. */
+ rec->dm_io_client = dm_io_client_create();
+ if (IS_ERR(rec->dm_io_client))
+ return PTR_ERR(rec->dm_io_client);
+
+ /* Allocate stripes for set recovery. */
+ while (rstripes--) {
+ stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
+ if (!stripe)
+ return -ENOMEM;
+
+ stripe->recover = kzalloc(sizeof(*stripe->recover), GFP_KERNEL);
+ if (!stripe->recover) {
+ stripe_free(stripe, rec->mem_cache_client);
+ return -ENOMEM;
+ }
+
+ SetStripeRecover(stripe);
+ stripe->io.size = rec->io_size;
+ list_add_tail(stripe->lists + LIST_RECOVER, &rec->stripes);
+ /* Don't add recovery stripes to LRU list! */
+ }
+
+ /*
+ * Allocate the stripe objetcs from the
+ * cache and add them to the LRU list.
+ */
+ r = sc_grow(sc, stripes, SC_KEEP);
+ if (!r)
+ atomic_set(&sc->stripes_last, stripes);
+
+ return r;
+}
+
+/* Destroy the stripe cache. */
+static void sc_exit(struct stripe_cache *sc)
+{
+ struct raid_set *rs = RS(sc);
+
+ if (sc->kc.cache) {
+ stripe_recover_free(rs);
+ BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
+ kmem_cache_destroy(sc->kc.cache);
+ sc->kc.cache = NULL;
+
+ if (sc->mem_cache_client && !IS_ERR(sc->mem_cache_client))
+ dm_mem_cache_client_destroy(sc->mem_cache_client);
+
+ if (sc->dm_io_client && !IS_ERR(sc->dm_io_client))
+ dm_io_client_destroy(sc->dm_io_client);
+
+ hash_exit(&sc->hash);
+ }
+}
+
+/*
+ * Calculate RAID address
+ *
+ * Delivers tuple with the index of the data disk holding the chunk
+ * in the set, the parity disks index and the start of the stripe
+ * within the address space of the set (used as the stripe cache hash key).
+ */
+/* thx MD. */
+static struct raid_address *raid_address(struct raid_set *rs, sector_t sector,
+ struct raid_address *addr)
+{
+ sector_t stripe, tmp;
+
+ /*
+ * chunk_number = sector / chunk_size
+ * stripe_number = chunk_number / data_devs
+ * di = stripe % data_devs;
+ */
+ stripe = sector >> rs->set.chunk_shift;
+ addr->di = sector_div(stripe, rs->set.data_devs);
+
+ switch (rs->set.raid_type->level) {
+ case raid4:
+ addr->pi = rs->set.pi;
+ goto check_shift_di;
+ case raid5:
+ tmp = stripe;
+ addr->pi = sector_div(tmp, rs->set.raid_devs);
+
+ switch (rs->set.raid_type->algorithm) {
+ case left_asym: /* Left asymmetric. */
+ addr->pi = rs->set.data_devs - addr->pi;
+ case right_asym: /* Right asymmetric. */
+check_shift_di:
+ if (addr->di >= addr->pi)
+ addr->di++;
+ break;
+ case left_sym: /* Left symmetric. */
+ addr->pi = rs->set.data_devs - addr->pi;
+ case right_sym: /* Right symmetric. */
+ addr->di = (addr->pi + addr->di + 1) %
+ rs->set.raid_devs;
+ break;
+ case none: /* Ain't happen: RAID4 algorithm placeholder. */
+ BUG();
+ }
+ }
+
+ /*
+ * Start offset of the stripes chunk on any single device of the RAID
+ * set, adjusted in case io size differs from chunk size.
+ */
+ addr->key = (stripe << rs->set.chunk_shift) +
+ (sector & rs->set.io_inv_mask);
+ return addr;
+}
+
+/*
+ * Copy data across between stripe pages and bio vectors.
+ *
+ * Pay attention to data alignment in stripe and bio pages.
+ */
+static void bio_copy_page_list(int rw, struct stripe *stripe,
+ struct page_list *pl, struct bio *bio)
+{
+ unsigned i, page_offset;
+ void *page_addr;
+ struct raid_set *rs = RS(stripe->sc);
+ struct bio_vec *bv;
+
+ /* Get start page in page list for this sector. */
+ i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
+ pl = pl_elem(pl, i);
+ BUG_ON(!pl);
+ BUG_ON(!pl->page);
+
+ page_addr = page_address(pl->page);
+ page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
+
+ /* Walk all segments and copy data across between bio_vecs and pages. */
+ bio_for_each_segment(bv, bio, i) {
+ int len = bv->bv_len, size;
+ unsigned bio_offset = 0;
+ void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
+redo:
+ size = (page_offset + len > PAGE_SIZE) ?
+ PAGE_SIZE - page_offset : len;
+
+ if (rw == READ)
+ memcpy(bio_addr + bio_offset,
+ page_addr + page_offset, size);
+ else
+ memcpy(page_addr + page_offset,
+ bio_addr + bio_offset, size);
+
+ page_offset += size;
+ if (page_offset == PAGE_SIZE) {
+ /*
+ * We reached the end of the chunk page ->
+ * need to refer to the next one to copy more data.
+ */
+ len -= size;
+ if (len) {
+ /* Get next page. */
+ pl = pl->next;
+ BUG_ON(!pl);
+ BUG_ON(!pl->page);
+ page_addr = page_address(pl->page);
+ page_offset = 0;
+ bio_offset += size;
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
+ goto redo;
+ }
+ }
+
+ __bio_kunmap_atomic(bio_addr, KM_USER0);
+ }
+}
+
+/*
+ * Xor optimization macros.
+ */
+/* Xor data pointer declaration and initialization macros. */
+#define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1]
+#define DECLARE_3 DECLARE_2, *d2 = data[2]
+#define DECLARE_4 DECLARE_3, *d3 = data[3]
+#define DECLARE_5 DECLARE_4, *d4 = data[4]
+#define DECLARE_6 DECLARE_5, *d5 = data[5]
+#define DECLARE_7 DECLARE_6, *d6 = data[6]
+#define DECLARE_8 DECLARE_7, *d7 = data[7]
+
+/* Xor unrole macros. */
+#define D2(n) d0[n] = d0[n] ^ d1[n]
+#define D3(n) D2(n) ^ d2[n]
+#define D4(n) D3(n) ^ d3[n]
+#define D5(n) D4(n) ^ d4[n]
+#define D6(n) D5(n) ^ d5[n]
+#define D7(n) D6(n) ^ d6[n]
+#define D8(n) D7(n) ^ d7[n]
+
+#define X_2(macro, offset) macro(offset); macro(offset + 1);
+#define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2);
+#define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4);
+#define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8);
+#define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16);
+#define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32);
+
+/* Define a _xor_#chunks_#xors_per_run() function. */
+#define _XOR(chunks, xors_per_run) \
+static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
+{ \
+ unsigned end = XOR_SIZE / sizeof(data[0]), i; \
+ DECLARE_ ## chunks; \
+\
+ for (i = 0; i < end; i += xors_per_run) { \
+ X_ ## xors_per_run(D ## chunks, i); \
+ } \
+}
+
+/* Define xor functions for 2 - 8 chunks and xors per run. */
+#define MAKE_XOR_PER_RUN(xors_per_run) \
+ _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
+ _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
+ _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
+ _XOR(8, xors_per_run);
+
+MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */
+MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */
+MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */
+MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */
+
+#define MAKE_XOR(xors_per_run) \
+struct { \
+ void (*f)(unsigned long **); \
+} static xor_funcs ## xors_per_run[] = { \
+ { NULL }, /* NULL pointers to optimize indexing in xor(). */ \
+ { NULL }, \
+ { _xor2_ ## xors_per_run }, \
+ { _xor3_ ## xors_per_run }, \
+ { _xor4_ ## xors_per_run }, \
+ { _xor5_ ## xors_per_run }, \
+ { _xor6_ ## xors_per_run }, \
+ { _xor7_ ## xors_per_run }, \
+ { _xor8_ ## xors_per_run }, \
+}; \
+\
+static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
+{ \
+ /* Call respective function for amount of chunks. */ \
+ xor_funcs ## xors_per_run[n].f(data); \
+}
+
+/* Define xor_8() - xor_64 functions. */
+MAKE_XOR(8)
+MAKE_XOR(16)
+MAKE_XOR(32)
+MAKE_XOR(64)
+/*
+ * END xor optimization macros.
+ */
+
+/* Maximum number of chunks, which can be xor'ed in one go. */
+#define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1)
+
+/* xor_blocks wrapper to allow for using that crypto library function. */
+static void xor_blocks_wrapper(unsigned n, unsigned long **data)
+{
+ BUG_ON(n < 2 || n > MAX_XOR_BLOCKS + 1);
+ xor_blocks(n - 1, XOR_SIZE, (void *) data[0], (void **) data + 1);
+}
+
+struct xor_func {
+ xor_function_t f;
+ const char *name;
+} static xor_funcs[] = {
+ { xor_64, "xor_64" },
+ { xor_32, "xor_32" },
+ { xor_16, "xor_16" },
+ { xor_8, "xor_8" },
+ { xor_blocks_wrapper, "xor_blocks" },
+};
+
+/*
+ * Check, if chunk has to be xored in/out:
+ *
+ * o if writes are queued
+ * o if writes are merged
+ * o if stripe is to be reconstructed
+ * o if recovery stripe
+ */
+static inline int chunk_must_xor(struct stripe_chunk *chunk)
+{
+ if (ChunkUptodate(chunk)) {
+ BUG_ON(!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) &&
+ !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)));
+
+ if (!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) ||
+ !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)))
+ return 1;
+
+ if (StripeReconstruct(chunk->stripe) ||
+ StripeRecover(chunk->stripe))
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Calculate crc.
+ *
+ * This indexes into the chunks of a stripe and their pages.
+ *
+ * All chunks will be xored into the indexed (@pi)
+ * chunk in maximum groups of xor.chunks.
+ *
+ */
+static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ unsigned max_chunks = rs->xor.chunks, n = 1,
+ o = sector / SECTORS_PER_PAGE, /* Offset into the page_list. */
+ p = rs->set.raid_devs;
+ unsigned long **d = rs->data;
+ xor_function_t xor_f = rs->xor.f->f;
+
+ BUG_ON(sector > stripe->io.size);
+
+ /* Address of parity page to xor into. */
+ d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
+
+ while (p--) {
+ /* Preset pointers to data pages. */
+ if (p != pi && chunk_must_xor(CHUNK(stripe, p)))
+ d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
+
+ /* If max chunks -> xor. */
+ if (n == max_chunks) {
+ mutex_lock(&rs->io.xor_lock);
+ xor_f(n, d);
+ mutex_unlock(&rs->io.xor_lock);
+ n = 1;
+ }
+ }
+
+ /* If chunks -> xor. */
+ if (n > 1) {
+ mutex_lock(&rs->io.xor_lock);
+ xor_f(n, d);
+ mutex_unlock(&rs->io.xor_lock);
+ }
+}
+
+/* Common xor loop through all stripe page lists. */
+static void common_xor(struct stripe *stripe, sector_t count,
+ unsigned off, unsigned pi)
+{
+ unsigned sector;
+
+ BUG_ON(!count);
+ for (sector = off; sector < count; sector += SECTORS_PER_PAGE)
+ xor(stripe, pi, sector);
+
+ /* Set parity page uptodate and clean. */
+ chunk_set(CHUNK(stripe, pi), CLEAN);
+ atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
+}
+
+/*
+ * Calculate parity sectors on intact stripes.
+ *
+ * Need to calculate raid address for recover stripe, because its
+ * chunk sizes differs and is typically larger than io chunk size.
+ */
+static void parity_xor(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ int size_differs = stripe->io.size != rs->set.io_size;
+ unsigned chunk_size = rs->set.chunk_size, io_size = stripe->io.size,
+ xor_size = chunk_size > io_size ? io_size : chunk_size;
+ sector_t off;
+
+ /* This can be the recover stripe with a larger io size. */
+ for (off = 0; off < io_size; off += xor_size) {
+ /*
+ * Recover stripe is likely bigger than regular io
+ * ones and has no precalculated parity disk index ->
+ * need to calculate RAID address.
+ */
+ if (unlikely(size_differs)) {
+ struct raid_address addr;
+
+ raid_address(rs, (stripe->key + off) *
+ rs->set.data_devs, &addr);
+ stripe->idx.parity = addr.pi;
+ stripe_zero_pl_part(stripe, addr.pi, off, xor_size);
+ }
+
+ common_xor(stripe, xor_size, off, stripe->idx.parity);
+ chunk_set(CHUNK(stripe, stripe->idx.parity), DIRTY);
+ }
+}
+
+/* Reconstruct missing chunk. */
+static void stripe_reconstruct(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ int p = rs->set.raid_devs, pr = stripe->idx.recover;
+
+ BUG_ON(pr < 0);
+
+ /* Check if all but the chunk to be reconstructed are uptodate. */
+ while (p--)
+ BUG_ON(p != pr && !ChunkUptodate(CHUNK(stripe, p)));
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + (RSDegraded(rs) ? S_RECONSTRUCT_EI :
+ S_RECONSTRUCT_DEV));
+ /* Zero chunk to be reconstructed. */
+ stripe_zero_chunk(stripe, pr);
+ common_xor(stripe, stripe->io.size, 0, pr);
+}
+
+/*
+ * Recovery io throttling
+ */
+/* Conditionally reset io counters. */
+static int recover_io_reset(struct raid_set *rs)
+{
+ unsigned long j = jiffies;
+
+ /* Pay attention to jiffies overflows. */
+ if (j > rs->recover.last_jiffies + HZ ||
+ j < rs->recover.last_jiffies) {
+ atomic_set(rs->recover.io_count + IO_WORK, 0);
+ atomic_set(rs->recover.io_count + IO_RECOVER, 0);
+ rs->recover.last_jiffies = j;
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Count ios. */
+static void recover_io_count(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+
+ atomic_inc(rs->recover.io_count +
+ (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
+}
+
+/* Try getting a stripe either from the hash or from the LRU list. */
+static struct stripe *stripe_find(struct raid_set *rs,
+ struct raid_address *addr)
+{
+ int r;
+ struct stripe_cache *sc = &rs->sc;
+ struct stripe *stripe;
+
+ /* Try stripe from hash. */
+ stripe = stripe_lookup(sc, addr->key);
+ if (stripe) {
+ r = stripe_get(stripe);
+ if (r)
+ goto get_lock_failed;
+
+ atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
+ } else {
+ /* Not in hash -> try to get an LRU stripe. */
+ stripe = stripe_lru_pop(sc);
+ if (stripe) {
+ /*
+ * An LRU stripe may not be referenced
+ * and may never have ios pending!
+ */
+ BUG_ON(stripe_ref(stripe));
+ BUG_ON(stripe_io_ref(stripe));
+
+ /* Remove from hash if on before reuse. */
+ stripe_hash_del(stripe);
+
+ /* Invalidate before reinserting with changed key. */
+ stripe_invalidate(stripe);
+
+ stripe->key = addr->key;
+ stripe->region = dm_rh_sector_to_region(rs->recover.rh,
+ addr->key);
+ stripe->idx.parity = addr->pi;
+ r = stripe_get(stripe);
+ if (r)
+ goto get_lock_failed;
+
+ /* Insert stripe into the stripe hash. */
+ stripe_insert(&sc->hash, stripe);
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_INSCACHE);
+ }
+ }
+
+ return stripe;
+
+get_lock_failed:
+ stripe_put(stripe);
+ return NULL;
+}
+
+/*
+ * Process end io
+ *
+ * I need to do it here because I can't in interrupt
+ */
+/* End io all bios on a bio list. */
+static void bio_list_endio(struct stripe *stripe, struct bio_list *bl,
+ int p, int error)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ struct bio *bio;
+ struct page_list *pl = PL(stripe, p);
+ struct stripe_chunk *chunk = CHUNK(stripe, p);
+
+ /* Update region counters. */
+ while ((bio = bio_list_pop(bl))) {
+ if (bio_data_dir(bio) == WRITE)
+ /* Drop io pending count for any writes. */
+ dm_rh_dec(rs->recover.rh, stripe->region);
+ else if (!error)
+ /* Copy data accross. */
+ bio_copy_page_list(READ, stripe, pl, bio);
+
+ bio_endio(bio, error);
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
+ S_BIOS_ENDIO_READ : S_BIOS_ENDIO_WRITE));
+
+ chunk_put(chunk);
+ stripe_put(stripe);
+ io_put(rs); /* Wake any suspend waiters on last bio. */
+ }
+}
+
+/*
+ * End io all reads/writes on a stripe copying
+ * read data accross from stripe to bios and
+ * decrementing region counters for writes.
+ *
+ * Processing of ios depeding on state:
+ * o no chunk error -> endio ok
+ * o degraded:
+ * - chunk error and read -> ignore to be requeued
+ * - chunk error and write -> endio ok
+ * o dead (more than parity_devs failed) and chunk_error-> endio failed
+ */
+static void stripe_endio(int rw, struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ unsigned p = rs->set.raid_devs;
+ int write = (rw != READ);
+
+ while (p--) {
+ struct stripe_chunk *chunk = CHUNK(stripe, p);
+ struct bio_list *bl;
+
+ BUG_ON(ChunkLocked(chunk));
+
+ bl = BL_CHUNK(chunk, rw);
+ if (bio_list_empty(bl))
+ continue;
+
+ if (unlikely(ChunkError(chunk) || !ChunkUptodate(chunk))) {
+ /* RAID set dead. */
+ if (unlikely(RSDead(rs)))
+ bio_list_endio(stripe, bl, p, -EIO);
+ /* RAID set degraded. */
+ else if (write)
+ bio_list_endio(stripe, bl, p, 0);
+ } else {
+ BUG_ON(!RSDegraded(rs) && ChunkDirty(chunk));
+ bio_list_endio(stripe, bl, p, 0);
+ }
+ }
+}
+
+/* Fail all ios hanging off all bio lists of a stripe. */
+static void stripe_fail_io(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ unsigned p = rs->set.raid_devs;
+
+ while (p--) {
+ struct stripe_chunk *chunk = CHUNK(stripe, p);
+ int i = ARRAY_SIZE(chunk->bl);
+
+ /* Fail all bios on all bio lists of the stripe. */
+ while (i--) {
+ struct bio_list *bl = chunk->bl + i;
+
+ if (!bio_list_empty(bl))
+ bio_list_endio(stripe, bl, p, -EIO);
+ }
+ }
+
+ /* Put stripe on LRU list. */
+ BUG_ON(stripe_io_ref(stripe));
+ BUG_ON(stripe_ref(stripe));
+}
+
+/* Unlock all required chunks. */
+static void stripe_chunks_unlock(struct stripe *stripe)
+{
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+ struct stripe_chunk *chunk;
+
+ while (p--) {
+ chunk = CHUNK(stripe, p);
+
+ if (TestClearChunkUnlock(chunk))
+ ClearChunkLocked(chunk);
+ }
+}
+
+/*
+ * Queue reads and writes to a stripe by hanging
+ * their bios off the stripesets read/write lists.
+ */
+static int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
+ struct bio_list *reject)
+{
+ struct raid_address addr;
+ struct stripe *stripe;
+
+ stripe = stripe_find(rs, raid_address(rs, bio->bi_sector, &addr));
+ if (stripe) {
+ int r = 0, rw = bio_data_dir(bio);
+
+ /* Distinguish reads and writes. */
+ bio_list_add(BL(stripe, addr.di, rw), bio);
+
+ if (rw == READ)
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_BIOS_ADDED_READ);
+ else {
+ /* Inrement pending write count on region. */
+ dm_rh_inc(rs->recover.rh, stripe->region);
+ r = 1;
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_BIOS_ADDED_WRITE);
+ }
+
+ /*
+ * Put on io (flush) list in case of
+ * initial bio queued to chunk.
+ */
+ if (chunk_get(CHUNK(stripe, addr.di)) == 1)
+ stripe_flush_add(stripe);
+
+ return r;
+ }
+
+ /* Got no stripe from cache or failed to lock it -> reject bio. */
+ bio_list_add(reject, bio);
+ atomic_inc(rs->stats + S_IOS_POST); /* REMOVEME: statistics. */
+ return 0;
+}
+
+/*
+ * Handle all stripes by handing them to the daemon, because we can't
+ * map their chunk pages to copy the data in interrupt context.
+ *
+ * We don't want to handle them here either, while interrupts are disabled.
+ */
+
+/* Read/write endio function for dm-io (interrupt context). */
+static void endio(unsigned long error, void *context)
+{
+ struct stripe_chunk *chunk = context;
+
+ if (unlikely(error)) {
+ chunk_set(chunk, ERROR);
+ /* REMOVEME: statistics. */
+ atomic_inc(RS(chunk->stripe->sc)->stats + S_STRIPE_ERROR);
+ } else
+ chunk_set(chunk, CLEAN);
+
+ /*
+ * For recovery stripes, I need to reset locked locked
+ * here, because those aren't processed in do_endios().
+ */
+ if (unlikely(StripeRecover(chunk->stripe)))
+ ClearChunkLocked(chunk);
+ else
+ SetChunkUnlock(chunk);
+
+ /* Indirectly puts stripe on cache's endio list via stripe_io_put(). */
+ stripe_put_references(chunk->stripe);
+}
+
+/* Read/Write a chunk asynchronously. */
+static void stripe_chunk_rw(struct stripe *stripe, unsigned p)
+{
+ struct stripe_cache *sc = stripe->sc;
+ struct raid_set *rs = RS(sc);
+ struct dm_mem_cache_object *obj = stripe->obj + p;
+ struct page_list *pl = obj->pl;
+ struct stripe_chunk *chunk = CHUNK(stripe, p);
+ struct raid_dev *dev = rs->dev + p;
+ struct dm_io_region io = {
+ .bdev = dev->dev->bdev,
+ .sector = stripe->key,
+ .count = stripe->io.size,
+ };
+ struct dm_io_request control = {
+ .bi_rw = ChunkDirty(chunk) ? WRITE : READ,
+ .mem = {
+ .type = DM_IO_PAGE_LIST,
+ .ptr.pl = pl,
+ .offset = 0,
+ },
+ .notify = {
+ .fn = endio,
+ .context = chunk,
+ },
+ .client = StripeRecover(stripe) ? rs->recover.dm_io_client :
+ sc->dm_io_client,
+ };
+
+ BUG_ON(ChunkLocked(chunk));
+ BUG_ON(!ChunkUptodate(chunk) && ChunkDirty(chunk));
+ BUG_ON(ChunkUptodate(chunk) && !ChunkDirty(chunk));
+
+ /*
+ * Don't rw past end of device, which can happen, because
+ * typically sectors_per_dev isn't divisible by io_size.
+ */
+ if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
+ io.count = rs->set.sectors_per_dev - io.sector;
+
+ BUG_ON(!io.count);
+ io.sector += dev->start; /* Add <offset>. */
+ if (RSRecover(rs))
+ recover_io_count(stripe); /* Recovery io accounting. */
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + (ChunkDirty(chunk) ? S_DM_IO_WRITE :
+ S_DM_IO_READ));
+ SetChunkLocked(chunk);
+ SetDevIoQueued(dev);
+ BUG_ON(dm_io(&control, 1, &io, NULL));
+}
+
+/*
+ * Write dirty or read not uptodate page lists of a stripe.
+ */
+static int stripe_chunks_rw(struct stripe *stripe)
+{
+ int r;
+ struct raid_set *rs = RS(stripe->sc);
+
+ /*
+ * Increment the pending count on the stripe
+ * first, so that we don't race in endio().
+ *
+ * An inc (IO) is needed for any chunk unless !ChunkIo(chunk):
+ *
+ * o not uptodate
+ * o dirtied by writes merged
+ * o dirtied by parity calculations
+ */
+ r = for_each_io_dev(stripe, stripe_get_references);
+ if (r) {
+ /* Io needed: chunks are either not uptodate or dirty. */
+ int max; /* REMOVEME: */
+ struct stripe_cache *sc = &rs->sc;
+
+ /* Submit actual io. */
+ for_each_io_dev(stripe, stripe_chunk_rw);
+
+ /* REMOVEME: statistics */
+ max = sc_active(sc);
+ if (atomic_read(&sc->active_stripes_max) < max)
+ atomic_set(&sc->active_stripes_max, max);
+
+ atomic_inc(rs->stats + S_FLUSHS);
+ /* END REMOVEME: statistics */
+ }
+
+ return r;
+}
+
+/* Merge in all writes hence dirtying respective chunks. */
+static void stripe_merge_writes(struct stripe *stripe)
+{
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ while (p--) {
+ struct stripe_chunk *chunk = CHUNK(stripe, p);
+ struct bio_list *write = BL_CHUNK(chunk, WRITE_QUEUED);
+
+ if (!bio_list_empty(write)) {
+ struct bio *bio;
+ struct page_list *pl = stripe->obj[p].pl;
+
+ /*
+ * We can play with the lists without holding a lock,
+ * because it is just us accessing them anyway.
+ */
+ bio_list_for_each(bio, write)
+ bio_copy_page_list(WRITE, stripe, pl, bio);
+
+ bio_list_merge(BL_CHUNK(chunk, WRITE_MERGED), write);
+ bio_list_init(write);
+ chunk_set(chunk, DIRTY);
+ }
+ }
+}
+
+/* Queue all writes to get merged. */
+static int stripe_queue_writes(struct stripe *stripe)
+{
+ int r = 0;
+ unsigned p = RS(stripe->sc)->set.raid_devs;
+
+ while (p--) {
+ struct stripe_chunk *chunk = CHUNK(stripe, p);
+ struct bio_list *write = BL_CHUNK(chunk, WRITE);
+
+ if (!bio_list_empty(write)) {
+ bio_list_merge(BL_CHUNK(chunk, WRITE_QUEUED), write);
+ bio_list_init(write);
+SetChunkIo(chunk);
+ r = 1;
+ }
+ }
+
+ return r;
+}
+
+
+/* Check, if a chunk gets completely overwritten. */
+static int stripe_check_chunk_overwrite(struct stripe *stripe, unsigned p)
+{
+ unsigned sectors = 0;
+ struct bio *bio;
+ struct bio_list *bl = BL(stripe, p, WRITE_QUEUED);
+
+ bio_list_for_each(bio, bl)
+ sectors += bio_sectors(bio);
+
+ BUG_ON(sectors > RS(stripe->sc)->set.io_size);
+ return sectors == RS(stripe->sc)->set.io_size;
+}
+
+/*
+ * Avoid io on broken/reconstructed drive in order to
+ * reconstruct date on endio.
+ *
+ * (*1*) We set StripeReconstruct() in here, so that _do_endios()
+ * will trigger a reconstruct call before resetting it.
+ */
+static int stripe_chunk_set_io_flags(struct stripe *stripe, int pr)
+{
+ struct stripe_chunk *chunk = CHUNK(stripe, pr);
+
+ /*
+ * Allow io on all chunks but the indexed one,
+ * because we're either degraded or prohibit it
+ * on the one for later reconstruction.
+ */
+ /* Includes ClearChunkIo(), ClearChunkUptodate(). */
+ stripe_chunk_invalidate(chunk);
+ stripe->idx.recover = pr;
+ SetStripeReconstruct(stripe);
+
+ /* REMOVEME: statistics. */
+ atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
+ return -EPERM;
+}
+
+/* Chunk locked/uptodate and device failed tests. */
+static struct stripe_chunk *
+stripe_chunk_check(struct stripe *stripe, unsigned p, unsigned *chunks_uptodate)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ struct stripe_chunk *chunk = CHUNK(stripe, p);
+
+ /* Can't access active chunks. */
+ if (ChunkLocked(chunk)) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_CHUNK_LOCKED);
+ return NULL;
+ }
+
+ /* Can't access broken devive. */
+ if (ChunkError(chunk) || DevFailed(rs->dev + p))
+ return NULL;
+
+ /* Can access uptodate chunks. */
+ if (ChunkUptodate(chunk)) {
+ (*chunks_uptodate)++;
+ return NULL;
+ }
+
+ return chunk;
+}
+
+/*
+ * Degraded/reconstruction mode.
+ *
+ * Check stripe state to figure which chunks don't need IO.
+ *
+ * Returns 0 for fully operational, -EPERM for degraded/resynchronizing.
+ */
+static int stripe_check_reconstruct(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+
+ if (RSDead(rs)) {
+ ClearStripeReconstruct(stripe);
+ ClearStripeReconstructed(stripe);
+ stripe_allow_io(stripe);
+ return 0;
+ }
+
+ /* Avoid further reconstruction setting, when already set. */
+ if (StripeReconstruct(stripe)) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_RECONSTRUCT_SET);
+ return -EBUSY;
+ }
+
+ /* Initially allow io on all chunks. */
+ stripe_allow_io(stripe);
+
+ /* Return if stripe is already reconstructed. */
+ if (StripeReconstructed(stripe)) {
+ atomic_inc(rs->stats + S_RECONSTRUCTED);
+ return 0;
+ }
+
+ /*
+ * Degraded/reconstruction mode (device failed) ->
+ * avoid io on the failed device.
+ */
+ if (unlikely(RSDegraded(rs))) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_DEGRADED);
+ /* Allow IO on all devices but the dead one. */
+ BUG_ON(rs->set.ei < 0);
+ return stripe_chunk_set_io_flags(stripe, rs->set.ei);
+ } else {
+ int sync, pi = dev_for_parity(stripe, &sync);
+
+ /*
+ * Reconstruction mode (ie. a particular (replaced) device or
+ * some (rotating) parity chunk is being resynchronized) ->
+ * o make sure all needed chunks are read in
+ * o cope with 3/4 disk array special case where it
+ * doesn't make a difference to read in parity
+ * to xor data in/out
+ */
+ if (RSEnforceParityCreation(rs) || !sync) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_NOSYNC);
+ /* Allow IO on all devs but the one to reconstruct. */
+ return stripe_chunk_set_io_flags(stripe, pi);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Check, if stripe is ready to merge writes.
+ * I.e. if all chunks present to allow to merge bios.
+ *
+ * We prohibit io on:
+ *
+ * o chunks without bios
+ * o chunks which get completely written over
+ */
+static int stripe_merge_possible(struct stripe *stripe, int nosync)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ unsigned chunks_overwrite = 0, chunks_prohibited = 0,
+ chunks_uptodate = 0, p = rs->set.raid_devs;
+
+ /* Walk all chunks. */
+ while (p--) {
+ struct stripe_chunk *chunk;
+
+ /* Prohibit io on broken devices. */
+ if (DevFailed(rs->dev + p)) {
+ chunk = CHUNK(stripe, p);
+ goto prohibit_io;
+ }
+
+ /* We can't optimize any further if no chunk. */
+ chunk = stripe_chunk_check(stripe, p, &chunks_uptodate);
+ if (!chunk || nosync)
+ continue;
+
+ /*
+ * We have a chunk, which is not uptodate.
+ *
+ * If this is not parity and we don't have
+ * reads queued, we can optimize further.
+ */
+ if (p != stripe->idx.parity &&
+ bio_list_empty(BL_CHUNK(chunk, READ)) &&
+ bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) {
+ if (bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)))
+ goto prohibit_io;
+ else if (RSCheckOverwrite(rs) &&
+ stripe_check_chunk_overwrite(stripe, p))
+ /* Completely overwritten chunk. */
+ chunks_overwrite++;
+ }
+
+ /* Allow io for chunks with bios and overwritten ones. */
+ SetChunkIo(chunk);
+ continue;
+
+prohibit_io:
+ /* No io for broken devices or for chunks w/o bios. */
+ ClearChunkIo(chunk);
+ chunks_prohibited++;
+ /* REMOVEME: statistics. */
+ atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
+ }
+
+ /* All data chunks will get written over. */
+ if (chunks_overwrite == rs->set.data_devs)
+ atomic_inc(rs->stats + S_OVERWRITE); /* REMOVEME: statistics.*/
+ else if (chunks_uptodate + chunks_prohibited < rs->set.raid_devs) {
+ /* We don't have enough chunks to merge. */
+ atomic_inc(rs->stats + S_CANT_MERGE); /* REMOVEME: statistics.*/
+ return -EPERM;
+ }
+
+ /*
+ * If we have all chunks up to date or overwrite them, we
+ * just zero the parity chunk and let stripe_rw() recreate it.
+ */
+ if (chunks_uptodate == rs->set.raid_devs ||
+ chunks_overwrite == rs->set.data_devs) {
+ stripe_zero_chunk(stripe, stripe->idx.parity);
+ BUG_ON(StripeReconstruct(stripe));
+ SetStripeReconstruct(stripe); /* Enforce xor in caller. */
+ } else {
+ /*
+ * With less chunks, we xor parity out.
+ *
+ * (*4*) We rely on !StripeReconstruct() in chunk_must_xor(),
+ * so that only chunks with queued or merged writes
+ * are being xored.
+ */
+ parity_xor(stripe);
+ }
+
+ /*
+ * We do have enough chunks to merge.
+ * All chunks are uptodate or get written over.
+ */
+ atomic_inc(rs->stats + S_CAN_MERGE); /* REMOVEME: statistics. */
+ return 0;
+}
+
+/*
+ * Avoid reading chunks in case we're fully operational.
+ *
+ * We prohibit io on any chunks without bios but the parity chunk.
+ */
+static void stripe_avoid_reads(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ unsigned dummy = 0, p = rs->set.raid_devs;
+
+ /* Walk all chunks. */
+ while (p--) {
+ struct stripe_chunk *chunk =
+ stripe_chunk_check(stripe, p, &dummy);
+
+ if (!chunk)
+ continue;
+
+ /* If parity or any bios pending -> allow io. */
+ if (chunk_ref(chunk) || p == stripe->idx.parity)
+ SetChunkIo(chunk);
+ else {
+ ClearChunkIo(chunk);
+ /* REMOVEME: statistics. */
+ atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
+ }
+ }
+}
+
+/*
+ * Read/write a stripe.
+ *
+ * All stripe read/write activity goes through this function
+ * unless recovery, which has to call stripe_chunk_rw() directly.
+ *
+ * Make sure we don't try already merged stripes in order
+ * to avoid data corruption.
+ *
+ * Check the state of the RAID set and if degraded (or
+ * resynchronizing for reads), read in all other chunks but
+ * the one on the dead/resynchronizing device in order to be
+ * able to reconstruct the missing one in _do_endios().
+ *
+ * Can be called on active stripes in order
+ * to dispatch new io on inactive chunks.
+ *
+ * States to cover:
+ * o stripe to read and/or write
+ * o stripe with error to reconstruct
+ */
+static int stripe_rw(struct stripe *stripe)
+{
+ int nosync, r;
+ struct raid_set *rs = RS(stripe->sc);
+
+ /*
+ * Check, if a chunk needs to be reconstructed
+ * because of a degraded set or a region out of sync.
+ */
+ nosync = stripe_check_reconstruct(stripe);
+ switch (nosync) {
+ case -EBUSY:
+ return 0; /* Wait for stripe reconstruction to finish. */
+ case -EPERM:
+ goto io;
+ }
+
+ /*
+ * If we don't have merged writes pending, we can schedule
+ * queued writes to be merged next without corrupting data.
+ */
+ if (!StripeMerged(stripe)) {
+ r = stripe_queue_writes(stripe);
+ if (r)
+ /* Writes got queued -> flag RBW. */
+ SetStripeRBW(stripe);
+ }
+
+ /*
+ * Merge all writes hanging off uptodate/overwritten
+ * chunks of the stripe.
+ */
+ if (StripeRBW(stripe)) {
+ r = stripe_merge_possible(stripe, nosync);
+ if (!r) { /* Merge possible. */
+ struct stripe_chunk *chunk;
+
+ /*
+ * I rely on valid parity in order
+ * to xor a fraction of chunks out
+ * of parity and back in.
+ */
+ stripe_merge_writes(stripe); /* Merge writes in. */
+ parity_xor(stripe); /* Update parity. */
+ ClearStripeReconstruct(stripe); /* Reset xor enforce. */
+ SetStripeMerged(stripe); /* Writes merged. */
+ ClearStripeRBW(stripe); /* Disable RBW. */
+
+ /*
+ * REMOVEME: sanity check on parity chunk
+ * states after writes got merged.
+ */
+ chunk = CHUNK(stripe, stripe->idx.parity);
+ BUG_ON(ChunkLocked(chunk));
+ BUG_ON(!ChunkUptodate(chunk));
+ BUG_ON(!ChunkDirty(chunk));
+ BUG_ON(!ChunkIo(chunk));
+ }
+ } else if (!nosync && !StripeMerged(stripe))
+ /* Read avoidance if not degraded/resynchronizing/merged. */
+ stripe_avoid_reads(stripe);
+
+io:
+ /* Now submit any reads/writes for non-uptodate or dirty chunks. */
+ r = stripe_chunks_rw(stripe);
+ if (!r) {
+ /*
+ * No io submitted because of chunk io
+ * prohibited or locked chunks/failed devices
+ * -> push to end io list for processing.
+ */
+ stripe_endio_push(stripe);
+ atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
+ }
+
+ return r;
+}
+
+/*
+ * Recovery functions
+ */
+/* Read a stripe off a raid set for recovery. */
+static int stripe_recover_read(struct stripe *stripe, int pi)
+{
+ BUG_ON(stripe_io_ref(stripe));
+
+ /* Invalidate all chunks so that they get read in. */
+ stripe_chunks_invalidate(stripe);
+ stripe_allow_io(stripe); /* Allow io on all recovery chunks. */
+
+ /*
+ * If we are reconstructing a perticular device, we can avoid
+ * reading the respective chunk in, because we're going to
+ * reconstruct it anyway.
+ *
+ * We can't do that for resynchronization of rotating parity,
+ * because the recovery stripe chunk size is typically larger
+ * than the sets chunk size.
+ */
+ if (pi > -1)
+ ClearChunkIo(CHUNK(stripe, pi));
+
+ return stripe_chunks_rw(stripe);
+}
+
+/* Write a stripe to a raid set for recovery. */
+static int stripe_recover_write(struct stripe *stripe, int pi)
+{
+ BUG_ON(stripe_io_ref(stripe));
+
+ /*
+ * If this is a reconstruct of a particular device, then
+ * reconstruct the respective chunk, else create parity chunk.
+ */
+ if (pi > -1) {
+ stripe_zero_chunk(stripe, pi);
+ common_xor(stripe, stripe->io.size, 0, pi);
+ chunk_set(CHUNK(stripe, pi), DIRTY);
+ } else
+ parity_xor(stripe);
+
+ return stripe_chunks_rw(stripe);
+}
+
+/* Read/write a recovery stripe. */
+static int stripe_recover_rw(struct stripe *stripe)
+{
+ int r = 0, sync = 0;
+
+ /* Read/write flip-flop. */
+ if (TestClearStripeRBW(stripe)) {
+ SetStripeMerged(stripe);
+ stripe->key = stripe->recover->pos;
+ r = stripe_recover_read(stripe, dev_for_parity(stripe, &sync));
+ BUG_ON(!r);
+ } else if (TestClearStripeMerged(stripe)) {
+ r = stripe_recover_write(stripe, dev_for_parity(stripe, &sync));
+ BUG_ON(!r);
+ }
+
+ BUG_ON(sync);
+ return r;
+}
+
+/* Recover bandwidth available ?. */
+static int recover_bandwidth(struct raid_set *rs)
+{
+ int r, work;
+
+ /* On reset or when bios delayed -> allow recovery. */
+ r = recover_io_reset(rs);
+ if (r || RSBandwidth(rs))
+ goto out;
+
+ work = atomic_read(rs->recover.io_count + IO_WORK);
+ if (work) {
+ /* Pay attention to larger recover stripe size. */
+ int recover = atomic_read(rs->recover.io_count + IO_RECOVER) *
+ rs->recover.io_size / rs->set.io_size;
+
+ /*
+ * Don't use more than given bandwidth
+ * of the work io for recovery.
+ */
+ if (recover > work / rs->recover.bandwidth_work) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_NO_BANDWIDTH);
+ return 0;
+ }
+ }
+
+out:
+ atomic_inc(rs->stats + S_BANDWIDTH); /* REMOVEME: statistics. */
+ return 1;
+}
+
+/* Try to get a region to recover. */
+static int stripe_recover_get_region(struct stripe *stripe)
+{
+ struct raid_set *rs = RS(stripe->sc);
+ struct recover *rec = &rs->recover;
+ struct recover_addr *addr = stripe->recover;
+ struct dm_dirty_log *dl = rec->dl;
+ struct dm_rh_client *rh = rec->rh;
+
+ BUG_ON(!dl);
+ BUG_ON(!rh);
+
+ /* Return, that we have region first to finish it during suspension. */
+ if (addr->reg)
+ return 1;
+
+ if (RSSuspend(rs))
+ return -EPERM;
+
+ if (dl->type->get_sync_count(dl) >= rec->nr_regions)
+ return -ENOENT;
+
+ /* If we don't have enough bandwidth, we don't proceed recovering. */
+ if (!recover_bandwidth(rs))
+ return -EAGAIN;
+
+ /* Start quiescing a region. */
+ dm_rh_recovery_prepare(rh);
+ addr->reg = dm_rh_recovery_start(rh);
+ if (!addr->reg)
+ return -EAGAIN;
+
+ addr->pos = dm_rh_region_to_sector(rh, dm_rh_get_region_key(addr->reg));
+ addr->end = addr->pos + dm_rh_get_region_size(rh);
+
+ /*
+ * Take one global io reference out for the
+ * whole region, which is going to be released
+ * when the region is completely done with.
+ */
+ io_get(rs);
+ return 0;
+}
+
+/* Update region hash state. */
+enum recover_type { REC_FAILURE = 0, REC_SUCCESS = 1 };
+static void recover_rh_update(struct stripe *stripe, enum recover_type success)
+{
+ struct recover_addr *addr = stripe->recover;
+ struct raid_set *rs = RS(stripe->sc);
+ struct recover *rec = &rs->recover;
+
+ if (!addr->reg) {
+ DMERR("%s- Called w/o region", __func__);
+ return;
+ }
+
+ dm_rh_recovery_end(addr->reg, success);
+ if (success)
+ rec->nr_regions_recovered++;
+
+ addr->reg = NULL;
+
+ /*
+ * Completely done with this region ->
+ * release the 1st io reference.
+ */
+ io_put(rs);
+}
+
+/* Set start of recovery state. */
+static void set_start_recovery(struct raid_set *rs)
+{
+ /* Initialize recovery. */
+ rs->recover.start_jiffies = jiffies;
+ rs->recover.end_jiffies = 0;
+}
+
+/* Set end of recovery state. */
+static void set_end_recovery(struct raid_set *rs)
+{
+ ClearRSRecover(rs);
+/* Achtung: nicht mehr zurück setzten -> 'i' belibt in status output und userpace könnte sich darauf verlassen, das es verschiwndet!!!! */
+ rs->set.dev_to_init = -1;
+
+ /* Check for jiffies overrun. */
+ rs->recover.end_jiffies = jiffies;
+ if (rs->recover.end_jiffies < rs->recover.start_jiffies)
+ rs->recover.end_jiffies = ~0;
+}
+
+/* Handle recovery on one recovery stripe. */
+static int _do_recovery(struct stripe *stripe)
+{
+ int r;
+ struct raid_set *rs = RS(stripe->sc);
+ struct recover_addr *addr = stripe->recover;
+
+ /* If recovery is active -> return. */
+ if (stripe_io_ref(stripe))
+ return 1;
+
+ /* IO error is fatal for recovery -> stop it. */
+ if (unlikely(StripeError(stripe)))
+ goto err;
+
+ /* Recovery end required. */
+ if (unlikely(RSDegraded(rs)))
+ goto err;
+
+ /* Get a region to recover. */
+ r = stripe_recover_get_region(stripe);
+ switch (r) {
+ case 0: /* Got a new region: flag initial read before write. */
+ SetStripeRBW(stripe);
+ case 1: /* Have a region in the works. */
+ break;
+ case -EAGAIN:
+ /* No bandwidth/quiesced region yet, try later. */
+ if (!io_ref(rs))
+ wake_do_raid_delayed(rs, HZ / 4);
+ case -EPERM:
+ /* Suspend. */
+ return 1;
+ case -ENOENT: /* No more regions to recover. */
+ schedule_work(&rs->io.ws_do_table_event);
+ return 0;
+ default:
+ BUG();
+ }
+
+ /* Read/write a recover stripe. */
+ r = stripe_recover_rw(stripe);
+ if (r)
+ /* IO initiated. */
+ return 1;
+
+ /* Read and write finished-> update recovery position within region. */
+ addr->pos += stripe->io.size;
+
+ /* If we're at end of region, update region hash. */
+ if (addr->pos >= addr->end ||
+ addr->pos >= rs->set.sectors_per_dev)
+ recover_rh_update(stripe, REC_SUCCESS);
+ else
+ /* Prepare to read next region segment. */
+ SetStripeRBW(stripe);
+
+ /* Schedule myself for another round... */
+ wake_do_raid(rs);
+ return 1;
+
+err:
+ /* FIXME: rather try recovering other regions on error? */
+ rs_check_degrade(stripe);
+ recover_rh_update(stripe, REC_FAILURE);
+
+ /* Check state of partially recovered array. */
+ if (RSDegraded(rs) && !RSDead(rs) &&
+ rs->set.dev_to_init != -1 &&
+ rs->set.ei != rs->set.dev_to_init) {
+ /* Broken drive != drive to recover -> FATAL. */
+ SetRSDead(rs);
+ DMERR("FATAL: failed device != device to initialize -> "
+ "RAID set broken");
+ }
+
+ if (StripeError(stripe) || RSDegraded(rs)) {
+ char buf[BDEVNAME_SIZE];
+
+ DMERR("stopping recovery due to "
+ "ERROR on /dev/%s, stripe at offset %llu",
+ bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
+ (unsigned long long) stripe->key);
+
+ }
+
+ /* Make sure, that all quiesced regions get released. */
+ while (addr->reg) {
+ dm_rh_recovery_end(addr->reg, -EIO);
+ addr->reg = dm_rh_recovery_start(rs->recover.rh);
+ }
+
+ return 0;
+}
+
+/* Called by main io daemon to recover regions. */
+static int do_recovery(struct raid_set *rs)
+{
+ if (RSRecover(rs)) {
+ int r = 0;
+ struct stripe *stripe;
+
+ list_for_each_entry(stripe, &rs->recover.stripes,
+ lists[LIST_RECOVER])
+ r += _do_recovery(stripe);
+
+ if (r)
+ return r;
+
+ set_end_recovery(rs);
+ stripe_recover_free(rs);
+ }
+
+ return 0;
+}
+
+/*
+ * END recovery functions
+ */
+
+/* End io process all stripes handed in by endio() callback. */
+static void _do_endios(struct raid_set *rs, struct stripe *stripe,
+ struct list_head *flush_list)
+{
+ /* First unlock all required chunks. */
+ stripe_chunks_unlock(stripe);
+
+ /*
+ * If an io error on a stripe occured, degrade the RAID set
+ * and try to endio as many bios as possible. If any bios can't
+ * be endio processed, requeue the stripe (stripe_ref() != 0).
+ */
+ if (TestClearStripeError(stripe)) {
+ /*
+ * FIXME: if read, rewrite the failed chunk after reconstruction
+ * in order to trigger disk bad sector relocation.
+ */
+ rs_check_degrade(stripe); /* Resets ChunkError(). */
+ ClearStripeReconstruct(stripe);
+ ClearStripeReconstructed(stripe);
+
+ /*
+ * FIXME: if write, don't endio writes in flight and don't
+ * allow for new writes until userspace has updated
+ * its metadata.
+ */
+ }
+
+ /* Got to reconstruct a missing chunk. */
+ if (StripeReconstruct(stripe)) {
+ /*
+ * (*2*) We use StripeReconstruct() to allow for
+ * all chunks to be xored into the reconstructed
+ * one (see chunk_must_xor()).
+ */
+ stripe_reconstruct(stripe);
+
+ /*
+ * (*3*) Now we reset StripeReconstruct() and flag
+ * StripeReconstructed() to show to stripe_rw(),
+ * that we have reconstructed a missing chunk.
+ */
+ ClearStripeReconstruct(stripe);
+ SetStripeReconstructed(stripe);
+
+ /* FIXME: reschedule to be written in case of read. */
+ /* if (!RSDead && RSDegraded(rs) !StripeRBW(stripe)) {
+ chunk_set(CHUNK(stripe, stripe->idx.recover), DIRTY);
+ stripe_chunks_rw(stripe);
+ } */
+
+ stripe->idx.recover = -1;
+ }
+
+ /*
+ * Now that we eventually got a complete stripe, we
+ * can process the rest of the end ios on reads.
+ */
+ stripe_endio(READ, stripe);
+
+ /* End io all merged writes if not prohibited. */
+ if (!RSProhibitWrites(rs) && StripeMerged(stripe)) {
+ ClearStripeMerged(stripe);
+ stripe_endio(WRITE_MERGED, stripe);
+ }
+
+ /* If RAID set is dead -> fail any ios to dead drives. */
+ if (RSDead(rs)) {
+ if (!TestSetRSDeadEndioMessage(rs))
+ DMERR("RAID set dead: failing ios to dead devices");
+
+ stripe_fail_io(stripe);
+ }
+
+ /*
+ * We have stripe references still,
+ * beacuse of read before writes or IO errors ->
+ * got to put on flush list for processing.
+ */
+ if (stripe_ref(stripe)) {
+ BUG_ON(!list_empty(stripe->lists + LIST_LRU));
+ list_add_tail(stripe->lists + LIST_FLUSH, flush_list);
+ atomic_inc(rs->stats + S_REQUEUE); /* REMOVEME: statistics. */
+ } else
+ stripe_lru_add(stripe);
+}
+
+/* Pop any endio stripes off of the endio list and belabour them. */
+static void do_endios(struct raid_set *rs)
+{
+ struct stripe_cache *sc = &rs->sc;
+ struct stripe *stripe;
+ /* IO flush list for sorted requeued stripes. */
+ struct list_head flush_list;
+
+ INIT_LIST_HEAD(&flush_list);
+
+ while ((stripe = stripe_endio_pop(sc))) {
+ /* Avoid endio on stripes with newly io'ed chunks. */
+ if (!stripe_io_ref(stripe))
+ _do_endios(rs, stripe, &flush_list);
+ }
+
+ /*
+ * Insert any requeued stripes in the proper
+ * order at the beginning of the io (flush) list.
+ */
+ list_splice(&flush_list, sc->lists + LIST_FLUSH);
+}
+
+/* Flush any stripes on the io list. */
+static int do_flush(struct raid_set *rs)
+{
+ int r = 0;
+ struct stripe *stripe;
+
+ while ((stripe = stripe_io_pop(&rs->sc)))
+ r += stripe_rw(stripe); /* Read/write stripe. */
+
+ return r;
+}
+
+/* Stripe cache resizing. */
+static void do_sc_resize(struct raid_set *rs)
+{
+ unsigned set = atomic_read(&rs->sc.stripes_to_set);
+
+ if (set) {
+ unsigned cur = atomic_read(&rs->sc.stripes);
+ int r = (set > cur) ? sc_grow(&rs->sc, set - cur, SC_GROW) :
+ sc_shrink(&rs->sc, cur - set);
+
+ /* Flag end of resizeing if ok. */
+ if (!r)
+ atomic_set(&rs->sc.stripes_to_set, 0);
+ }
+}
+
+/*
+ * Process all ios
+ *
+ * We do different things with the io depending
+ * on the state of the region that it is in:
+ *
+ * o reads: hang off stripe cache or postpone if full
+ *
+ * o writes:
+ *
+ * CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
+ * In case stripe cache is full or busy, postpone the io.
+ *
+ * RECOVERING: delay the io until recovery of the region completes.
+ *
+ */
+static void do_ios(struct raid_set *rs, struct bio_list *ios)
+{
+ int r;
+ unsigned flush = 0, delay = 0;
+ sector_t sector;
+ struct dm_rh_client *rh = rs->recover.rh;
+ struct bio *bio;
+ struct bio_list reject;
+
+ bio_list_init(&reject);
+
+ /*
+ * Classify each io:
+ * o delay writes to recovering regions (let reads go through)
+ * o queue io to all other regions
+ */
+ while ((bio = bio_list_pop(ios))) {
+ /*
+ * In case we get a barrier bio, push it back onto
+ * the input queue unless all work queues are empty
+ * and the stripe cache is inactive.
+ */
+ if (bio->bi_rw & REQ_FLUSH) {
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + S_BARRIER);
+ if (delay ||
+ !list_empty(rs->sc.lists + LIST_FLUSH) ||
+ !bio_list_empty(&reject) ||
+ sc_active(&rs->sc)) {
+ bio_list_push(ios, bio);
+ break;
+ }
+ }
+
+ /* If writes prohibited because of failures -> postpone. */
+ if (RSProhibitWrites(rs) && bio_data_dir(bio) == WRITE) {
+ bio_list_add(&reject, bio);
+ continue;
+ }
+
+ /* Check for recovering regions. */
+ sector = _sector(rs, bio);
+ r = region_state(rs, sector, DM_RH_RECOVERING);
+ if (unlikely(r)) {
+ delay++;
+ /* Wait writing to recovering regions. */
+ dm_rh_delay_by_region(rh, bio,
+ dm_rh_sector_to_region(rh,
+ sector));
+ /* REMOVEME: statistics.*/
+ atomic_inc(rs->stats + S_DELAYED_BIOS);
+ atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
+
+ /* Force bandwidth tests in recovery. */
+ SetRSBandwidth(rs);
+ } else {
+ /*
+ * Process ios to non-recovering regions by queueing
+ * them to stripes (does dm_rh_inc()) for writes).
+ */
+ flush += stripe_queue_bio(rs, bio, &reject);
+ }
+ }
+
+ if (flush) {
+ /* FIXME: better error handling. */
+ r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
+ if (r)
+ DMERR_LIMIT("dirty log flush");
+ }
+
+ /* Merge any rejected bios back to the head of the input list. */
+ bio_list_merge_head(ios, &reject);
+}
+
+/* Send an event in case we're getting too busy. */
+static void do_busy_event(struct raid_set *rs)
+{
+ if (sc_busy(rs)) {
+ if (!TestSetRSScBusy(rs))
+ schedule_work(&rs->io.ws_do_table_event);
+ } else
+ ClearRSScBusy(rs);
+}
+
+/* Throw an event. */
+static void do_table_event(struct work_struct *ws)
+{
+ struct raid_set *rs = container_of(ws, struct raid_set,
+ io.ws_do_table_event);
+ dm_table_event(rs->ti->table);
+}
+
+
+/*-----------------------------------------------------------------
+ * RAID daemon
+ *---------------------------------------------------------------*/
+/*
+ * o belabour all end ios
+ * o update the region hash states
+ * o optionally shrink the stripe cache
+ * o optionally do recovery
+ * o unplug any component raid devices with queued bios
+ * o grab the input queue
+ * o work an all requeued or new ios and perform stripe cache flushs
+ * o unplug any component raid devices with queued bios
+ * o check, if the stripe cache gets too busy and throw an event if so
+ */
+static void do_raid(struct work_struct *ws)
+{
+ int r;
+ struct raid_set *rs = container_of(ws, struct raid_set,
+ io.dws_do_raid.work);
+ struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
+
+ /*
+ * We always need to end io, so that ios can get errored in
+ * case the set failed and the region counters get decremented
+ * before we update region hash states and go any further.
+ */
+ do_endios(rs);
+ dm_rh_update_states(rs->recover.rh, 1);
+
+ /*
+ * Now that we've end io'd, which may have put stripes on the LRU list
+ * to allow for shrinking, we resize the stripe cache if requested.
+ */
+ do_sc_resize(rs);
+
+ /* Try to recover regions. */
+ r = do_recovery(rs);
+
+ /* Quickly grab all new ios queued and add them to the work list. */
+ mutex_lock(&rs->io.in_lock);
+ bio_list_merge(ios, ios_in);
+ bio_list_init(ios_in);
+ mutex_unlock(&rs->io.in_lock);
+
+ if (!bio_list_empty(ios))
+ do_ios(rs, ios); /* Got ios to work into the cache. */
+
+ r = do_flush(rs); /* Flush any stripes on io list. */
+
+ do_busy_event(rs); /* Check if we got too busy. */
+}
+
+/*
+ * Callback for region hash to dispatch
+ * delayed bios queued to recovered regions
+ * (gets called via dm_rh_update_states()).
+ */
+static void dispatch_delayed_bios(void *context, struct bio_list *bl)
+{
+ struct raid_set *rs = context;
+ struct bio *bio;
+
+ /* REMOVEME: statistics; decrement pending delayed bios counter. */
+ bio_list_for_each(bio, bl)
+ atomic_dec(rs->stats + S_DELAYED_BIOS);
+
+ /* Merge region hash private list to work list. */
+ bio_list_merge_head(&rs->io.work, bl);
+ bio_list_init(bl);
+ ClearRSBandwidth(rs);
+}
+
+/*************************************************************
+ * Constructor helpers
+ *************************************************************/
+/* Calculate MB/sec. */
+static unsigned mbpers(struct raid_set *rs, unsigned io_size)
+{
+ return to_bytes((rs->xor.speed * rs->set.data_devs *
+ io_size * HZ / XOR_SPEED_TICKS) >> 10) >> 10;
+}
+
+/*
+ * Discover fastest xor algorithm and # of chunks combination.
+ */
+/* Calculate speed of particular algorithm and # of chunks. */
+static unsigned xor_speed(struct stripe *stripe)
+{
+ int ticks = XOR_SPEED_TICKS;
+ unsigned p = RS(stripe->sc)->set.raid_devs, r = 0;
+ unsigned long j;
+
+ /* Set uptodate so that common_xor()->xor() will belabour chunks. */
+ while (p--)
+ SetChunkUptodate(CHUNK(stripe, p));
+
+ /* Wait for next tick. */
+ for (j = jiffies; j == jiffies; );
+
+ /* Do xors for a few ticks. */
+ while (ticks--) {
+ unsigned xors = 0;
+
+ for (j = jiffies; j == jiffies; ) {
+ mb();
+ common_xor(stripe, stripe->io.size, 0, 0);
+ mb();
+ xors++;
+ mb();
+ }
+
+ if (xors > r)
+ r = xors;
+ }
+
+ return r;
+}
+
+/* Define for xor multi recovery stripe optimization runs. */
+#define DMRAID45_XOR_TEST
+
+/* Optimize xor algorithm for this RAID set. */
+static unsigned xor_optimize(struct raid_set *rs)
+{
+ unsigned chunks_max = 2, speed_max = 0;
+ struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
+ struct stripe *stripe;
+ unsigned io_size = 0, speed_hm = 0, speed_min = ~0, speed_xor_blocks = 0;
+
+ BUG_ON(list_empty(&rs->recover.stripes));
+#ifndef DMRAID45_XOR_TEST
+ stripe = list_first_entry(&rs->recover.stripes, struct stripe,
+ lists[LIST_RECOVER]);
+#endif
+
+ /* Try all xor functions. */
+ while (f-- > xor_funcs) {
+ unsigned speed;
+
+#ifdef DMRAID45_XOR_TEST
+ list_for_each_entry(stripe, &rs->recover.stripes,
+ lists[LIST_RECOVER]) {
+ io_size = stripe->io.size;
+#endif
+
+ /* Set actual xor function for common_xor(). */
+ rs->xor.f = f;
+ rs->xor.chunks = (f->f == xor_blocks_wrapper ?
+ (MAX_XOR_BLOCKS + 1) :
+ XOR_CHUNKS_MAX);
+ if (rs->xor.chunks > rs->set.raid_devs)
+ rs->xor.chunks = rs->set.raid_devs;
+
+ for ( ; rs->xor.chunks > 1; rs->xor.chunks--) {
+ speed = xor_speed(stripe);
+
+#ifdef DMRAID45_XOR_TEST
+ if (f->f == xor_blocks_wrapper) {
+ if (speed > speed_xor_blocks)
+ speed_xor_blocks = speed;
+ } else if (speed > speed_hm)
+ speed_hm = speed;
+
+ if (speed < speed_min)
+ speed_min = speed;
+#endif
+
+ if (speed > speed_max) {
+ speed_max = speed;
+ chunks_max = rs->xor.chunks;
+ f_max = f;
+ }
+ }
+#ifdef DMRAID45_XOR_TEST
+ }
+#endif
+ }
+
+ /* Memorize optimal parameters. */
+ rs->xor.f = f_max;
+ rs->xor.chunks = chunks_max;
+#ifdef DMRAID45_XOR_TEST
+ DMINFO("%s stripes=%u/size=%u min=%u xor_blocks=%u hm=%u max=%u",
+ speed_max == speed_hm ? "HM" : "NB",
+ rs->recover.recovery_stripes, io_size, speed_min,
+ speed_xor_blocks, speed_hm, speed_max);
+#endif
+ return speed_max;
+}
+
+/*
+ * Allocate a RAID context (a RAID set)
+ */
+/* Structure for variable RAID parameters. */
+struct variable_parms {
+ int bandwidth;
+ int bandwidth_parm;
+ int chunk_size;
+ int chunk_size_parm;
+ int io_size;
+ int io_size_parm;
+ int stripes;
+ int stripes_parm;
+ int recover_io_size;
+ int recover_io_size_parm;
+ int raid_parms;
+ int recovery;
+ int recovery_stripes;
+ int recovery_stripes_parm;
+};
+
+static struct raid_set *
+context_alloc(struct raid_type *raid_type, struct variable_parms *p,
+ unsigned raid_devs, sector_t sectors_per_dev,
+ struct dm_target *ti, unsigned dl_parms, char **argv)
+{
+ int r;
+ size_t len;
+ sector_t region_size, ti_len;
+ struct raid_set *rs = NULL;
+ struct dm_dirty_log *dl;
+ struct recover *rec;
+
+ /*
+ * Create the dirty log
+ *
+ * We need to change length for the dirty log constructor,
+ * because we want an amount of regions for all stripes derived
+ * from the single device size, so that we can keep region
+ * size = 2^^n independant of the number of devices
+ */
+ ti_len = ti->len;
+ ti->len = sectors_per_dev;
+ dl = dm_dirty_log_create(argv[0], ti, NULL, dl_parms, argv + 2);
+ ti->len = ti_len;
+ if (!dl)
+ goto bad_dirty_log;
+
+ /* Chunk size *must* be smaller than region size. */
+ region_size = dl->type->get_region_size(dl);
+ if (p->chunk_size > region_size)
+ goto bad_chunk_size;
+
+ /* Recover io size *must* be smaller than region size as well. */
+ if (p->recover_io_size > region_size)
+ goto bad_recover_io_size;
+
+ /* Size and allocate the RAID set structure. */
+ len = sizeof(*rs->data) + sizeof(*rs->dev);
+ if (dm_array_too_big(sizeof(*rs), len, raid_devs))
+ goto bad_array;
+
+ len = sizeof(*rs) + raid_devs * len;
+ rs = kzalloc(len, GFP_KERNEL);
+ if (!rs)
+ goto bad_alloc;
+
+ rec = &rs->recover;
+ atomic_set(&rs->io.in_process, 0);
+ atomic_set(&rs->io.in_process_max, 0);
+ rec->io_size = p->recover_io_size;
+
+ /* Pointer to data array. */
+ rs->data = (unsigned long **)
+ ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
+ rec->dl = dl;
+ rs->set.raid_devs = raid_devs;
+ rs->set.data_devs = raid_devs - raid_type->parity_devs;
+ rs->set.raid_type = raid_type;
+
+ rs->set.raid_parms = p->raid_parms;
+ rs->set.chunk_size_parm = p->chunk_size_parm;
+ rs->set.io_size_parm = p->io_size_parm;
+ rs->sc.stripes_parm = p->stripes_parm;
+ rec->io_size_parm = p->recover_io_size_parm;
+ rec->bandwidth_parm = p->bandwidth_parm;
+ rec->recovery = p->recovery;
+ rec->recovery_stripes = p->recovery_stripes;
+
+ /*
+ * Set chunk and io size and respective shifts
+ * (used to avoid divisions)
+ */
+ rs->set.chunk_size = p->chunk_size;
+ rs->set.chunk_shift = ffs(p->chunk_size) - 1;
+
+ rs->set.io_size = p->io_size;
+ rs->set.io_mask = p->io_size - 1;
+ /* Mask to adjust address key in case io_size != chunk_size. */
+ rs->set.io_inv_mask = (p->chunk_size - 1) & ~rs->set.io_mask;
+
+ rs->set.sectors_per_dev = sectors_per_dev;
+
+ rs->set.ei = -1; /* Indicate no failed device. */
+ atomic_set(&rs->set.failed_devs, 0);
+
+ rs->ti = ti;
+
+ atomic_set(rec->io_count + IO_WORK, 0);
+ atomic_set(rec->io_count + IO_RECOVER, 0);
+
+ /* Initialize io lock and queues. */
+ mutex_init(&rs->io.in_lock);
+ mutex_init(&rs->io.xor_lock);
+ bio_list_init(&rs->io.in);
+ bio_list_init(&rs->io.work);
+
+ init_waitqueue_head(&rs->io.suspendq); /* Suspend waiters (dm-io). */
+
+ rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
+ rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios,
+ wake_dummy, wake_do_raid, 0, p->recovery_stripes,
+ dl, region_size, rec->nr_regions);
+ if (IS_ERR(rec->rh))
+ goto bad_rh;
+
+ /* Initialize stripe cache. */
+ r = sc_init(rs, p->stripes);
+ if (r)
+ goto bad_sc;
+
+ /* REMOVEME: statistics. */
+ stats_reset(rs);
+ ClearRSDevelStats(rs); /* Disnable development status. */
+ return rs;
+
+bad_dirty_log:
+ TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM));
+
+bad_chunk_size:
+ dm_dirty_log_destroy(dl);
+ TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL));
+
+bad_recover_io_size:
+ dm_dirty_log_destroy(dl);
+ TI_ERR_RET("Recover stripe io size larger than region size",
+ ERR_PTR(-EINVAL));
+
+bad_array:
+ dm_dirty_log_destroy(dl);
+ TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL));
+
+bad_alloc:
+ dm_dirty_log_destroy(dl);
+ TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM));
+
+bad_rh:
+ dm_dirty_log_destroy(dl);
+ ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
+ goto free_rs;
+
+bad_sc:
+ dm_region_hash_destroy(rec->rh); /* Destroys dirty log too. */
+ sc_exit(&rs->sc);
+ ti->error = DM_MSG_PREFIX "Error creating stripe cache";
+free_rs:
+ kfree(rs);
+ return ERR_PTR(-ENOMEM);
+}
+
+/* Free a RAID context (a RAID set). */
+static void context_free(struct raid_set *rs, unsigned p)
+{
+ while (p--)
+ dm_put_device(rs->ti, rs->dev[p].dev);
+
+ sc_exit(&rs->sc);
+ dm_region_hash_destroy(rs->recover.rh); /* Destroys dirty log too. */
+ kfree(rs);
+}
+
+/* Create work queue and initialize delayed work. */
+static int rs_workqueue_init(struct raid_set *rs)
+{
+ struct dm_target *ti = rs->ti;
+
+ rs->io.wq = create_singlethread_workqueue(DAEMON);
+ if (!rs->io.wq)
+ TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
+
+ INIT_DELAYED_WORK(&rs->io.dws_do_raid, do_raid);
+ INIT_WORK(&rs->io.ws_do_table_event, do_table_event);
+ return 0;
+}
+
+/* Return pointer to raid_type structure for raid name. */
+static struct raid_type *get_raid_type(char *name)
+{
+ struct raid_type *r = ARRAY_END(raid_types);
+
+ while (r-- > raid_types) {
+ if (!strcmp(r->name, name))
+ return r;
+ }
+
+ return NULL;
+}
+
+/* FIXME: factor out to dm core. */
+static int multiple(sector_t a, sector_t b, sector_t *n)
+{
+ sector_t r = a;
+
+ sector_div(r, b);
+ *n = r;
+ return a == r * b;
+}
+
+/* Log RAID set information to kernel log. */
+static void rs_log(struct raid_set *rs, unsigned io_size)
+{
+ unsigned p;
+ char buf[BDEVNAME_SIZE];
+
+ for (p = 0; p < rs->set.raid_devs; p++)
+ DMINFO("/dev/%s is raid disk %u%s",
+ bdevname(rs->dev[p].dev->bdev, buf), p,
+ (p == rs->set.pi) ? " (parity)" : "");
+
+ DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n"
+ "algorithm \"%s\", %u chunks with %uMB/s\n"
+ "%s set with net %u/%u devices",
+ rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
+ atomic_read(&rs->sc.stripes),
+ rs->xor.f->name, rs->xor.chunks, mbpers(rs, io_size),
+ rs->set.raid_type->descr, rs->set.data_devs, rs->set.raid_devs);
+}
+
+/* Get all devices and offsets. */
+static int dev_parms(struct raid_set *rs, char **argv, int *p)
+{
+ struct dm_target *ti = rs->ti;
+
+DMINFO("rs->set.sectors_per_dev=%llu", (unsigned long long) rs->set.sectors_per_dev);
+ for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
+ int r;
+ unsigned long long tmp;
+ struct raid_dev *dev = rs->dev + *p;
+
+ /* Get offset and device. */
+ if (sscanf(argv[1], "%llu", &tmp) != 1 ||
+ tmp > rs->set.sectors_per_dev)
+ TI_ERR("Invalid RAID device offset parameter");
+
+ dev->start = tmp;
+ r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
+ &dev->dev);
+ if (r)
+ TI_ERR_RET("RAID device lookup failure", r);
+
+ r = raid_dev_lookup(rs, dev);
+ if (r != -ENODEV && r < *p) {
+ (*p)++; /* Ensure dm_put_device() on actual device. */
+ TI_ERR_RET("Duplicate RAID device", -ENXIO);
+ }
+ }
+
+ return 0;
+}
+
+/* Set recovery bandwidth. */
+static void
+recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
+{
+ rs->recover.bandwidth = bandwidth;
+ rs->recover.bandwidth_work = 100 / bandwidth;
+}
+
+/* Handle variable number of RAID parameters. */
+static int get_raid_variable_parms(struct dm_target *ti, char **argv,
+ struct variable_parms *vp)
+{
+ int p, value;
+ struct {
+ int action; /* -1: skip, 0: no power2 check, 1: power2 check */
+ char *errmsg;
+ int min, max;
+ int *var, *var2, *var3;
+ } argctr[] = {
+ { 1,
+ "Invalid chunk size; must be -1 or 2^^n and <= 16384",
+ IO_SIZE_MIN, CHUNK_SIZE_MAX,
+ &vp->chunk_size_parm, &vp->chunk_size, &vp->io_size },
+ { 0,
+ "Invalid number of stripes: must be -1 or >= 8 and <= 16384",
+ STRIPES_MIN, STRIPES_MAX,
+ &vp->stripes_parm, &vp->stripes, NULL },
+ { 1,
+ "Invalid io size; must -1 or >= 8, 2^^n and less equal "
+ "min(BIO_MAX_SECTORS/2, chunk size)",
+ IO_SIZE_MIN, 0, /* Needs to be updated in loop below. */
+ &vp->io_size_parm, &vp->io_size, NULL },
+ { 1,
+ "Invalid recovery io size; must be -1 or "
+ "2^^n and less equal BIO_MAX_SECTORS/2",
+ RECOVER_IO_SIZE_MIN, BIO_MAX_SECTORS / 2,
+ &vp->recover_io_size_parm, &vp->recover_io_size, NULL },
+ { 0,
+ "Invalid recovery bandwidth percentage; "
+ "must be -1 or > 0 and <= 100",
+ BANDWIDTH_MIN, BANDWIDTH_MAX,
+ &vp->bandwidth_parm, &vp->bandwidth, NULL },
+ /* Handle sync argument seperately in loop. */
+ { -1,
+ "Invalid recovery switch; must be \"sync\" or \"nosync\"" },
+ { 0,
+ "Invalid number of recovery stripes;"
+ "must be -1, > 0 and <= 64",
+ RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX,
+ &vp->recovery_stripes_parm, &vp->recovery_stripes, NULL },
+ }, *varp;
+
+ /* Fetch # of variable raid parameters. */
+ if (sscanf(*(argv++), "%d", &vp->raid_parms) != 1 ||
+ !range_ok(vp->raid_parms, 0, 7))
+ TI_ERR("Bad variable raid parameters number");
+
+ /* Preset variable RAID parameters. */
+ vp->chunk_size = CHUNK_SIZE_DEFAULT;
+ vp->io_size = IO_SIZE_DEFAULT;
+ vp->stripes = STRIPES_DEFAULT;
+ vp->recover_io_size = RECOVER_IO_SIZE_DEFAULT;
+ vp->bandwidth = BANDWIDTH_DEFAULT;
+ vp->recovery = 1;
+ vp->recovery_stripes = RECOVERY_STRIPES_DEFAULT;
+
+ /* Walk the array of argument constraints for all given ones. */
+ for (p = 0, varp = argctr; p < vp->raid_parms; p++, varp++) {
+ BUG_ON(varp >= ARRAY_END(argctr));
+
+ /* Special case for "[no]sync" string argument. */
+ if (varp->action < 0) {
+ if (!strcmp(*argv, "sync"))
+ ;
+ else if (!strcmp(*argv, "nosync"))
+ vp->recovery = 0;
+ else
+ TI_ERR(varp->errmsg);
+
+ argv++;
+ continue;
+ }
+
+ /*
+ * Special case for io_size depending
+ * on previously set chunk size.
+ */
+ if (p == 2)
+ varp->max = min(BIO_MAX_SECTORS / 2, vp->chunk_size);
+
+ if (sscanf(*(argv++), "%d", &value) != 1 ||
+ (value != -1 &&
+ ((varp->action && !is_power_of_2(value)) ||
+ !range_ok(value, varp->min, varp->max))))
+ TI_ERR(varp->errmsg);
+
+ *varp->var = value;
+ if (value != -1) {
+ if (varp->var2)
+ *varp->var2 = value;
+ if (varp->var3)
+ *varp->var3 = value;
+ }
+ }
+
+ return 0;
+}
+
+/* Parse optional locking parameters. */
+static int get_raid_locking_parms(struct dm_target *ti, char **argv,
+ int *locking_parms,
+ struct dm_raid45_locking_type **locking_type)
+{
+ if (!strnicmp(argv[0], "locking", strlen(argv[0]))) {
+ char *lckstr = argv[1];
+ size_t lcksz = strlen(lckstr);
+
+ if (!strnicmp(lckstr, "none", lcksz)) {
+ *locking_type = &locking_none;
+ *locking_parms = 2;
+ } else if (!strnicmp(lckstr, "cluster", lcksz)) {
+ DMERR("locking type \"%s\" not yet implemented",
+ lckstr);
+ return -EINVAL;
+ } else {
+ DMERR("unknown locking type \"%s\"", lckstr);
+ return -EINVAL;
+ }
+ }
+
+ *locking_parms = 0;
+ *locking_type = &locking_none;
+ return 0;
+}
+
+/* Set backing device read ahead properties of RAID set. */
+static void rs_set_read_ahead(struct raid_set *rs,
+ unsigned sectors, unsigned stripes)
+{
+ unsigned ra_pages = dm_div_up(sectors, SECTORS_PER_PAGE);
+ struct mapped_device *md = dm_table_get_md(rs->ti->table);
+ struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
+
+ /* Set read-ahead for the RAID set and the component devices. */
+ if (ra_pages) {
+ unsigned p = rs->set.raid_devs;
+
+ bdi->ra_pages = stripes * ra_pages * rs->set.data_devs;
+
+ while (p--) {
+ struct request_queue *q =
+ bdev_get_queue(rs->dev[p].dev->bdev);
+
+ q->backing_dev_info.ra_pages = ra_pages;
+ }
+ }
+}
+
+/* Set congested function. */
+static void rs_set_congested_fn(struct raid_set *rs)
+{
+ struct mapped_device *md = dm_table_get_md(rs->ti->table);
+ struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
+
+ /* Set congested function and data. */
+ bdi->congested_fn = rs_congested;
+ bdi->congested_data = rs;
+}
+
+/*
+ * Construct a RAID4/5 mapping:
+ *
+ * log_type #log_params <log_params> \
+ * raid_type [#parity_dev] #raid_variable_params <raid_params> \
+ * [locking "none"/"cluster"]
+ * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
+ *
+ * log_type = "core"/"disk",
+ * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
+ * log_params = [dirty_log_path] region_size [[no]sync])
+ *
+ * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
+ *
+ * #parity_dev = N if raid_type = "raid4"
+ * o N = -1: pick default = last device
+ * o N >= 0 and < #raid_devs: parity device index
+ *
+ * #raid_variable_params = 0-7; raid_params (-1 = default):
+ * [chunk_size [#stripes [io_size [recover_io_size \
+ * [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]]
+ * o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
+ * and <= CHUNK_SIZE_MAX)
+ * o #stripes is number of stripes allocated to stripe cache
+ * (must be > 1 and < STRIPES_MAX)
+ * o io_size (io unit size per device in sectors; must be 2^^n and > 8)
+ * o recover_io_size (io unit size per device for recovery in sectors;
+ must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
+ * o %recovery_bandwith is the maximum amount spend for recovery during
+ * application io (1-100%)
+ * o recovery switch = [sync|nosync]
+ * o #recovery_stripes is the number of recovery stripes used for
+ * parallel recovery of the RAID set
+ * If raid_variable_params = 0, defaults will be used.
+ * Any raid_variable_param can be set to -1 to apply a default
+ *
+ * #raid_devs = N (N >= 3)
+ *
+ * #dev_to_initialize = N
+ * -1: initialize parity on all devices
+ * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
+ * of a failed devices content after replacement
+ *
+ * <dev_path> = device_path (eg, /dev/sdd1)
+ * <offset> = begin at offset on <dev_path>
+ *
+ */
+#define MIN_PARMS 13
+static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ int dev_to_init, dl_parms, i, locking_parms,
+ parity_parm, pi = -1, r, raid_devs;
+ sector_t tmp, sectors_per_dev;
+ struct dm_raid45_locking_type *locking;
+ struct raid_set *rs;
+ struct raid_type *raid_type;
+ struct variable_parms parms;
+
+ /* Ensure minimum number of parameters. */
+ if (argc < MIN_PARMS)
+ TI_ERR("Not enough parameters");
+
+ /* Fetch # of dirty log parameters. */
+ if (sscanf(argv[1], "%d", &dl_parms) != 1 ||
+ !range_ok(dl_parms, 1, 4711)) /* ;-) */
+ TI_ERR("Bad dirty log parameters number");
+
+ /* Check raid_type. */
+ raid_type = get_raid_type(argv[dl_parms + 2]);
+ if (!raid_type)
+ TI_ERR("Bad raid type");
+
+ /* In case of RAID4, parity drive is selectable. */
+ parity_parm = !!(raid_type->level == raid4);
+
+ /* Handle variable number of RAID parameters. */
+ r = get_raid_variable_parms(ti, argv + dl_parms + parity_parm + 3,
+ &parms);
+ if (r)
+ return r;
+
+ /* Handle any locking parameters. */
+ r = get_raid_locking_parms(ti,
+ argv + dl_parms + parity_parm +
+ parms.raid_parms + 4,
+ &locking_parms, &locking);
+ if (r)
+ return r;
+
+ /* # of raid devices. */
+ i = dl_parms + parity_parm + parms.raid_parms + locking_parms + 4;
+ if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
+ raid_devs < raid_type->minimal_devs)
+ TI_ERR("Invalid number of raid devices");
+
+ /* In case of RAID4, check parity drive index is in limits. */
+ if (raid_type->level == raid4) {
+ /* Fetch index of parity device. */
+ if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
+ (pi != -1 && !range_ok(pi, 0, raid_devs - 1)))
+ TI_ERR("Invalid RAID4 parity device index");
+ }
+
+ /*
+ * Index of device to initialize starts at 0
+ *
+ * o -1 -> don't initialize a selected device;
+ * initialize parity conforming to algorithm
+ * o 0..raid_devs-1 -> initialize respective device
+ * (used for reconstruction of a replaced device)
+ */
+ if (sscanf(argv[dl_parms + parity_parm + parms.raid_parms +
+ locking_parms + 5], "%d", &dev_to_init) != 1 ||
+ !range_ok(dev_to_init, -1, raid_devs - 1))
+ TI_ERR("Invalid number for raid device to initialize");
+
+ /* Check # of raid device arguments. */
+ if (argc - dl_parms - parity_parm - parms.raid_parms - 6 !=
+ 2 * raid_devs)
+ TI_ERR("Wrong number of raid device/offset arguments");
+
+ /*
+ * Check that the table length is devisable
+ * w/o rest by (raid_devs - parity_devs)
+ */
+ if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
+ §ors_per_dev))
+ TI_ERR("Target length not divisible by number of data devices");
+
+ /*
+ * Check that the device size is
+ * devisable w/o rest by chunk size
+ */
+ if (!multiple(sectors_per_dev, parms.chunk_size, &tmp))
+ TI_ERR("Device length not divisible by chunk_size");
+
+ /****************************************************************
+ * Now that we checked the constructor arguments ->
+ * let's allocate the RAID set
+ ****************************************************************/
+ rs = context_alloc(raid_type, &parms, raid_devs, sectors_per_dev,
+ ti, dl_parms, argv);
+ if (IS_ERR(rs))
+ return PTR_ERR(rs);
+
+
+ rs->set.dev_to_init = rs->set.dev_to_init_parm = dev_to_init;
+ rs->set.pi = rs->set.pi_parm = pi;
+
+ /* Set RAID4 parity drive index. */
+ if (raid_type->level == raid4)
+ rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
+
+ recover_set_bandwidth(rs, parms.bandwidth);
+
+ /* Use locking type to lock stripe access. */
+ rs->locking = locking;
+
+ /* Get the device/offset tupels. */
+ argv += dl_parms + 6 + parity_parm + parms.raid_parms;
+ r = dev_parms(rs, argv, &i);
+ if (r)
+ goto err;
+
+ /* Set backing device information (eg. read ahead). */
+ rs_set_read_ahead(rs, 2 * rs->set.chunk_size /* sectors per device */,
+ 2 /* # of stripes */);
+ rs_set_congested_fn(rs); /* Set congested function. */
+ SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
+ rs->xor.speed = xor_optimize(rs); /* Select best xor algorithm. */
+
+ /* Set for recovery of any nosync regions. */
+ if (parms.recovery)
+ SetRSRecover(rs);
+ else {
+ /*
+ * Need to free recovery stripe(s) here in case
+ * of nosync, because xor_optimize uses one.
+ */
+ set_start_recovery(rs);
+ set_end_recovery(rs);
+ stripe_recover_free(rs);
+ }
+
+ /*
+ * Enable parity chunk creation enformcement for
+ * little numbers of array members where it doesn'ti
+ * gain us performance to xor parity out and back in as
+ * with larger array member numbers.
+ */
+ if (rs->set.raid_devs <= rs->set.raid_type->minimal_devs + 1)
+ SetRSEnforceParityCreation(rs);
+
+ /*
+ * Make sure that dm core only hands maximum io size
+ * length down and pays attention to io boundaries.
+ */
+ ti->split_io = rs->set.io_size;
+ ti->private = rs;
+
+ /* Initialize work queue to handle this RAID set's io. */
+ r = rs_workqueue_init(rs);
+ if (r)
+ goto err;
+
+ rs_log(rs, rs->recover.io_size); /* Log information about RAID set. */
+ return 0;
+
+err:
+ context_free(rs, i);
+ return r;
+}
+
+/*
+ * Destruct a raid mapping
+ */
+static void raid_dtr(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ destroy_workqueue(rs->io.wq);
+ context_free(rs, rs->set.raid_devs);
+}
+
+/* Raid mapping function. */
+static int raid_map(struct dm_target *ti, struct bio *bio,
+ union map_info *map_context)
+{
+ /* I don't want to waste stripe cache capacity. */
+ if (bio_rw(bio) == READA)
+ return -EIO;
+ else {
+ struct raid_set *rs = ti->private;
+
+ /*
+ * Get io reference to be waiting for to drop
+ * to zero on device suspension/destruction.
+ */
+ io_get(rs);
+ bio->bi_sector -= ti->begin; /* Remap sector. */
+
+ /* Queue io to RAID set. */
+ mutex_lock(&rs->io.in_lock);
+ bio_list_add(&rs->io.in, bio);
+ mutex_unlock(&rs->io.in_lock);
+
+ /* Wake daemon to process input list. */
+ wake_do_raid(rs);
+
+ /* REMOVEME: statistics. */
+ atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
+ S_BIOS_READ : S_BIOS_WRITE));
+ return DM_MAPIO_SUBMITTED; /* Handle later. */
+ }
+}
+
+/* Device suspend. */
+static void raid_presuspend(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+ struct dm_dirty_log *dl = rs->recover.dl;
+
+ SetRSSuspend(rs);
+
+ if (RSRecover(rs))
+ dm_rh_stop_recovery(rs->recover.rh);
+
+ cancel_delayed_work(&rs->io.dws_do_raid);
+ flush_workqueue(rs->io.wq);
+ wait_ios(rs); /* Wait for completion of all ios being processed. */
+
+ if (dl->type->presuspend && dl->type->presuspend(dl))
+ /* FIXME: need better error handling. */
+ DMWARN("log presuspend failed");
+}
+
+static void raid_postsuspend(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+ struct dm_dirty_log *dl = rs->recover.dl;
+
+ if (dl->type->postsuspend && dl->type->postsuspend(dl))
+ /* FIXME: need better error handling. */
+ DMWARN("log postsuspend failed");
+
+}
+
+/* Device resume. */
+static void raid_resume(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+ struct recover *rec = &rs->recover;
+ struct dm_dirty_log *dl = rec->dl;
+
+DMINFO("%s...", __func__);
+ if (dl->type->resume && dl->type->resume(dl))
+ /* Resume dirty log. */
+ /* FIXME: need better error handling. */
+ DMWARN("log resume failed");
+
+ rec->nr_regions_to_recover =
+ rec->nr_regions - dl->type->get_sync_count(dl);
+
+ /* Restart any unfinished recovery. */
+ if (RSRecover(rs)) {
+ set_start_recovery(rs);
+ dm_rh_start_recovery(rec->rh);
+ }
+
+ ClearRSSuspend(rs);
+}
+
+/* Return stripe cache size. */
+static unsigned sc_size(struct raid_set *rs)
+{
+ return to_sector(atomic_read(&rs->sc.stripes) *
+ (sizeof(struct stripe) +
+ (sizeof(struct stripe_chunk) +
+ (sizeof(struct page_list) +
+ to_bytes(rs->set.io_size) *
+ rs->set.raid_devs)) +
+ (rs->recover.end_jiffies ?
+ 0 : rs->recover.recovery_stripes *
+ to_bytes(rs->set.raid_devs * rs->recover.io_size))));
+}
+
+/* REMOVEME: status output for development. */
+static void raid_devel_stats(struct dm_target *ti, char *result,
+ unsigned *size, unsigned maxlen)
+{
+ unsigned sz = *size;
+ unsigned long j;
+ char buf[BDEVNAME_SIZE], *p;
+ struct stats_map *sm;
+ struct raid_set *rs = ti->private;
+ struct recover *rec = &rs->recover;
+ struct timespec ts;
+
+ DMEMIT("%s %s=%u bw=%u\n",
+ version, rs->xor.f->name, rs->xor.chunks, rs->recover.bandwidth);
+ DMEMIT("act_ios=%d ", io_ref(rs));
+ DMEMIT("act_ios_max=%d\n", atomic_read(&rs->io.in_process_max));
+ DMEMIT("act_stripes=%d ", sc_active(&rs->sc));
+ DMEMIT("act_stripes_max=%d\n",
+ atomic_read(&rs->sc.active_stripes_max));
+
+ for (sm = stats_map; sm < ARRAY_END(stats_map); sm++)
+ DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
+
+ DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs) ? "on" : "off");
+ DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs->set.chunk_size,
+ atomic_read(&rs->sc.stripes), rs->set.io_size,
+ rec->recovery_stripes, rec->io_size, rs->sc.hash.buckets,
+ sc_size(rs));
+
+ j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
+ rec->start_jiffies;
+ jiffies_to_timespec(j, &ts);
+ sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
+ p = strchr(buf, '.');
+ p[3] = 0;
+
+ DMEMIT("rg=%llu/%llu/%llu/%u %s\n",
+ (unsigned long long) rec->nr_regions_recovered,
+ (unsigned long long) rec->nr_regions_to_recover,
+ (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
+
+ *size = sz;
+}
+
+static int raid_status(struct dm_target *ti, status_type_t type,
+ char *result, unsigned maxlen)
+{
+ unsigned p, sz = 0;
+ char buf[BDEVNAME_SIZE];
+ struct raid_set *rs = ti->private;
+ struct dm_dirty_log *dl = rs->recover.dl;
+ int raid_parms[] = {
+ rs->set.chunk_size_parm,
+ rs->sc.stripes_parm,
+ rs->set.io_size_parm,
+ rs->recover.io_size_parm,
+ rs->recover.bandwidth_parm,
+ -2,
+ rs->recover.recovery_stripes,
+ };
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ /* REMOVEME: statistics. */
+ if (RSDevelStats(rs))
+ raid_devel_stats(ti, result, &sz, maxlen);
+
+ DMEMIT("%u ", rs->set.raid_devs);
+
+ for (p = 0; p < rs->set.raid_devs; p++)
+ DMEMIT("%s ",
+ format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev));
+
+ DMEMIT("2 ");
+ for (p = 0; p < rs->set.raid_devs; p++) {
+ DMEMIT("%c", !DevFailed(rs->dev + p) ? 'A' : 'D');
+
+ if (p == rs->set.pi)
+ DMEMIT("p");
+
+ if (p == rs->set.dev_to_init)
+ DMEMIT("i");
+ }
+
+ DMEMIT(" %llu/%llu ",
+ (unsigned long long) dl->type->get_sync_count(dl),
+ (unsigned long long) rs->recover.nr_regions);
+
+ sz += dl->type->status(dl, type, result+sz, maxlen-sz);
+ break;
+ case STATUSTYPE_TABLE:
+ sz = rs->recover.dl->type->status(rs->recover.dl, type,
+ result, maxlen);
+ DMEMIT("%s %u ", rs->set.raid_type->name, rs->set.raid_parms);
+
+ for (p = 0; p < rs->set.raid_parms; p++) {
+ if (raid_parms[p] > -2)
+ DMEMIT("%d ", raid_parms[p]);
+ else
+ DMEMIT("%s ", rs->recover.recovery ?
+ "sync" : "nosync");
+ }
+
+ DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
+
+ for (p = 0; p < rs->set.raid_devs; p++)
+ DMEMIT("%s %llu ",
+ format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev),
+ (unsigned long long) rs->dev[p].start);
+ }
+
+ return 0;
+}
+
+/*
+ * Message interface
+ */
+/* Turn a delta into an absolute value. */
+static int _absolute(char *action, int act, int r)
+{
+ size_t len = strlen(action);
+
+ if (len < 2)
+ len = 2;
+
+ /* Make delta absolute. */
+ if (!strncmp("set", action, len))
+ ;
+ else if (!strncmp("grow", action, len))
+ r += act;
+ else if (!strncmp("shrink", action, len))
+ r = act - r;
+ else
+ r = -EINVAL;
+
+ return r;
+}
+
+ /* Change recovery io bandwidth. */
+static int bandwidth_change(struct raid_set *rs, int argc, char **argv,
+ enum raid_set_flags flag)
+{
+ int act = rs->recover.bandwidth, bandwidth;
+
+ if (argc != 2)
+ return -EINVAL;
+
+ if (sscanf(argv[1], "%d", &bandwidth) == 1 &&
+ range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+ /* Make delta bandwidth absolute. */
+ bandwidth = _absolute(argv[0], act, bandwidth);
+
+ /* Check range. */
+ if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
+ recover_set_bandwidth(rs, bandwidth);
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+/* Set/reset development feature flags. */
+static int devel_flags(struct raid_set *rs, int argc, char **argv,
+ enum raid_set_flags flag)
+{
+ size_t len;
+
+ if (argc != 1)
+ return -EINVAL;
+
+ len = strlen(argv[0]);
+ if (len < 2)
+ len = 2;
+
+ if (!strncmp(argv[0], "on", len))
+ return test_and_set_bit(flag, &rs->io.flags) ? -EPERM : 0;
+ else if (!strncmp(argv[0], "off", len))
+ return test_and_clear_bit(flag, &rs->io.flags) ? 0 : -EPERM;
+ else if (!strncmp(argv[0], "reset", len)) {
+ if (flag == RS_DEVEL_STATS) {
+ if (test_bit(flag, &rs->io.flags)) {
+ stats_reset(rs);
+ return 0;
+ } else
+ return -EPERM;
+ } else {
+ set_bit(flag, &rs->io.flags);
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+/* Resize the stripe cache. */
+static int sc_resize(struct raid_set *rs, int argc, char **argv,
+ enum raid_set_flags flag)
+{
+ int act, stripes;
+
+ if (argc != 2)
+ return -EINVAL;
+
+ /* Deny permission in case the daemon is still resizing!. */
+ if (atomic_read(&rs->sc.stripes_to_set))
+ return -EPERM;
+
+ if (sscanf(argv[1], "%d", &stripes) == 1 &&
+ stripes > 0) {
+ act = atomic_read(&rs->sc.stripes);
+
+ /* Make delta stripes absolute. */
+ stripes = _absolute(argv[0], act, stripes);
+
+ /*
+ * Check range and that the # of stripes changes.
+ * We leave the resizing to the wroker.
+ */
+ if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX) &&
+ stripes != atomic_read(&rs->sc.stripes)) {
+ atomic_set(&rs->sc.stripes_to_set, stripes);
+ wake_do_raid(rs);
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+/* Change xor algorithm and number of chunks. */
+static int xor_set(struct raid_set *rs, int argc, char **argv,
+ enum raid_set_flags flag)
+{
+ if (argc == 2) {
+ int chunks;
+ char *algorithm = argv[0];
+ struct xor_func *f = ARRAY_END(xor_funcs);
+
+ if (sscanf(argv[1], "%d", &chunks) == 1 &&
+ range_ok(chunks, 2, XOR_CHUNKS_MAX) &&
+ chunks <= rs->set.raid_devs) {
+ while (f-- > xor_funcs) {
+ if (!strcmp(algorithm, f->name)) {
+ unsigned io_size = 0;
+ struct stripe *stripe = stripe_alloc(&rs->sc, rs->sc.mem_cache_client, SC_GROW);
+
+ DMINFO("xor: %s", f->name);
+ if (f->f == xor_blocks_wrapper &&
+ chunks > MAX_XOR_BLOCKS + 1) {
+ DMERR("chunks > MAX_XOR_BLOCKS"
+ " + 1");
+ break;
+ }
+
+ mutex_lock(&rs->io.xor_lock);
+ rs->xor.f = f;
+ rs->xor.chunks = chunks;
+ rs->xor.speed = 0;
+ mutex_unlock(&rs->io.xor_lock);
+
+ if (stripe) {
+ rs->xor.speed = xor_speed(stripe);
+ io_size = stripe->io.size;
+ stripe_free(stripe, rs->sc.mem_cache_client);
+ }
+
+ rs_log(rs, io_size);
+ return 0;
+ }
+ }
+ }
+ }
+
+ return -EINVAL;
+}
+
+/*
+ * Allow writes after they got prohibited because of a device failure.
+ *
+ * This needs to be called after userspace updated metadata state
+ * based on an event being thrown during device failure processing.
+ */
+static int allow_writes(struct raid_set *rs, int argc, char **argv,
+ enum raid_set_flags flag)
+{
+ if (TestClearRSProhibitWrites(rs)) {
+DMINFO("%s waking", __func__);
+ wake_do_raid(rs);
+ return 0;
+ }
+
+ return -EPERM;
+}
+
+/* Parse the RAID message. */
+/*
+ * 'all[ow_writes]'
+ * 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50'
+ * "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of'
+ * 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of'
+ * 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024'
+ * 'xor algorithm #chunks' # e.g. 'xor xor_8 5'
+ *
+ */
+static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ if (argc) {
+ size_t len = strlen(argv[0]);
+ struct raid_set *rs = ti->private;
+ struct {
+ const char *name;
+ int (*f) (struct raid_set *rs, int argc, char **argv,
+ enum raid_set_flags flag);
+ enum raid_set_flags flag;
+ } msg_descr[] = {
+ { "allow_writes", allow_writes, 0 },
+ { "bandwidth", bandwidth_change, 0 },
+ { "overwrite", devel_flags, RS_CHECK_OVERWRITE },
+ { "statistics", devel_flags, RS_DEVEL_STATS },
+ { "stripe_cache", sc_resize, 0 },
+ { "xor", xor_set, 0 },
+ }, *m = ARRAY_END(msg_descr);
+
+ if (len < 3)
+ len = 3;
+
+ while (m-- > msg_descr) {
+ if (!strncmp(argv[0], m->name, len))
+ return m->f(rs, argc - 1, argv + 1, m->flag);
+ }
+
+ }
+
+ return -EINVAL;
+}
+/*
+ * END message interface
+ */
+
+/* Provide io hints. */
+static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct raid_set *rs = ti->private;
+
+ blk_limits_io_min(limits, rs->set.chunk_size);
+ blk_limits_io_opt(limits, rs->set.chunk_size * rs->set.data_devs);
+}
+
+static struct target_type raid_target = {
+ .name = "raid45",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = raid_ctr,
+ .dtr = raid_dtr,
+ .map = raid_map,
+ .presuspend = raid_presuspend,
+ .postsuspend = raid_postsuspend,
+ .resume = raid_resume,
+ .status = raid_status,
+ .message = raid_message,
+ .io_hints = raid_io_hints,
+};
+
+static void init_exit(const char *bad_msg, const char *good_msg, int r)
+{
+ if (r)
+ DMERR("Failed to %sregister target [%d]", bad_msg, r);
+ else
+ DMINFO("%s %s", good_msg, version);
+}
+
+static int __init dm_raid_init(void)
+{
+ int r = dm_register_target(&raid_target);
+
+ init_exit("", "initialized", r);
+ return r;
+}
+
+static void __exit dm_raid_exit(void)
+{
+ dm_unregister_target(&raid_target);
+ init_exit("un", "exit", 0);
+}
+
+/* Module hooks. */
+module_init(dm_raid_init);
+module_exit(dm_raid_exit);
+
+MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
+MODULE_AUTHOR("Heinz Mauelshagen <heinzm@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("dm-raid4");
+MODULE_ALIAS("dm-raid5");
--- /dev/null
+ /* tulip_core.c: A DEC 21x4x-family ethernet driver for Linux.
+
+ Copyright 2000,2001 The Linux Kernel Team
+ Written/copyright 1994-2001 by Donald Becker.
+
+ This software may be used and distributed according to the terms
+ of the GNU General Public License, incorporated herein by reference.
+
+ Please submit bugs to http://bugzilla.kernel.org/ .
+ */
+
+ #define pr_fmt(fmt) "tulip: " fmt
+
+ #define DRV_NAME "tulip"
+ #ifdef CONFIG_TULIP_NAPI
+ #define DRV_VERSION "1.1.15-NAPI" /* Keep at least for test */
+ #else
+ #define DRV_VERSION "1.1.15"
+ #endif
+ #define DRV_RELDATE "Feb 27, 2007"
+
+
+ #include <linux/module.h>
+ #include <linux/pci.h>
+ #include <linux/slab.h>
+ #include "tulip.h"
+ #include <linux/init.h>
+ #include <linux/interrupt.h>
+ #include <linux/etherdevice.h>
+ #include <linux/delay.h>
+ #include <linux/mii.h>
+ #include <linux/crc32.h>
+ #include <asm/unaligned.h>
+ #include <asm/uaccess.h>
+
+ #ifdef CONFIG_SPARC
+ #include <asm/prom.h>
+ #endif
+
+ static char version[] __devinitdata =
+ "Linux Tulip driver version " DRV_VERSION " (" DRV_RELDATE ")\n";
+
+ /* A few user-configurable values. */
+
+ /* Maximum events (Rx packets, etc.) to handle at each interrupt. */
+ static unsigned int max_interrupt_work = 25;
+
+ #define MAX_UNITS 8
+ /* Used to pass the full-duplex flag, etc. */
+ static int full_duplex[MAX_UNITS];
+ static int options[MAX_UNITS];
+ static int mtu[MAX_UNITS]; /* Jumbo MTU for interfaces. */
+
+ /* The possible media types that can be set in options[] are: */
+ const char * const medianame[32] = {
+ "10baseT", "10base2", "AUI", "100baseTx",
+ "10baseT-FDX", "100baseTx-FDX", "100baseT4", "100baseFx",
+ "100baseFx-FDX", "MII 10baseT", "MII 10baseT-FDX", "MII",
+ "10baseT(forced)", "MII 100baseTx", "MII 100baseTx-FDX", "MII 100baseT4",
+ "MII 100baseFx-HDX", "MII 100baseFx-FDX", "Home-PNA 1Mbps", "Invalid-19",
+ "","","","", "","","","", "","","","Transceiver reset",
+ };
+
+ /* Set the copy breakpoint for the copy-only-tiny-buffer Rx structure. */
+ #if defined(__alpha__) || defined(__arm__) || defined(__hppa__) || \
+ defined(CONFIG_SPARC) || defined(__ia64__) || \
+ defined(__sh__) || defined(__mips__)
+ static int rx_copybreak = 1518;
+ #else
+ static int rx_copybreak = 100;
+ #endif
+
+ /*
+ Set the bus performance register.
+ Typical: Set 16 longword cache alignment, no burst limit.
+ Cache alignment bits 15:14 Burst length 13:8
+ 0000 No alignment 0x00000000 unlimited 0800 8 longwords
+ 4000 8 longwords 0100 1 longword 1000 16 longwords
+ 8000 16 longwords 0200 2 longwords 2000 32 longwords
+ C000 32 longwords 0400 4 longwords
+ Warning: many older 486 systems are broken and require setting 0x00A04800
+ 8 longword cache alignment, 8 longword burst.
+ ToDo: Non-Intel setting could be better.
+ */
+
+ #if defined(__alpha__) || defined(__ia64__)
+ static int csr0 = 0x01A00000 | 0xE000;
+ #elif defined(__i386__) || defined(__powerpc__) || defined(__x86_64__)
+ static int csr0 = 0x01A00000 | 0x8000;
+ #elif defined(CONFIG_SPARC) || defined(__hppa__)
+ /* The UltraSparc PCI controllers will disconnect at every 64-byte
+ * crossing anyways so it makes no sense to tell Tulip to burst
+ * any more than that.
+ */
+ static int csr0 = 0x01A00000 | 0x9000;
+ #elif defined(__arm__) || defined(__sh__)
+ static int csr0 = 0x01A00000 | 0x4800;
+ #elif defined(__mips__)
+ static int csr0 = 0x00200000 | 0x4000;
+ #else
+ #warning Processor architecture undefined!
+ static int csr0 = 0x00A00000 | 0x4800;
+ #endif
+
+ /* Operational parameters that usually are not changed. */
+ /* Time in jiffies before concluding the transmitter is hung. */
+ #define TX_TIMEOUT (4*HZ)
+
+
+ MODULE_AUTHOR("The Linux Kernel Team");
+ MODULE_DESCRIPTION("Digital 21*4* Tulip ethernet driver");
+ MODULE_LICENSE("GPL");
+ MODULE_VERSION(DRV_VERSION);
+ module_param(tulip_debug, int, 0);
+ module_param(max_interrupt_work, int, 0);
+ module_param(rx_copybreak, int, 0);
+ module_param(csr0, int, 0);
+ module_param_array(options, int, NULL, 0);
+ module_param_array(full_duplex, int, NULL, 0);
+
+ #ifdef TULIP_DEBUG
+ int tulip_debug = TULIP_DEBUG;
+ #else
+ int tulip_debug = 1;
+ #endif
+
+ static void tulip_timer(unsigned long data)
+ {
+ struct net_device *dev = (struct net_device *)data;
+ struct tulip_private *tp = netdev_priv(dev);
+
+ if (netif_running(dev))
+ schedule_work(&tp->media_work);
+ }
+
+ /*
+ * This table use during operation for capabilities and media timer.
+ *
+ * It is indexed via the values in 'enum chips'
+ */
+
+ struct tulip_chip_table tulip_tbl[] = {
+ { }, /* placeholder for array, slot unused currently */
+ { }, /* placeholder for array, slot unused currently */
+
+ /* DC21140 */
+ { "Digital DS21140 Tulip", 128, 0x0001ebef,
+ HAS_MII | HAS_MEDIA_TABLE | CSR12_IN_SROM | HAS_PCI_MWI, tulip_timer,
+ tulip_media_task },
+
+ /* DC21142, DC21143 */
+ { "Digital DS21142/43 Tulip", 128, 0x0801fbff,
+ HAS_MII | HAS_MEDIA_TABLE | ALWAYS_CHECK_MII | HAS_ACPI | HAS_NWAY
+ | HAS_INTR_MITIGATION | HAS_PCI_MWI, tulip_timer, t21142_media_task },
+
+ /* LC82C168 */
+ { "Lite-On 82c168 PNIC", 256, 0x0001fbef,
+ HAS_MII | HAS_PNICNWAY, pnic_timer, },
+
+ /* MX98713 */
+ { "Macronix 98713 PMAC", 128, 0x0001ebef,
+ HAS_MII | HAS_MEDIA_TABLE | CSR12_IN_SROM, mxic_timer, },
+
+ /* MX98715 */
+ { "Macronix 98715 PMAC", 256, 0x0001ebef,
+ HAS_MEDIA_TABLE, mxic_timer, },
+
+ /* MX98725 */
+ { "Macronix 98725 PMAC", 256, 0x0001ebef,
+ HAS_MEDIA_TABLE, mxic_timer, },
+
+ /* AX88140 */
+ { "ASIX AX88140", 128, 0x0001fbff,
+ HAS_MII | HAS_MEDIA_TABLE | CSR12_IN_SROM | MC_HASH_ONLY
+ | IS_ASIX, tulip_timer, tulip_media_task },
+
+ /* PNIC2 */
+ { "Lite-On PNIC-II", 256, 0x0801fbff,
+ HAS_MII | HAS_NWAY | HAS_8023X | HAS_PCI_MWI, pnic2_timer, },
+
+ /* COMET */
+ { "ADMtek Comet", 256, 0x0001abef,
+ HAS_MII | MC_HASH_ONLY | COMET_MAC_ADDR, comet_timer, },
+
+ /* COMPEX9881 */
+ { "Compex 9881 PMAC", 128, 0x0001ebef,
+ HAS_MII | HAS_MEDIA_TABLE | CSR12_IN_SROM, mxic_timer, },
+
+ /* I21145 */
+ { "Intel DS21145 Tulip", 128, 0x0801fbff,
+ HAS_MII | HAS_MEDIA_TABLE | ALWAYS_CHECK_MII | HAS_ACPI
+ | HAS_NWAY | HAS_PCI_MWI, tulip_timer, tulip_media_task },
+
+ /* DM910X */
+ #ifdef CONFIG_TULIP_DM910X
+ { "Davicom DM9102/DM9102A", 128, 0x0001ebef,
+ HAS_MII | HAS_MEDIA_TABLE | CSR12_IN_SROM | HAS_ACPI,
+ tulip_timer, tulip_media_task },
+ #else
+ { NULL },
+ #endif
+
+ /* RS7112 */
+ { "Conexant LANfinity", 256, 0x0001ebef,
+ HAS_MII | HAS_ACPI, tulip_timer, tulip_media_task },
+
+ };
+
+
+ static DEFINE_PCI_DEVICE_TABLE(tulip_pci_tbl) = {
+ { 0x1011, 0x0009, PCI_ANY_ID, PCI_ANY_ID, 0, 0, DC21140 },
+ { 0x1011, 0x0019, PCI_ANY_ID, PCI_ANY_ID, 0, 0, DC21143 },
+ { 0x11AD, 0x0002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, LC82C168 },
+ { 0x10d9, 0x0512, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MX98713 },
+ { 0x10d9, 0x0531, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MX98715 },
+ /* { 0x10d9, 0x0531, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MX98725 },*/
+ { 0x125B, 0x1400, PCI_ANY_ID, PCI_ANY_ID, 0, 0, AX88140 },
+ { 0x11AD, 0xc115, PCI_ANY_ID, PCI_ANY_ID, 0, 0, PNIC2 },
+ { 0x1317, 0x0981, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x1317, 0x0985, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x1317, 0x1985, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x1317, 0x9511, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x13D1, 0xAB02, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x13D1, 0xAB03, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x13D1, 0xAB08, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x104A, 0x0981, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x104A, 0x2774, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x1259, 0xa120, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x11F6, 0x9881, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMPEX9881 },
+ { 0x8086, 0x0039, PCI_ANY_ID, PCI_ANY_ID, 0, 0, I21145 },
+ #ifdef CONFIG_TULIP_DM910X
+ { 0x1282, 0x9100, PCI_ANY_ID, PCI_ANY_ID, 0, 0, DM910X },
+ { 0x1282, 0x9102, PCI_ANY_ID, PCI_ANY_ID, 0, 0, DM910X },
+ #endif
+ { 0x1113, 0x1216, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x1113, 0x1217, PCI_ANY_ID, PCI_ANY_ID, 0, 0, MX98715 },
+ { 0x1113, 0x9511, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x1186, 0x1541, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x1186, 0x1561, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x1186, 0x1591, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x14f1, 0x1803, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CONEXANT },
+ { 0x1626, 0x8410, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x1737, 0xAB09, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x1737, 0xAB08, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x17B3, 0xAB08, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { 0x10b7, 0x9300, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET }, /* 3Com 3CSOHO100B-TX */
+ { 0x14ea, 0xab08, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET }, /* Planex FNW-3602-TX */
+ { 0x1414, 0x0001, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET }, /* Microsoft MN-120 */
+ { 0x1414, 0x0002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, COMET },
+ { } /* terminate list */
+ };
+ MODULE_DEVICE_TABLE(pci, tulip_pci_tbl);
+
+
+ /* A full-duplex map for media types. */
+ const char tulip_media_cap[32] =
+ {0,0,0,16, 3,19,16,24, 27,4,7,5, 0,20,23,20, 28,31,0,0, };
+
+ static void tulip_tx_timeout(struct net_device *dev);
+ static void tulip_init_ring(struct net_device *dev);
+ static void tulip_free_ring(struct net_device *dev);
+ static netdev_tx_t tulip_start_xmit(struct sk_buff *skb,
+ struct net_device *dev);
+ static int tulip_open(struct net_device *dev);
+ static int tulip_close(struct net_device *dev);
+ static void tulip_up(struct net_device *dev);
+ static void tulip_down(struct net_device *dev);
+ static struct net_device_stats *tulip_get_stats(struct net_device *dev);
+ static int private_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
+ static void set_rx_mode(struct net_device *dev);
+ static void tulip_set_wolopts(struct pci_dev *pdev, u32 wolopts);
+ #ifdef CONFIG_NET_POLL_CONTROLLER
+ static void poll_tulip(struct net_device *dev);
+ #endif
+
+ static void tulip_set_power_state (struct tulip_private *tp,
+ int sleep, int snooze)
+ {
+ if (tp->flags & HAS_ACPI) {
+ u32 tmp, newtmp;
+ pci_read_config_dword (tp->pdev, CFDD, &tmp);
+ newtmp = tmp & ~(CFDD_Sleep | CFDD_Snooze);
+ if (sleep)
+ newtmp |= CFDD_Sleep;
+ else if (snooze)
+ newtmp |= CFDD_Snooze;
+ if (tmp != newtmp)
+ pci_write_config_dword (tp->pdev, CFDD, newtmp);
+ }
+
+ }
+
+
+ static void tulip_up(struct net_device *dev)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+ void __iomem *ioaddr = tp->base_addr;
+ int next_tick = 3*HZ;
+ u32 reg;
+ int i;
+
+ #ifdef CONFIG_TULIP_NAPI
+ napi_enable(&tp->napi);
+ #endif
+
+ /* Wake the chip from sleep/snooze mode. */
+ tulip_set_power_state (tp, 0, 0);
+
+ /* Disable all WOL events */
+ pci_enable_wake(tp->pdev, PCI_D3hot, 0);
+ pci_enable_wake(tp->pdev, PCI_D3cold, 0);
+ tulip_set_wolopts(tp->pdev, 0);
+
+ /* On some chip revs we must set the MII/SYM port before the reset!? */
+ if (tp->mii_cnt || (tp->mtable && tp->mtable->has_mii))
+ iowrite32(0x00040000, ioaddr + CSR6);
+
+ /* Reset the chip, holding bit 0 set at least 50 PCI cycles. */
+ iowrite32(0x00000001, ioaddr + CSR0);
+ pci_read_config_dword(tp->pdev, PCI_COMMAND, ®); /* flush write */
+ udelay(100);
+
+ /* Deassert reset.
+ Wait the specified 50 PCI cycles after a reset by initializing
+ Tx and Rx queues and the address filter list. */
+ iowrite32(tp->csr0, ioaddr + CSR0);
+ pci_read_config_dword(tp->pdev, PCI_COMMAND, ®); /* flush write */
+ udelay(100);
+
+ if (tulip_debug > 1)
+ netdev_dbg(dev, "tulip_up(), irq==%d\n", dev->irq);
+
+ iowrite32(tp->rx_ring_dma, ioaddr + CSR3);
+ iowrite32(tp->tx_ring_dma, ioaddr + CSR4);
+ tp->cur_rx = tp->cur_tx = 0;
+ tp->dirty_rx = tp->dirty_tx = 0;
+
+ if (tp->flags & MC_HASH_ONLY) {
+ u32 addr_low = get_unaligned_le32(dev->dev_addr);
+ u32 addr_high = get_unaligned_le16(dev->dev_addr + 4);
+ if (tp->chip_id == AX88140) {
+ iowrite32(0, ioaddr + CSR13);
+ iowrite32(addr_low, ioaddr + CSR14);
+ iowrite32(1, ioaddr + CSR13);
+ iowrite32(addr_high, ioaddr + CSR14);
+ } else if (tp->flags & COMET_MAC_ADDR) {
+ iowrite32(addr_low, ioaddr + 0xA4);
+ iowrite32(addr_high, ioaddr + 0xA8);
+ iowrite32(0, ioaddr + CSR27);
+ iowrite32(0, ioaddr + CSR28);
+ }
+ } else {
+ /* This is set_rx_mode(), but without starting the transmitter. */
+ u16 *eaddrs = (u16 *)dev->dev_addr;
+ u16 *setup_frm = &tp->setup_frame[15*6];
+ dma_addr_t mapping;
+
+ /* 21140 bug: you must add the broadcast address. */
+ memset(tp->setup_frame, 0xff, sizeof(tp->setup_frame));
+ /* Fill the final entry of the table with our physical address. */
+ *setup_frm++ = eaddrs[0]; *setup_frm++ = eaddrs[0];
+ *setup_frm++ = eaddrs[1]; *setup_frm++ = eaddrs[1];
+ *setup_frm++ = eaddrs[2]; *setup_frm++ = eaddrs[2];
+
+ mapping = pci_map_single(tp->pdev, tp->setup_frame,
+ sizeof(tp->setup_frame),
+ PCI_DMA_TODEVICE);
+ tp->tx_buffers[tp->cur_tx].skb = NULL;
+ tp->tx_buffers[tp->cur_tx].mapping = mapping;
+
+ /* Put the setup frame on the Tx list. */
+ tp->tx_ring[tp->cur_tx].length = cpu_to_le32(0x08000000 | 192);
+ tp->tx_ring[tp->cur_tx].buffer1 = cpu_to_le32(mapping);
+ tp->tx_ring[tp->cur_tx].status = cpu_to_le32(DescOwned);
+
+ tp->cur_tx++;
+ }
+
+ tp->saved_if_port = dev->if_port;
+ if (dev->if_port == 0)
+ dev->if_port = tp->default_port;
+
+ /* Allow selecting a default media. */
+ i = 0;
+ if (tp->mtable == NULL)
+ goto media_picked;
+ if (dev->if_port) {
+ int looking_for = tulip_media_cap[dev->if_port] & MediaIsMII ? 11 :
+ (dev->if_port == 12 ? 0 : dev->if_port);
+ for (i = 0; i < tp->mtable->leafcount; i++)
+ if (tp->mtable->mleaf[i].media == looking_for) {
+ dev_info(&dev->dev,
+ "Using user-specified media %s\n",
+ medianame[dev->if_port]);
+ goto media_picked;
+ }
+ }
+ if ((tp->mtable->defaultmedia & 0x0800) == 0) {
+ int looking_for = tp->mtable->defaultmedia & MEDIA_MASK;
+ for (i = 0; i < tp->mtable->leafcount; i++)
+ if (tp->mtable->mleaf[i].media == looking_for) {
+ dev_info(&dev->dev,
+ "Using EEPROM-set media %s\n",
+ medianame[looking_for]);
+ goto media_picked;
+ }
+ }
+ /* Start sensing first non-full-duplex media. */
+ for (i = tp->mtable->leafcount - 1;
+ (tulip_media_cap[tp->mtable->mleaf[i].media] & MediaAlwaysFD) && i > 0; i--)
+ ;
+ media_picked:
+
+ tp->csr6 = 0;
+ tp->cur_index = i;
+ tp->nwayset = 0;
+
+ if (dev->if_port) {
+ if (tp->chip_id == DC21143 &&
+ (tulip_media_cap[dev->if_port] & MediaIsMII)) {
+ /* We must reset the media CSRs when we force-select MII mode. */
+ iowrite32(0x0000, ioaddr + CSR13);
+ iowrite32(0x0000, ioaddr + CSR14);
+ iowrite32(0x0008, ioaddr + CSR15);
+ }
+ tulip_select_media(dev, 1);
+ } else if (tp->chip_id == DC21142) {
+ if (tp->mii_cnt) {
+ tulip_select_media(dev, 1);
+ if (tulip_debug > 1)
+ dev_info(&dev->dev,
+ "Using MII transceiver %d, status %04x\n",
+ tp->phys[0],
+ tulip_mdio_read(dev, tp->phys[0], 1));
+ iowrite32(csr6_mask_defstate, ioaddr + CSR6);
+ tp->csr6 = csr6_mask_hdcap;
+ dev->if_port = 11;
+ iowrite32(0x0000, ioaddr + CSR13);
+ iowrite32(0x0000, ioaddr + CSR14);
+ } else
+ t21142_start_nway(dev);
+ } else if (tp->chip_id == PNIC2) {
+ /* for initial startup advertise 10/100 Full and Half */
+ tp->sym_advertise = 0x01E0;
+ /* enable autonegotiate end interrupt */
+ iowrite32(ioread32(ioaddr+CSR5)| 0x00008010, ioaddr + CSR5);
+ iowrite32(ioread32(ioaddr+CSR7)| 0x00008010, ioaddr + CSR7);
+ pnic2_start_nway(dev);
+ } else if (tp->chip_id == LC82C168 && ! tp->medialock) {
+ if (tp->mii_cnt) {
+ dev->if_port = 11;
+ tp->csr6 = 0x814C0000 | (tp->full_duplex ? 0x0200 : 0);
+ iowrite32(0x0001, ioaddr + CSR15);
+ } else if (ioread32(ioaddr + CSR5) & TPLnkPass)
+ pnic_do_nway(dev);
+ else {
+ /* Start with 10mbps to do autonegotiation. */
+ iowrite32(0x32, ioaddr + CSR12);
+ tp->csr6 = 0x00420000;
+ iowrite32(0x0001B078, ioaddr + 0xB8);
+ iowrite32(0x0201B078, ioaddr + 0xB8);
+ next_tick = 1*HZ;
+ }
+ } else if ((tp->chip_id == MX98713 || tp->chip_id == COMPEX9881) &&
+ ! tp->medialock) {
+ dev->if_port = 0;
+ tp->csr6 = 0x01880000 | (tp->full_duplex ? 0x0200 : 0);
+ iowrite32(0x0f370000 | ioread16(ioaddr + 0x80), ioaddr + 0x80);
+ } else if (tp->chip_id == MX98715 || tp->chip_id == MX98725) {
+ /* Provided by BOLO, Macronix - 12/10/1998. */
+ dev->if_port = 0;
+ tp->csr6 = 0x01a80200;
+ iowrite32(0x0f370000 | ioread16(ioaddr + 0x80), ioaddr + 0x80);
+ iowrite32(0x11000 | ioread16(ioaddr + 0xa0), ioaddr + 0xa0);
+ } else if (tp->chip_id == COMET || tp->chip_id == CONEXANT) {
+ /* Enable automatic Tx underrun recovery. */
+ iowrite32(ioread32(ioaddr + 0x88) | 1, ioaddr + 0x88);
+ dev->if_port = tp->mii_cnt ? 11 : 0;
+ tp->csr6 = 0x00040000;
+ } else if (tp->chip_id == AX88140) {
+ tp->csr6 = tp->mii_cnt ? 0x00040100 : 0x00000100;
+ } else
+ tulip_select_media(dev, 1);
+
+ /* Start the chip's Tx to process setup frame. */
+ tulip_stop_rxtx(tp);
+ barrier();
+ udelay(5);
+ iowrite32(tp->csr6 | TxOn, ioaddr + CSR6);
+
+ /* Enable interrupts by setting the interrupt mask. */
+ iowrite32(tulip_tbl[tp->chip_id].valid_intrs, ioaddr + CSR5);
+ iowrite32(tulip_tbl[tp->chip_id].valid_intrs, ioaddr + CSR7);
+ tulip_start_rxtx(tp);
+ iowrite32(0, ioaddr + CSR2); /* Rx poll demand */
+
+ if (tulip_debug > 2) {
+ netdev_dbg(dev, "Done tulip_up(), CSR0 %08x, CSR5 %08x CSR6 %08x\n",
+ ioread32(ioaddr + CSR0),
+ ioread32(ioaddr + CSR5),
+ ioread32(ioaddr + CSR6));
+ }
+
+ /* Set the timer to switch to check for link beat and perhaps switch
+ to an alternate media type. */
+ tp->timer.expires = RUN_AT(next_tick);
+ add_timer(&tp->timer);
+ #ifdef CONFIG_TULIP_NAPI
+ init_timer(&tp->oom_timer);
+ tp->oom_timer.data = (unsigned long)dev;
+ tp->oom_timer.function = oom_timer;
+ #endif
+ }
+
+ static int
+ tulip_open(struct net_device *dev)
+ {
+ int retval;
+
+ tulip_init_ring (dev);
+
+ retval = request_irq(dev->irq, tulip_interrupt, IRQF_SHARED, dev->name, dev);
+ if (retval)
+ goto free_ring;
+
+ tulip_up (dev);
+
+ netif_start_queue (dev);
+
+ return 0;
+
+ free_ring:
+ tulip_free_ring (dev);
+ return retval;
+ }
+
+
+ static void tulip_tx_timeout(struct net_device *dev)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+ void __iomem *ioaddr = tp->base_addr;
+ unsigned long flags;
+
+ spin_lock_irqsave (&tp->lock, flags);
+
+ if (tulip_media_cap[dev->if_port] & MediaIsMII) {
+ /* Do nothing -- the media monitor should handle this. */
+ if (tulip_debug > 1)
+ dev_warn(&dev->dev,
+ "Transmit timeout using MII device\n");
+ } else if (tp->chip_id == DC21140 || tp->chip_id == DC21142 ||
+ tp->chip_id == MX98713 || tp->chip_id == COMPEX9881 ||
+ tp->chip_id == DM910X) {
+ dev_warn(&dev->dev,
+ "21140 transmit timed out, status %08x, SIA %08x %08x %08x %08x, resetting...\n",
+ ioread32(ioaddr + CSR5), ioread32(ioaddr + CSR12),
+ ioread32(ioaddr + CSR13), ioread32(ioaddr + CSR14),
+ ioread32(ioaddr + CSR15));
+ tp->timeout_recovery = 1;
+ schedule_work(&tp->media_work);
+ goto out_unlock;
+ } else if (tp->chip_id == PNIC2) {
+ dev_warn(&dev->dev,
+ "PNIC2 transmit timed out, status %08x, CSR6/7 %08x / %08x CSR12 %08x, resetting...\n",
+ (int)ioread32(ioaddr + CSR5),
+ (int)ioread32(ioaddr + CSR6),
+ (int)ioread32(ioaddr + CSR7),
+ (int)ioread32(ioaddr + CSR12));
+ } else {
+ dev_warn(&dev->dev,
+ "Transmit timed out, status %08x, CSR12 %08x, resetting...\n",
+ ioread32(ioaddr + CSR5), ioread32(ioaddr + CSR12));
+ dev->if_port = 0;
+ }
+
+ #if defined(way_too_many_messages)
+ if (tulip_debug > 3) {
+ int i;
+ for (i = 0; i < RX_RING_SIZE; i++) {
+ u8 *buf = (u8 *)(tp->rx_ring[i].buffer1);
+ int j;
+ printk(KERN_DEBUG
+ "%2d: %08x %08x %08x %08x %02x %02x %02x\n",
+ i,
+ (unsigned int)tp->rx_ring[i].status,
+ (unsigned int)tp->rx_ring[i].length,
+ (unsigned int)tp->rx_ring[i].buffer1,
+ (unsigned int)tp->rx_ring[i].buffer2,
+ buf[0], buf[1], buf[2]);
+ for (j = 0; buf[j] != 0xee && j < 1600; j++)
+ if (j < 100)
+ pr_cont(" %02x", buf[j]);
+ pr_cont(" j=%d\n", j);
+ }
+ printk(KERN_DEBUG " Rx ring %p: ", tp->rx_ring);
+ for (i = 0; i < RX_RING_SIZE; i++)
+ pr_cont(" %08x", (unsigned int)tp->rx_ring[i].status);
+ printk(KERN_DEBUG " Tx ring %p: ", tp->tx_ring);
+ for (i = 0; i < TX_RING_SIZE; i++)
+ pr_cont(" %08x", (unsigned int)tp->tx_ring[i].status);
+ pr_cont("\n");
+ }
+ #endif
+
+ tulip_tx_timeout_complete(tp, ioaddr);
+
+ out_unlock:
+ spin_unlock_irqrestore (&tp->lock, flags);
+ dev->trans_start = jiffies; /* prevent tx timeout */
+ netif_wake_queue (dev);
+ }
+
+
+ /* Initialize the Rx and Tx rings, along with various 'dev' bits. */
+ static void tulip_init_ring(struct net_device *dev)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+ int i;
+
+ tp->susp_rx = 0;
+ tp->ttimer = 0;
+ tp->nir = 0;
+
+ for (i = 0; i < RX_RING_SIZE; i++) {
+ tp->rx_ring[i].status = 0x00000000;
+ tp->rx_ring[i].length = cpu_to_le32(PKT_BUF_SZ);
+ tp->rx_ring[i].buffer2 = cpu_to_le32(tp->rx_ring_dma + sizeof(struct tulip_rx_desc) * (i + 1));
+ tp->rx_buffers[i].skb = NULL;
+ tp->rx_buffers[i].mapping = 0;
+ }
+ /* Mark the last entry as wrapping the ring. */
+ tp->rx_ring[i-1].length = cpu_to_le32(PKT_BUF_SZ | DESC_RING_WRAP);
+ tp->rx_ring[i-1].buffer2 = cpu_to_le32(tp->rx_ring_dma);
+
+ for (i = 0; i < RX_RING_SIZE; i++) {
+ dma_addr_t mapping;
+
+ /* Note the receive buffer must be longword aligned.
+ dev_alloc_skb() provides 16 byte alignment. But do *not*
+ use skb_reserve() to align the IP header! */
+ struct sk_buff *skb = dev_alloc_skb(PKT_BUF_SZ);
+ tp->rx_buffers[i].skb = skb;
+ if (skb == NULL)
+ break;
+ mapping = pci_map_single(tp->pdev, skb->data,
+ PKT_BUF_SZ, PCI_DMA_FROMDEVICE);
+ tp->rx_buffers[i].mapping = mapping;
+ skb->dev = dev; /* Mark as being used by this device. */
+ tp->rx_ring[i].status = cpu_to_le32(DescOwned); /* Owned by Tulip chip */
+ tp->rx_ring[i].buffer1 = cpu_to_le32(mapping);
+ }
+ tp->dirty_rx = (unsigned int)(i - RX_RING_SIZE);
+
+ /* The Tx buffer descriptor is filled in as needed, but we
+ do need to clear the ownership bit. */
+ for (i = 0; i < TX_RING_SIZE; i++) {
+ tp->tx_buffers[i].skb = NULL;
+ tp->tx_buffers[i].mapping = 0;
+ tp->tx_ring[i].status = 0x00000000;
+ tp->tx_ring[i].buffer2 = cpu_to_le32(tp->tx_ring_dma + sizeof(struct tulip_tx_desc) * (i + 1));
+ }
+ tp->tx_ring[i-1].buffer2 = cpu_to_le32(tp->tx_ring_dma);
+ }
+
+ static netdev_tx_t
+ tulip_start_xmit(struct sk_buff *skb, struct net_device *dev)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+ int entry;
+ u32 flag;
+ dma_addr_t mapping;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tp->lock, flags);
+
+ /* Calculate the next Tx descriptor entry. */
+ entry = tp->cur_tx % TX_RING_SIZE;
+
+ tp->tx_buffers[entry].skb = skb;
+ mapping = pci_map_single(tp->pdev, skb->data,
+ skb->len, PCI_DMA_TODEVICE);
+ tp->tx_buffers[entry].mapping = mapping;
+ tp->tx_ring[entry].buffer1 = cpu_to_le32(mapping);
+
+ if (tp->cur_tx - tp->dirty_tx < TX_RING_SIZE/2) {/* Typical path */
+ flag = 0x60000000; /* No interrupt */
+ } else if (tp->cur_tx - tp->dirty_tx == TX_RING_SIZE/2) {
+ flag = 0xe0000000; /* Tx-done intr. */
+ } else if (tp->cur_tx - tp->dirty_tx < TX_RING_SIZE - 2) {
+ flag = 0x60000000; /* No Tx-done intr. */
+ } else { /* Leave room for set_rx_mode() to fill entries. */
+ flag = 0xe0000000; /* Tx-done intr. */
+ netif_stop_queue(dev);
+ }
+ if (entry == TX_RING_SIZE-1)
+ flag = 0xe0000000 | DESC_RING_WRAP;
+
+ tp->tx_ring[entry].length = cpu_to_le32(skb->len | flag);
+ /* if we were using Transmit Automatic Polling, we would need a
+ * wmb() here. */
+ tp->tx_ring[entry].status = cpu_to_le32(DescOwned);
+ wmb();
+
+ tp->cur_tx++;
+
+ /* Trigger an immediate transmit demand. */
+ iowrite32(0, tp->base_addr + CSR1);
+
+ spin_unlock_irqrestore(&tp->lock, flags);
+
+ return NETDEV_TX_OK;
+ }
+
+ static void tulip_clean_tx_ring(struct tulip_private *tp)
+ {
+ unsigned int dirty_tx;
+
+ for (dirty_tx = tp->dirty_tx ; tp->cur_tx - dirty_tx > 0;
+ dirty_tx++) {
+ int entry = dirty_tx % TX_RING_SIZE;
+ int status = le32_to_cpu(tp->tx_ring[entry].status);
+
+ if (status < 0) {
+ tp->dev->stats.tx_errors++; /* It wasn't Txed */
+ tp->tx_ring[entry].status = 0;
+ }
+
+ /* Check for Tx filter setup frames. */
+ if (tp->tx_buffers[entry].skb == NULL) {
+ /* test because dummy frames not mapped */
+ if (tp->tx_buffers[entry].mapping)
+ pci_unmap_single(tp->pdev,
+ tp->tx_buffers[entry].mapping,
+ sizeof(tp->setup_frame),
+ PCI_DMA_TODEVICE);
+ continue;
+ }
+
+ pci_unmap_single(tp->pdev, tp->tx_buffers[entry].mapping,
+ tp->tx_buffers[entry].skb->len,
+ PCI_DMA_TODEVICE);
+
+ /* Free the original skb. */
+ dev_kfree_skb_irq(tp->tx_buffers[entry].skb);
+ tp->tx_buffers[entry].skb = NULL;
+ tp->tx_buffers[entry].mapping = 0;
+ }
+ }
+
+ static void tulip_down (struct net_device *dev)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+ void __iomem *ioaddr = tp->base_addr;
+ unsigned long flags;
+
+ cancel_work_sync(&tp->media_work);
+
+ #ifdef CONFIG_TULIP_NAPI
+ napi_disable(&tp->napi);
+ #endif
+
+ del_timer_sync (&tp->timer);
+ #ifdef CONFIG_TULIP_NAPI
+ del_timer_sync (&tp->oom_timer);
+ #endif
+ spin_lock_irqsave (&tp->lock, flags);
+
+ /* Disable interrupts by clearing the interrupt mask. */
+ iowrite32 (0x00000000, ioaddr + CSR7);
+
+ /* Stop the Tx and Rx processes. */
+ tulip_stop_rxtx(tp);
+
+ /* prepare receive buffers */
+ tulip_refill_rx(dev);
+
+ /* release any unconsumed transmit buffers */
+ tulip_clean_tx_ring(tp);
+
+ if (ioread32(ioaddr + CSR6) != 0xffffffff)
+ dev->stats.rx_missed_errors += ioread32(ioaddr + CSR8) & 0xffff;
+
+ spin_unlock_irqrestore (&tp->lock, flags);
+
+ init_timer(&tp->timer);
+ tp->timer.data = (unsigned long)dev;
+ tp->timer.function = tulip_tbl[tp->chip_id].media_timer;
+
+ dev->if_port = tp->saved_if_port;
+
+ /* Leave the driver in snooze, not sleep, mode. */
+ tulip_set_power_state (tp, 0, 1);
+ }
+
+ static void tulip_free_ring (struct net_device *dev)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+ int i;
+
+ /* Free all the skbuffs in the Rx queue. */
+ for (i = 0; i < RX_RING_SIZE; i++) {
+ struct sk_buff *skb = tp->rx_buffers[i].skb;
+ dma_addr_t mapping = tp->rx_buffers[i].mapping;
+
+ tp->rx_buffers[i].skb = NULL;
+ tp->rx_buffers[i].mapping = 0;
+
+ tp->rx_ring[i].status = 0; /* Not owned by Tulip chip. */
+ tp->rx_ring[i].length = 0;
+ /* An invalid address. */
+ tp->rx_ring[i].buffer1 = cpu_to_le32(0xBADF00D0);
+ if (skb) {
+ pci_unmap_single(tp->pdev, mapping, PKT_BUF_SZ,
+ PCI_DMA_FROMDEVICE);
+ dev_kfree_skb (skb);
+ }
+ }
+
+ for (i = 0; i < TX_RING_SIZE; i++) {
+ struct sk_buff *skb = tp->tx_buffers[i].skb;
+
+ if (skb != NULL) {
+ pci_unmap_single(tp->pdev, tp->tx_buffers[i].mapping,
+ skb->len, PCI_DMA_TODEVICE);
+ dev_kfree_skb (skb);
+ }
+ tp->tx_buffers[i].skb = NULL;
+ tp->tx_buffers[i].mapping = 0;
+ }
+ }
+
+ static int tulip_close (struct net_device *dev)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+ void __iomem *ioaddr = tp->base_addr;
+
+ netif_stop_queue (dev);
+
+ tulip_down (dev);
+
+ if (tulip_debug > 1)
+ netdev_dbg(dev, "Shutting down ethercard, status was %02x\n",
+ ioread32 (ioaddr + CSR5));
+
+ free_irq (dev->irq, dev);
+
+ tulip_free_ring (dev);
+
+ return 0;
+ }
+
+ static struct net_device_stats *tulip_get_stats(struct net_device *dev)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+ void __iomem *ioaddr = tp->base_addr;
+
+ if (netif_running(dev)) {
+ unsigned long flags;
+
+ spin_lock_irqsave (&tp->lock, flags);
+
+ dev->stats.rx_missed_errors += ioread32(ioaddr + CSR8) & 0xffff;
+
+ spin_unlock_irqrestore(&tp->lock, flags);
+ }
+
+ return &dev->stats;
+ }
+
+
+ static void tulip_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+ {
+ struct tulip_private *np = netdev_priv(dev);
+ strcpy(info->driver, DRV_NAME);
+ strcpy(info->version, DRV_VERSION);
+ strcpy(info->bus_info, pci_name(np->pdev));
+ }
+
+
+ static int tulip_ethtool_set_wol(struct net_device *dev,
+ struct ethtool_wolinfo *wolinfo)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+
+ if (wolinfo->wolopts & (~tp->wolinfo.supported))
+ return -EOPNOTSUPP;
+
+ tp->wolinfo.wolopts = wolinfo->wolopts;
+ device_set_wakeup_enable(&tp->pdev->dev, tp->wolinfo.wolopts);
+ return 0;
+ }
+
+ static void tulip_ethtool_get_wol(struct net_device *dev,
+ struct ethtool_wolinfo *wolinfo)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+
+ wolinfo->supported = tp->wolinfo.supported;
+ wolinfo->wolopts = tp->wolinfo.wolopts;
+ return;
+ }
+
+
+ static const struct ethtool_ops ops = {
+ .get_drvinfo = tulip_get_drvinfo,
+ .set_wol = tulip_ethtool_set_wol,
+ .get_wol = tulip_ethtool_get_wol,
+ };
+
+ /* Provide ioctl() calls to examine the MII xcvr state. */
+ static int private_ioctl (struct net_device *dev, struct ifreq *rq, int cmd)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+ void __iomem *ioaddr = tp->base_addr;
+ struct mii_ioctl_data *data = if_mii(rq);
+ const unsigned int phy_idx = 0;
+ int phy = tp->phys[phy_idx] & 0x1f;
+ unsigned int regnum = data->reg_num;
+
+ switch (cmd) {
+ case SIOCGMIIPHY: /* Get address of MII PHY in use. */
+ if (tp->mii_cnt)
+ data->phy_id = phy;
+ else if (tp->flags & HAS_NWAY)
+ data->phy_id = 32;
+ else if (tp->chip_id == COMET)
+ data->phy_id = 1;
+ else
+ return -ENODEV;
+
+ case SIOCGMIIREG: /* Read MII PHY register. */
+ if (data->phy_id == 32 && (tp->flags & HAS_NWAY)) {
+ int csr12 = ioread32 (ioaddr + CSR12);
+ int csr14 = ioread32 (ioaddr + CSR14);
+ switch (regnum) {
+ case 0:
+ if (((csr14<<5) & 0x1000) ||
+ (dev->if_port == 5 && tp->nwayset))
+ data->val_out = 0x1000;
+ else
+ data->val_out = (tulip_media_cap[dev->if_port]&MediaIs100 ? 0x2000 : 0)
+ | (tulip_media_cap[dev->if_port]&MediaIsFD ? 0x0100 : 0);
+ break;
+ case 1:
+ data->val_out =
+ 0x1848 +
+ ((csr12&0x7000) == 0x5000 ? 0x20 : 0) +
+ ((csr12&0x06) == 6 ? 0 : 4);
+ data->val_out |= 0x6048;
+ break;
+ case 4:
+ /* Advertised value, bogus 10baseTx-FD value from CSR6. */
+ data->val_out =
+ ((ioread32(ioaddr + CSR6) >> 3) & 0x0040) +
+ ((csr14 >> 1) & 0x20) + 1;
+ data->val_out |= ((csr14 >> 9) & 0x03C0);
+ break;
+ case 5: data->val_out = tp->lpar; break;
+ default: data->val_out = 0; break;
+ }
+ } else {
+ data->val_out = tulip_mdio_read (dev, data->phy_id & 0x1f, regnum);
+ }
+ return 0;
+
+ case SIOCSMIIREG: /* Write MII PHY register. */
+ if (regnum & ~0x1f)
+ return -EINVAL;
+ if (data->phy_id == phy) {
+ u16 value = data->val_in;
+ switch (regnum) {
+ case 0: /* Check for autonegotiation on or reset. */
+ tp->full_duplex_lock = (value & 0x9000) ? 0 : 1;
+ if (tp->full_duplex_lock)
+ tp->full_duplex = (value & 0x0100) ? 1 : 0;
+ break;
+ case 4:
+ tp->advertising[phy_idx] =
+ tp->mii_advertise = data->val_in;
+ break;
+ }
+ }
+ if (data->phy_id == 32 && (tp->flags & HAS_NWAY)) {
+ u16 value = data->val_in;
+ if (regnum == 0) {
+ if ((value & 0x1200) == 0x1200) {
+ if (tp->chip_id == PNIC2) {
+ pnic2_start_nway (dev);
+ } else {
+ t21142_start_nway (dev);
+ }
+ }
+ } else if (regnum == 4)
+ tp->sym_advertise = value;
+ } else {
+ tulip_mdio_write (dev, data->phy_id & 0x1f, regnum, data->val_in);
+ }
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return -EOPNOTSUPP;
+ }
+
+
+ /* Set or clear the multicast filter for this adaptor.
+ Note that we only use exclusion around actually queueing the
+ new frame, not around filling tp->setup_frame. This is non-deterministic
+ when re-entered but still correct. */
+
+ #undef set_bit_le
+ #define set_bit_le(i,p) do { ((char *)(p))[(i)/8] |= (1<<((i)%8)); } while(0)
+
+ static void build_setup_frame_hash(u16 *setup_frm, struct net_device *dev)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+ u16 hash_table[32];
+ struct netdev_hw_addr *ha;
+ int i;
+ u16 *eaddrs;
+
+ memset(hash_table, 0, sizeof(hash_table));
+ set_bit_le(255, hash_table); /* Broadcast entry */
+ /* This should work on big-endian machines as well. */
+ netdev_for_each_mc_addr(ha, dev) {
+ int index = ether_crc_le(ETH_ALEN, ha->addr) & 0x1ff;
+
+ set_bit_le(index, hash_table);
+ }
+ for (i = 0; i < 32; i++) {
+ *setup_frm++ = hash_table[i];
+ *setup_frm++ = hash_table[i];
+ }
+ setup_frm = &tp->setup_frame[13*6];
+
+ /* Fill the final entry with our physical address. */
+ eaddrs = (u16 *)dev->dev_addr;
+ *setup_frm++ = eaddrs[0]; *setup_frm++ = eaddrs[0];
+ *setup_frm++ = eaddrs[1]; *setup_frm++ = eaddrs[1];
+ *setup_frm++ = eaddrs[2]; *setup_frm++ = eaddrs[2];
+ }
+
+ static void build_setup_frame_perfect(u16 *setup_frm, struct net_device *dev)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+ struct netdev_hw_addr *ha;
+ u16 *eaddrs;
+
+ /* We have <= 14 addresses so we can use the wonderful
+ 16 address perfect filtering of the Tulip. */
+ netdev_for_each_mc_addr(ha, dev) {
+ eaddrs = (u16 *) ha->addr;
+ *setup_frm++ = *eaddrs; *setup_frm++ = *eaddrs++;
+ *setup_frm++ = *eaddrs; *setup_frm++ = *eaddrs++;
+ *setup_frm++ = *eaddrs; *setup_frm++ = *eaddrs++;
+ }
+ /* Fill the unused entries with the broadcast address. */
+ memset(setup_frm, 0xff, (15 - netdev_mc_count(dev)) * 12);
+ setup_frm = &tp->setup_frame[15*6];
+
+ /* Fill the final entry with our physical address. */
+ eaddrs = (u16 *)dev->dev_addr;
+ *setup_frm++ = eaddrs[0]; *setup_frm++ = eaddrs[0];
+ *setup_frm++ = eaddrs[1]; *setup_frm++ = eaddrs[1];
+ *setup_frm++ = eaddrs[2]; *setup_frm++ = eaddrs[2];
+ }
+
+
+ static void set_rx_mode(struct net_device *dev)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+ void __iomem *ioaddr = tp->base_addr;
+ int csr6;
+
+ csr6 = ioread32(ioaddr + CSR6) & ~0x00D5;
+
+ tp->csr6 &= ~0x00D5;
+ if (dev->flags & IFF_PROMISC) { /* Set promiscuous. */
+ tp->csr6 |= AcceptAllMulticast | AcceptAllPhys;
+ csr6 |= AcceptAllMulticast | AcceptAllPhys;
+ } else if ((netdev_mc_count(dev) > 1000) ||
+ (dev->flags & IFF_ALLMULTI)) {
+ /* Too many to filter well -- accept all multicasts. */
+ tp->csr6 |= AcceptAllMulticast;
+ csr6 |= AcceptAllMulticast;
+ } else if (tp->flags & MC_HASH_ONLY) {
+ /* Some work-alikes have only a 64-entry hash filter table. */
+ /* Should verify correctness on big-endian/__powerpc__ */
+ struct netdev_hw_addr *ha;
+ if (netdev_mc_count(dev) > 64) {
+ /* Arbitrary non-effective limit. */
+ tp->csr6 |= AcceptAllMulticast;
+ csr6 |= AcceptAllMulticast;
+ } else {
+ u32 mc_filter[2] = {0, 0}; /* Multicast hash filter */
+ int filterbit;
+ netdev_for_each_mc_addr(ha, dev) {
+ if (tp->flags & COMET_MAC_ADDR)
+ filterbit = ether_crc_le(ETH_ALEN,
+ ha->addr);
+ else
+ filterbit = ether_crc(ETH_ALEN,
+ ha->addr) >> 26;
+ filterbit &= 0x3f;
+ mc_filter[filterbit >> 5] |= 1 << (filterbit & 31);
+ if (tulip_debug > 2)
+ dev_info(&dev->dev,
+ "Added filter for %pM %08x bit %d\n",
+ ha->addr,
+ ether_crc(ETH_ALEN, ha->addr),
+ filterbit);
+ }
+ if (mc_filter[0] == tp->mc_filter[0] &&
+ mc_filter[1] == tp->mc_filter[1])
+ ; /* No change. */
+ else if (tp->flags & IS_ASIX) {
+ iowrite32(2, ioaddr + CSR13);
+ iowrite32(mc_filter[0], ioaddr + CSR14);
+ iowrite32(3, ioaddr + CSR13);
+ iowrite32(mc_filter[1], ioaddr + CSR14);
+ } else if (tp->flags & COMET_MAC_ADDR) {
+ iowrite32(mc_filter[0], ioaddr + CSR27);
+ iowrite32(mc_filter[1], ioaddr + CSR28);
+ }
+ tp->mc_filter[0] = mc_filter[0];
+ tp->mc_filter[1] = mc_filter[1];
+ }
+ } else {
+ unsigned long flags;
+ u32 tx_flags = 0x08000000 | 192;
+
+ /* Note that only the low-address shortword of setup_frame is valid!
+ The values are doubled for big-endian architectures. */
+ if (netdev_mc_count(dev) > 14) {
+ /* Must use a multicast hash table. */
+ build_setup_frame_hash(tp->setup_frame, dev);
+ tx_flags = 0x08400000 | 192;
+ } else {
+ build_setup_frame_perfect(tp->setup_frame, dev);
+ }
+
+ spin_lock_irqsave(&tp->lock, flags);
+
+ if (tp->cur_tx - tp->dirty_tx > TX_RING_SIZE - 2) {
+ /* Same setup recently queued, we need not add it. */
+ } else {
+ unsigned int entry;
+ int dummy = -1;
+
+ /* Now add this frame to the Tx list. */
+
+ entry = tp->cur_tx++ % TX_RING_SIZE;
+
+ if (entry != 0) {
+ /* Avoid a chip errata by prefixing a dummy entry. */
+ tp->tx_buffers[entry].skb = NULL;
+ tp->tx_buffers[entry].mapping = 0;
+ tp->tx_ring[entry].length =
+ (entry == TX_RING_SIZE-1) ? cpu_to_le32(DESC_RING_WRAP) : 0;
+ tp->tx_ring[entry].buffer1 = 0;
+ /* Must set DescOwned later to avoid race with chip */
+ dummy = entry;
+ entry = tp->cur_tx++ % TX_RING_SIZE;
+
+ }
+
+ tp->tx_buffers[entry].skb = NULL;
+ tp->tx_buffers[entry].mapping =
+ pci_map_single(tp->pdev, tp->setup_frame,
+ sizeof(tp->setup_frame),
+ PCI_DMA_TODEVICE);
+ /* Put the setup frame on the Tx list. */
+ if (entry == TX_RING_SIZE-1)
+ tx_flags |= DESC_RING_WRAP; /* Wrap ring. */
+ tp->tx_ring[entry].length = cpu_to_le32(tx_flags);
+ tp->tx_ring[entry].buffer1 =
+ cpu_to_le32(tp->tx_buffers[entry].mapping);
+ tp->tx_ring[entry].status = cpu_to_le32(DescOwned);
+ if (dummy >= 0)
+ tp->tx_ring[dummy].status = cpu_to_le32(DescOwned);
+ if (tp->cur_tx - tp->dirty_tx >= TX_RING_SIZE - 2)
+ netif_stop_queue(dev);
+
+ /* Trigger an immediate transmit demand. */
+ iowrite32(0, ioaddr + CSR1);
+ }
+
+ spin_unlock_irqrestore(&tp->lock, flags);
+ }
+
+ iowrite32(csr6, ioaddr + CSR6);
+ }
+
+ #ifdef CONFIG_TULIP_MWI
+ static void __devinit tulip_mwi_config (struct pci_dev *pdev,
+ struct net_device *dev)
+ {
+ struct tulip_private *tp = netdev_priv(dev);
+ u8 cache;
+ u16 pci_command;
+ u32 csr0;
+
+ if (tulip_debug > 3)
+ netdev_dbg(dev, "tulip_mwi_config()\n");
+
+ tp->csr0 = csr0 = 0;
+
+ /* if we have any cache line size at all, we can do MRM and MWI */
+ csr0 |= MRM | MWI;
+
+ /* Enable MWI in the standard PCI command bit.
+ * Check for the case where MWI is desired but not available
+ */
+ pci_try_set_mwi(pdev);
+
+ /* read result from hardware (in case bit refused to enable) */
+ pci_read_config_word(pdev, PCI_COMMAND, &pci_command);
+ if ((csr0 & MWI) && (!(pci_command & PCI_COMMAND_INVALIDATE)))
+ csr0 &= ~MWI;
+
+ /* if cache line size hardwired to zero, no MWI */
+ pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &cache);
+ if ((csr0 & MWI) && (cache == 0)) {
+ csr0 &= ~MWI;
+ pci_clear_mwi(pdev);
+ }
+
+ /* assign per-cacheline-size cache alignment and
+ * burst length values
+ */
+ switch (cache) {
+ case 8:
+ csr0 |= MRL | (1 << CALShift) | (16 << BurstLenShift);
+ break;
+ case 16:
+ csr0 |= MRL | (2 << CALShift) | (16 << BurstLenShift);
+ break;
+ case 32:
+ csr0 |= MRL | (3 << CALShift) | (32 << BurstLenShift);
+ break;
+ default:
+ cache = 0;
+ break;
+ }
+
+ /* if we have a good cache line size, we by now have a good
+ * csr0, so save it and exit
+ */
+ if (cache)
+ goto out;
+
+ /* we don't have a good csr0 or cache line size, disable MWI */
+ if (csr0 & MWI) {
+ pci_clear_mwi(pdev);
+ csr0 &= ~MWI;
+ }
+
+ /* sane defaults for burst length and cache alignment
+ * originally from de4x5 driver
+ */
+ csr0 |= (8 << BurstLenShift) | (1 << CALShift);
+
+ out:
+ tp->csr0 = csr0;
+ if (tulip_debug > 2)
+ netdev_dbg(dev, "MWI config cacheline=%d, csr0=%08x\n",
+ cache, csr0);
+ }
+ #endif
+
+ /*
+ * Chips that have the MRM/reserved bit quirk and the burst quirk. That
+ * is the DM910X and the on chip ULi devices
+ */
+
+ static int tulip_uli_dm_quirk(struct pci_dev *pdev)
+ {
+ if (pdev->vendor == 0x1282 && pdev->device == 0x9102)
+ return 1;
+ return 0;
+ }
+
+ static const struct net_device_ops tulip_netdev_ops = {
+ .ndo_open = tulip_open,
+ .ndo_start_xmit = tulip_start_xmit,
+ .ndo_tx_timeout = tulip_tx_timeout,
+ .ndo_stop = tulip_close,
+ .ndo_get_stats = tulip_get_stats,
+ .ndo_do_ioctl = private_ioctl,
+ .ndo_set_rx_mode = set_rx_mode,
+ .ndo_change_mtu = eth_change_mtu,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+ #ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_poll_controller = poll_tulip,
+ #endif
+ };
+
+ DEFINE_PCI_DEVICE_TABLE(early_486_chipsets) = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82424) },
+ { PCI_DEVICE(PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_496) },
+ { },
+ };
+
+ static int __devinit tulip_init_one (struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+ {
+ struct tulip_private *tp;
+ /* See note below on the multiport cards. */
+ static unsigned char last_phys_addr[6] = {0x00, 'L', 'i', 'n', 'u', 'x'};
+ static int last_irq;
+ static int multiport_cnt; /* For four-port boards w/one EEPROM */
+ int i, irq;
+ unsigned short sum;
+ unsigned char *ee_data;
+ struct net_device *dev;
+ void __iomem *ioaddr;
+ static int board_idx = -1;
+ int chip_idx = ent->driver_data;
+ const char *chip_name = tulip_tbl[chip_idx].chip_name;
+ unsigned int eeprom_missing = 0;
+ unsigned int force_csr0 = 0;
+
+ #ifndef MODULE
+ if (tulip_debug > 0)
+ printk_once(KERN_INFO "%s", version);
+ #endif
+
+ board_idx++;
+
+ /*
+ * Lan media wire a tulip chip to a wan interface. Needs a very
+ * different driver (lmc driver)
+ */
+
+ if (pdev->subsystem_vendor == PCI_VENDOR_ID_LMC) {
+ pr_err("skipping LMC card\n");
+ return -ENODEV;
+ } else if (pdev->subsystem_vendor == PCI_VENDOR_ID_SBE &&
+ (pdev->subsystem_device == PCI_SUBDEVICE_ID_SBE_T3E3 ||
+ pdev->subsystem_device == PCI_SUBDEVICE_ID_SBE_2T3E3_P0 ||
+ pdev->subsystem_device == PCI_SUBDEVICE_ID_SBE_2T3E3_P1)) {
+ pr_err("skipping SBE T3E3 port\n");
+ return -ENODEV;
+ }
+
+ /*
+ * DM910x chips should be handled by the dmfe driver, except
+ * on-board chips on SPARC systems. Also, early DM9100s need
+ * software CRC which only the dmfe driver supports.
+ */
+
+ #ifdef CONFIG_TULIP_DM910X
+ if (chip_idx == DM910X) {
+ struct device_node *dp;
+
+ if (pdev->vendor == 0x1282 && pdev->device == 0x9100 &&
+ pdev->revision < 0x30) {
+ pr_info("skipping early DM9100 with Crc bug (use dmfe)\n");
+ return -ENODEV;
+ }
+
+ dp = pci_device_to_OF_node(pdev);
+ if (!(dp && of_get_property(dp, "local-mac-address", NULL))) {
+ pr_info("skipping DM910x expansion card (use dmfe)\n");
+ return -ENODEV;
+ }
+ }
+ #endif
+
+ /*
+ * Looks for early PCI chipsets where people report hangs
+ * without the workarounds being on.
+ */
+
+ /* 1. Intel Saturn. Switch to 8 long words burst, 8 long word cache
+ aligned. Aries might need this too. The Saturn errata are not
+ pretty reading but thankfully it's an old 486 chipset.
+
+ 2. The dreaded SiS496 486 chipset. Same workaround as Intel
+ Saturn.
+ */
+
+ if (pci_dev_present(early_486_chipsets)) {
+ csr0 = MRL | MRM | (8 << BurstLenShift) | (1 << CALShift);
+ force_csr0 = 1;
+ }
+
+ /* bugfix: the ASIX must have a burst limit or horrible things happen. */
+ if (chip_idx == AX88140) {
+ if ((csr0 & 0x3f00) == 0)
+ csr0 |= 0x2000;
+ }
+
+ /* PNIC doesn't have MWI/MRL/MRM... */
+ if (chip_idx == LC82C168)
+ csr0 &= ~0xfff10000; /* zero reserved bits 31:20, 16 */
+
+ /* DM9102A has troubles with MRM & clear reserved bits 24:22, 20, 16, 7:1 */
+ if (tulip_uli_dm_quirk(pdev)) {
+ csr0 &= ~0x01f100ff;
+ #if defined(CONFIG_SPARC)
+ csr0 = (csr0 & ~0xff00) | 0xe000;
+ #endif
+ }
+ /*
+ * And back to business
+ */
+
+ i = pci_enable_device(pdev);
+ if (i) {
+ pr_err("Cannot enable tulip board #%d, aborting\n", board_idx);
+ return i;
+ }
+
+ /* The chip will fail to enter a low-power state later unless
+ * first explicitly commanded into D0 */
+ if (pci_set_power_state(pdev, PCI_D0)) {
+ pr_notice("Failed to set power state to D0\n");
+ }
+
+ irq = pdev->irq;
+
+ /* alloc_etherdev ensures aligned and zeroed private structures */
+ dev = alloc_etherdev (sizeof (*tp));
+ if (!dev) {
+ pr_err("ether device alloc failed, aborting\n");
+ return -ENOMEM;
+ }
+
+ SET_NETDEV_DEV(dev, &pdev->dev);
+ if (pci_resource_len (pdev, 0) < tulip_tbl[chip_idx].io_size) {
+ pr_err("%s: I/O region (0x%llx@0x%llx) too small, aborting\n",
+ pci_name(pdev),
+ (unsigned long long)pci_resource_len (pdev, 0),
+ (unsigned long long)pci_resource_start (pdev, 0));
+ goto err_out_free_netdev;
+ }
+
+ /* grab all resources from both PIO and MMIO regions, as we
+ * don't want anyone else messing around with our hardware */
+ if (pci_request_regions (pdev, DRV_NAME))
+ goto err_out_free_netdev;
+
+ ioaddr = pci_iomap(pdev, TULIP_BAR, tulip_tbl[chip_idx].io_size);
+
+ if (!ioaddr)
+ goto err_out_free_res;
+
+ /*
+ * initialize private data structure 'tp'
+ * it is zeroed and aligned in alloc_etherdev
+ */
+ tp = netdev_priv(dev);
+ tp->dev = dev;
+
+ tp->rx_ring = pci_alloc_consistent(pdev,
+ sizeof(struct tulip_rx_desc) * RX_RING_SIZE +
+ sizeof(struct tulip_tx_desc) * TX_RING_SIZE,
+ &tp->rx_ring_dma);
+ if (!tp->rx_ring)
+ goto err_out_mtable;
+ tp->tx_ring = (struct tulip_tx_desc *)(tp->rx_ring + RX_RING_SIZE);
+ tp->tx_ring_dma = tp->rx_ring_dma + sizeof(struct tulip_rx_desc) * RX_RING_SIZE;
+
+ tp->chip_id = chip_idx;
+ tp->flags = tulip_tbl[chip_idx].flags;
+
+ tp->wolinfo.supported = 0;
+ tp->wolinfo.wolopts = 0;
+ /* COMET: Enable power management only for AN983B */
+ if (chip_idx == COMET ) {
+ u32 sig;
+ pci_read_config_dword (pdev, 0x80, &sig);
+ if (sig == 0x09811317) {
+ tp->flags |= COMET_PM;
+ tp->wolinfo.supported = WAKE_PHY | WAKE_MAGIC;
+ pr_info("%s: Enabled WOL support for AN983B\n",
+ __func__);
+ }
+ }
+ tp->pdev = pdev;
+ tp->base_addr = ioaddr;
+ tp->revision = pdev->revision;
+ tp->csr0 = csr0;
+ spin_lock_init(&tp->lock);
+ spin_lock_init(&tp->mii_lock);
+ init_timer(&tp->timer);
+ tp->timer.data = (unsigned long)dev;
+ tp->timer.function = tulip_tbl[tp->chip_id].media_timer;
+
+ INIT_WORK(&tp->media_work, tulip_tbl[tp->chip_id].media_task);
+
+ dev->base_addr = (unsigned long)ioaddr;
+
+ #ifdef CONFIG_TULIP_MWI
+ if (!force_csr0 && (tp->flags & HAS_PCI_MWI))
+ tulip_mwi_config (pdev, dev);
+ #endif
+
+ /* Stop the chip's Tx and Rx processes. */
+ tulip_stop_rxtx(tp);
+
+ pci_set_master(pdev);
+
+ #ifdef CONFIG_GSC
+ if (pdev->subsystem_vendor == PCI_VENDOR_ID_HP) {
+ switch (pdev->subsystem_device) {
+ default:
+ break;
+ case 0x1061:
+ case 0x1062:
+ case 0x1063:
+ case 0x1098:
+ case 0x1099:
+ case 0x10EE:
+ tp->flags |= HAS_SWAPPED_SEEPROM | NEEDS_FAKE_MEDIA_TABLE;
+ chip_name = "GSC DS21140 Tulip";
+ }
+ }
+ #endif
+
+ /* Clear the missed-packet counter. */
+ ioread32(ioaddr + CSR8);
+
+ /* The station address ROM is read byte serially. The register must
+ be polled, waiting for the value to be read bit serially from the
+ EEPROM.
+ */
+ ee_data = tp->eeprom;
+ memset(ee_data, 0, sizeof(tp->eeprom));
+ sum = 0;
+ if (chip_idx == LC82C168) {
+ for (i = 0; i < 3; i++) {
+ int value, boguscnt = 100000;
+ iowrite32(0x600 | i, ioaddr + 0x98);
+ do {
+ value = ioread32(ioaddr + CSR9);
+ } while (value < 0 && --boguscnt > 0);
+ put_unaligned_le16(value, ((__le16 *)dev->dev_addr) + i);
+ sum += value & 0xffff;
+ }
+ } else if (chip_idx == COMET) {
+ /* No need to read the EEPROM. */
+ put_unaligned_le32(ioread32(ioaddr + 0xA4), dev->dev_addr);
+ put_unaligned_le16(ioread32(ioaddr + 0xA8), dev->dev_addr + 4);
+ for (i = 0; i < 6; i ++)
+ sum += dev->dev_addr[i];
+ } else {
+ /* A serial EEPROM interface, we read now and sort it out later. */
+ int sa_offset = 0;
+ int ee_addr_size = tulip_read_eeprom(dev, 0xff, 8) & 0x40000 ? 8 : 6;
+ int ee_max_addr = ((1 << ee_addr_size) - 1) * sizeof(u16);
+
+ if (ee_max_addr > sizeof(tp->eeprom))
+ ee_max_addr = sizeof(tp->eeprom);
+
+ for (i = 0; i < ee_max_addr ; i += sizeof(u16)) {
+ u16 data = tulip_read_eeprom(dev, i/2, ee_addr_size);
+ ee_data[i] = data & 0xff;
+ ee_data[i + 1] = data >> 8;
+ }
+
+ /* DEC now has a specification (see Notes) but early board makers
+ just put the address in the first EEPROM locations. */
+ /* This does memcmp(ee_data, ee_data+16, 8) */
+ for (i = 0; i < 8; i ++)
+ if (ee_data[i] != ee_data[16+i])
+ sa_offset = 20;
+ if (chip_idx == CONEXANT) {
+ /* Check that the tuple type and length is correct. */
+ if (ee_data[0x198] == 0x04 && ee_data[0x199] == 6)
+ sa_offset = 0x19A;
+ } else if (ee_data[0] == 0xff && ee_data[1] == 0xff &&
+ ee_data[2] == 0) {
+ sa_offset = 2; /* Grrr, damn Matrox boards. */
+ multiport_cnt = 4;
+ }
+ #ifdef CONFIG_MIPS_COBALT
+ if ((pdev->bus->number == 0) &&
+ ((PCI_SLOT(pdev->devfn) == 7) ||
+ (PCI_SLOT(pdev->devfn) == 12))) {
+ /* Cobalt MAC address in first EEPROM locations. */
+ sa_offset = 0;
+ /* Ensure our media table fixup get's applied */
+ memcpy(ee_data + 16, ee_data, 8);
+ }
+ #endif
+ #ifdef CONFIG_GSC
+ /* Check to see if we have a broken srom */
+ if (ee_data[0] == 0x61 && ee_data[1] == 0x10) {
+ /* pci_vendor_id and subsystem_id are swapped */
+ ee_data[0] = ee_data[2];
+ ee_data[1] = ee_data[3];
+ ee_data[2] = 0x61;
+ ee_data[3] = 0x10;
+
+ /* HSC-PCI boards need to be byte-swaped and shifted
+ * up 1 word. This shift needs to happen at the end
+ * of the MAC first because of the 2 byte overlap.
+ */
+ for (i = 4; i >= 0; i -= 2) {
+ ee_data[17 + i + 3] = ee_data[17 + i];
+ ee_data[16 + i + 5] = ee_data[16 + i];
+ }
+ }
+ #endif
+
+ for (i = 0; i < 6; i ++) {
+ dev->dev_addr[i] = ee_data[i + sa_offset];
+ sum += ee_data[i + sa_offset];
+ }
+ }
+ /* Lite-On boards have the address byte-swapped. */
+ if ((dev->dev_addr[0] == 0xA0 ||
+ dev->dev_addr[0] == 0xC0 ||
+ dev->dev_addr[0] == 0x02) &&
+ dev->dev_addr[1] == 0x00)
+ for (i = 0; i < 6; i+=2) {
+ char tmp = dev->dev_addr[i];
+ dev->dev_addr[i] = dev->dev_addr[i+1];
+ dev->dev_addr[i+1] = tmp;
+ }
+ /* On the Zynx 315 Etherarray and other multiport boards only the
+ first Tulip has an EEPROM.
+ On Sparc systems the mac address is held in the OBP property
+ "local-mac-address".
+ The addresses of the subsequent ports are derived from the first.
+ Many PCI BIOSes also incorrectly report the IRQ line, so we correct
+ that here as well. */
+ if (sum == 0 || sum == 6*0xff) {
+ #if defined(CONFIG_SPARC)
+ struct device_node *dp = pci_device_to_OF_node(pdev);
+ const unsigned char *addr;
+ int len;
+ #endif
+ eeprom_missing = 1;
+ for (i = 0; i < 5; i++)
+ dev->dev_addr[i] = last_phys_addr[i];
+ dev->dev_addr[i] = last_phys_addr[i] + 1;
+ #if defined(CONFIG_SPARC)
+ addr = of_get_property(dp, "local-mac-address", &len);
+ if (addr && len == 6)
+ memcpy(dev->dev_addr, addr, 6);
+ #endif
+ #if defined(__i386__) || defined(__x86_64__) /* Patch up x86 BIOS bug. */
+ if (last_irq)
+ irq = last_irq;
+ #endif
+ }
+
+ for (i = 0; i < 6; i++)
+ last_phys_addr[i] = dev->dev_addr[i];
+ last_irq = irq;
+ dev->irq = irq;
+
+ /* The lower four bits are the media type. */
+ if (board_idx >= 0 && board_idx < MAX_UNITS) {
+ if (options[board_idx] & MEDIA_MASK)
+ tp->default_port = options[board_idx] & MEDIA_MASK;
+ if ((options[board_idx] & FullDuplex) || full_duplex[board_idx] > 0)
+ tp->full_duplex = 1;
+ if (mtu[board_idx] > 0)
+ dev->mtu = mtu[board_idx];
+ }
+ if (dev->mem_start & MEDIA_MASK)
+ tp->default_port = dev->mem_start & MEDIA_MASK;
+ if (tp->default_port) {
+ pr_info(DRV_NAME "%d: Transceiver selection forced to %s\n",
+ board_idx, medianame[tp->default_port & MEDIA_MASK]);
+ tp->medialock = 1;
+ if (tulip_media_cap[tp->default_port] & MediaAlwaysFD)
+ tp->full_duplex = 1;
+ }
+ if (tp->full_duplex)
+ tp->full_duplex_lock = 1;
+
+ if (tulip_media_cap[tp->default_port] & MediaIsMII) {
+ static const u16 media2advert[] = {
+ 0x20, 0x40, 0x03e0, 0x60, 0x80, 0x100, 0x200
+ };
+ tp->mii_advertise = media2advert[tp->default_port - 9];
+ tp->mii_advertise |= (tp->flags & HAS_8023X); /* Matching bits! */
+ }
+
+ if (tp->flags & HAS_MEDIA_TABLE) {
+ sprintf(dev->name, DRV_NAME "%d", board_idx); /* hack */
+ tulip_parse_eeprom(dev);
+ strcpy(dev->name, "eth%d"); /* un-hack */
+ }
+
+ if ((tp->flags & ALWAYS_CHECK_MII) ||
+ (tp->mtable && tp->mtable->has_mii) ||
+ ( ! tp->mtable && (tp->flags & HAS_MII))) {
+ if (tp->mtable && tp->mtable->has_mii) {
+ for (i = 0; i < tp->mtable->leafcount; i++)
+ if (tp->mtable->mleaf[i].media == 11) {
+ tp->cur_index = i;
+ tp->saved_if_port = dev->if_port;
+ tulip_select_media(dev, 2);
+ dev->if_port = tp->saved_if_port;
+ break;
+ }
+ }
+
+ /* Find the connected MII xcvrs.
+ Doing this in open() would allow detecting external xcvrs
+ later, but takes much time. */
+ tulip_find_mii (dev, board_idx);
+ }
+
+ /* The Tulip-specific entries in the device structure. */
+ dev->netdev_ops = &tulip_netdev_ops;
+ dev->watchdog_timeo = TX_TIMEOUT;
+ #ifdef CONFIG_TULIP_NAPI
+ netif_napi_add(dev, &tp->napi, tulip_poll, 16);
+ #endif
+ SET_ETHTOOL_OPS(dev, &ops);
+
+ if (register_netdev(dev))
+ goto err_out_free_ring;
+
+ pci_set_drvdata(pdev, dev);
+
+ dev_info(&dev->dev,
+ #ifdef CONFIG_TULIP_MMIO
+ "%s rev %d at MMIO %#llx,%s %pM, IRQ %d\n",
+ #else
+ "%s rev %d at Port %#llx,%s %pM, IRQ %d\n",
+ #endif
+ chip_name, pdev->revision,
+ (unsigned long long)pci_resource_start(pdev, TULIP_BAR),
+ eeprom_missing ? " EEPROM not present," : "",
+ dev->dev_addr, irq);
+
+ if (tp->chip_id == PNIC2)
+ tp->link_change = pnic2_lnk_change;
+ else if (tp->flags & HAS_NWAY)
+ tp->link_change = t21142_lnk_change;
+ else if (tp->flags & HAS_PNICNWAY)
+ tp->link_change = pnic_lnk_change;
+
+ /* Reset the xcvr interface and turn on heartbeat. */
+ switch (chip_idx) {
+ case DC21140:
+ case DM910X:
+ default:
+ if (tp->mtable)
+ iowrite32(tp->mtable->csr12dir | 0x100, ioaddr + CSR12);
+ break;
+ case DC21142:
+ if (tp->mii_cnt || tulip_media_cap[dev->if_port] & MediaIsMII) {
+ iowrite32(csr6_mask_defstate, ioaddr + CSR6);
+ iowrite32(0x0000, ioaddr + CSR13);
+ iowrite32(0x0000, ioaddr + CSR14);
+ iowrite32(csr6_mask_hdcap, ioaddr + CSR6);
+ } else
+ t21142_start_nway(dev);
+ break;
+ case PNIC2:
+ /* just do a reset for sanity sake */
+ iowrite32(0x0000, ioaddr + CSR13);
+ iowrite32(0x0000, ioaddr + CSR14);
+ break;
+ case LC82C168:
+ if ( ! tp->mii_cnt) {
+ tp->nway = 1;
+ tp->nwayset = 0;
+ iowrite32(csr6_ttm | csr6_ca, ioaddr + CSR6);
+ iowrite32(0x30, ioaddr + CSR12);
+ iowrite32(0x0001F078, ioaddr + CSR6);
+ iowrite32(0x0201F078, ioaddr + CSR6); /* Turn on autonegotiation. */
+ }
+ break;
+ case MX98713:
+ case COMPEX9881:
+ iowrite32(0x00000000, ioaddr + CSR6);
+ iowrite32(0x000711C0, ioaddr + CSR14); /* Turn on NWay. */
+ iowrite32(0x00000001, ioaddr + CSR13);
+ break;
+ case MX98715:
+ case MX98725:
+ iowrite32(0x01a80000, ioaddr + CSR6);
+ iowrite32(0xFFFFFFFF, ioaddr + CSR14);
+ iowrite32(0x00001000, ioaddr + CSR12);
+ break;
+ case COMET:
+ /* No initialization necessary. */
+ break;
+ }
+
+ /* put the chip in snooze mode until opened */
+ tulip_set_power_state (tp, 0, 1);
+
+ return 0;
+
+ err_out_free_ring:
+ pci_free_consistent (pdev,
+ sizeof (struct tulip_rx_desc) * RX_RING_SIZE +
+ sizeof (struct tulip_tx_desc) * TX_RING_SIZE,
+ tp->rx_ring, tp->rx_ring_dma);
+
+ err_out_mtable:
+ kfree (tp->mtable);
+ pci_iounmap(pdev, ioaddr);
+
+ err_out_free_res:
+ pci_release_regions (pdev);
+
+ err_out_free_netdev:
+ free_netdev (dev);
+ return -ENODEV;
+ }
+
+
+ /* set the registers according to the given wolopts */
+ static void tulip_set_wolopts (struct pci_dev *pdev, u32 wolopts)
+ {
+ struct net_device *dev = pci_get_drvdata(pdev);
+ struct tulip_private *tp = netdev_priv(dev);
+ void __iomem *ioaddr = tp->base_addr;
+
+ if (tp->flags & COMET_PM) {
+
+ unsigned int tmp;
+
+ tmp = ioread32(ioaddr + CSR18);
+ tmp &= ~(comet_csr18_pmes_sticky | comet_csr18_apm_mode | comet_csr18_d3a);
+ tmp |= comet_csr18_pm_mode;
+ iowrite32(tmp, ioaddr + CSR18);
+
+ /* Set the Wake-up Control/Status Register to the given WOL options*/
+ tmp = ioread32(ioaddr + CSR13);
+ tmp &= ~(comet_csr13_linkoffe | comet_csr13_linkone | comet_csr13_wfre | comet_csr13_lsce | comet_csr13_mpre);
+ if (wolopts & WAKE_MAGIC)
+ tmp |= comet_csr13_mpre;
+ if (wolopts & WAKE_PHY)
+ tmp |= comet_csr13_linkoffe | comet_csr13_linkone | comet_csr13_lsce;
+ /* Clear the event flags */
+ tmp |= comet_csr13_wfr | comet_csr13_mpr | comet_csr13_lsc;
+ iowrite32(tmp, ioaddr + CSR13);
+ }
+ }
+
+ #ifdef CONFIG_PM
+
+
+ static int tulip_suspend (struct pci_dev *pdev, pm_message_t state)
+ {
+ pci_power_t pstate;
+ struct net_device *dev = pci_get_drvdata(pdev);
+ struct tulip_private *tp = netdev_priv(dev);
+
+ if (!dev)
+ return -EINVAL;
+
+ if (!netif_running(dev))
+ goto save_state;
+
+ tulip_down(dev);
+
+ netif_device_detach(dev);
+ free_irq(dev->irq, dev);
+
+ save_state:
+ pci_save_state(pdev);
+ pci_disable_device(pdev);
+ pstate = pci_choose_state(pdev, state);
+ if (state.event == PM_EVENT_SUSPEND && pstate != PCI_D0) {
+ int rc;
+
+ tulip_set_wolopts(pdev, tp->wolinfo.wolopts);
+ rc = pci_enable_wake(pdev, pstate, tp->wolinfo.wolopts);
+ if (rc)
+ pr_err("pci_enable_wake failed (%d)\n", rc);
+ }
+ pci_set_power_state(pdev, pstate);
+
+ return 0;
+ }
+
+
+ static int tulip_resume(struct pci_dev *pdev)
+ {
+ struct net_device *dev = pci_get_drvdata(pdev);
+ struct tulip_private *tp = netdev_priv(dev);
+ void __iomem *ioaddr = tp->base_addr;
+ int retval;
+ unsigned int tmp;
+
+ if (!dev)
+ return -EINVAL;
+
+ pci_set_power_state(pdev, PCI_D0);
+ pci_restore_state(pdev);
+
+ if (!netif_running(dev))
+ return 0;
+
+ if ((retval = pci_enable_device(pdev))) {
+ pr_err("pci_enable_device failed in resume\n");
+ return retval;
+ }
+
+ if ((retval = request_irq(dev->irq, tulip_interrupt, IRQF_SHARED, dev->name, dev))) {
+ pr_err("request_irq failed in resume\n");
+ return retval;
+ }
+
+ if (tp->flags & COMET_PM) {
+ pci_enable_wake(pdev, PCI_D3hot, 0);
+ pci_enable_wake(pdev, PCI_D3cold, 0);
+
+ /* Clear the PMES flag */
+ tmp = ioread32(ioaddr + CSR20);
+ tmp |= comet_csr20_pmes;
+ iowrite32(tmp, ioaddr + CSR20);
+
+ /* Disable all wake-up events */
+ tulip_set_wolopts(pdev, 0);
+ }
+ netif_device_attach(dev);
+
+ if (netif_running(dev))
+ tulip_up(dev);
+
+ return 0;
+ }
+
+ #endif /* CONFIG_PM */
+
+
+ static void __devexit tulip_remove_one (struct pci_dev *pdev)
+ {
+ struct net_device *dev = pci_get_drvdata (pdev);
+ struct tulip_private *tp;
+
+ if (!dev)
+ return;
+
+ tp = netdev_priv(dev);
++
++ /* shoot NIC in the head before deallocating descriptors */
++ pci_disable_device(tp->pdev);
++
+ unregister_netdev(dev);
+ pci_free_consistent (pdev,
+ sizeof (struct tulip_rx_desc) * RX_RING_SIZE +
+ sizeof (struct tulip_tx_desc) * TX_RING_SIZE,
+ tp->rx_ring, tp->rx_ring_dma);
+ kfree (tp->mtable);
+ pci_iounmap(pdev, tp->base_addr);
+ free_netdev (dev);
+ pci_release_regions (pdev);
+ pci_set_drvdata (pdev, NULL);
+
+ /* pci_power_off (pdev, -1); */
+ }
+
+ #ifdef CONFIG_NET_POLL_CONTROLLER
+ /*
+ * Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+
+ static void poll_tulip (struct net_device *dev)
+ {
+ /* disable_irq here is not very nice, but with the lockless
+ interrupt handler we have no other choice. */
+ disable_irq(dev->irq);
+ tulip_interrupt (dev->irq, dev);
+ enable_irq(dev->irq);
+ }
+ #endif
+
+ static struct pci_driver tulip_driver = {
+ .name = DRV_NAME,
+ .id_table = tulip_pci_tbl,
+ .probe = tulip_init_one,
+ .remove = __devexit_p(tulip_remove_one),
+ #ifdef CONFIG_PM
+ .suspend = tulip_suspend,
+ .resume = tulip_resume,
+ #endif /* CONFIG_PM */
+ };
+
+
+ static int __init tulip_init (void)
+ {
+ #ifdef MODULE
+ pr_info("%s", version);
+ #endif
+
+ /* copy module parms into globals */
+ tulip_rx_copybreak = rx_copybreak;
+ tulip_max_interrupt_work = max_interrupt_work;
+
+ /* probe for and init boards */
+ return pci_register_driver(&tulip_driver);
+ }
+
+
+ static void __exit tulip_cleanup (void)
+ {
+ pci_unregister_driver (&tulip_driver);
+ }
+
+
+ module_init(tulip_init);
+ module_exit(tulip_cleanup);
--- /dev/null
+ /*
+ * linux/drivers/net/ehea/ehea_main.c
+ *
+ * eHEA ethernet device driver for IBM eServer System p
+ *
+ * (C) Copyright IBM Corp. 2006
+ *
+ * Authors:
+ * Christoph Raisch <raisch@de.ibm.com>
+ * Jan-Bernd Themann <themann@de.ibm.com>
+ * Thomas Klein <tklein@de.ibm.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+ #include <linux/in.h>
+ #include <linux/ip.h>
+ #include <linux/tcp.h>
+ #include <linux/udp.h>
+ #include <linux/if.h>
+ #include <linux/list.h>
+ #include <linux/slab.h>
+ #include <linux/if_ether.h>
+ #include <linux/notifier.h>
+ #include <linux/reboot.h>
+ #include <linux/memory.h>
+ #include <asm/kexec.h>
+ #include <linux/mutex.h>
+ #include <linux/prefetch.h>
+
+ #include <net/ip.h>
+
+ #include "ehea.h"
+ #include "ehea_qmr.h"
+ #include "ehea_phyp.h"
+
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
+ MODULE_DESCRIPTION("IBM eServer HEA Driver");
+ MODULE_VERSION(DRV_VERSION);
+
+
+ static int msg_level = -1;
+ static int rq1_entries = EHEA_DEF_ENTRIES_RQ1;
+ static int rq2_entries = EHEA_DEF_ENTRIES_RQ2;
+ static int rq3_entries = EHEA_DEF_ENTRIES_RQ3;
+ static int sq_entries = EHEA_DEF_ENTRIES_SQ;
+ static int use_mcs = 1;
+ static int prop_carrier_state;
+
+ module_param(msg_level, int, 0);
+ module_param(rq1_entries, int, 0);
+ module_param(rq2_entries, int, 0);
+ module_param(rq3_entries, int, 0);
+ module_param(sq_entries, int, 0);
+ module_param(prop_carrier_state, int, 0);
+ module_param(use_mcs, int, 0);
+
+ MODULE_PARM_DESC(msg_level, "msg_level");
+ MODULE_PARM_DESC(prop_carrier_state, "Propagate carrier state of physical "
+ "port to stack. 1:yes, 0:no. Default = 0 ");
+ MODULE_PARM_DESC(rq3_entries, "Number of entries for Receive Queue 3 "
+ "[2^x - 1], x = [6..14]. Default = "
+ __MODULE_STRING(EHEA_DEF_ENTRIES_RQ3) ")");
+ MODULE_PARM_DESC(rq2_entries, "Number of entries for Receive Queue 2 "
+ "[2^x - 1], x = [6..14]. Default = "
+ __MODULE_STRING(EHEA_DEF_ENTRIES_RQ2) ")");
+ MODULE_PARM_DESC(rq1_entries, "Number of entries for Receive Queue 1 "
+ "[2^x - 1], x = [6..14]. Default = "
+ __MODULE_STRING(EHEA_DEF_ENTRIES_RQ1) ")");
+ MODULE_PARM_DESC(sq_entries, " Number of entries for the Send Queue "
+ "[2^x - 1], x = [6..14]. Default = "
+ __MODULE_STRING(EHEA_DEF_ENTRIES_SQ) ")");
+ MODULE_PARM_DESC(use_mcs, " Multiple receive queues, 1: enable, 0: disable, "
+ "Default = 1");
+
+ static int port_name_cnt;
+ static LIST_HEAD(adapter_list);
+ static unsigned long ehea_driver_flags;
+ static DEFINE_MUTEX(dlpar_mem_lock);
+ struct ehea_fw_handle_array ehea_fw_handles;
+ struct ehea_bcmc_reg_array ehea_bcmc_regs;
+
+
+ static int __devinit ehea_probe_adapter(struct platform_device *dev,
+ const struct of_device_id *id);
+
+ static int __devexit ehea_remove(struct platform_device *dev);
+
++static struct of_device_id ehea_module_device_table[] = {
++ {
++ .name = "lhea",
++ .compatible = "IBM,lhea",
++ },
++ {
++ .type = "network",
++ .compatible = "IBM,lhea-ethernet",
++ },
++ {},
++};
++MODULE_DEVICE_TABLE(of, ehea_module_device_table);
++
+ static struct of_device_id ehea_device_table[] = {
+ {
+ .name = "lhea",
+ .compatible = "IBM,lhea",
+ },
+ {},
+ };
-MODULE_DEVICE_TABLE(of, ehea_device_table);
+
+ static struct of_platform_driver ehea_driver = {
+ .driver = {
+ .name = "ehea",
+ .owner = THIS_MODULE,
+ .of_match_table = ehea_device_table,
+ },
+ .probe = ehea_probe_adapter,
+ .remove = ehea_remove,
+ };
+
+ void ehea_dump(void *adr, int len, char *msg)
+ {
+ int x;
+ unsigned char *deb = adr;
+ for (x = 0; x < len; x += 16) {
+ pr_info("%s adr=%p ofs=%04x %016llx %016llx\n",
+ msg, deb, x, *((u64 *)&deb[0]), *((u64 *)&deb[8]));
+ deb += 16;
+ }
+ }
+
+ void ehea_schedule_port_reset(struct ehea_port *port)
+ {
+ if (!test_bit(__EHEA_DISABLE_PORT_RESET, &port->flags))
+ schedule_work(&port->reset_task);
+ }
+
+ static void ehea_update_firmware_handles(void)
+ {
+ struct ehea_fw_handle_entry *arr = NULL;
+ struct ehea_adapter *adapter;
+ int num_adapters = 0;
+ int num_ports = 0;
+ int num_portres = 0;
+ int i = 0;
+ int num_fw_handles, k, l;
+
+ /* Determine number of handles */
+ mutex_lock(&ehea_fw_handles.lock);
+
+ list_for_each_entry(adapter, &adapter_list, list) {
+ num_adapters++;
+
+ for (k = 0; k < EHEA_MAX_PORTS; k++) {
+ struct ehea_port *port = adapter->port[k];
+
+ if (!port || (port->state != EHEA_PORT_UP))
+ continue;
+
+ num_ports++;
+ num_portres += port->num_def_qps;
+ }
+ }
+
+ num_fw_handles = num_adapters * EHEA_NUM_ADAPTER_FW_HANDLES +
+ num_ports * EHEA_NUM_PORT_FW_HANDLES +
+ num_portres * EHEA_NUM_PORTRES_FW_HANDLES;
+
+ if (num_fw_handles) {
+ arr = kcalloc(num_fw_handles, sizeof(*arr), GFP_KERNEL);
+ if (!arr)
+ goto out; /* Keep the existing array */
+ } else
+ goto out_update;
+
+ list_for_each_entry(adapter, &adapter_list, list) {
+ if (num_adapters == 0)
+ break;
+
+ for (k = 0; k < EHEA_MAX_PORTS; k++) {
+ struct ehea_port *port = adapter->port[k];
+
+ if (!port || (port->state != EHEA_PORT_UP) ||
+ (num_ports == 0))
+ continue;
+
+ for (l = 0; l < port->num_def_qps; l++) {
+ struct ehea_port_res *pr = &port->port_res[l];
+
+ arr[i].adh = adapter->handle;
+ arr[i++].fwh = pr->qp->fw_handle;
+ arr[i].adh = adapter->handle;
+ arr[i++].fwh = pr->send_cq->fw_handle;
+ arr[i].adh = adapter->handle;
+ arr[i++].fwh = pr->recv_cq->fw_handle;
+ arr[i].adh = adapter->handle;
+ arr[i++].fwh = pr->eq->fw_handle;
+ arr[i].adh = adapter->handle;
+ arr[i++].fwh = pr->send_mr.handle;
+ arr[i].adh = adapter->handle;
+ arr[i++].fwh = pr->recv_mr.handle;
+ }
+ arr[i].adh = adapter->handle;
+ arr[i++].fwh = port->qp_eq->fw_handle;
+ num_ports--;
+ }
+
+ arr[i].adh = adapter->handle;
+ arr[i++].fwh = adapter->neq->fw_handle;
+
+ if (adapter->mr.handle) {
+ arr[i].adh = adapter->handle;
+ arr[i++].fwh = adapter->mr.handle;
+ }
+ num_adapters--;
+ }
+
+ out_update:
+ kfree(ehea_fw_handles.arr);
+ ehea_fw_handles.arr = arr;
+ ehea_fw_handles.num_entries = i;
+ out:
+ mutex_unlock(&ehea_fw_handles.lock);
+ }
+
+ static void ehea_update_bcmc_registrations(void)
+ {
+ unsigned long flags;
+ struct ehea_bcmc_reg_entry *arr = NULL;
+ struct ehea_adapter *adapter;
+ struct ehea_mc_list *mc_entry;
+ int num_registrations = 0;
+ int i = 0;
+ int k;
+
+ spin_lock_irqsave(&ehea_bcmc_regs.lock, flags);
+
+ /* Determine number of registrations */
+ list_for_each_entry(adapter, &adapter_list, list)
+ for (k = 0; k < EHEA_MAX_PORTS; k++) {
+ struct ehea_port *port = adapter->port[k];
+
+ if (!port || (port->state != EHEA_PORT_UP))
+ continue;
+
+ num_registrations += 2; /* Broadcast registrations */
+
+ list_for_each_entry(mc_entry, &port->mc_list->list,list)
+ num_registrations += 2;
+ }
+
+ if (num_registrations) {
+ arr = kcalloc(num_registrations, sizeof(*arr), GFP_ATOMIC);
+ if (!arr)
+ goto out; /* Keep the existing array */
+ } else
+ goto out_update;
+
+ list_for_each_entry(adapter, &adapter_list, list) {
+ for (k = 0; k < EHEA_MAX_PORTS; k++) {
+ struct ehea_port *port = adapter->port[k];
+
+ if (!port || (port->state != EHEA_PORT_UP))
+ continue;
+
+ if (num_registrations == 0)
+ goto out_update;
+
+ arr[i].adh = adapter->handle;
+ arr[i].port_id = port->logical_port_id;
+ arr[i].reg_type = EHEA_BCMC_BROADCAST |
+ EHEA_BCMC_UNTAGGED;
+ arr[i++].macaddr = port->mac_addr;
+
+ arr[i].adh = adapter->handle;
+ arr[i].port_id = port->logical_port_id;
+ arr[i].reg_type = EHEA_BCMC_BROADCAST |
+ EHEA_BCMC_VLANID_ALL;
+ arr[i++].macaddr = port->mac_addr;
+ num_registrations -= 2;
+
+ list_for_each_entry(mc_entry,
+ &port->mc_list->list, list) {
+ if (num_registrations == 0)
+ goto out_update;
+
+ arr[i].adh = adapter->handle;
+ arr[i].port_id = port->logical_port_id;
+ arr[i].reg_type = EHEA_BCMC_SCOPE_ALL |
+ EHEA_BCMC_MULTICAST |
+ EHEA_BCMC_UNTAGGED;
+ arr[i++].macaddr = mc_entry->macaddr;
+
+ arr[i].adh = adapter->handle;
+ arr[i].port_id = port->logical_port_id;
+ arr[i].reg_type = EHEA_BCMC_SCOPE_ALL |
+ EHEA_BCMC_MULTICAST |
+ EHEA_BCMC_VLANID_ALL;
+ arr[i++].macaddr = mc_entry->macaddr;
+ num_registrations -= 2;
+ }
+ }
+ }
+
+ out_update:
+ kfree(ehea_bcmc_regs.arr);
+ ehea_bcmc_regs.arr = arr;
+ ehea_bcmc_regs.num_entries = i;
+ out:
+ spin_unlock_irqrestore(&ehea_bcmc_regs.lock, flags);
+ }
+
+ static struct rtnl_link_stats64 *ehea_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *stats)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ u64 rx_packets = 0, tx_packets = 0, rx_bytes = 0, tx_bytes = 0;
+ int i;
+
+ for (i = 0; i < port->num_def_qps; i++) {
+ rx_packets += port->port_res[i].rx_packets;
+ rx_bytes += port->port_res[i].rx_bytes;
+ }
+
+ for (i = 0; i < port->num_def_qps; i++) {
+ tx_packets += port->port_res[i].tx_packets;
+ tx_bytes += port->port_res[i].tx_bytes;
+ }
+
+ stats->tx_packets = tx_packets;
+ stats->rx_bytes = rx_bytes;
+ stats->tx_bytes = tx_bytes;
+ stats->rx_packets = rx_packets;
+
+ return &port->stats;
+ }
+
+ static void ehea_update_stats(struct work_struct *work)
+ {
+ struct ehea_port *port =
+ container_of(work, struct ehea_port, stats_work.work);
+ struct net_device *dev = port->netdev;
+ struct rtnl_link_stats64 *stats = &port->stats;
+ struct hcp_ehea_port_cb2 *cb2;
+ u64 hret;
+
+ cb2 = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cb2) {
+ netdev_err(dev, "No mem for cb2. Some interface statistics were not updated\n");
+ goto resched;
+ }
+
+ hret = ehea_h_query_ehea_port(port->adapter->handle,
+ port->logical_port_id,
+ H_PORT_CB2, H_PORT_CB2_ALL, cb2);
+ if (hret != H_SUCCESS) {
+ netdev_err(dev, "query_ehea_port failed\n");
+ goto out_herr;
+ }
+
+ if (netif_msg_hw(port))
+ ehea_dump(cb2, sizeof(*cb2), "net_device_stats");
+
+ stats->multicast = cb2->rxmcp;
+ stats->rx_errors = cb2->rxuerr;
+
+ out_herr:
+ free_page((unsigned long)cb2);
+ resched:
+ schedule_delayed_work(&port->stats_work, msecs_to_jiffies(1000));
+ }
+
+ static void ehea_refill_rq1(struct ehea_port_res *pr, int index, int nr_of_wqes)
+ {
+ struct sk_buff **skb_arr_rq1 = pr->rq1_skba.arr;
+ struct net_device *dev = pr->port->netdev;
+ int max_index_mask = pr->rq1_skba.len - 1;
+ int fill_wqes = pr->rq1_skba.os_skbs + nr_of_wqes;
+ int adder = 0;
+ int i;
+
+ pr->rq1_skba.os_skbs = 0;
+
+ if (unlikely(test_bit(__EHEA_STOP_XFER, &ehea_driver_flags))) {
+ if (nr_of_wqes > 0)
+ pr->rq1_skba.index = index;
+ pr->rq1_skba.os_skbs = fill_wqes;
+ return;
+ }
+
+ for (i = 0; i < fill_wqes; i++) {
+ if (!skb_arr_rq1[index]) {
+ skb_arr_rq1[index] = netdev_alloc_skb(dev,
+ EHEA_L_PKT_SIZE);
+ if (!skb_arr_rq1[index]) {
+ netdev_info(dev, "Unable to allocate enough skb in the array\n");
+ pr->rq1_skba.os_skbs = fill_wqes - i;
+ break;
+ }
+ }
+ index--;
+ index &= max_index_mask;
+ adder++;
+ }
+
+ if (adder == 0)
+ return;
+
+ /* Ring doorbell */
+ ehea_update_rq1a(pr->qp, adder);
+ }
+
+ static void ehea_init_fill_rq1(struct ehea_port_res *pr, int nr_rq1a)
+ {
+ struct sk_buff **skb_arr_rq1 = pr->rq1_skba.arr;
+ struct net_device *dev = pr->port->netdev;
+ int i;
+
+ if (nr_rq1a > pr->rq1_skba.len) {
+ netdev_err(dev, "NR_RQ1A bigger than skb array len\n");
+ return;
+ }
+
+ for (i = 0; i < nr_rq1a; i++) {
+ skb_arr_rq1[i] = netdev_alloc_skb(dev, EHEA_L_PKT_SIZE);
+ if (!skb_arr_rq1[i]) {
+ netdev_info(dev, "Not enough memory to allocate skb array\n");
+ break;
+ }
+ }
+ /* Ring doorbell */
+ ehea_update_rq1a(pr->qp, i - 1);
+ }
+
+ static int ehea_refill_rq_def(struct ehea_port_res *pr,
+ struct ehea_q_skb_arr *q_skba, int rq_nr,
+ int num_wqes, int wqe_type, int packet_size)
+ {
+ struct net_device *dev = pr->port->netdev;
+ struct ehea_qp *qp = pr->qp;
+ struct sk_buff **skb_arr = q_skba->arr;
+ struct ehea_rwqe *rwqe;
+ int i, index, max_index_mask, fill_wqes;
+ int adder = 0;
+ int ret = 0;
+
+ fill_wqes = q_skba->os_skbs + num_wqes;
+ q_skba->os_skbs = 0;
+
+ if (unlikely(test_bit(__EHEA_STOP_XFER, &ehea_driver_flags))) {
+ q_skba->os_skbs = fill_wqes;
+ return ret;
+ }
+
+ index = q_skba->index;
+ max_index_mask = q_skba->len - 1;
+ for (i = 0; i < fill_wqes; i++) {
+ u64 tmp_addr;
+ struct sk_buff *skb;
+
+ skb = netdev_alloc_skb_ip_align(dev, packet_size);
+ if (!skb) {
+ q_skba->os_skbs = fill_wqes - i;
+ if (q_skba->os_skbs == q_skba->len - 2) {
+ netdev_info(pr->port->netdev,
+ "rq%i ran dry - no mem for skb\n",
+ rq_nr);
+ ret = -ENOMEM;
+ }
+ break;
+ }
+
+ skb_arr[index] = skb;
+ tmp_addr = ehea_map_vaddr(skb->data);
+ if (tmp_addr == -1) {
+ dev_kfree_skb(skb);
+ q_skba->os_skbs = fill_wqes - i;
+ ret = 0;
+ break;
+ }
+
+ rwqe = ehea_get_next_rwqe(qp, rq_nr);
+ rwqe->wr_id = EHEA_BMASK_SET(EHEA_WR_ID_TYPE, wqe_type)
+ | EHEA_BMASK_SET(EHEA_WR_ID_INDEX, index);
+ rwqe->sg_list[0].l_key = pr->recv_mr.lkey;
+ rwqe->sg_list[0].vaddr = tmp_addr;
+ rwqe->sg_list[0].len = packet_size;
+ rwqe->data_segments = 1;
+
+ index++;
+ index &= max_index_mask;
+ adder++;
+ }
+
+ q_skba->index = index;
+ if (adder == 0)
+ goto out;
+
+ /* Ring doorbell */
+ iosync();
+ if (rq_nr == 2)
+ ehea_update_rq2a(pr->qp, adder);
+ else
+ ehea_update_rq3a(pr->qp, adder);
+ out:
+ return ret;
+ }
+
+
+ static int ehea_refill_rq2(struct ehea_port_res *pr, int nr_of_wqes)
+ {
+ return ehea_refill_rq_def(pr, &pr->rq2_skba, 2,
+ nr_of_wqes, EHEA_RWQE2_TYPE,
+ EHEA_RQ2_PKT_SIZE);
+ }
+
+
+ static int ehea_refill_rq3(struct ehea_port_res *pr, int nr_of_wqes)
+ {
+ return ehea_refill_rq_def(pr, &pr->rq3_skba, 3,
+ nr_of_wqes, EHEA_RWQE3_TYPE,
+ EHEA_MAX_PACKET_SIZE);
+ }
+
+ static inline int ehea_check_cqe(struct ehea_cqe *cqe, int *rq_num)
+ {
+ *rq_num = (cqe->type & EHEA_CQE_TYPE_RQ) >> 5;
+ if ((cqe->status & EHEA_CQE_STAT_ERR_MASK) == 0)
+ return 0;
+ if (((cqe->status & EHEA_CQE_STAT_ERR_TCP) != 0) &&
+ (cqe->header_length == 0))
+ return 0;
+ return -EINVAL;
+ }
+
+ static inline void ehea_fill_skb(struct net_device *dev,
+ struct sk_buff *skb, struct ehea_cqe *cqe,
+ struct ehea_port_res *pr)
+ {
+ int length = cqe->num_bytes_transfered - 4; /*remove CRC */
+
+ skb_put(skb, length);
+ skb->protocol = eth_type_trans(skb, dev);
+
+ /* The packet was not an IPV4 packet so a complemented checksum was
+ calculated. The value is found in the Internet Checksum field. */
+ if (cqe->status & EHEA_CQE_BLIND_CKSUM) {
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum = csum_unfold(~cqe->inet_checksum_value);
+ } else
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ skb_record_rx_queue(skb, pr - &pr->port->port_res[0]);
+ }
+
+ static inline struct sk_buff *get_skb_by_index(struct sk_buff **skb_array,
+ int arr_len,
+ struct ehea_cqe *cqe)
+ {
+ int skb_index = EHEA_BMASK_GET(EHEA_WR_ID_INDEX, cqe->wr_id);
+ struct sk_buff *skb;
+ void *pref;
+ int x;
+
+ x = skb_index + 1;
+ x &= (arr_len - 1);
+
+ pref = skb_array[x];
+ if (pref) {
+ prefetchw(pref);
+ prefetchw(pref + EHEA_CACHE_LINE);
+
+ pref = (skb_array[x]->data);
+ prefetch(pref);
+ prefetch(pref + EHEA_CACHE_LINE);
+ prefetch(pref + EHEA_CACHE_LINE * 2);
+ prefetch(pref + EHEA_CACHE_LINE * 3);
+ }
+
+ skb = skb_array[skb_index];
+ skb_array[skb_index] = NULL;
+ return skb;
+ }
+
+ static inline struct sk_buff *get_skb_by_index_ll(struct sk_buff **skb_array,
+ int arr_len, int wqe_index)
+ {
+ struct sk_buff *skb;
+ void *pref;
+ int x;
+
+ x = wqe_index + 1;
+ x &= (arr_len - 1);
+
+ pref = skb_array[x];
+ if (pref) {
+ prefetchw(pref);
+ prefetchw(pref + EHEA_CACHE_LINE);
+
+ pref = (skb_array[x]->data);
+ prefetchw(pref);
+ prefetchw(pref + EHEA_CACHE_LINE);
+ }
+
+ skb = skb_array[wqe_index];
+ skb_array[wqe_index] = NULL;
+ return skb;
+ }
+
+ static int ehea_treat_poll_error(struct ehea_port_res *pr, int rq,
+ struct ehea_cqe *cqe, int *processed_rq2,
+ int *processed_rq3)
+ {
+ struct sk_buff *skb;
+
+ if (cqe->status & EHEA_CQE_STAT_ERR_TCP)
+ pr->p_stats.err_tcp_cksum++;
+ if (cqe->status & EHEA_CQE_STAT_ERR_IP)
+ pr->p_stats.err_ip_cksum++;
+ if (cqe->status & EHEA_CQE_STAT_ERR_CRC)
+ pr->p_stats.err_frame_crc++;
+
+ if (rq == 2) {
+ *processed_rq2 += 1;
+ skb = get_skb_by_index(pr->rq2_skba.arr, pr->rq2_skba.len, cqe);
+ dev_kfree_skb(skb);
+ } else if (rq == 3) {
+ *processed_rq3 += 1;
+ skb = get_skb_by_index(pr->rq3_skba.arr, pr->rq3_skba.len, cqe);
+ dev_kfree_skb(skb);
+ }
+
+ if (cqe->status & EHEA_CQE_STAT_FAT_ERR_MASK) {
+ if (netif_msg_rx_err(pr->port)) {
+ pr_err("Critical receive error for QP %d. Resetting port.\n",
+ pr->qp->init_attr.qp_nr);
+ ehea_dump(cqe, sizeof(*cqe), "CQE");
+ }
+ ehea_schedule_port_reset(pr->port);
+ return 1;
+ }
+
+ return 0;
+ }
+
+ static int ehea_proc_rwqes(struct net_device *dev,
+ struct ehea_port_res *pr,
+ int budget)
+ {
+ struct ehea_port *port = pr->port;
+ struct ehea_qp *qp = pr->qp;
+ struct ehea_cqe *cqe;
+ struct sk_buff *skb;
+ struct sk_buff **skb_arr_rq1 = pr->rq1_skba.arr;
+ struct sk_buff **skb_arr_rq2 = pr->rq2_skba.arr;
+ struct sk_buff **skb_arr_rq3 = pr->rq3_skba.arr;
+ int skb_arr_rq1_len = pr->rq1_skba.len;
+ int skb_arr_rq2_len = pr->rq2_skba.len;
+ int skb_arr_rq3_len = pr->rq3_skba.len;
+ int processed, processed_rq1, processed_rq2, processed_rq3;
+ u64 processed_bytes = 0;
+ int wqe_index, last_wqe_index, rq, port_reset;
+
+ processed = processed_rq1 = processed_rq2 = processed_rq3 = 0;
+ last_wqe_index = 0;
+
+ cqe = ehea_poll_rq1(qp, &wqe_index);
+ while ((processed < budget) && cqe) {
+ ehea_inc_rq1(qp);
+ processed_rq1++;
+ processed++;
+ if (netif_msg_rx_status(port))
+ ehea_dump(cqe, sizeof(*cqe), "CQE");
+
+ last_wqe_index = wqe_index;
+ rmb();
+ if (!ehea_check_cqe(cqe, &rq)) {
+ if (rq == 1) {
+ /* LL RQ1 */
+ skb = get_skb_by_index_ll(skb_arr_rq1,
+ skb_arr_rq1_len,
+ wqe_index);
+ if (unlikely(!skb)) {
+ netif_info(port, rx_err, dev,
+ "LL rq1: skb=NULL\n");
+
+ skb = netdev_alloc_skb(dev,
+ EHEA_L_PKT_SIZE);
+ if (!skb) {
+ netdev_err(dev, "Not enough memory to allocate skb\n");
+ break;
+ }
+ }
+ skb_copy_to_linear_data(skb, ((char *)cqe) + 64,
+ cqe->num_bytes_transfered - 4);
+ ehea_fill_skb(dev, skb, cqe, pr);
+ } else if (rq == 2) {
+ /* RQ2 */
+ skb = get_skb_by_index(skb_arr_rq2,
+ skb_arr_rq2_len, cqe);
+ if (unlikely(!skb)) {
+ netif_err(port, rx_err, dev,
+ "rq2: skb=NULL\n");
+ break;
+ }
+ ehea_fill_skb(dev, skb, cqe, pr);
+ processed_rq2++;
+ } else {
+ /* RQ3 */
+ skb = get_skb_by_index(skb_arr_rq3,
+ skb_arr_rq3_len, cqe);
+ if (unlikely(!skb)) {
+ netif_err(port, rx_err, dev,
+ "rq3: skb=NULL\n");
+ break;
+ }
+ ehea_fill_skb(dev, skb, cqe, pr);
+ processed_rq3++;
+ }
+
+ processed_bytes += skb->len;
+
+ if (cqe->status & EHEA_CQE_VLAN_TAG_XTRACT)
+ __vlan_hwaccel_put_tag(skb, cqe->vlan_tag);
+
+ napi_gro_receive(&pr->napi, skb);
+ } else {
+ pr->p_stats.poll_receive_errors++;
+ port_reset = ehea_treat_poll_error(pr, rq, cqe,
+ &processed_rq2,
+ &processed_rq3);
+ if (port_reset)
+ break;
+ }
+ cqe = ehea_poll_rq1(qp, &wqe_index);
+ }
+
+ pr->rx_packets += processed;
+ pr->rx_bytes += processed_bytes;
+
+ ehea_refill_rq1(pr, last_wqe_index, processed_rq1);
+ ehea_refill_rq2(pr, processed_rq2);
+ ehea_refill_rq3(pr, processed_rq3);
+
+ return processed;
+ }
+
+ #define SWQE_RESTART_CHECK 0xdeadbeaff00d0000ull
+
+ static void reset_sq_restart_flag(struct ehea_port *port)
+ {
+ int i;
+
+ for (i = 0; i < port->num_def_qps; i++) {
+ struct ehea_port_res *pr = &port->port_res[i];
+ pr->sq_restart_flag = 0;
+ }
+ wake_up(&port->restart_wq);
+ }
+
+ static void check_sqs(struct ehea_port *port)
+ {
+ struct ehea_swqe *swqe;
+ int swqe_index;
+ int i, k;
+
+ for (i = 0; i < port->num_def_qps; i++) {
+ struct ehea_port_res *pr = &port->port_res[i];
+ int ret;
+ k = 0;
+ swqe = ehea_get_swqe(pr->qp, &swqe_index);
+ memset(swqe, 0, SWQE_HEADER_SIZE);
+ atomic_dec(&pr->swqe_avail);
+
+ swqe->tx_control |= EHEA_SWQE_PURGE;
+ swqe->wr_id = SWQE_RESTART_CHECK;
+ swqe->tx_control |= EHEA_SWQE_SIGNALLED_COMPLETION;
+ swqe->tx_control |= EHEA_SWQE_IMM_DATA_PRESENT;
+ swqe->immediate_data_length = 80;
+
+ ehea_post_swqe(pr->qp, swqe);
+
+ ret = wait_event_timeout(port->restart_wq,
+ pr->sq_restart_flag == 0,
+ msecs_to_jiffies(100));
+
+ if (!ret) {
+ pr_err("HW/SW queues out of sync\n");
+ ehea_schedule_port_reset(pr->port);
+ return;
+ }
+ }
+ }
+
+
+ static struct ehea_cqe *ehea_proc_cqes(struct ehea_port_res *pr, int my_quota)
+ {
+ struct sk_buff *skb;
+ struct ehea_cq *send_cq = pr->send_cq;
+ struct ehea_cqe *cqe;
+ int quota = my_quota;
+ int cqe_counter = 0;
+ int swqe_av = 0;
+ int index;
+ struct netdev_queue *txq = netdev_get_tx_queue(pr->port->netdev,
+ pr - &pr->port->port_res[0]);
+
+ cqe = ehea_poll_cq(send_cq);
+ while (cqe && (quota > 0)) {
+ ehea_inc_cq(send_cq);
+
+ cqe_counter++;
+ rmb();
+
+ if (cqe->wr_id == SWQE_RESTART_CHECK) {
+ pr->sq_restart_flag = 1;
+ swqe_av++;
+ break;
+ }
+
+ if (cqe->status & EHEA_CQE_STAT_ERR_MASK) {
+ pr_err("Bad send completion status=0x%04X\n",
+ cqe->status);
+
+ if (netif_msg_tx_err(pr->port))
+ ehea_dump(cqe, sizeof(*cqe), "Send CQE");
+
+ if (cqe->status & EHEA_CQE_STAT_RESET_MASK) {
+ pr_err("Resetting port\n");
+ ehea_schedule_port_reset(pr->port);
+ break;
+ }
+ }
+
+ if (netif_msg_tx_done(pr->port))
+ ehea_dump(cqe, sizeof(*cqe), "CQE");
+
+ if (likely(EHEA_BMASK_GET(EHEA_WR_ID_TYPE, cqe->wr_id)
+ == EHEA_SWQE2_TYPE)) {
+
+ index = EHEA_BMASK_GET(EHEA_WR_ID_INDEX, cqe->wr_id);
+ skb = pr->sq_skba.arr[index];
+ dev_kfree_skb(skb);
+ pr->sq_skba.arr[index] = NULL;
+ }
+
+ swqe_av += EHEA_BMASK_GET(EHEA_WR_ID_REFILL, cqe->wr_id);
+ quota--;
+
+ cqe = ehea_poll_cq(send_cq);
+ }
+
+ ehea_update_feca(send_cq, cqe_counter);
+ atomic_add(swqe_av, &pr->swqe_avail);
+
+ if (unlikely(netif_tx_queue_stopped(txq) &&
+ (atomic_read(&pr->swqe_avail) >= pr->swqe_refill_th))) {
+ __netif_tx_lock(txq, smp_processor_id());
+ if (netif_tx_queue_stopped(txq) &&
+ (atomic_read(&pr->swqe_avail) >= pr->swqe_refill_th))
+ netif_tx_wake_queue(txq);
+ __netif_tx_unlock(txq);
+ }
+
+ wake_up(&pr->port->swqe_avail_wq);
+
+ return cqe;
+ }
+
+ #define EHEA_POLL_MAX_CQES 65535
+
+ static int ehea_poll(struct napi_struct *napi, int budget)
+ {
+ struct ehea_port_res *pr = container_of(napi, struct ehea_port_res,
+ napi);
+ struct net_device *dev = pr->port->netdev;
+ struct ehea_cqe *cqe;
+ struct ehea_cqe *cqe_skb = NULL;
+ int wqe_index;
+ int rx = 0;
+
+ cqe_skb = ehea_proc_cqes(pr, EHEA_POLL_MAX_CQES);
+ rx += ehea_proc_rwqes(dev, pr, budget - rx);
+
+ while (rx != budget) {
+ napi_complete(napi);
+ ehea_reset_cq_ep(pr->recv_cq);
+ ehea_reset_cq_ep(pr->send_cq);
+ ehea_reset_cq_n1(pr->recv_cq);
+ ehea_reset_cq_n1(pr->send_cq);
+ rmb();
+ cqe = ehea_poll_rq1(pr->qp, &wqe_index);
+ cqe_skb = ehea_poll_cq(pr->send_cq);
+
+ if (!cqe && !cqe_skb)
+ return rx;
+
+ if (!napi_reschedule(napi))
+ return rx;
+
+ cqe_skb = ehea_proc_cqes(pr, EHEA_POLL_MAX_CQES);
+ rx += ehea_proc_rwqes(dev, pr, budget - rx);
+ }
+
+ return rx;
+ }
+
+ #ifdef CONFIG_NET_POLL_CONTROLLER
+ static void ehea_netpoll(struct net_device *dev)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ int i;
+
+ for (i = 0; i < port->num_def_qps; i++)
+ napi_schedule(&port->port_res[i].napi);
+ }
+ #endif
+
+ static irqreturn_t ehea_recv_irq_handler(int irq, void *param)
+ {
+ struct ehea_port_res *pr = param;
+
+ napi_schedule(&pr->napi);
+
+ return IRQ_HANDLED;
+ }
+
+ static irqreturn_t ehea_qp_aff_irq_handler(int irq, void *param)
+ {
+ struct ehea_port *port = param;
+ struct ehea_eqe *eqe;
+ struct ehea_qp *qp;
+ u32 qp_token;
+ u64 resource_type, aer, aerr;
+ int reset_port = 0;
+
+ eqe = ehea_poll_eq(port->qp_eq);
+
+ while (eqe) {
+ qp_token = EHEA_BMASK_GET(EHEA_EQE_QP_TOKEN, eqe->entry);
+ pr_err("QP aff_err: entry=0x%llx, token=0x%x\n",
+ eqe->entry, qp_token);
+
+ qp = port->port_res[qp_token].qp;
+
+ resource_type = ehea_error_data(port->adapter, qp->fw_handle,
+ &aer, &aerr);
+
+ if (resource_type == EHEA_AER_RESTYPE_QP) {
+ if ((aer & EHEA_AER_RESET_MASK) ||
+ (aerr & EHEA_AERR_RESET_MASK))
+ reset_port = 1;
+ } else
+ reset_port = 1; /* Reset in case of CQ or EQ error */
+
+ eqe = ehea_poll_eq(port->qp_eq);
+ }
+
+ if (reset_port) {
+ pr_err("Resetting port\n");
+ ehea_schedule_port_reset(port);
+ }
+
+ return IRQ_HANDLED;
+ }
+
+ static struct ehea_port *ehea_get_port(struct ehea_adapter *adapter,
+ int logical_port)
+ {
+ int i;
+
+ for (i = 0; i < EHEA_MAX_PORTS; i++)
+ if (adapter->port[i])
+ if (adapter->port[i]->logical_port_id == logical_port)
+ return adapter->port[i];
+ return NULL;
+ }
+
+ int ehea_sense_port_attr(struct ehea_port *port)
+ {
+ int ret;
+ u64 hret;
+ struct hcp_ehea_port_cb0 *cb0;
+
+ /* may be called via ehea_neq_tasklet() */
+ cb0 = (void *)get_zeroed_page(GFP_ATOMIC);
+ if (!cb0) {
+ pr_err("no mem for cb0\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ hret = ehea_h_query_ehea_port(port->adapter->handle,
+ port->logical_port_id, H_PORT_CB0,
+ EHEA_BMASK_SET(H_PORT_CB0_ALL, 0xFFFF),
+ cb0);
+ if (hret != H_SUCCESS) {
+ ret = -EIO;
+ goto out_free;
+ }
+
+ /* MAC address */
+ port->mac_addr = cb0->port_mac_addr << 16;
+
+ if (!is_valid_ether_addr((u8 *)&port->mac_addr)) {
+ ret = -EADDRNOTAVAIL;
+ goto out_free;
+ }
+
+ /* Port speed */
+ switch (cb0->port_speed) {
+ case H_SPEED_10M_H:
+ port->port_speed = EHEA_SPEED_10M;
+ port->full_duplex = 0;
+ break;
+ case H_SPEED_10M_F:
+ port->port_speed = EHEA_SPEED_10M;
+ port->full_duplex = 1;
+ break;
+ case H_SPEED_100M_H:
+ port->port_speed = EHEA_SPEED_100M;
+ port->full_duplex = 0;
+ break;
+ case H_SPEED_100M_F:
+ port->port_speed = EHEA_SPEED_100M;
+ port->full_duplex = 1;
+ break;
+ case H_SPEED_1G_F:
+ port->port_speed = EHEA_SPEED_1G;
+ port->full_duplex = 1;
+ break;
+ case H_SPEED_10G_F:
+ port->port_speed = EHEA_SPEED_10G;
+ port->full_duplex = 1;
+ break;
+ default:
+ port->port_speed = 0;
+ port->full_duplex = 0;
+ break;
+ }
+
+ port->autoneg = 1;
+ port->num_mcs = cb0->num_default_qps;
+
+ /* Number of default QPs */
+ if (use_mcs)
+ port->num_def_qps = cb0->num_default_qps;
+ else
+ port->num_def_qps = 1;
+
+ if (!port->num_def_qps) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+
+ ret = 0;
+ out_free:
+ if (ret || netif_msg_probe(port))
+ ehea_dump(cb0, sizeof(*cb0), "ehea_sense_port_attr");
+ free_page((unsigned long)cb0);
+ out:
+ return ret;
+ }
+
+ int ehea_set_portspeed(struct ehea_port *port, u32 port_speed)
+ {
+ struct hcp_ehea_port_cb4 *cb4;
+ u64 hret;
+ int ret = 0;
+
+ cb4 = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cb4) {
+ pr_err("no mem for cb4\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cb4->port_speed = port_speed;
+
+ netif_carrier_off(port->netdev);
+
+ hret = ehea_h_modify_ehea_port(port->adapter->handle,
+ port->logical_port_id,
+ H_PORT_CB4, H_PORT_CB4_SPEED, cb4);
+ if (hret == H_SUCCESS) {
+ port->autoneg = port_speed == EHEA_SPEED_AUTONEG ? 1 : 0;
+
+ hret = ehea_h_query_ehea_port(port->adapter->handle,
+ port->logical_port_id,
+ H_PORT_CB4, H_PORT_CB4_SPEED,
+ cb4);
+ if (hret == H_SUCCESS) {
+ switch (cb4->port_speed) {
+ case H_SPEED_10M_H:
+ port->port_speed = EHEA_SPEED_10M;
+ port->full_duplex = 0;
+ break;
+ case H_SPEED_10M_F:
+ port->port_speed = EHEA_SPEED_10M;
+ port->full_duplex = 1;
+ break;
+ case H_SPEED_100M_H:
+ port->port_speed = EHEA_SPEED_100M;
+ port->full_duplex = 0;
+ break;
+ case H_SPEED_100M_F:
+ port->port_speed = EHEA_SPEED_100M;
+ port->full_duplex = 1;
+ break;
+ case H_SPEED_1G_F:
+ port->port_speed = EHEA_SPEED_1G;
+ port->full_duplex = 1;
+ break;
+ case H_SPEED_10G_F:
+ port->port_speed = EHEA_SPEED_10G;
+ port->full_duplex = 1;
+ break;
+ default:
+ port->port_speed = 0;
+ port->full_duplex = 0;
+ break;
+ }
+ } else {
+ pr_err("Failed sensing port speed\n");
+ ret = -EIO;
+ }
+ } else {
+ if (hret == H_AUTHORITY) {
+ pr_info("Hypervisor denied setting port speed\n");
+ ret = -EPERM;
+ } else {
+ ret = -EIO;
+ pr_err("Failed setting port speed\n");
+ }
+ }
+ if (!prop_carrier_state || (port->phy_link == EHEA_PHY_LINK_UP))
+ netif_carrier_on(port->netdev);
+
+ free_page((unsigned long)cb4);
+ out:
+ return ret;
+ }
+
+ static void ehea_parse_eqe(struct ehea_adapter *adapter, u64 eqe)
+ {
+ int ret;
+ u8 ec;
+ u8 portnum;
+ struct ehea_port *port;
+ struct net_device *dev;
+
+ ec = EHEA_BMASK_GET(NEQE_EVENT_CODE, eqe);
+ portnum = EHEA_BMASK_GET(NEQE_PORTNUM, eqe);
+ port = ehea_get_port(adapter, portnum);
+ dev = port->netdev;
+
+ switch (ec) {
+ case EHEA_EC_PORTSTATE_CHG: /* port state change */
+
+ if (!port) {
+ netdev_err(dev, "unknown portnum %x\n", portnum);
+ break;
+ }
+
+ if (EHEA_BMASK_GET(NEQE_PORT_UP, eqe)) {
+ if (!netif_carrier_ok(dev)) {
+ ret = ehea_sense_port_attr(port);
+ if (ret) {
+ netdev_err(dev, "failed resensing port attributes\n");
+ break;
+ }
+
+ netif_info(port, link, dev,
+ "Logical port up: %dMbps %s Duplex\n",
+ port->port_speed,
+ port->full_duplex == 1 ?
+ "Full" : "Half");
+
+ netif_carrier_on(dev);
+ netif_wake_queue(dev);
+ }
+ } else
+ if (netif_carrier_ok(dev)) {
+ netif_info(port, link, dev,
+ "Logical port down\n");
+ netif_carrier_off(dev);
+ netif_tx_disable(dev);
+ }
+
+ if (EHEA_BMASK_GET(NEQE_EXTSWITCH_PORT_UP, eqe)) {
+ port->phy_link = EHEA_PHY_LINK_UP;
+ netif_info(port, link, dev,
+ "Physical port up\n");
+ if (prop_carrier_state)
+ netif_carrier_on(dev);
+ } else {
+ port->phy_link = EHEA_PHY_LINK_DOWN;
+ netif_info(port, link, dev,
+ "Physical port down\n");
+ if (prop_carrier_state)
+ netif_carrier_off(dev);
+ }
+
+ if (EHEA_BMASK_GET(NEQE_EXTSWITCH_PRIMARY, eqe))
+ netdev_info(dev,
+ "External switch port is primary port\n");
+ else
+ netdev_info(dev,
+ "External switch port is backup port\n");
+
+ break;
+ case EHEA_EC_ADAPTER_MALFUNC:
+ netdev_err(dev, "Adapter malfunction\n");
+ break;
+ case EHEA_EC_PORT_MALFUNC:
+ netdev_info(dev, "Port malfunction\n");
+ netif_carrier_off(dev);
+ netif_tx_disable(dev);
+ break;
+ default:
+ netdev_err(dev, "unknown event code %x, eqe=0x%llX\n", ec, eqe);
+ break;
+ }
+ }
+
+ static void ehea_neq_tasklet(unsigned long data)
+ {
+ struct ehea_adapter *adapter = (struct ehea_adapter *)data;
+ struct ehea_eqe *eqe;
+ u64 event_mask;
+
+ eqe = ehea_poll_eq(adapter->neq);
+ pr_debug("eqe=%p\n", eqe);
+
+ while (eqe) {
+ pr_debug("*eqe=%lx\n", (unsigned long) eqe->entry);
+ ehea_parse_eqe(adapter, eqe->entry);
+ eqe = ehea_poll_eq(adapter->neq);
+ pr_debug("next eqe=%p\n", eqe);
+ }
+
+ event_mask = EHEA_BMASK_SET(NELR_PORTSTATE_CHG, 1)
+ | EHEA_BMASK_SET(NELR_ADAPTER_MALFUNC, 1)
+ | EHEA_BMASK_SET(NELR_PORT_MALFUNC, 1);
+
+ ehea_h_reset_events(adapter->handle,
+ adapter->neq->fw_handle, event_mask);
+ }
+
+ static irqreturn_t ehea_interrupt_neq(int irq, void *param)
+ {
+ struct ehea_adapter *adapter = param;
+ tasklet_hi_schedule(&adapter->neq_tasklet);
+ return IRQ_HANDLED;
+ }
+
+
+ static int ehea_fill_port_res(struct ehea_port_res *pr)
+ {
+ int ret;
+ struct ehea_qp_init_attr *init_attr = &pr->qp->init_attr;
+
+ ehea_init_fill_rq1(pr, pr->rq1_skba.len);
+
+ ret = ehea_refill_rq2(pr, init_attr->act_nr_rwqes_rq2 - 1);
+
+ ret |= ehea_refill_rq3(pr, init_attr->act_nr_rwqes_rq3 - 1);
+
+ return ret;
+ }
+
+ static int ehea_reg_interrupts(struct net_device *dev)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ struct ehea_port_res *pr;
+ int i, ret;
+
+
+ snprintf(port->int_aff_name, EHEA_IRQ_NAME_SIZE - 1, "%s-aff",
+ dev->name);
+
+ ret = ibmebus_request_irq(port->qp_eq->attr.ist1,
+ ehea_qp_aff_irq_handler,
+ IRQF_DISABLED, port->int_aff_name, port);
+ if (ret) {
+ netdev_err(dev, "failed registering irq for qp_aff_irq_handler:ist=%X\n",
+ port->qp_eq->attr.ist1);
+ goto out_free_qpeq;
+ }
+
+ netif_info(port, ifup, dev,
+ "irq_handle 0x%X for function qp_aff_irq_handler registered\n",
+ port->qp_eq->attr.ist1);
+
+
+ for (i = 0; i < port->num_def_qps; i++) {
+ pr = &port->port_res[i];
+ snprintf(pr->int_send_name, EHEA_IRQ_NAME_SIZE - 1,
+ "%s-queue%d", dev->name, i);
+ ret = ibmebus_request_irq(pr->eq->attr.ist1,
+ ehea_recv_irq_handler,
+ IRQF_DISABLED, pr->int_send_name,
+ pr);
+ if (ret) {
+ netdev_err(dev, "failed registering irq for ehea_queue port_res_nr:%d, ist=%X\n",
+ i, pr->eq->attr.ist1);
+ goto out_free_req;
+ }
+ netif_info(port, ifup, dev,
+ "irq_handle 0x%X for function ehea_queue_int %d registered\n",
+ pr->eq->attr.ist1, i);
+ }
+ out:
+ return ret;
+
+
+ out_free_req:
+ while (--i >= 0) {
+ u32 ist = port->port_res[i].eq->attr.ist1;
+ ibmebus_free_irq(ist, &port->port_res[i]);
+ }
+
+ out_free_qpeq:
+ ibmebus_free_irq(port->qp_eq->attr.ist1, port);
+ i = port->num_def_qps;
+
+ goto out;
+
+ }
+
+ static void ehea_free_interrupts(struct net_device *dev)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ struct ehea_port_res *pr;
+ int i;
+
+ /* send */
+
+ for (i = 0; i < port->num_def_qps; i++) {
+ pr = &port->port_res[i];
+ ibmebus_free_irq(pr->eq->attr.ist1, pr);
+ netif_info(port, intr, dev,
+ "free send irq for res %d with handle 0x%X\n",
+ i, pr->eq->attr.ist1);
+ }
+
+ /* associated events */
+ ibmebus_free_irq(port->qp_eq->attr.ist1, port);
+ netif_info(port, intr, dev,
+ "associated event interrupt for handle 0x%X freed\n",
+ port->qp_eq->attr.ist1);
+ }
+
+ static int ehea_configure_port(struct ehea_port *port)
+ {
+ int ret, i;
+ u64 hret, mask;
+ struct hcp_ehea_port_cb0 *cb0;
+
+ ret = -ENOMEM;
+ cb0 = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cb0)
+ goto out;
+
+ cb0->port_rc = EHEA_BMASK_SET(PXLY_RC_VALID, 1)
+ | EHEA_BMASK_SET(PXLY_RC_IP_CHKSUM, 1)
+ | EHEA_BMASK_SET(PXLY_RC_TCP_UDP_CHKSUM, 1)
+ | EHEA_BMASK_SET(PXLY_RC_VLAN_XTRACT, 1)
+ | EHEA_BMASK_SET(PXLY_RC_VLAN_TAG_FILTER,
+ PXLY_RC_VLAN_FILTER)
+ | EHEA_BMASK_SET(PXLY_RC_JUMBO_FRAME, 1);
+
+ for (i = 0; i < port->num_mcs; i++)
+ if (use_mcs)
+ cb0->default_qpn_arr[i] =
+ port->port_res[i].qp->init_attr.qp_nr;
+ else
+ cb0->default_qpn_arr[i] =
+ port->port_res[0].qp->init_attr.qp_nr;
+
+ if (netif_msg_ifup(port))
+ ehea_dump(cb0, sizeof(*cb0), "ehea_configure_port");
+
+ mask = EHEA_BMASK_SET(H_PORT_CB0_PRC, 1)
+ | EHEA_BMASK_SET(H_PORT_CB0_DEFQPNARRAY, 1);
+
+ hret = ehea_h_modify_ehea_port(port->adapter->handle,
+ port->logical_port_id,
+ H_PORT_CB0, mask, cb0);
+ ret = -EIO;
+ if (hret != H_SUCCESS)
+ goto out_free;
+
+ ret = 0;
+
+ out_free:
+ free_page((unsigned long)cb0);
+ out:
+ return ret;
+ }
+
+ int ehea_gen_smrs(struct ehea_port_res *pr)
+ {
+ int ret;
+ struct ehea_adapter *adapter = pr->port->adapter;
+
+ ret = ehea_gen_smr(adapter, &adapter->mr, &pr->send_mr);
+ if (ret)
+ goto out;
+
+ ret = ehea_gen_smr(adapter, &adapter->mr, &pr->recv_mr);
+ if (ret)
+ goto out_free;
+
+ return 0;
+
+ out_free:
+ ehea_rem_mr(&pr->send_mr);
+ out:
+ pr_err("Generating SMRS failed\n");
+ return -EIO;
+ }
+
+ int ehea_rem_smrs(struct ehea_port_res *pr)
+ {
+ if ((ehea_rem_mr(&pr->send_mr)) ||
+ (ehea_rem_mr(&pr->recv_mr)))
+ return -EIO;
+ else
+ return 0;
+ }
+
+ static int ehea_init_q_skba(struct ehea_q_skb_arr *q_skba, int max_q_entries)
+ {
+ int arr_size = sizeof(void *) * max_q_entries;
+
+ q_skba->arr = vzalloc(arr_size);
+ if (!q_skba->arr)
+ return -ENOMEM;
+
+ q_skba->len = max_q_entries;
+ q_skba->index = 0;
+ q_skba->os_skbs = 0;
+
+ return 0;
+ }
+
+ static int ehea_init_port_res(struct ehea_port *port, struct ehea_port_res *pr,
+ struct port_res_cfg *pr_cfg, int queue_token)
+ {
+ struct ehea_adapter *adapter = port->adapter;
+ enum ehea_eq_type eq_type = EHEA_EQ;
+ struct ehea_qp_init_attr *init_attr = NULL;
+ int ret = -EIO;
+ u64 tx_bytes, rx_bytes, tx_packets, rx_packets;
+
+ tx_bytes = pr->tx_bytes;
+ tx_packets = pr->tx_packets;
+ rx_bytes = pr->rx_bytes;
+ rx_packets = pr->rx_packets;
+
+ memset(pr, 0, sizeof(struct ehea_port_res));
+
+ pr->tx_bytes = rx_bytes;
+ pr->tx_packets = tx_packets;
+ pr->rx_bytes = rx_bytes;
+ pr->rx_packets = rx_packets;
+
+ pr->port = port;
+
+ pr->eq = ehea_create_eq(adapter, eq_type, EHEA_MAX_ENTRIES_EQ, 0);
+ if (!pr->eq) {
+ pr_err("create_eq failed (eq)\n");
+ goto out_free;
+ }
+
+ pr->recv_cq = ehea_create_cq(adapter, pr_cfg->max_entries_rcq,
+ pr->eq->fw_handle,
+ port->logical_port_id);
+ if (!pr->recv_cq) {
+ pr_err("create_cq failed (cq_recv)\n");
+ goto out_free;
+ }
+
+ pr->send_cq = ehea_create_cq(adapter, pr_cfg->max_entries_scq,
+ pr->eq->fw_handle,
+ port->logical_port_id);
+ if (!pr->send_cq) {
+ pr_err("create_cq failed (cq_send)\n");
+ goto out_free;
+ }
+
+ if (netif_msg_ifup(port))
+ pr_info("Send CQ: act_nr_cqes=%d, Recv CQ: act_nr_cqes=%d\n",
+ pr->send_cq->attr.act_nr_of_cqes,
+ pr->recv_cq->attr.act_nr_of_cqes);
+
+ init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
+ if (!init_attr) {
+ ret = -ENOMEM;
+ pr_err("no mem for ehea_qp_init_attr\n");
+ goto out_free;
+ }
+
+ init_attr->low_lat_rq1 = 1;
+ init_attr->signalingtype = 1; /* generate CQE if specified in WQE */
+ init_attr->rq_count = 3;
+ init_attr->qp_token = queue_token;
+ init_attr->max_nr_send_wqes = pr_cfg->max_entries_sq;
+ init_attr->max_nr_rwqes_rq1 = pr_cfg->max_entries_rq1;
+ init_attr->max_nr_rwqes_rq2 = pr_cfg->max_entries_rq2;
+ init_attr->max_nr_rwqes_rq3 = pr_cfg->max_entries_rq3;
+ init_attr->wqe_size_enc_sq = EHEA_SG_SQ;
+ init_attr->wqe_size_enc_rq1 = EHEA_SG_RQ1;
+ init_attr->wqe_size_enc_rq2 = EHEA_SG_RQ2;
+ init_attr->wqe_size_enc_rq3 = EHEA_SG_RQ3;
+ init_attr->rq2_threshold = EHEA_RQ2_THRESHOLD;
+ init_attr->rq3_threshold = EHEA_RQ3_THRESHOLD;
+ init_attr->port_nr = port->logical_port_id;
+ init_attr->send_cq_handle = pr->send_cq->fw_handle;
+ init_attr->recv_cq_handle = pr->recv_cq->fw_handle;
+ init_attr->aff_eq_handle = port->qp_eq->fw_handle;
+
+ pr->qp = ehea_create_qp(adapter, adapter->pd, init_attr);
+ if (!pr->qp) {
+ pr_err("create_qp failed\n");
+ ret = -EIO;
+ goto out_free;
+ }
+
+ if (netif_msg_ifup(port))
+ pr_info("QP: qp_nr=%d\n act_nr_snd_wqe=%d\n nr_rwqe_rq1=%d\n nr_rwqe_rq2=%d\n nr_rwqe_rq3=%d\n",
+ init_attr->qp_nr,
+ init_attr->act_nr_send_wqes,
+ init_attr->act_nr_rwqes_rq1,
+ init_attr->act_nr_rwqes_rq2,
+ init_attr->act_nr_rwqes_rq3);
+
+ pr->sq_skba_size = init_attr->act_nr_send_wqes + 1;
+
+ ret = ehea_init_q_skba(&pr->sq_skba, pr->sq_skba_size);
+ ret |= ehea_init_q_skba(&pr->rq1_skba, init_attr->act_nr_rwqes_rq1 + 1);
+ ret |= ehea_init_q_skba(&pr->rq2_skba, init_attr->act_nr_rwqes_rq2 + 1);
+ ret |= ehea_init_q_skba(&pr->rq3_skba, init_attr->act_nr_rwqes_rq3 + 1);
+ if (ret)
+ goto out_free;
+
+ pr->swqe_refill_th = init_attr->act_nr_send_wqes / 10;
+ if (ehea_gen_smrs(pr) != 0) {
+ ret = -EIO;
+ goto out_free;
+ }
+
+ atomic_set(&pr->swqe_avail, init_attr->act_nr_send_wqes - 1);
+
+ kfree(init_attr);
+
+ netif_napi_add(pr->port->netdev, &pr->napi, ehea_poll, 64);
+
+ ret = 0;
+ goto out;
+
+ out_free:
+ kfree(init_attr);
+ vfree(pr->sq_skba.arr);
+ vfree(pr->rq1_skba.arr);
+ vfree(pr->rq2_skba.arr);
+ vfree(pr->rq3_skba.arr);
+ ehea_destroy_qp(pr->qp);
+ ehea_destroy_cq(pr->send_cq);
+ ehea_destroy_cq(pr->recv_cq);
+ ehea_destroy_eq(pr->eq);
+ out:
+ return ret;
+ }
+
+ static int ehea_clean_portres(struct ehea_port *port, struct ehea_port_res *pr)
+ {
+ int ret, i;
+
+ if (pr->qp)
+ netif_napi_del(&pr->napi);
+
+ ret = ehea_destroy_qp(pr->qp);
+
+ if (!ret) {
+ ehea_destroy_cq(pr->send_cq);
+ ehea_destroy_cq(pr->recv_cq);
+ ehea_destroy_eq(pr->eq);
+
+ for (i = 0; i < pr->rq1_skba.len; i++)
+ if (pr->rq1_skba.arr[i])
+ dev_kfree_skb(pr->rq1_skba.arr[i]);
+
+ for (i = 0; i < pr->rq2_skba.len; i++)
+ if (pr->rq2_skba.arr[i])
+ dev_kfree_skb(pr->rq2_skba.arr[i]);
+
+ for (i = 0; i < pr->rq3_skba.len; i++)
+ if (pr->rq3_skba.arr[i])
+ dev_kfree_skb(pr->rq3_skba.arr[i]);
+
+ for (i = 0; i < pr->sq_skba.len; i++)
+ if (pr->sq_skba.arr[i])
+ dev_kfree_skb(pr->sq_skba.arr[i]);
+
+ vfree(pr->rq1_skba.arr);
+ vfree(pr->rq2_skba.arr);
+ vfree(pr->rq3_skba.arr);
+ vfree(pr->sq_skba.arr);
+ ret = ehea_rem_smrs(pr);
+ }
+ return ret;
+ }
+
+ static void write_swqe2_immediate(struct sk_buff *skb, struct ehea_swqe *swqe,
+ u32 lkey)
+ {
+ int skb_data_size = skb_headlen(skb);
+ u8 *imm_data = &swqe->u.immdata_desc.immediate_data[0];
+ struct ehea_vsgentry *sg1entry = &swqe->u.immdata_desc.sg_entry;
+ unsigned int immediate_len = SWQE2_MAX_IMM;
+
+ swqe->descriptors = 0;
+
+ if (skb_is_gso(skb)) {
+ swqe->tx_control |= EHEA_SWQE_TSO;
+ swqe->mss = skb_shinfo(skb)->gso_size;
+ /*
+ * For TSO packets we only copy the headers into the
+ * immediate area.
+ */
+ immediate_len = ETH_HLEN + ip_hdrlen(skb) + tcp_hdrlen(skb);
+ }
+
+ if (skb_is_gso(skb) || skb_data_size >= SWQE2_MAX_IMM) {
+ skb_copy_from_linear_data(skb, imm_data, immediate_len);
+ swqe->immediate_data_length = immediate_len;
+
+ if (skb_data_size > immediate_len) {
+ sg1entry->l_key = lkey;
+ sg1entry->len = skb_data_size - immediate_len;
+ sg1entry->vaddr =
+ ehea_map_vaddr(skb->data + immediate_len);
+ swqe->descriptors++;
+ }
+ } else {
+ skb_copy_from_linear_data(skb, imm_data, skb_data_size);
+ swqe->immediate_data_length = skb_data_size;
+ }
+ }
+
+ static inline void write_swqe2_data(struct sk_buff *skb, struct net_device *dev,
+ struct ehea_swqe *swqe, u32 lkey)
+ {
+ struct ehea_vsgentry *sg_list, *sg1entry, *sgentry;
+ skb_frag_t *frag;
+ int nfrags, sg1entry_contains_frag_data, i;
+
+ nfrags = skb_shinfo(skb)->nr_frags;
+ sg1entry = &swqe->u.immdata_desc.sg_entry;
+ sg_list = (struct ehea_vsgentry *)&swqe->u.immdata_desc.sg_list;
+ sg1entry_contains_frag_data = 0;
+
+ write_swqe2_immediate(skb, swqe, lkey);
+
+ /* write descriptors */
+ if (nfrags > 0) {
+ if (swqe->descriptors == 0) {
+ /* sg1entry not yet used */
+ frag = &skb_shinfo(skb)->frags[0];
+
+ /* copy sg1entry data */
+ sg1entry->l_key = lkey;
+ sg1entry->len = skb_frag_size(frag);
+ sg1entry->vaddr =
+ ehea_map_vaddr(skb_frag_address(frag));
+ swqe->descriptors++;
+ sg1entry_contains_frag_data = 1;
+ }
+
+ for (i = sg1entry_contains_frag_data; i < nfrags; i++) {
+
+ frag = &skb_shinfo(skb)->frags[i];
+ sgentry = &sg_list[i - sg1entry_contains_frag_data];
+
+ sgentry->l_key = lkey;
+ sgentry->len = skb_frag_size(frag);
+ sgentry->vaddr = ehea_map_vaddr(skb_frag_address(frag));
+ swqe->descriptors++;
+ }
+ }
+ }
+
+ static int ehea_broadcast_reg_helper(struct ehea_port *port, u32 hcallid)
+ {
+ int ret = 0;
+ u64 hret;
+ u8 reg_type;
+
+ /* De/Register untagged packets */
+ reg_type = EHEA_BCMC_BROADCAST | EHEA_BCMC_UNTAGGED;
+ hret = ehea_h_reg_dereg_bcmc(port->adapter->handle,
+ port->logical_port_id,
+ reg_type, port->mac_addr, 0, hcallid);
+ if (hret != H_SUCCESS) {
+ pr_err("%sregistering bc address failed (tagged)\n",
+ hcallid == H_REG_BCMC ? "" : "de");
+ ret = -EIO;
+ goto out_herr;
+ }
+
+ /* De/Register VLAN packets */
+ reg_type = EHEA_BCMC_BROADCAST | EHEA_BCMC_VLANID_ALL;
+ hret = ehea_h_reg_dereg_bcmc(port->adapter->handle,
+ port->logical_port_id,
+ reg_type, port->mac_addr, 0, hcallid);
+ if (hret != H_SUCCESS) {
+ pr_err("%sregistering bc address failed (vlan)\n",
+ hcallid == H_REG_BCMC ? "" : "de");
+ ret = -EIO;
+ }
+ out_herr:
+ return ret;
+ }
+
+ static int ehea_set_mac_addr(struct net_device *dev, void *sa)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ struct sockaddr *mac_addr = sa;
+ struct hcp_ehea_port_cb0 *cb0;
+ int ret;
+ u64 hret;
+
+ if (!is_valid_ether_addr(mac_addr->sa_data)) {
+ ret = -EADDRNOTAVAIL;
+ goto out;
+ }
+
+ cb0 = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cb0) {
+ pr_err("no mem for cb0\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(&(cb0->port_mac_addr), &(mac_addr->sa_data[0]), ETH_ALEN);
+
+ cb0->port_mac_addr = cb0->port_mac_addr >> 16;
+
+ hret = ehea_h_modify_ehea_port(port->adapter->handle,
+ port->logical_port_id, H_PORT_CB0,
+ EHEA_BMASK_SET(H_PORT_CB0_MAC, 1), cb0);
+ if (hret != H_SUCCESS) {
+ ret = -EIO;
+ goto out_free;
+ }
+
+ memcpy(dev->dev_addr, mac_addr->sa_data, dev->addr_len);
+
+ /* Deregister old MAC in pHYP */
+ if (port->state == EHEA_PORT_UP) {
+ ret = ehea_broadcast_reg_helper(port, H_DEREG_BCMC);
+ if (ret)
+ goto out_upregs;
+ }
+
+ port->mac_addr = cb0->port_mac_addr << 16;
+
+ /* Register new MAC in pHYP */
+ if (port->state == EHEA_PORT_UP) {
+ ret = ehea_broadcast_reg_helper(port, H_REG_BCMC);
+ if (ret)
+ goto out_upregs;
+ }
+
+ ret = 0;
+
+ out_upregs:
+ ehea_update_bcmc_registrations();
+ out_free:
+ free_page((unsigned long)cb0);
+ out:
+ return ret;
+ }
+
+ static void ehea_promiscuous_error(u64 hret, int enable)
+ {
+ if (hret == H_AUTHORITY)
+ pr_info("Hypervisor denied %sabling promiscuous mode\n",
+ enable == 1 ? "en" : "dis");
+ else
+ pr_err("failed %sabling promiscuous mode\n",
+ enable == 1 ? "en" : "dis");
+ }
+
+ static void ehea_promiscuous(struct net_device *dev, int enable)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ struct hcp_ehea_port_cb7 *cb7;
+ u64 hret;
+
+ if (enable == port->promisc)
+ return;
+
+ cb7 = (void *)get_zeroed_page(GFP_ATOMIC);
+ if (!cb7) {
+ pr_err("no mem for cb7\n");
+ goto out;
+ }
+
+ /* Modify Pxs_DUCQPN in CB7 */
+ cb7->def_uc_qpn = enable == 1 ? port->port_res[0].qp->fw_handle : 0;
+
+ hret = ehea_h_modify_ehea_port(port->adapter->handle,
+ port->logical_port_id,
+ H_PORT_CB7, H_PORT_CB7_DUCQPN, cb7);
+ if (hret) {
+ ehea_promiscuous_error(hret, enable);
+ goto out;
+ }
+
+ port->promisc = enable;
+ out:
+ free_page((unsigned long)cb7);
+ }
+
+ static u64 ehea_multicast_reg_helper(struct ehea_port *port, u64 mc_mac_addr,
+ u32 hcallid)
+ {
+ u64 hret;
+ u8 reg_type;
+
+ reg_type = EHEA_BCMC_SCOPE_ALL | EHEA_BCMC_MULTICAST
+ | EHEA_BCMC_UNTAGGED;
+
+ hret = ehea_h_reg_dereg_bcmc(port->adapter->handle,
+ port->logical_port_id,
+ reg_type, mc_mac_addr, 0, hcallid);
+ if (hret)
+ goto out;
+
+ reg_type = EHEA_BCMC_SCOPE_ALL | EHEA_BCMC_MULTICAST
+ | EHEA_BCMC_VLANID_ALL;
+
+ hret = ehea_h_reg_dereg_bcmc(port->adapter->handle,
+ port->logical_port_id,
+ reg_type, mc_mac_addr, 0, hcallid);
+ out:
+ return hret;
+ }
+
+ static int ehea_drop_multicast_list(struct net_device *dev)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ struct ehea_mc_list *mc_entry = port->mc_list;
+ struct list_head *pos;
+ struct list_head *temp;
+ int ret = 0;
+ u64 hret;
+
+ list_for_each_safe(pos, temp, &(port->mc_list->list)) {
+ mc_entry = list_entry(pos, struct ehea_mc_list, list);
+
+ hret = ehea_multicast_reg_helper(port, mc_entry->macaddr,
+ H_DEREG_BCMC);
+ if (hret) {
+ pr_err("failed deregistering mcast MAC\n");
+ ret = -EIO;
+ }
+
+ list_del(pos);
+ kfree(mc_entry);
+ }
+ return ret;
+ }
+
+ static void ehea_allmulti(struct net_device *dev, int enable)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ u64 hret;
+
+ if (!port->allmulti) {
+ if (enable) {
+ /* Enable ALLMULTI */
+ ehea_drop_multicast_list(dev);
+ hret = ehea_multicast_reg_helper(port, 0, H_REG_BCMC);
+ if (!hret)
+ port->allmulti = 1;
+ else
+ netdev_err(dev,
+ "failed enabling IFF_ALLMULTI\n");
+ }
+ } else
+ if (!enable) {
+ /* Disable ALLMULTI */
+ hret = ehea_multicast_reg_helper(port, 0, H_DEREG_BCMC);
+ if (!hret)
+ port->allmulti = 0;
+ else
+ netdev_err(dev,
+ "failed disabling IFF_ALLMULTI\n");
+ }
+ }
+
+ static void ehea_add_multicast_entry(struct ehea_port *port, u8 *mc_mac_addr)
+ {
+ struct ehea_mc_list *ehea_mcl_entry;
+ u64 hret;
+
+ ehea_mcl_entry = kzalloc(sizeof(*ehea_mcl_entry), GFP_ATOMIC);
+ if (!ehea_mcl_entry) {
+ pr_err("no mem for mcl_entry\n");
+ return;
+ }
+
+ INIT_LIST_HEAD(&ehea_mcl_entry->list);
+
+ memcpy(&ehea_mcl_entry->macaddr, mc_mac_addr, ETH_ALEN);
+
+ hret = ehea_multicast_reg_helper(port, ehea_mcl_entry->macaddr,
+ H_REG_BCMC);
+ if (!hret)
+ list_add(&ehea_mcl_entry->list, &port->mc_list->list);
+ else {
+ pr_err("failed registering mcast MAC\n");
+ kfree(ehea_mcl_entry);
+ }
+ }
+
+ static void ehea_set_multicast_list(struct net_device *dev)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ struct netdev_hw_addr *ha;
+ int ret;
+
+ if (port->promisc) {
+ ehea_promiscuous(dev, 1);
+ return;
+ }
+ ehea_promiscuous(dev, 0);
+
+ if (dev->flags & IFF_ALLMULTI) {
+ ehea_allmulti(dev, 1);
+ goto out;
+ }
+ ehea_allmulti(dev, 0);
+
+ if (!netdev_mc_empty(dev)) {
+ ret = ehea_drop_multicast_list(dev);
+ if (ret) {
+ /* Dropping the current multicast list failed.
+ * Enabling ALL_MULTI is the best we can do.
+ */
+ ehea_allmulti(dev, 1);
+ }
+
+ if (netdev_mc_count(dev) > port->adapter->max_mc_mac) {
+ pr_info("Mcast registration limit reached (0x%llx). Use ALLMULTI!\n",
+ port->adapter->max_mc_mac);
+ goto out;
+ }
+
+ netdev_for_each_mc_addr(ha, dev)
+ ehea_add_multicast_entry(port, ha->addr);
+
+ }
+ out:
+ ehea_update_bcmc_registrations();
+ }
+
+ static int ehea_change_mtu(struct net_device *dev, int new_mtu)
+ {
+ if ((new_mtu < 68) || (new_mtu > EHEA_MAX_PACKET_SIZE))
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+ }
+
+ static void xmit_common(struct sk_buff *skb, struct ehea_swqe *swqe)
+ {
+ swqe->tx_control |= EHEA_SWQE_IMM_DATA_PRESENT | EHEA_SWQE_CRC;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ return;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ swqe->tx_control |= EHEA_SWQE_IP_CHECKSUM;
+
+ swqe->ip_start = skb_network_offset(skb);
+ swqe->ip_end = swqe->ip_start + ip_hdrlen(skb) - 1;
+
+ switch (ip_hdr(skb)->protocol) {
+ case IPPROTO_UDP:
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ swqe->tx_control |= EHEA_SWQE_TCP_CHECKSUM;
+
+ swqe->tcp_offset = swqe->ip_end + 1 +
+ offsetof(struct udphdr, check);
+ break;
+
+ case IPPROTO_TCP:
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ swqe->tx_control |= EHEA_SWQE_TCP_CHECKSUM;
+
+ swqe->tcp_offset = swqe->ip_end + 1 +
+ offsetof(struct tcphdr, check);
+ break;
+ }
+ }
+
+ static void ehea_xmit2(struct sk_buff *skb, struct net_device *dev,
+ struct ehea_swqe *swqe, u32 lkey)
+ {
+ swqe->tx_control |= EHEA_SWQE_DESCRIPTORS_PRESENT;
+
+ xmit_common(skb, swqe);
+
+ write_swqe2_data(skb, dev, swqe, lkey);
+ }
+
+ static void ehea_xmit3(struct sk_buff *skb, struct net_device *dev,
+ struct ehea_swqe *swqe)
+ {
+ u8 *imm_data = &swqe->u.immdata_nodesc.immediate_data[0];
+
+ xmit_common(skb, swqe);
+
+ if (!skb->data_len)
+ skb_copy_from_linear_data(skb, imm_data, skb->len);
+ else
+ skb_copy_bits(skb, 0, imm_data, skb->len);
+
+ swqe->immediate_data_length = skb->len;
+ dev_kfree_skb(skb);
+ }
+
+ static int ehea_start_xmit(struct sk_buff *skb, struct net_device *dev)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ struct ehea_swqe *swqe;
+ u32 lkey;
+ int swqe_index;
+ struct ehea_port_res *pr;
+ struct netdev_queue *txq;
+
+ pr = &port->port_res[skb_get_queue_mapping(skb)];
+ txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
+
+ swqe = ehea_get_swqe(pr->qp, &swqe_index);
+ memset(swqe, 0, SWQE_HEADER_SIZE);
+ atomic_dec(&pr->swqe_avail);
+
+ if (vlan_tx_tag_present(skb)) {
+ swqe->tx_control |= EHEA_SWQE_VLAN_INSERT;
+ swqe->vlan_tag = vlan_tx_tag_get(skb);
+ }
+
+ pr->tx_packets++;
+ pr->tx_bytes += skb->len;
+
+ if (skb->len <= SWQE3_MAX_IMM) {
+ u32 sig_iv = port->sig_comp_iv;
+ u32 swqe_num = pr->swqe_id_counter;
+ ehea_xmit3(skb, dev, swqe);
+ swqe->wr_id = EHEA_BMASK_SET(EHEA_WR_ID_TYPE, EHEA_SWQE3_TYPE)
+ | EHEA_BMASK_SET(EHEA_WR_ID_COUNT, swqe_num);
+ if (pr->swqe_ll_count >= (sig_iv - 1)) {
+ swqe->wr_id |= EHEA_BMASK_SET(EHEA_WR_ID_REFILL,
+ sig_iv);
+ swqe->tx_control |= EHEA_SWQE_SIGNALLED_COMPLETION;
+ pr->swqe_ll_count = 0;
+ } else
+ pr->swqe_ll_count += 1;
+ } else {
+ swqe->wr_id =
+ EHEA_BMASK_SET(EHEA_WR_ID_TYPE, EHEA_SWQE2_TYPE)
+ | EHEA_BMASK_SET(EHEA_WR_ID_COUNT, pr->swqe_id_counter)
+ | EHEA_BMASK_SET(EHEA_WR_ID_REFILL, 1)
+ | EHEA_BMASK_SET(EHEA_WR_ID_INDEX, pr->sq_skba.index);
+ pr->sq_skba.arr[pr->sq_skba.index] = skb;
+
+ pr->sq_skba.index++;
+ pr->sq_skba.index &= (pr->sq_skba.len - 1);
+
+ lkey = pr->send_mr.lkey;
+ ehea_xmit2(skb, dev, swqe, lkey);
+ swqe->tx_control |= EHEA_SWQE_SIGNALLED_COMPLETION;
+ }
+ pr->swqe_id_counter += 1;
+
+ netif_info(port, tx_queued, dev,
+ "post swqe on QP %d\n", pr->qp->init_attr.qp_nr);
+ if (netif_msg_tx_queued(port))
+ ehea_dump(swqe, 512, "swqe");
+
+ if (unlikely(test_bit(__EHEA_STOP_XFER, &ehea_driver_flags))) {
+ netif_tx_stop_queue(txq);
+ swqe->tx_control |= EHEA_SWQE_PURGE;
+ }
+
+ ehea_post_swqe(pr->qp, swqe);
+
+ if (unlikely(atomic_read(&pr->swqe_avail) <= 1)) {
+ pr->p_stats.queue_stopped++;
+ netif_tx_stop_queue(txq);
+ }
+
+ return NETDEV_TX_OK;
+ }
+
+ static void ehea_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ struct ehea_adapter *adapter = port->adapter;
+ struct hcp_ehea_port_cb1 *cb1;
+ int index;
+ u64 hret;
+
+ cb1 = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cb1) {
+ pr_err("no mem for cb1\n");
+ goto out;
+ }
+
+ hret = ehea_h_query_ehea_port(adapter->handle, port->logical_port_id,
+ H_PORT_CB1, H_PORT_CB1_ALL, cb1);
+ if (hret != H_SUCCESS) {
+ pr_err("query_ehea_port failed\n");
+ goto out;
+ }
+
+ index = (vid / 64);
+ cb1->vlan_filter[index] |= ((u64)(0x8000000000000000 >> (vid & 0x3F)));
+
+ hret = ehea_h_modify_ehea_port(adapter->handle, port->logical_port_id,
+ H_PORT_CB1, H_PORT_CB1_ALL, cb1);
+ if (hret != H_SUCCESS)
+ pr_err("modify_ehea_port failed\n");
+ out:
+ free_page((unsigned long)cb1);
+ return;
+ }
+
+ static void ehea_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ struct ehea_adapter *adapter = port->adapter;
+ struct hcp_ehea_port_cb1 *cb1;
+ int index;
+ u64 hret;
+
+ cb1 = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cb1) {
+ pr_err("no mem for cb1\n");
+ goto out;
+ }
+
+ hret = ehea_h_query_ehea_port(adapter->handle, port->logical_port_id,
+ H_PORT_CB1, H_PORT_CB1_ALL, cb1);
+ if (hret != H_SUCCESS) {
+ pr_err("query_ehea_port failed\n");
+ goto out;
+ }
+
+ index = (vid / 64);
+ cb1->vlan_filter[index] &= ~((u64)(0x8000000000000000 >> (vid & 0x3F)));
+
+ hret = ehea_h_modify_ehea_port(adapter->handle, port->logical_port_id,
+ H_PORT_CB1, H_PORT_CB1_ALL, cb1);
+ if (hret != H_SUCCESS)
+ pr_err("modify_ehea_port failed\n");
+ out:
+ free_page((unsigned long)cb1);
+ }
+
+ int ehea_activate_qp(struct ehea_adapter *adapter, struct ehea_qp *qp)
+ {
+ int ret = -EIO;
+ u64 hret;
+ u16 dummy16 = 0;
+ u64 dummy64 = 0;
+ struct hcp_modify_qp_cb0 *cb0;
+
+ cb0 = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cb0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+ EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF), cb0);
+ if (hret != H_SUCCESS) {
+ pr_err("query_ehea_qp failed (1)\n");
+ goto out;
+ }
+
+ cb0->qp_ctl_reg = H_QP_CR_STATE_INITIALIZED;
+ hret = ehea_h_modify_ehea_qp(adapter->handle, 0, qp->fw_handle,
+ EHEA_BMASK_SET(H_QPCB0_QP_CTL_REG, 1), cb0,
+ &dummy64, &dummy64, &dummy16, &dummy16);
+ if (hret != H_SUCCESS) {
+ pr_err("modify_ehea_qp failed (1)\n");
+ goto out;
+ }
+
+ hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+ EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF), cb0);
+ if (hret != H_SUCCESS) {
+ pr_err("query_ehea_qp failed (2)\n");
+ goto out;
+ }
+
+ cb0->qp_ctl_reg = H_QP_CR_ENABLED | H_QP_CR_STATE_INITIALIZED;
+ hret = ehea_h_modify_ehea_qp(adapter->handle, 0, qp->fw_handle,
+ EHEA_BMASK_SET(H_QPCB0_QP_CTL_REG, 1), cb0,
+ &dummy64, &dummy64, &dummy16, &dummy16);
+ if (hret != H_SUCCESS) {
+ pr_err("modify_ehea_qp failed (2)\n");
+ goto out;
+ }
+
+ hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+ EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF), cb0);
+ if (hret != H_SUCCESS) {
+ pr_err("query_ehea_qp failed (3)\n");
+ goto out;
+ }
+
+ cb0->qp_ctl_reg = H_QP_CR_ENABLED | H_QP_CR_STATE_RDY2SND;
+ hret = ehea_h_modify_ehea_qp(adapter->handle, 0, qp->fw_handle,
+ EHEA_BMASK_SET(H_QPCB0_QP_CTL_REG, 1), cb0,
+ &dummy64, &dummy64, &dummy16, &dummy16);
+ if (hret != H_SUCCESS) {
+ pr_err("modify_ehea_qp failed (3)\n");
+ goto out;
+ }
+
+ hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+ EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF), cb0);
+ if (hret != H_SUCCESS) {
+ pr_err("query_ehea_qp failed (4)\n");
+ goto out;
+ }
+
+ ret = 0;
+ out:
+ free_page((unsigned long)cb0);
+ return ret;
+ }
+
+ static int ehea_port_res_setup(struct ehea_port *port, int def_qps)
+ {
+ int ret, i;
+ struct port_res_cfg pr_cfg, pr_cfg_small_rx;
+ enum ehea_eq_type eq_type = EHEA_EQ;
+
+ port->qp_eq = ehea_create_eq(port->adapter, eq_type,
+ EHEA_MAX_ENTRIES_EQ, 1);
+ if (!port->qp_eq) {
+ ret = -EINVAL;
+ pr_err("ehea_create_eq failed (qp_eq)\n");
+ goto out_kill_eq;
+ }
+
+ pr_cfg.max_entries_rcq = rq1_entries + rq2_entries + rq3_entries;
+ pr_cfg.max_entries_scq = sq_entries * 2;
+ pr_cfg.max_entries_sq = sq_entries;
+ pr_cfg.max_entries_rq1 = rq1_entries;
+ pr_cfg.max_entries_rq2 = rq2_entries;
+ pr_cfg.max_entries_rq3 = rq3_entries;
+
+ pr_cfg_small_rx.max_entries_rcq = 1;
+ pr_cfg_small_rx.max_entries_scq = sq_entries;
+ pr_cfg_small_rx.max_entries_sq = sq_entries;
+ pr_cfg_small_rx.max_entries_rq1 = 1;
+ pr_cfg_small_rx.max_entries_rq2 = 1;
+ pr_cfg_small_rx.max_entries_rq3 = 1;
+
+ for (i = 0; i < def_qps; i++) {
+ ret = ehea_init_port_res(port, &port->port_res[i], &pr_cfg, i);
+ if (ret)
+ goto out_clean_pr;
+ }
+ for (i = def_qps; i < def_qps; i++) {
+ ret = ehea_init_port_res(port, &port->port_res[i],
+ &pr_cfg_small_rx, i);
+ if (ret)
+ goto out_clean_pr;
+ }
+
+ return 0;
+
+ out_clean_pr:
+ while (--i >= 0)
+ ehea_clean_portres(port, &port->port_res[i]);
+
+ out_kill_eq:
+ ehea_destroy_eq(port->qp_eq);
+ return ret;
+ }
+
+ static int ehea_clean_all_portres(struct ehea_port *port)
+ {
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < port->num_def_qps; i++)
+ ret |= ehea_clean_portres(port, &port->port_res[i]);
+
+ ret |= ehea_destroy_eq(port->qp_eq);
+
+ return ret;
+ }
+
+ static void ehea_remove_adapter_mr(struct ehea_adapter *adapter)
+ {
+ if (adapter->active_ports)
+ return;
+
+ ehea_rem_mr(&adapter->mr);
+ }
+
+ static int ehea_add_adapter_mr(struct ehea_adapter *adapter)
+ {
+ if (adapter->active_ports)
+ return 0;
+
+ return ehea_reg_kernel_mr(adapter, &adapter->mr);
+ }
+
+ static int ehea_up(struct net_device *dev)
+ {
+ int ret, i;
+ struct ehea_port *port = netdev_priv(dev);
+
+ if (port->state == EHEA_PORT_UP)
+ return 0;
+
+ ret = ehea_port_res_setup(port, port->num_def_qps);
+ if (ret) {
+ netdev_err(dev, "port_res_failed\n");
+ goto out;
+ }
+
+ /* Set default QP for this port */
+ ret = ehea_configure_port(port);
+ if (ret) {
+ netdev_err(dev, "ehea_configure_port failed. ret:%d\n", ret);
+ goto out_clean_pr;
+ }
+
+ ret = ehea_reg_interrupts(dev);
+ if (ret) {
+ netdev_err(dev, "reg_interrupts failed. ret:%d\n", ret);
+ goto out_clean_pr;
+ }
+
+ for (i = 0; i < port->num_def_qps; i++) {
+ ret = ehea_activate_qp(port->adapter, port->port_res[i].qp);
+ if (ret) {
+ netdev_err(dev, "activate_qp failed\n");
+ goto out_free_irqs;
+ }
+ }
+
+ for (i = 0; i < port->num_def_qps; i++) {
+ ret = ehea_fill_port_res(&port->port_res[i]);
+ if (ret) {
+ netdev_err(dev, "out_free_irqs\n");
+ goto out_free_irqs;
+ }
+ }
+
+ ret = ehea_broadcast_reg_helper(port, H_REG_BCMC);
+ if (ret) {
+ ret = -EIO;
+ goto out_free_irqs;
+ }
+
+ port->state = EHEA_PORT_UP;
+
+ ret = 0;
+ goto out;
+
+ out_free_irqs:
+ ehea_free_interrupts(dev);
+
+ out_clean_pr:
+ ehea_clean_all_portres(port);
+ out:
+ if (ret)
+ netdev_info(dev, "Failed starting. ret=%i\n", ret);
+
+ ehea_update_bcmc_registrations();
+ ehea_update_firmware_handles();
+
+ return ret;
+ }
+
+ static void port_napi_disable(struct ehea_port *port)
+ {
+ int i;
+
+ for (i = 0; i < port->num_def_qps; i++)
+ napi_disable(&port->port_res[i].napi);
+ }
+
+ static void port_napi_enable(struct ehea_port *port)
+ {
+ int i;
+
+ for (i = 0; i < port->num_def_qps; i++)
+ napi_enable(&port->port_res[i].napi);
+ }
+
+ static int ehea_open(struct net_device *dev)
+ {
+ int ret;
+ struct ehea_port *port = netdev_priv(dev);
+
+ mutex_lock(&port->port_lock);
+
+ netif_info(port, ifup, dev, "enabling port\n");
+
+ ret = ehea_up(dev);
+ if (!ret) {
+ port_napi_enable(port);
+ netif_tx_start_all_queues(dev);
+ }
+
+ mutex_unlock(&port->port_lock);
+ schedule_delayed_work(&port->stats_work, msecs_to_jiffies(1000));
+
+ return ret;
+ }
+
+ static int ehea_down(struct net_device *dev)
+ {
+ int ret;
+ struct ehea_port *port = netdev_priv(dev);
+
+ if (port->state == EHEA_PORT_DOWN)
+ return 0;
+
+ ehea_drop_multicast_list(dev);
+ ehea_broadcast_reg_helper(port, H_DEREG_BCMC);
+
+ ehea_free_interrupts(dev);
+
+ port->state = EHEA_PORT_DOWN;
+
+ ehea_update_bcmc_registrations();
+
+ ret = ehea_clean_all_portres(port);
+ if (ret)
+ netdev_info(dev, "Failed freeing resources. ret=%i\n", ret);
+
+ ehea_update_firmware_handles();
+
+ return ret;
+ }
+
+ static int ehea_stop(struct net_device *dev)
+ {
+ int ret;
+ struct ehea_port *port = netdev_priv(dev);
+
+ netif_info(port, ifdown, dev, "disabling port\n");
+
+ set_bit(__EHEA_DISABLE_PORT_RESET, &port->flags);
+ cancel_work_sync(&port->reset_task);
+ cancel_delayed_work_sync(&port->stats_work);
+ mutex_lock(&port->port_lock);
+ netif_tx_stop_all_queues(dev);
+ port_napi_disable(port);
+ ret = ehea_down(dev);
+ mutex_unlock(&port->port_lock);
+ clear_bit(__EHEA_DISABLE_PORT_RESET, &port->flags);
+ return ret;
+ }
+
+ static void ehea_purge_sq(struct ehea_qp *orig_qp)
+ {
+ struct ehea_qp qp = *orig_qp;
+ struct ehea_qp_init_attr *init_attr = &qp.init_attr;
+ struct ehea_swqe *swqe;
+ int wqe_index;
+ int i;
+
+ for (i = 0; i < init_attr->act_nr_send_wqes; i++) {
+ swqe = ehea_get_swqe(&qp, &wqe_index);
+ swqe->tx_control |= EHEA_SWQE_PURGE;
+ }
+ }
+
+ static void ehea_flush_sq(struct ehea_port *port)
+ {
+ int i;
+
+ for (i = 0; i < port->num_def_qps; i++) {
+ struct ehea_port_res *pr = &port->port_res[i];
+ int swqe_max = pr->sq_skba_size - 2 - pr->swqe_ll_count;
+ int ret;
+
+ ret = wait_event_timeout(port->swqe_avail_wq,
+ atomic_read(&pr->swqe_avail) >= swqe_max,
+ msecs_to_jiffies(100));
+
+ if (!ret) {
+ pr_err("WARNING: sq not flushed completely\n");
+ break;
+ }
+ }
+ }
+
+ int ehea_stop_qps(struct net_device *dev)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ struct ehea_adapter *adapter = port->adapter;
+ struct hcp_modify_qp_cb0 *cb0;
+ int ret = -EIO;
+ int dret;
+ int i;
+ u64 hret;
+ u64 dummy64 = 0;
+ u16 dummy16 = 0;
+
+ cb0 = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cb0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < (port->num_def_qps); i++) {
+ struct ehea_port_res *pr = &port->port_res[i];
+ struct ehea_qp *qp = pr->qp;
+
+ /* Purge send queue */
+ ehea_purge_sq(qp);
+
+ /* Disable queue pair */
+ hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+ EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF),
+ cb0);
+ if (hret != H_SUCCESS) {
+ pr_err("query_ehea_qp failed (1)\n");
+ goto out;
+ }
+
+ cb0->qp_ctl_reg = (cb0->qp_ctl_reg & H_QP_CR_RES_STATE) << 8;
+ cb0->qp_ctl_reg &= ~H_QP_CR_ENABLED;
+
+ hret = ehea_h_modify_ehea_qp(adapter->handle, 0, qp->fw_handle,
+ EHEA_BMASK_SET(H_QPCB0_QP_CTL_REG,
+ 1), cb0, &dummy64,
+ &dummy64, &dummy16, &dummy16);
+ if (hret != H_SUCCESS) {
+ pr_err("modify_ehea_qp failed (1)\n");
+ goto out;
+ }
+
+ hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+ EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF),
+ cb0);
+ if (hret != H_SUCCESS) {
+ pr_err("query_ehea_qp failed (2)\n");
+ goto out;
+ }
+
+ /* deregister shared memory regions */
+ dret = ehea_rem_smrs(pr);
+ if (dret) {
+ pr_err("unreg shared memory region failed\n");
+ goto out;
+ }
+ }
+
+ ret = 0;
+ out:
+ free_page((unsigned long)cb0);
+
+ return ret;
+ }
+
+ void ehea_update_rqs(struct ehea_qp *orig_qp, struct ehea_port_res *pr)
+ {
+ struct ehea_qp qp = *orig_qp;
+ struct ehea_qp_init_attr *init_attr = &qp.init_attr;
+ struct ehea_rwqe *rwqe;
+ struct sk_buff **skba_rq2 = pr->rq2_skba.arr;
+ struct sk_buff **skba_rq3 = pr->rq3_skba.arr;
+ struct sk_buff *skb;
+ u32 lkey = pr->recv_mr.lkey;
+
+
+ int i;
+ int index;
+
+ for (i = 0; i < init_attr->act_nr_rwqes_rq2 + 1; i++) {
+ rwqe = ehea_get_next_rwqe(&qp, 2);
+ rwqe->sg_list[0].l_key = lkey;
+ index = EHEA_BMASK_GET(EHEA_WR_ID_INDEX, rwqe->wr_id);
+ skb = skba_rq2[index];
+ if (skb)
+ rwqe->sg_list[0].vaddr = ehea_map_vaddr(skb->data);
+ }
+
+ for (i = 0; i < init_attr->act_nr_rwqes_rq3 + 1; i++) {
+ rwqe = ehea_get_next_rwqe(&qp, 3);
+ rwqe->sg_list[0].l_key = lkey;
+ index = EHEA_BMASK_GET(EHEA_WR_ID_INDEX, rwqe->wr_id);
+ skb = skba_rq3[index];
+ if (skb)
+ rwqe->sg_list[0].vaddr = ehea_map_vaddr(skb->data);
+ }
+ }
+
+ int ehea_restart_qps(struct net_device *dev)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+ struct ehea_adapter *adapter = port->adapter;
+ int ret = 0;
+ int i;
+
+ struct hcp_modify_qp_cb0 *cb0;
+ u64 hret;
+ u64 dummy64 = 0;
+ u16 dummy16 = 0;
+
+ cb0 = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cb0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < (port->num_def_qps); i++) {
+ struct ehea_port_res *pr = &port->port_res[i];
+ struct ehea_qp *qp = pr->qp;
+
+ ret = ehea_gen_smrs(pr);
+ if (ret) {
+ netdev_err(dev, "creation of shared memory regions failed\n");
+ goto out;
+ }
+
+ ehea_update_rqs(qp, pr);
+
+ /* Enable queue pair */
+ hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+ EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF),
+ cb0);
+ if (hret != H_SUCCESS) {
+ netdev_err(dev, "query_ehea_qp failed (1)\n");
+ goto out;
+ }
+
+ cb0->qp_ctl_reg = (cb0->qp_ctl_reg & H_QP_CR_RES_STATE) << 8;
+ cb0->qp_ctl_reg |= H_QP_CR_ENABLED;
+
+ hret = ehea_h_modify_ehea_qp(adapter->handle, 0, qp->fw_handle,
+ EHEA_BMASK_SET(H_QPCB0_QP_CTL_REG,
+ 1), cb0, &dummy64,
+ &dummy64, &dummy16, &dummy16);
+ if (hret != H_SUCCESS) {
+ netdev_err(dev, "modify_ehea_qp failed (1)\n");
+ goto out;
+ }
+
+ hret = ehea_h_query_ehea_qp(adapter->handle, 0, qp->fw_handle,
+ EHEA_BMASK_SET(H_QPCB0_ALL, 0xFFFF),
+ cb0);
+ if (hret != H_SUCCESS) {
+ netdev_err(dev, "query_ehea_qp failed (2)\n");
+ goto out;
+ }
+
+ /* refill entire queue */
+ ehea_refill_rq1(pr, pr->rq1_skba.index, 0);
+ ehea_refill_rq2(pr, 0);
+ ehea_refill_rq3(pr, 0);
+ }
+ out:
+ free_page((unsigned long)cb0);
+
+ return ret;
+ }
+
+ static void ehea_reset_port(struct work_struct *work)
+ {
+ int ret;
+ struct ehea_port *port =
+ container_of(work, struct ehea_port, reset_task);
+ struct net_device *dev = port->netdev;
+
+ mutex_lock(&dlpar_mem_lock);
+ port->resets++;
+ mutex_lock(&port->port_lock);
+ netif_tx_disable(dev);
+
+ port_napi_disable(port);
+
+ ehea_down(dev);
+
+ ret = ehea_up(dev);
+ if (ret)
+ goto out;
+
+ ehea_set_multicast_list(dev);
+
+ netif_info(port, timer, dev, "reset successful\n");
+
+ port_napi_enable(port);
+
+ netif_tx_wake_all_queues(dev);
+ out:
+ mutex_unlock(&port->port_lock);
+ mutex_unlock(&dlpar_mem_lock);
+ }
+
+ static void ehea_rereg_mrs(void)
+ {
+ int ret, i;
+ struct ehea_adapter *adapter;
+
+ pr_info("LPAR memory changed - re-initializing driver\n");
+
+ list_for_each_entry(adapter, &adapter_list, list)
+ if (adapter->active_ports) {
+ /* Shutdown all ports */
+ for (i = 0; i < EHEA_MAX_PORTS; i++) {
+ struct ehea_port *port = adapter->port[i];
+ struct net_device *dev;
+
+ if (!port)
+ continue;
+
+ dev = port->netdev;
+
+ if (dev->flags & IFF_UP) {
+ mutex_lock(&port->port_lock);
+ netif_tx_disable(dev);
+ ehea_flush_sq(port);
+ ret = ehea_stop_qps(dev);
+ if (ret) {
+ mutex_unlock(&port->port_lock);
+ goto out;
+ }
+ port_napi_disable(port);
+ mutex_unlock(&port->port_lock);
+ }
+ reset_sq_restart_flag(port);
+ }
+
+ /* Unregister old memory region */
+ ret = ehea_rem_mr(&adapter->mr);
+ if (ret) {
+ pr_err("unregister MR failed - driver inoperable!\n");
+ goto out;
+ }
+ }
+
+ clear_bit(__EHEA_STOP_XFER, &ehea_driver_flags);
+
+ list_for_each_entry(adapter, &adapter_list, list)
+ if (adapter->active_ports) {
+ /* Register new memory region */
+ ret = ehea_reg_kernel_mr(adapter, &adapter->mr);
+ if (ret) {
+ pr_err("register MR failed - driver inoperable!\n");
+ goto out;
+ }
+
+ /* Restart all ports */
+ for (i = 0; i < EHEA_MAX_PORTS; i++) {
+ struct ehea_port *port = adapter->port[i];
+
+ if (port) {
+ struct net_device *dev = port->netdev;
+
+ if (dev->flags & IFF_UP) {
+ mutex_lock(&port->port_lock);
+ ret = ehea_restart_qps(dev);
+ if (!ret) {
+ check_sqs(port);
+ port_napi_enable(port);
+ netif_tx_wake_all_queues(dev);
+ } else {
+ netdev_err(dev, "Unable to restart QPS\n");
+ }
+ mutex_unlock(&port->port_lock);
+ }
+ }
+ }
+ }
+ pr_info("re-initializing driver complete\n");
+ out:
+ return;
+ }
+
+ static void ehea_tx_watchdog(struct net_device *dev)
+ {
+ struct ehea_port *port = netdev_priv(dev);
+
+ if (netif_carrier_ok(dev) &&
+ !test_bit(__EHEA_STOP_XFER, &ehea_driver_flags))
+ ehea_schedule_port_reset(port);
+ }
+
+ int ehea_sense_adapter_attr(struct ehea_adapter *adapter)
+ {
+ struct hcp_query_ehea *cb;
+ u64 hret;
+ int ret;
+
+ cb = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cb) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ hret = ehea_h_query_ehea(adapter->handle, cb);
+
+ if (hret != H_SUCCESS) {
+ ret = -EIO;
+ goto out_herr;
+ }
+
+ adapter->max_mc_mac = cb->max_mc_mac - 1;
+ ret = 0;
+
+ out_herr:
+ free_page((unsigned long)cb);
+ out:
+ return ret;
+ }
+
+ int ehea_get_jumboframe_status(struct ehea_port *port, int *jumbo)
+ {
+ struct hcp_ehea_port_cb4 *cb4;
+ u64 hret;
+ int ret = 0;
+
+ *jumbo = 0;
+
+ /* (Try to) enable *jumbo frames */
+ cb4 = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cb4) {
+ pr_err("no mem for cb4\n");
+ ret = -ENOMEM;
+ goto out;
+ } else {
+ hret = ehea_h_query_ehea_port(port->adapter->handle,
+ port->logical_port_id,
+ H_PORT_CB4,
+ H_PORT_CB4_JUMBO, cb4);
+ if (hret == H_SUCCESS) {
+ if (cb4->jumbo_frame)
+ *jumbo = 1;
+ else {
+ cb4->jumbo_frame = 1;
+ hret = ehea_h_modify_ehea_port(port->adapter->
+ handle,
+ port->
+ logical_port_id,
+ H_PORT_CB4,
+ H_PORT_CB4_JUMBO,
+ cb4);
+ if (hret == H_SUCCESS)
+ *jumbo = 1;
+ }
+ } else
+ ret = -EINVAL;
+
+ free_page((unsigned long)cb4);
+ }
+ out:
+ return ret;
+ }
+
+ static ssize_t ehea_show_port_id(struct device *dev,
+ struct device_attribute *attr, char *buf)
+ {
+ struct ehea_port *port = container_of(dev, struct ehea_port, ofdev.dev);
+ return sprintf(buf, "%d", port->logical_port_id);
+ }
+
+ static DEVICE_ATTR(log_port_id, S_IRUSR | S_IRGRP | S_IROTH, ehea_show_port_id,
+ NULL);
+
+ static void __devinit logical_port_release(struct device *dev)
+ {
+ struct ehea_port *port = container_of(dev, struct ehea_port, ofdev.dev);
+ of_node_put(port->ofdev.dev.of_node);
+ }
+
+ static struct device *ehea_register_port(struct ehea_port *port,
+ struct device_node *dn)
+ {
+ int ret;
+
+ port->ofdev.dev.of_node = of_node_get(dn);
+ port->ofdev.dev.parent = &port->adapter->ofdev->dev;
+ port->ofdev.dev.bus = &ibmebus_bus_type;
+
+ dev_set_name(&port->ofdev.dev, "port%d", port_name_cnt++);
+ port->ofdev.dev.release = logical_port_release;
+
+ ret = of_device_register(&port->ofdev);
+ if (ret) {
+ pr_err("failed to register device. ret=%d\n", ret);
+ goto out;
+ }
+
+ ret = device_create_file(&port->ofdev.dev, &dev_attr_log_port_id);
+ if (ret) {
+ pr_err("failed to register attributes, ret=%d\n", ret);
+ goto out_unreg_of_dev;
+ }
+
+ return &port->ofdev.dev;
+
+ out_unreg_of_dev:
+ of_device_unregister(&port->ofdev);
+ out:
+ return NULL;
+ }
+
+ static void ehea_unregister_port(struct ehea_port *port)
+ {
+ device_remove_file(&port->ofdev.dev, &dev_attr_log_port_id);
+ of_device_unregister(&port->ofdev);
+ }
+
+ static const struct net_device_ops ehea_netdev_ops = {
+ .ndo_open = ehea_open,
+ .ndo_stop = ehea_stop,
+ .ndo_start_xmit = ehea_start_xmit,
+ #ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_poll_controller = ehea_netpoll,
+ #endif
+ .ndo_get_stats64 = ehea_get_stats64,
+ .ndo_set_mac_address = ehea_set_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_set_rx_mode = ehea_set_multicast_list,
+ .ndo_change_mtu = ehea_change_mtu,
+ .ndo_vlan_rx_add_vid = ehea_vlan_rx_add_vid,
+ .ndo_vlan_rx_kill_vid = ehea_vlan_rx_kill_vid,
+ .ndo_tx_timeout = ehea_tx_watchdog,
+ };
+
+ struct ehea_port *ehea_setup_single_port(struct ehea_adapter *adapter,
+ u32 logical_port_id,
+ struct device_node *dn)
+ {
+ int ret;
+ struct net_device *dev;
+ struct ehea_port *port;
+ struct device *port_dev;
+ int jumbo;
+
+ /* allocate memory for the port structures */
+ dev = alloc_etherdev_mq(sizeof(struct ehea_port), EHEA_MAX_PORT_RES);
+
+ if (!dev) {
+ pr_err("no mem for net_device\n");
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ port = netdev_priv(dev);
+
+ mutex_init(&port->port_lock);
+ port->state = EHEA_PORT_DOWN;
+ port->sig_comp_iv = sq_entries / 10;
+
+ port->adapter = adapter;
+ port->netdev = dev;
+ port->logical_port_id = logical_port_id;
+
+ port->msg_enable = netif_msg_init(msg_level, EHEA_MSG_DEFAULT);
+
+ port->mc_list = kzalloc(sizeof(struct ehea_mc_list), GFP_KERNEL);
+ if (!port->mc_list) {
+ ret = -ENOMEM;
+ goto out_free_ethdev;
+ }
+
+ INIT_LIST_HEAD(&port->mc_list->list);
+
+ ret = ehea_sense_port_attr(port);
+ if (ret)
+ goto out_free_mc_list;
+
+ netif_set_real_num_rx_queues(dev, port->num_def_qps);
+ netif_set_real_num_tx_queues(dev, port->num_def_qps);
+
+ port_dev = ehea_register_port(port, dn);
+ if (!port_dev)
+ goto out_free_mc_list;
+
+ SET_NETDEV_DEV(dev, port_dev);
+
+ /* initialize net_device structure */
+ memcpy(dev->dev_addr, &port->mac_addr, ETH_ALEN);
+
+ dev->netdev_ops = &ehea_netdev_ops;
+ ehea_set_ethtool_ops(dev);
+
+ dev->hw_features = NETIF_F_SG | NETIF_F_TSO
+ | NETIF_F_IP_CSUM | NETIF_F_HW_VLAN_TX | NETIF_F_LRO;
+ dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_TSO
+ | NETIF_F_HIGHDMA | NETIF_F_IP_CSUM | NETIF_F_HW_VLAN_TX
+ | NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER
+ | NETIF_F_RXCSUM;
+ dev->vlan_features = NETIF_F_SG | NETIF_F_TSO | NETIF_F_HIGHDMA |
+ NETIF_F_IP_CSUM;
+ dev->watchdog_timeo = EHEA_WATCH_DOG_TIMEOUT;
+
+ INIT_WORK(&port->reset_task, ehea_reset_port);
+ INIT_DELAYED_WORK(&port->stats_work, ehea_update_stats);
+
+ init_waitqueue_head(&port->swqe_avail_wq);
+ init_waitqueue_head(&port->restart_wq);
+
+ memset(&port->stats, 0, sizeof(struct net_device_stats));
+ ret = register_netdev(dev);
+ if (ret) {
+ pr_err("register_netdev failed. ret=%d\n", ret);
+ goto out_unreg_port;
+ }
+
+ ret = ehea_get_jumboframe_status(port, &jumbo);
+ if (ret)
+ netdev_err(dev, "failed determining jumbo frame status\n");
+
+ netdev_info(dev, "Jumbo frames are %sabled\n",
+ jumbo == 1 ? "en" : "dis");
+
+ adapter->active_ports++;
+
+ return port;
+
+ out_unreg_port:
+ ehea_unregister_port(port);
+
+ out_free_mc_list:
+ kfree(port->mc_list);
+
+ out_free_ethdev:
+ free_netdev(dev);
+
+ out_err:
+ pr_err("setting up logical port with id=%d failed, ret=%d\n",
+ logical_port_id, ret);
+ return NULL;
+ }
+
+ static void ehea_shutdown_single_port(struct ehea_port *port)
+ {
+ struct ehea_adapter *adapter = port->adapter;
+
+ cancel_work_sync(&port->reset_task);
+ cancel_delayed_work_sync(&port->stats_work);
+ unregister_netdev(port->netdev);
+ ehea_unregister_port(port);
+ kfree(port->mc_list);
+ free_netdev(port->netdev);
+ adapter->active_ports--;
+ }
+
+ static int ehea_setup_ports(struct ehea_adapter *adapter)
+ {
+ struct device_node *lhea_dn;
+ struct device_node *eth_dn = NULL;
+
+ const u32 *dn_log_port_id;
+ int i = 0;
+
+ lhea_dn = adapter->ofdev->dev.of_node;
+ while ((eth_dn = of_get_next_child(lhea_dn, eth_dn))) {
+
+ dn_log_port_id = of_get_property(eth_dn, "ibm,hea-port-no",
+ NULL);
+ if (!dn_log_port_id) {
+ pr_err("bad device node: eth_dn name=%s\n",
+ eth_dn->full_name);
+ continue;
+ }
+
+ if (ehea_add_adapter_mr(adapter)) {
+ pr_err("creating MR failed\n");
+ of_node_put(eth_dn);
+ return -EIO;
+ }
+
+ adapter->port[i] = ehea_setup_single_port(adapter,
+ *dn_log_port_id,
+ eth_dn);
+ if (adapter->port[i])
+ netdev_info(adapter->port[i]->netdev,
+ "logical port id #%d\n", *dn_log_port_id);
+ else
+ ehea_remove_adapter_mr(adapter);
+
+ i++;
+ }
+ return 0;
+ }
+
+ static struct device_node *ehea_get_eth_dn(struct ehea_adapter *adapter,
+ u32 logical_port_id)
+ {
+ struct device_node *lhea_dn;
+ struct device_node *eth_dn = NULL;
+ const u32 *dn_log_port_id;
+
+ lhea_dn = adapter->ofdev->dev.of_node;
+ while ((eth_dn = of_get_next_child(lhea_dn, eth_dn))) {
+
+ dn_log_port_id = of_get_property(eth_dn, "ibm,hea-port-no",
+ NULL);
+ if (dn_log_port_id)
+ if (*dn_log_port_id == logical_port_id)
+ return eth_dn;
+ }
+
+ return NULL;
+ }
+
+ static ssize_t ehea_probe_port(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+ {
+ struct ehea_adapter *adapter = dev_get_drvdata(dev);
+ struct ehea_port *port;
+ struct device_node *eth_dn = NULL;
+ int i;
+
+ u32 logical_port_id;
+
+ sscanf(buf, "%d", &logical_port_id);
+
+ port = ehea_get_port(adapter, logical_port_id);
+
+ if (port) {
+ netdev_info(port->netdev, "adding port with logical port id=%d failed: port already configured\n",
+ logical_port_id);
+ return -EINVAL;
+ }
+
+ eth_dn = ehea_get_eth_dn(adapter, logical_port_id);
+
+ if (!eth_dn) {
+ pr_info("no logical port with id %d found\n", logical_port_id);
+ return -EINVAL;
+ }
+
+ if (ehea_add_adapter_mr(adapter)) {
+ pr_err("creating MR failed\n");
+ return -EIO;
+ }
+
+ port = ehea_setup_single_port(adapter, logical_port_id, eth_dn);
+
+ of_node_put(eth_dn);
+
+ if (port) {
+ for (i = 0; i < EHEA_MAX_PORTS; i++)
+ if (!adapter->port[i]) {
+ adapter->port[i] = port;
+ break;
+ }
+
+ netdev_info(port->netdev, "added: (logical port id=%d)\n",
+ logical_port_id);
+ } else {
+ ehea_remove_adapter_mr(adapter);
+ return -EIO;
+ }
+
+ return (ssize_t) count;
+ }
+
+ static ssize_t ehea_remove_port(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+ {
+ struct ehea_adapter *adapter = dev_get_drvdata(dev);
+ struct ehea_port *port;
+ int i;
+ u32 logical_port_id;
+
+ sscanf(buf, "%d", &logical_port_id);
+
+ port = ehea_get_port(adapter, logical_port_id);
+
+ if (port) {
+ netdev_info(port->netdev, "removed: (logical port id=%d)\n",
+ logical_port_id);
+
+ ehea_shutdown_single_port(port);
+
+ for (i = 0; i < EHEA_MAX_PORTS; i++)
+ if (adapter->port[i] == port) {
+ adapter->port[i] = NULL;
+ break;
+ }
+ } else {
+ pr_err("removing port with logical port id=%d failed. port not configured.\n",
+ logical_port_id);
+ return -EINVAL;
+ }
+
+ ehea_remove_adapter_mr(adapter);
+
+ return (ssize_t) count;
+ }
+
+ static DEVICE_ATTR(probe_port, S_IWUSR, NULL, ehea_probe_port);
+ static DEVICE_ATTR(remove_port, S_IWUSR, NULL, ehea_remove_port);
+
+ int ehea_create_device_sysfs(struct platform_device *dev)
+ {
+ int ret = device_create_file(&dev->dev, &dev_attr_probe_port);
+ if (ret)
+ goto out;
+
+ ret = device_create_file(&dev->dev, &dev_attr_remove_port);
+ out:
+ return ret;
+ }
+
+ void ehea_remove_device_sysfs(struct platform_device *dev)
+ {
+ device_remove_file(&dev->dev, &dev_attr_probe_port);
+ device_remove_file(&dev->dev, &dev_attr_remove_port);
+ }
+
+ static int __devinit ehea_probe_adapter(struct platform_device *dev,
+ const struct of_device_id *id)
+ {
+ struct ehea_adapter *adapter;
+ const u64 *adapter_handle;
+ int ret;
+
+ if (!dev || !dev->dev.of_node) {
+ pr_err("Invalid ibmebus device probed\n");
+ return -EINVAL;
+ }
+
+ adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
+ if (!adapter) {
+ ret = -ENOMEM;
+ dev_err(&dev->dev, "no mem for ehea_adapter\n");
+ goto out;
+ }
+
+ list_add(&adapter->list, &adapter_list);
+
+ adapter->ofdev = dev;
+
+ adapter_handle = of_get_property(dev->dev.of_node, "ibm,hea-handle",
+ NULL);
+ if (adapter_handle)
+ adapter->handle = *adapter_handle;
+
+ if (!adapter->handle) {
+ dev_err(&dev->dev, "failed getting handle for adapter"
+ " '%s'\n", dev->dev.of_node->full_name);
+ ret = -ENODEV;
+ goto out_free_ad;
+ }
+
+ adapter->pd = EHEA_PD_ID;
+
+ dev_set_drvdata(&dev->dev, adapter);
+
+
+ /* initialize adapter and ports */
+ /* get adapter properties */
+ ret = ehea_sense_adapter_attr(adapter);
+ if (ret) {
+ dev_err(&dev->dev, "sense_adapter_attr failed: %d\n", ret);
+ goto out_free_ad;
+ }
+
+ adapter->neq = ehea_create_eq(adapter,
+ EHEA_NEQ, EHEA_MAX_ENTRIES_EQ, 1);
+ if (!adapter->neq) {
+ ret = -EIO;
+ dev_err(&dev->dev, "NEQ creation failed\n");
+ goto out_free_ad;
+ }
+
+ tasklet_init(&adapter->neq_tasklet, ehea_neq_tasklet,
+ (unsigned long)adapter);
+
+ ret = ibmebus_request_irq(adapter->neq->attr.ist1,
+ ehea_interrupt_neq, IRQF_DISABLED,
+ "ehea_neq", adapter);
+ if (ret) {
+ dev_err(&dev->dev, "requesting NEQ IRQ failed\n");
+ goto out_kill_eq;
+ }
+
+ ret = ehea_create_device_sysfs(dev);
+ if (ret)
+ goto out_free_irq;
+
+ ret = ehea_setup_ports(adapter);
+ if (ret) {
+ dev_err(&dev->dev, "setup_ports failed\n");
+ goto out_rem_dev_sysfs;
+ }
+
+ ret = 0;
+ goto out;
+
+ out_rem_dev_sysfs:
+ ehea_remove_device_sysfs(dev);
+
+ out_free_irq:
+ ibmebus_free_irq(adapter->neq->attr.ist1, adapter);
+
+ out_kill_eq:
+ ehea_destroy_eq(adapter->neq);
+
+ out_free_ad:
+ list_del(&adapter->list);
+ kfree(adapter);
+
+ out:
+ ehea_update_firmware_handles();
+
+ return ret;
+ }
+
+ static int __devexit ehea_remove(struct platform_device *dev)
+ {
+ struct ehea_adapter *adapter = dev_get_drvdata(&dev->dev);
+ int i;
+
+ for (i = 0; i < EHEA_MAX_PORTS; i++)
+ if (adapter->port[i]) {
+ ehea_shutdown_single_port(adapter->port[i]);
+ adapter->port[i] = NULL;
+ }
+
+ ehea_remove_device_sysfs(dev);
+
+ ibmebus_free_irq(adapter->neq->attr.ist1, adapter);
+ tasklet_kill(&adapter->neq_tasklet);
+
+ ehea_destroy_eq(adapter->neq);
+ ehea_remove_adapter_mr(adapter);
+ list_del(&adapter->list);
+ kfree(adapter);
+
+ ehea_update_firmware_handles();
+
+ return 0;
+ }
+
+ void ehea_crash_handler(void)
+ {
+ int i;
+
+ if (ehea_fw_handles.arr)
+ for (i = 0; i < ehea_fw_handles.num_entries; i++)
+ ehea_h_free_resource(ehea_fw_handles.arr[i].adh,
+ ehea_fw_handles.arr[i].fwh,
+ FORCE_FREE);
+
+ if (ehea_bcmc_regs.arr)
+ for (i = 0; i < ehea_bcmc_regs.num_entries; i++)
+ ehea_h_reg_dereg_bcmc(ehea_bcmc_regs.arr[i].adh,
+ ehea_bcmc_regs.arr[i].port_id,
+ ehea_bcmc_regs.arr[i].reg_type,
+ ehea_bcmc_regs.arr[i].macaddr,
+ 0, H_DEREG_BCMC);
+ }
+
+ static int ehea_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+ {
+ int ret = NOTIFY_BAD;
+ struct memory_notify *arg = data;
+
+ mutex_lock(&dlpar_mem_lock);
+
+ switch (action) {
+ case MEM_CANCEL_OFFLINE:
+ pr_info("memory offlining canceled");
+ /* Readd canceled memory block */
+ case MEM_ONLINE:
+ pr_info("memory is going online");
+ set_bit(__EHEA_STOP_XFER, &ehea_driver_flags);
+ if (ehea_add_sect_bmap(arg->start_pfn, arg->nr_pages))
+ goto out_unlock;
+ ehea_rereg_mrs();
+ break;
+ case MEM_GOING_OFFLINE:
+ pr_info("memory is going offline");
+ set_bit(__EHEA_STOP_XFER, &ehea_driver_flags);
+ if (ehea_rem_sect_bmap(arg->start_pfn, arg->nr_pages))
+ goto out_unlock;
+ ehea_rereg_mrs();
+ break;
+ default:
+ break;
+ }
+
+ ehea_update_firmware_handles();
+ ret = NOTIFY_OK;
+
+ out_unlock:
+ mutex_unlock(&dlpar_mem_lock);
+ return ret;
+ }
+
+ static struct notifier_block ehea_mem_nb = {
+ .notifier_call = ehea_mem_notifier,
+ };
+
+ static int ehea_reboot_notifier(struct notifier_block *nb,
+ unsigned long action, void *unused)
+ {
+ if (action == SYS_RESTART) {
+ pr_info("Reboot: freeing all eHEA resources\n");
+ ibmebus_unregister_driver(&ehea_driver);
+ }
+ return NOTIFY_DONE;
+ }
+
+ static struct notifier_block ehea_reboot_nb = {
+ .notifier_call = ehea_reboot_notifier,
+ };
+
+ static int check_module_parm(void)
+ {
+ int ret = 0;
+
+ if ((rq1_entries < EHEA_MIN_ENTRIES_QP) ||
+ (rq1_entries > EHEA_MAX_ENTRIES_RQ1)) {
+ pr_info("Bad parameter: rq1_entries\n");
+ ret = -EINVAL;
+ }
+ if ((rq2_entries < EHEA_MIN_ENTRIES_QP) ||
+ (rq2_entries > EHEA_MAX_ENTRIES_RQ2)) {
+ pr_info("Bad parameter: rq2_entries\n");
+ ret = -EINVAL;
+ }
+ if ((rq3_entries < EHEA_MIN_ENTRIES_QP) ||
+ (rq3_entries > EHEA_MAX_ENTRIES_RQ3)) {
+ pr_info("Bad parameter: rq3_entries\n");
+ ret = -EINVAL;
+ }
+ if ((sq_entries < EHEA_MIN_ENTRIES_QP) ||
+ (sq_entries > EHEA_MAX_ENTRIES_SQ)) {
+ pr_info("Bad parameter: sq_entries\n");
+ ret = -EINVAL;
+ }
+
+ return ret;
+ }
+
+ static ssize_t ehea_show_capabilities(struct device_driver *drv,
+ char *buf)
+ {
+ return sprintf(buf, "%d", EHEA_CAPABILITIES);
+ }
+
+ static DRIVER_ATTR(capabilities, S_IRUSR | S_IRGRP | S_IROTH,
+ ehea_show_capabilities, NULL);
+
+ int __init ehea_module_init(void)
+ {
+ int ret;
+
+ pr_info("IBM eHEA ethernet device driver (Release %s)\n", DRV_VERSION);
+
+ memset(&ehea_fw_handles, 0, sizeof(ehea_fw_handles));
+ memset(&ehea_bcmc_regs, 0, sizeof(ehea_bcmc_regs));
+
+ mutex_init(&ehea_fw_handles.lock);
+ spin_lock_init(&ehea_bcmc_regs.lock);
+
+ ret = check_module_parm();
+ if (ret)
+ goto out;
+
+ ret = ehea_create_busmap();
+ if (ret)
+ goto out;
+
+ ret = register_reboot_notifier(&ehea_reboot_nb);
+ if (ret)
+ pr_info("failed registering reboot notifier\n");
+
+ ret = register_memory_notifier(&ehea_mem_nb);
+ if (ret)
+ pr_info("failed registering memory remove notifier\n");
+
+ ret = crash_shutdown_register(ehea_crash_handler);
+ if (ret)
+ pr_info("failed registering crash handler\n");
+
+ ret = ibmebus_register_driver(&ehea_driver);
+ if (ret) {
+ pr_err("failed registering eHEA device driver on ebus\n");
+ goto out2;
+ }
+
+ ret = driver_create_file(&ehea_driver.driver,
+ &driver_attr_capabilities);
+ if (ret) {
+ pr_err("failed to register capabilities attribute, ret=%d\n",
+ ret);
+ goto out3;
+ }
+
+ return ret;
+
+ out3:
+ ibmebus_unregister_driver(&ehea_driver);
+ out2:
+ unregister_memory_notifier(&ehea_mem_nb);
+ unregister_reboot_notifier(&ehea_reboot_nb);
+ crash_shutdown_unregister(ehea_crash_handler);
+ out:
+ return ret;
+ }
+
+ static void __exit ehea_module_exit(void)
+ {
+ int ret;
+
+ driver_remove_file(&ehea_driver.driver, &driver_attr_capabilities);
+ ibmebus_unregister_driver(&ehea_driver);
+ unregister_reboot_notifier(&ehea_reboot_nb);
+ ret = crash_shutdown_unregister(ehea_crash_handler);
+ if (ret)
+ pr_info("failed unregistering crash handler\n");
+ unregister_memory_notifier(&ehea_mem_nb);
+ kfree(ehea_fw_handles.arr);
+ kfree(ehea_bcmc_regs.arr);
+ ehea_destroy_busmap();
+ }
+
+ module_init(ehea_module_init);
+ module_exit(ehea_module_exit);
struct device *dev = NULL;
spin_lock_irqsave(q->queue_lock, flags);
- sdev = q->queuedata;
+ sdev = scsi_device_from_queue(q);
- if (sdev && sdev->scsi_dh_data)
+ if (!sdev) {
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ err = SCSI_DH_NOSYS;
+ if (fn)
+ fn(data, err);
+ return err;
+ }
+
+ if (sdev->scsi_dh_data)
scsi_dh = sdev->scsi_dh_data->scsi_dh;
dev = get_device(&sdev->sdev_gendev);
if (!scsi_dh || !dev ||
#include <linux/notifier.h>
#include <linux/jiffies.h>
+ #include <asm/irq_regs.h>
+
+#include <linux/bootsplash.h>
+
extern void ctrl_alt_del(void);
/*
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
- #include "ext4_extents.h"
#include "truncate.h"
+#include "richacl.h"
#include <trace/events/ext4.h>
inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
- inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+#ifdef CONFIG_EXT4_FS_RICHACL
+ ei->i_richacl = EXT4_RICHACL_NOT_CACHED;
+#endif
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
&journal_ioprio, NULL, 0))
goto failed_mount;
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
+ "with data=journal disables delayed "
+ "allocation and O_DIRECT support!\n");
+ if (test_opt2(sb, EXPLICIT_DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ goto failed_mount;
+ }
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and delalloc");
+ goto failed_mount;
+ }
+ if (test_opt(sb, DELALLOC))
+ clear_opt(sb, DELALLOC);
+ }
+
+ blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+ if (test_opt(sb, DIOREAD_NOLOCK)) {
+ if (blocksize < PAGE_SIZE) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "dioread_nolock if block size != PAGE_SIZE");
+ goto failed_mount;
+ }
+ }
+
- sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
- (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
-
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
(EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
if (sb->s_op->remount_fs) {
retval = sb->s_op->remount_fs(sb, &flags, data);
- if (retval)
- return retval;
+ if (retval) {
- if (!force)
++ if (!(rflags & REMOUNT_FORCE))
+ return retval;
+ /* If forced remount, go ahead despite any errors */
+ WARN(1, "forced remount of a %s fs returned %i\n",
+ sb->s_type->name, retval);
+ }
}
sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
#ifdef CONFIG_PRINTK
+ extern int __dev_printk(const char *level, const struct device *dev,
+ struct va_format *vaf);
+ extern __printf(3, 4)
++
+#if defined(KMSG_COMPONENT) && (defined(CONFIG_KMSG_IDS) || defined(__KMSG_CHECKER))
+/* dev_printk_hash for message documentation */
+#if defined(__KMSG_CHECKER) && defined(KMSG_COMPONENT)
+
+/* generate magic string for scripts/kmsg-doc to parse */
+#define dev_printk_hash(level, dev, format, arg...) \
+ __KMSG_DEV(level _FMT_ format _ARGS_ dev, ## arg _END_)
+
+#elif defined(CONFIG_KMSG_IDS) && defined(KMSG_COMPONENT)
+
+int printk_dev_hash(const char *, const char *, const char *, ...);
+#define dev_printk_hash(level, dev, format, arg...) \
+ printk_dev_hash(level "%s.%06x: ", dev_driver_string(dev), \
+ "%s: " format, dev_name(dev), ## arg)
+
+#endif
+
+#define dev_printk(level, dev, format, arg...) \
+ dev_printk_hash(level , dev, format, ## arg)
+#define dev_emerg(dev, format, arg...) \
+ dev_printk_hash(KERN_EMERG , dev , format , ## arg)
+#define dev_alert(dev, format, arg...) \
+ dev_printk_hash(KERN_ALERT , dev , format , ## arg)
+#define dev_crit(dev, format, arg...) \
+ dev_printk_hash(KERN_CRIT , dev , format , ## arg)
+#define dev_err(dev, format, arg...) \
+ dev_printk_hash(KERN_ERR , dev , format , ## arg)
+#define dev_warn(dev, format, arg...) \
+ dev_printk_hash(KERN_WARNING , dev , format , ## arg)
+#define dev_notice(dev, format, arg...) \
+ dev_printk_hash(KERN_NOTICE , dev , format , ## arg)
+#define _dev_info(dev, format, arg...) \
+ dev_printk_hash(KERN_INFO , dev , format , ## arg)
+#else
- extern int dev_printk(const char *level, const struct device *dev,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
- extern int dev_emerg(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
- extern int dev_alert(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
- extern int dev_crit(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
- extern int dev_err(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
- extern int dev_warn(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
- extern int dev_notice(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
- extern int _dev_info(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
+ int dev_printk(const char *level, const struct device *dev,
+ const char *fmt, ...)
+ ;
+ extern __printf(2, 3)
+ int dev_emerg(const struct device *dev, const char *fmt, ...);
+ extern __printf(2, 3)
+ int dev_alert(const struct device *dev, const char *fmt, ...);
+ extern __printf(2, 3)
+ int dev_crit(const struct device *dev, const char *fmt, ...);
+ extern __printf(2, 3)
+ int dev_err(const struct device *dev, const char *fmt, ...);
+ extern __printf(2, 3)
+ int dev_warn(const struct device *dev, const char *fmt, ...);
+ extern __printf(2, 3)
+ int dev_notice(const struct device *dev, const char *fmt, ...);
+ extern __printf(2, 3)
+ int _dev_info(const struct device *dev, const char *fmt, ...);
-
+#endif
#else
- static inline int dev_printk(const char *level, const struct device *dev,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
- static inline int dev_printk(const char *level, const struct device *dev,
- const char *fmt, ...)
- { return 0; }
-
- static inline int dev_emerg(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
- static inline int dev_emerg(const struct device *dev, const char *fmt, ...)
- { return 0; }
- static inline int dev_crit(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
- static inline int dev_crit(const struct device *dev, const char *fmt, ...)
- { return 0; }
- static inline int dev_alert(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
- static inline int dev_alert(const struct device *dev, const char *fmt, ...)
- { return 0; }
- static inline int dev_err(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
- static inline int dev_err(const struct device *dev, const char *fmt, ...)
- { return 0; }
- static inline int dev_warn(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
- static inline int dev_warn(const struct device *dev, const char *fmt, ...)
- { return 0; }
- static inline int dev_notice(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
- static inline int dev_notice(const struct device *dev, const char *fmt, ...)
- { return 0; }
- static inline int _dev_info(const struct device *dev, const char *fmt, ...)
- __attribute__ ((format (printf, 2, 3)));
- static inline int _dev_info(const struct device *dev, const char *fmt, ...)
- { return 0; }
+ static inline int __dev_printk(const char *level, const struct device *dev,
+ struct va_format *vaf)
+ { return 0; }
+ static inline __printf(3, 4)
+ int dev_printk(const char *level, const struct device *dev,
+ const char *fmt, ...)
+ { return 0; }
+
+ static inline __printf(2, 3)
+ int dev_emerg(const struct device *dev, const char *fmt, ...)
+ { return 0; }
+ static inline __printf(2, 3)
+ int dev_crit(const struct device *dev, const char *fmt, ...)
+ { return 0; }
+ static inline __printf(2, 3)
+ int dev_alert(const struct device *dev, const char *fmt, ...)
+ { return 0; }
+ static inline __printf(2, 3)
+ int dev_err(const struct device *dev, const char *fmt, ...)
+ { return 0; }
+ static inline __printf(2, 3)
+ int dev_warn(const struct device *dev, const char *fmt, ...)
+ { return 0; }
+ static inline __printf(2, 3)
+ int dev_notice(const struct device *dev, const char *fmt, ...)
+ { return 0; }
+ static inline __printf(2, 3)
+ int _dev_info(const struct device *dev, const char *fmt, ...)
+ { return 0; }
#endif
#define TAINT_WARN 9
#define TAINT_CRAP 10
#define TAINT_FIRMWARE_WORKAROUND 11
+ #define TAINT_OOT_MODULE 12
+#ifdef CONFIG_ENTERPRISE_SUPPORT
+/*
+ * Take the upper bits to hopefully allow them
+ * to stay the same for more than one release.
+ */
+#define TAINT_NO_SUPPORT 30
+#define TAINT_EXTERNAL_SUPPORT 31
+#endif
+
extern const char hex_asc[];
#define hex_asc_lo(x) hex_asc[((x) & 0x0f)]
#define hex_asc_hi(x) hex_asc[((x) & 0xf0) >> 4]
#define NFS_INO_COMMIT (7) /* inode is committing unstable writes */
#define NFS_INO_PNFS_COMMIT (8) /* use pnfs code for commit */
#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */
- #define NFS_INO_SEEN_GETATTR (10) /* flag to track if app is calling
+ #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */
++#define NFS_INO_SEEN_GETATTR (11) /* flag to track if app is calling
+ * getattr in a directory during
+ * readdir
+ */
static inline struct nfs_inode *NFS_I(const struct inode *inode)
{
{ TAINT_WARN, 'W', ' ' },
{ TAINT_CRAP, 'C', ' ' },
{ TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
+ { TAINT_OOT_MODULE, 'O', ' ' },
+#ifdef CONFIG_ENTERPRISE_SUPPORT
+ { TAINT_NO_SUPPORT, 'N', ' ' },
+ { TAINT_EXTERNAL_SUPPORT, 'X', ' ' },
+#endif
};
/**
* 'W' - Taint on warning.
* 'C' - modules from drivers/staging are loaded.
* 'I' - Working around severe firmware bug.
+ * 'O' - Out-of-tree module has been loaded.
+ * 'N' - Unsuported modules loaded.
+ * 'X' - Modules with external support loaded.
*
* The string is overwritten by the next call to print_tainted().
*/
depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT
depends on !X86_64
select STACKTRACE
- select FRAME_POINTER if !PPC && !S390 && !MICROBLAZE && !X86
- select FRAME_POINTER if !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND
++ select FRAME_POINTER if !PPC && !S390 && !MICROBLAZE && !X86 && !ARM_UNWIND
+ select UNWIND_INFO if X86 && !FRAME_POINTER
help
Provide stacktrace filter for fault-injection capabilities
depends on DEBUG_KERNEL
depends on STACKTRACE_SUPPORT
depends on PROC_FS
- select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !X86
- select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND
++ select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !X86 && !ARM_UNWIND
+ select UNWIND_INFO if X86 && !FRAME_POINTER
select KALLSYMS
select KALLSYMS_ALL
select STACKTRACE
va_end(args);
}
- pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
+ if (!(gfp_mask & __GFP_WAIT)) {
+ pr_info("The following is only an harmless informational message.\n");
+ pr_info("Unless you get a _continuous_flood_ of these messages it means\n");
+ pr_info("everything is working fine. Allocations from irqs cannot be\n");
+ pr_info("perfectly reliable and the kernel is designed to handle that.\n");
+ }
+ pr_info("%s: page allocation failure. order:%d, mode:0x%x\n",
- current->comm, order, gfp_mask);
+ current->comm, order, gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
static FILE *debugfile;
int cur_line = 1;
- char *cur_filename;
+ char *cur_filename, *source_file;
+ int in_source_file;
static int flag_debug, flag_dump_defs, flag_reference, flag_dump_types,
- flag_preserve, flag_warnings;
+ flag_override, flag_preserve, flag_warnings;
static const char *arch = "";
static const char *mod_prefix = "";
buf.pos = 0;
add_header(&buf, mod);
+ add_intree_flag(&buf, !external_module);
add_staging_flag(&buf, mod->name);
+ add_supported_flag(&buf, mod);
err |= add_versions(&buf, mod);
add_depends(&buf, mod, modules);
add_moddevtable(&buf, mod);