Merge branch 'for-davem' of git://git.kernel.org/pub/scm/linux/kernel/git/linville...
[linux-flexiantxendom0-3.2.10.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47
48 #define ROOT_SIZE               VTD_PAGE_SIZE
49 #define CONTEXT_SIZE            VTD_PAGE_SIZE
50
51 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
52 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
53 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
54
55 #define IOAPIC_RANGE_START      (0xfee00000)
56 #define IOAPIC_RANGE_END        (0xfeefffff)
57 #define IOVA_START_ADDR         (0x1000)
58
59 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
60
61 #define MAX_AGAW_WIDTH 64
62
63 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
64 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
65
66 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
67    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
68 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
69                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
70 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
71
72 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
73 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
74 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
75
76 /* page table handling */
77 #define LEVEL_STRIDE            (9)
78 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
79
80 /*
81  * This bitmap is used to advertise the page sizes our hardware support
82  * to the IOMMU core, which will then use this information to split
83  * physically contiguous memory regions it is mapping into page sizes
84  * that we support.
85  *
86  * Traditionally the IOMMU core just handed us the mappings directly,
87  * after making sure the size is an order of a 4KiB page and that the
88  * mapping has natural alignment.
89  *
90  * To retain this behavior, we currently advertise that we support
91  * all page sizes that are an order of 4KiB.
92  *
93  * If at some point we'd like to utilize the IOMMU core's new behavior,
94  * we could change this to advertise the real page sizes we support.
95  */
96 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
97
98 static inline int agaw_to_level(int agaw)
99 {
100         return agaw + 2;
101 }
102
103 static inline int agaw_to_width(int agaw)
104 {
105         return 30 + agaw * LEVEL_STRIDE;
106 }
107
108 static inline int width_to_agaw(int width)
109 {
110         return (width - 30) / LEVEL_STRIDE;
111 }
112
113 static inline unsigned int level_to_offset_bits(int level)
114 {
115         return (level - 1) * LEVEL_STRIDE;
116 }
117
118 static inline int pfn_level_offset(unsigned long pfn, int level)
119 {
120         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
121 }
122
123 static inline unsigned long level_mask(int level)
124 {
125         return -1UL << level_to_offset_bits(level);
126 }
127
128 static inline unsigned long level_size(int level)
129 {
130         return 1UL << level_to_offset_bits(level);
131 }
132
133 static inline unsigned long align_to_level(unsigned long pfn, int level)
134 {
135         return (pfn + level_size(level) - 1) & level_mask(level);
136 }
137
138 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
139 {
140         return  1 << ((lvl - 1) * LEVEL_STRIDE);
141 }
142
143 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
144    are never going to work. */
145 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
146 {
147         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
148 }
149
150 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
151 {
152         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
153 }
154 static inline unsigned long page_to_dma_pfn(struct page *pg)
155 {
156         return mm_to_dma_pfn(page_to_pfn(pg));
157 }
158 static inline unsigned long virt_to_dma_pfn(void *p)
159 {
160         return page_to_dma_pfn(virt_to_page(p));
161 }
162
163 /* global iommu list, set NULL for ignored DMAR units */
164 static struct intel_iommu **g_iommus;
165
166 static void __init check_tylersburg_isoch(void);
167 static int rwbf_quirk;
168
169 /*
170  * set to 1 to panic kernel if can't successfully enable VT-d
171  * (used when kernel is launched w/ TXT)
172  */
173 static int force_on = 0;
174
175 /*
176  * 0: Present
177  * 1-11: Reserved
178  * 12-63: Context Ptr (12 - (haw-1))
179  * 64-127: Reserved
180  */
181 struct root_entry {
182         u64     val;
183         u64     rsvd1;
184 };
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 static inline bool root_present(struct root_entry *root)
187 {
188         return (root->val & 1);
189 }
190 static inline void set_root_present(struct root_entry *root)
191 {
192         root->val |= 1;
193 }
194 static inline void set_root_value(struct root_entry *root, unsigned long value)
195 {
196         root->val |= value & VTD_PAGE_MASK;
197 }
198
199 static inline struct context_entry *
200 get_context_addr_from_root(struct root_entry *root)
201 {
202         return (struct context_entry *)
203                 (root_present(root)?phys_to_virt(
204                 root->val & VTD_PAGE_MASK) :
205                 NULL);
206 }
207
208 /*
209  * low 64 bits:
210  * 0: present
211  * 1: fault processing disable
212  * 2-3: translation type
213  * 12-63: address space root
214  * high 64 bits:
215  * 0-2: address width
216  * 3-6: aval
217  * 8-23: domain id
218  */
219 struct context_entry {
220         u64 lo;
221         u64 hi;
222 };
223
224 static inline bool context_present(struct context_entry *context)
225 {
226         return (context->lo & 1);
227 }
228 static inline void context_set_present(struct context_entry *context)
229 {
230         context->lo |= 1;
231 }
232
233 static inline void context_set_fault_enable(struct context_entry *context)
234 {
235         context->lo &= (((u64)-1) << 2) | 1;
236 }
237
238 static inline void context_set_translation_type(struct context_entry *context,
239                                                 unsigned long value)
240 {
241         context->lo &= (((u64)-1) << 4) | 3;
242         context->lo |= (value & 3) << 2;
243 }
244
245 static inline void context_set_address_root(struct context_entry *context,
246                                             unsigned long value)
247 {
248         context->lo |= value & VTD_PAGE_MASK;
249 }
250
251 static inline void context_set_address_width(struct context_entry *context,
252                                              unsigned long value)
253 {
254         context->hi |= value & 7;
255 }
256
257 static inline void context_set_domain_id(struct context_entry *context,
258                                          unsigned long value)
259 {
260         context->hi |= (value & ((1 << 16) - 1)) << 8;
261 }
262
263 static inline void context_clear_entry(struct context_entry *context)
264 {
265         context->lo = 0;
266         context->hi = 0;
267 }
268
269 /*
270  * 0: readable
271  * 1: writable
272  * 2-6: reserved
273  * 7: super page
274  * 8-10: available
275  * 11: snoop behavior
276  * 12-63: Host physcial address
277  */
278 struct dma_pte {
279         u64 val;
280 };
281
282 static inline void dma_clear_pte(struct dma_pte *pte)
283 {
284         pte->val = 0;
285 }
286
287 static inline void dma_set_pte_readable(struct dma_pte *pte)
288 {
289         pte->val |= DMA_PTE_READ;
290 }
291
292 static inline void dma_set_pte_writable(struct dma_pte *pte)
293 {
294         pte->val |= DMA_PTE_WRITE;
295 }
296
297 static inline void dma_set_pte_snp(struct dma_pte *pte)
298 {
299         pte->val |= DMA_PTE_SNP;
300 }
301
302 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
303 {
304         pte->val = (pte->val & ~3) | (prot & 3);
305 }
306
307 static inline u64 dma_pte_addr(struct dma_pte *pte)
308 {
309 #ifdef CONFIG_64BIT
310         return pte->val & VTD_PAGE_MASK;
311 #else
312         /* Must have a full atomic 64-bit read */
313         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
314 #endif
315 }
316
317 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
318 {
319         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
320 }
321
322 static inline bool dma_pte_present(struct dma_pte *pte)
323 {
324         return (pte->val & 3) != 0;
325 }
326
327 static inline bool dma_pte_superpage(struct dma_pte *pte)
328 {
329         return (pte->val & (1 << 7));
330 }
331
332 static inline int first_pte_in_page(struct dma_pte *pte)
333 {
334         return !((unsigned long)pte & ~VTD_PAGE_MASK);
335 }
336
337 /*
338  * This domain is a statically identity mapping domain.
339  *      1. This domain creats a static 1:1 mapping to all usable memory.
340  *      2. It maps to each iommu if successful.
341  *      3. Each iommu mapps to this domain if successful.
342  */
343 static struct dmar_domain *si_domain;
344 static int hw_pass_through = 1;
345
346 /* devices under the same p2p bridge are owned in one domain */
347 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
348
349 /* domain represents a virtual machine, more than one devices
350  * across iommus may be owned in one domain, e.g. kvm guest.
351  */
352 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
353
354 /* si_domain contains mulitple devices */
355 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
356
357 /* define the limit of IOMMUs supported in each domain */
358 #ifdef  CONFIG_X86
359 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
360 #else
361 # define        IOMMU_UNITS_SUPPORTED   64
362 #endif
363
364 struct dmar_domain {
365         int     id;                     /* domain id */
366         int     nid;                    /* node id */
367         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
368                                         /* bitmap of iommus this domain uses*/
369
370         struct list_head devices;       /* all devices' list */
371         struct iova_domain iovad;       /* iova's that belong to this domain */
372
373         struct dma_pte  *pgd;           /* virtual address */
374         int             gaw;            /* max guest address width */
375
376         /* adjusted guest address width, 0 is level 2 30-bit */
377         int             agaw;
378
379         int             flags;          /* flags to find out type of domain */
380
381         int             iommu_coherency;/* indicate coherency of iommu access */
382         int             iommu_snooping; /* indicate snooping control feature*/
383         int             iommu_count;    /* reference count of iommu */
384         int             iommu_superpage;/* Level of superpages supported:
385                                            0 == 4KiB (no superpages), 1 == 2MiB,
386                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
387         spinlock_t      iommu_lock;     /* protect iommu set in domain */
388         u64             max_addr;       /* maximum mapped address */
389 };
390
391 /* PCI domain-device relationship */
392 struct device_domain_info {
393         struct list_head link;  /* link to domain siblings */
394         struct list_head global; /* link to global list */
395         int segment;            /* PCI domain */
396         u8 bus;                 /* PCI bus number */
397         u8 devfn;               /* PCI devfn number */
398         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
399         struct intel_iommu *iommu; /* IOMMU used by this device */
400         struct dmar_domain *domain; /* pointer to domain */
401 };
402
403 static void flush_unmaps_timeout(unsigned long data);
404
405 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
406
407 #define HIGH_WATER_MARK 250
408 struct deferred_flush_tables {
409         int next;
410         struct iova *iova[HIGH_WATER_MARK];
411         struct dmar_domain *domain[HIGH_WATER_MARK];
412 };
413
414 static struct deferred_flush_tables *deferred_flush;
415
416 /* bitmap for indexing intel_iommus */
417 static int g_num_of_iommus;
418
419 static DEFINE_SPINLOCK(async_umap_flush_lock);
420 static LIST_HEAD(unmaps_to_do);
421
422 static int timer_on;
423 static long list_size;
424
425 static void domain_remove_dev_info(struct dmar_domain *domain);
426
427 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
428 int dmar_disabled = 0;
429 #else
430 int dmar_disabled = 1;
431 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
432
433 int intel_iommu_enabled = 0;
434 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
435
436 static int dmar_map_gfx = 1;
437 static int dmar_forcedac;
438 static int intel_iommu_strict;
439 static int intel_iommu_superpage = 1;
440
441 int intel_iommu_gfx_mapped;
442 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
443
444 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
445 static DEFINE_SPINLOCK(device_domain_lock);
446 static LIST_HEAD(device_domain_list);
447
448 static struct iommu_ops intel_iommu_ops;
449
450 static int __init intel_iommu_setup(char *str)
451 {
452         if (!str)
453                 return -EINVAL;
454         while (*str) {
455                 if (!strncmp(str, "on", 2)) {
456                         dmar_disabled = 0;
457                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
458                 } else if (!strncmp(str, "off", 3)) {
459                         dmar_disabled = 1;
460                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
461                 } else if (!strncmp(str, "igfx_off", 8)) {
462                         dmar_map_gfx = 0;
463                         printk(KERN_INFO
464                                 "Intel-IOMMU: disable GFX device mapping\n");
465                 } else if (!strncmp(str, "forcedac", 8)) {
466                         printk(KERN_INFO
467                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
468                         dmar_forcedac = 1;
469                 } else if (!strncmp(str, "strict", 6)) {
470                         printk(KERN_INFO
471                                 "Intel-IOMMU: disable batched IOTLB flush\n");
472                         intel_iommu_strict = 1;
473                 } else if (!strncmp(str, "sp_off", 6)) {
474                         printk(KERN_INFO
475                                 "Intel-IOMMU: disable supported super page\n");
476                         intel_iommu_superpage = 0;
477                 }
478
479                 str += strcspn(str, ",");
480                 while (*str == ',')
481                         str++;
482         }
483         return 0;
484 }
485 __setup("intel_iommu=", intel_iommu_setup);
486
487 static struct kmem_cache *iommu_domain_cache;
488 static struct kmem_cache *iommu_devinfo_cache;
489 static struct kmem_cache *iommu_iova_cache;
490
491 static inline void *alloc_pgtable_page(int node)
492 {
493         struct page *page;
494         void *vaddr = NULL;
495
496         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
497         if (page)
498                 vaddr = page_address(page);
499         return vaddr;
500 }
501
502 static inline void free_pgtable_page(void *vaddr)
503 {
504         free_page((unsigned long)vaddr);
505 }
506
507 static inline void *alloc_domain_mem(void)
508 {
509         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
510 }
511
512 static void free_domain_mem(void *vaddr)
513 {
514         kmem_cache_free(iommu_domain_cache, vaddr);
515 }
516
517 static inline void * alloc_devinfo_mem(void)
518 {
519         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
520 }
521
522 static inline void free_devinfo_mem(void *vaddr)
523 {
524         kmem_cache_free(iommu_devinfo_cache, vaddr);
525 }
526
527 struct iova *alloc_iova_mem(void)
528 {
529         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
530 }
531
532 void free_iova_mem(struct iova *iova)
533 {
534         kmem_cache_free(iommu_iova_cache, iova);
535 }
536
537
538 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
539 {
540         unsigned long sagaw;
541         int agaw = -1;
542
543         sagaw = cap_sagaw(iommu->cap);
544         for (agaw = width_to_agaw(max_gaw);
545              agaw >= 0; agaw--) {
546                 if (test_bit(agaw, &sagaw))
547                         break;
548         }
549
550         return agaw;
551 }
552
553 /*
554  * Calculate max SAGAW for each iommu.
555  */
556 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
557 {
558         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
559 }
560
561 /*
562  * calculate agaw for each iommu.
563  * "SAGAW" may be different across iommus, use a default agaw, and
564  * get a supported less agaw for iommus that don't support the default agaw.
565  */
566 int iommu_calculate_agaw(struct intel_iommu *iommu)
567 {
568         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
569 }
570
571 /* This functionin only returns single iommu in a domain */
572 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
573 {
574         int iommu_id;
575
576         /* si_domain and vm domain should not get here. */
577         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
578         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
579
580         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
581         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
582                 return NULL;
583
584         return g_iommus[iommu_id];
585 }
586
587 static void domain_update_iommu_coherency(struct dmar_domain *domain)
588 {
589         int i;
590
591         domain->iommu_coherency = 1;
592
593         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
594                 if (!ecap_coherent(g_iommus[i]->ecap)) {
595                         domain->iommu_coherency = 0;
596                         break;
597                 }
598         }
599 }
600
601 static void domain_update_iommu_snooping(struct dmar_domain *domain)
602 {
603         int i;
604
605         domain->iommu_snooping = 1;
606
607         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
608                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
609                         domain->iommu_snooping = 0;
610                         break;
611                 }
612         }
613 }
614
615 static void domain_update_iommu_superpage(struct dmar_domain *domain)
616 {
617         struct dmar_drhd_unit *drhd;
618         struct intel_iommu *iommu = NULL;
619         int mask = 0xf;
620
621         if (!intel_iommu_superpage) {
622                 domain->iommu_superpage = 0;
623                 return;
624         }
625
626         /* set iommu_superpage to the smallest common denominator */
627         for_each_active_iommu(iommu, drhd) {
628                 mask &= cap_super_page_val(iommu->cap);
629                 if (!mask) {
630                         break;
631                 }
632         }
633         domain->iommu_superpage = fls(mask);
634 }
635
636 /* Some capabilities may be different across iommus */
637 static void domain_update_iommu_cap(struct dmar_domain *domain)
638 {
639         domain_update_iommu_coherency(domain);
640         domain_update_iommu_snooping(domain);
641         domain_update_iommu_superpage(domain);
642 }
643
644 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
645 {
646         struct dmar_drhd_unit *drhd = NULL;
647         int i;
648
649         for_each_drhd_unit(drhd) {
650                 if (drhd->ignored)
651                         continue;
652                 if (segment != drhd->segment)
653                         continue;
654
655                 for (i = 0; i < drhd->devices_cnt; i++) {
656                         if (drhd->devices[i] &&
657                             drhd->devices[i]->bus->number == bus &&
658                             drhd->devices[i]->devfn == devfn)
659                                 return drhd->iommu;
660                         if (drhd->devices[i] &&
661                             drhd->devices[i]->subordinate &&
662                             drhd->devices[i]->subordinate->number <= bus &&
663                             drhd->devices[i]->subordinate->subordinate >= bus)
664                                 return drhd->iommu;
665                 }
666
667                 if (drhd->include_all)
668                         return drhd->iommu;
669         }
670
671         return NULL;
672 }
673
674 static void domain_flush_cache(struct dmar_domain *domain,
675                                void *addr, int size)
676 {
677         if (!domain->iommu_coherency)
678                 clflush_cache_range(addr, size);
679 }
680
681 /* Gets context entry for a given bus and devfn */
682 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
683                 u8 bus, u8 devfn)
684 {
685         struct root_entry *root;
686         struct context_entry *context;
687         unsigned long phy_addr;
688         unsigned long flags;
689
690         spin_lock_irqsave(&iommu->lock, flags);
691         root = &iommu->root_entry[bus];
692         context = get_context_addr_from_root(root);
693         if (!context) {
694                 context = (struct context_entry *)
695                                 alloc_pgtable_page(iommu->node);
696                 if (!context) {
697                         spin_unlock_irqrestore(&iommu->lock, flags);
698                         return NULL;
699                 }
700                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
701                 phy_addr = virt_to_phys((void *)context);
702                 set_root_value(root, phy_addr);
703                 set_root_present(root);
704                 __iommu_flush_cache(iommu, root, sizeof(*root));
705         }
706         spin_unlock_irqrestore(&iommu->lock, flags);
707         return &context[devfn];
708 }
709
710 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
711 {
712         struct root_entry *root;
713         struct context_entry *context;
714         int ret;
715         unsigned long flags;
716
717         spin_lock_irqsave(&iommu->lock, flags);
718         root = &iommu->root_entry[bus];
719         context = get_context_addr_from_root(root);
720         if (!context) {
721                 ret = 0;
722                 goto out;
723         }
724         ret = context_present(&context[devfn]);
725 out:
726         spin_unlock_irqrestore(&iommu->lock, flags);
727         return ret;
728 }
729
730 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
731 {
732         struct root_entry *root;
733         struct context_entry *context;
734         unsigned long flags;
735
736         spin_lock_irqsave(&iommu->lock, flags);
737         root = &iommu->root_entry[bus];
738         context = get_context_addr_from_root(root);
739         if (context) {
740                 context_clear_entry(&context[devfn]);
741                 __iommu_flush_cache(iommu, &context[devfn], \
742                         sizeof(*context));
743         }
744         spin_unlock_irqrestore(&iommu->lock, flags);
745 }
746
747 static void free_context_table(struct intel_iommu *iommu)
748 {
749         struct root_entry *root;
750         int i;
751         unsigned long flags;
752         struct context_entry *context;
753
754         spin_lock_irqsave(&iommu->lock, flags);
755         if (!iommu->root_entry) {
756                 goto out;
757         }
758         for (i = 0; i < ROOT_ENTRY_NR; i++) {
759                 root = &iommu->root_entry[i];
760                 context = get_context_addr_from_root(root);
761                 if (context)
762                         free_pgtable_page(context);
763         }
764         free_pgtable_page(iommu->root_entry);
765         iommu->root_entry = NULL;
766 out:
767         spin_unlock_irqrestore(&iommu->lock, flags);
768 }
769
770 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
771                                       unsigned long pfn, int target_level)
772 {
773         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
774         struct dma_pte *parent, *pte = NULL;
775         int level = agaw_to_level(domain->agaw);
776         int offset;
777
778         BUG_ON(!domain->pgd);
779         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
780         parent = domain->pgd;
781
782         while (level > 0) {
783                 void *tmp_page;
784
785                 offset = pfn_level_offset(pfn, level);
786                 pte = &parent[offset];
787                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
788                         break;
789                 if (level == target_level)
790                         break;
791
792                 if (!dma_pte_present(pte)) {
793                         uint64_t pteval;
794
795                         tmp_page = alloc_pgtable_page(domain->nid);
796
797                         if (!tmp_page)
798                                 return NULL;
799
800                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
801                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
802                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
803                                 /* Someone else set it while we were thinking; use theirs. */
804                                 free_pgtable_page(tmp_page);
805                         } else {
806                                 dma_pte_addr(pte);
807                                 domain_flush_cache(domain, pte, sizeof(*pte));
808                         }
809                 }
810                 parent = phys_to_virt(dma_pte_addr(pte));
811                 level--;
812         }
813
814         return pte;
815 }
816
817
818 /* return address's pte at specific level */
819 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
820                                          unsigned long pfn,
821                                          int level, int *large_page)
822 {
823         struct dma_pte *parent, *pte = NULL;
824         int total = agaw_to_level(domain->agaw);
825         int offset;
826
827         parent = domain->pgd;
828         while (level <= total) {
829                 offset = pfn_level_offset(pfn, total);
830                 pte = &parent[offset];
831                 if (level == total)
832                         return pte;
833
834                 if (!dma_pte_present(pte)) {
835                         *large_page = total;
836                         break;
837                 }
838
839                 if (pte->val & DMA_PTE_LARGE_PAGE) {
840                         *large_page = total;
841                         return pte;
842                 }
843
844                 parent = phys_to_virt(dma_pte_addr(pte));
845                 total--;
846         }
847         return NULL;
848 }
849
850 /* clear last level pte, a tlb flush should be followed */
851 static int dma_pte_clear_range(struct dmar_domain *domain,
852                                 unsigned long start_pfn,
853                                 unsigned long last_pfn)
854 {
855         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
856         unsigned int large_page = 1;
857         struct dma_pte *first_pte, *pte;
858         int order;
859
860         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
861         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
862         BUG_ON(start_pfn > last_pfn);
863
864         /* we don't need lock here; nobody else touches the iova range */
865         do {
866                 large_page = 1;
867                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
868                 if (!pte) {
869                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
870                         continue;
871                 }
872                 do {
873                         dma_clear_pte(pte);
874                         start_pfn += lvl_to_nr_pages(large_page);
875                         pte++;
876                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
877
878                 domain_flush_cache(domain, first_pte,
879                                    (void *)pte - (void *)first_pte);
880
881         } while (start_pfn && start_pfn <= last_pfn);
882
883         order = (large_page - 1) * 9;
884         return order;
885 }
886
887 /* free page table pages. last level pte should already be cleared */
888 static void dma_pte_free_pagetable(struct dmar_domain *domain,
889                                    unsigned long start_pfn,
890                                    unsigned long last_pfn)
891 {
892         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
893         struct dma_pte *first_pte, *pte;
894         int total = agaw_to_level(domain->agaw);
895         int level;
896         unsigned long tmp;
897         int large_page = 2;
898
899         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
900         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
901         BUG_ON(start_pfn > last_pfn);
902
903         /* We don't need lock here; nobody else touches the iova range */
904         level = 2;
905         while (level <= total) {
906                 tmp = align_to_level(start_pfn, level);
907
908                 /* If we can't even clear one PTE at this level, we're done */
909                 if (tmp + level_size(level) - 1 > last_pfn)
910                         return;
911
912                 do {
913                         large_page = level;
914                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
915                         if (large_page > level)
916                                 level = large_page + 1;
917                         if (!pte) {
918                                 tmp = align_to_level(tmp + 1, level + 1);
919                                 continue;
920                         }
921                         do {
922                                 if (dma_pte_present(pte)) {
923                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
924                                         dma_clear_pte(pte);
925                                 }
926                                 pte++;
927                                 tmp += level_size(level);
928                         } while (!first_pte_in_page(pte) &&
929                                  tmp + level_size(level) - 1 <= last_pfn);
930
931                         domain_flush_cache(domain, first_pte,
932                                            (void *)pte - (void *)first_pte);
933                         
934                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
935                 level++;
936         }
937         /* free pgd */
938         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
939                 free_pgtable_page(domain->pgd);
940                 domain->pgd = NULL;
941         }
942 }
943
944 /* iommu handling */
945 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
946 {
947         struct root_entry *root;
948         unsigned long flags;
949
950         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
951         if (!root)
952                 return -ENOMEM;
953
954         __iommu_flush_cache(iommu, root, ROOT_SIZE);
955
956         spin_lock_irqsave(&iommu->lock, flags);
957         iommu->root_entry = root;
958         spin_unlock_irqrestore(&iommu->lock, flags);
959
960         return 0;
961 }
962
963 static void iommu_set_root_entry(struct intel_iommu *iommu)
964 {
965         void *addr;
966         u32 sts;
967         unsigned long flag;
968
969         addr = iommu->root_entry;
970
971         raw_spin_lock_irqsave(&iommu->register_lock, flag);
972         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
973
974         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
975
976         /* Make sure hardware complete it */
977         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
978                       readl, (sts & DMA_GSTS_RTPS), sts);
979
980         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
981 }
982
983 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
984 {
985         u32 val;
986         unsigned long flag;
987
988         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
989                 return;
990
991         raw_spin_lock_irqsave(&iommu->register_lock, flag);
992         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
993
994         /* Make sure hardware complete it */
995         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
996                       readl, (!(val & DMA_GSTS_WBFS)), val);
997
998         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
999 }
1000
1001 /* return value determine if we need a write buffer flush */
1002 static void __iommu_flush_context(struct intel_iommu *iommu,
1003                                   u16 did, u16 source_id, u8 function_mask,
1004                                   u64 type)
1005 {
1006         u64 val = 0;
1007         unsigned long flag;
1008
1009         switch (type) {
1010         case DMA_CCMD_GLOBAL_INVL:
1011                 val = DMA_CCMD_GLOBAL_INVL;
1012                 break;
1013         case DMA_CCMD_DOMAIN_INVL:
1014                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1015                 break;
1016         case DMA_CCMD_DEVICE_INVL:
1017                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1018                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1019                 break;
1020         default:
1021                 BUG();
1022         }
1023         val |= DMA_CCMD_ICC;
1024
1025         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1026         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1027
1028         /* Make sure hardware complete it */
1029         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1030                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1031
1032         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1033 }
1034
1035 /* return value determine if we need a write buffer flush */
1036 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1037                                 u64 addr, unsigned int size_order, u64 type)
1038 {
1039         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1040         u64 val = 0, val_iva = 0;
1041         unsigned long flag;
1042
1043         switch (type) {
1044         case DMA_TLB_GLOBAL_FLUSH:
1045                 /* global flush doesn't need set IVA_REG */
1046                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1047                 break;
1048         case DMA_TLB_DSI_FLUSH:
1049                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1050                 break;
1051         case DMA_TLB_PSI_FLUSH:
1052                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1053                 /* Note: always flush non-leaf currently */
1054                 val_iva = size_order | addr;
1055                 break;
1056         default:
1057                 BUG();
1058         }
1059         /* Note: set drain read/write */
1060 #if 0
1061         /*
1062          * This is probably to be super secure.. Looks like we can
1063          * ignore it without any impact.
1064          */
1065         if (cap_read_drain(iommu->cap))
1066                 val |= DMA_TLB_READ_DRAIN;
1067 #endif
1068         if (cap_write_drain(iommu->cap))
1069                 val |= DMA_TLB_WRITE_DRAIN;
1070
1071         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1072         /* Note: Only uses first TLB reg currently */
1073         if (val_iva)
1074                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1075         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1076
1077         /* Make sure hardware complete it */
1078         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1079                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1080
1081         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1082
1083         /* check IOTLB invalidation granularity */
1084         if (DMA_TLB_IAIG(val) == 0)
1085                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1086         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1087                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1088                         (unsigned long long)DMA_TLB_IIRG(type),
1089                         (unsigned long long)DMA_TLB_IAIG(val));
1090 }
1091
1092 static struct device_domain_info *iommu_support_dev_iotlb(
1093         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1094 {
1095         int found = 0;
1096         unsigned long flags;
1097         struct device_domain_info *info;
1098         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1099
1100         if (!ecap_dev_iotlb_support(iommu->ecap))
1101                 return NULL;
1102
1103         if (!iommu->qi)
1104                 return NULL;
1105
1106         spin_lock_irqsave(&device_domain_lock, flags);
1107         list_for_each_entry(info, &domain->devices, link)
1108                 if (info->bus == bus && info->devfn == devfn) {
1109                         found = 1;
1110                         break;
1111                 }
1112         spin_unlock_irqrestore(&device_domain_lock, flags);
1113
1114         if (!found || !info->dev)
1115                 return NULL;
1116
1117         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1118                 return NULL;
1119
1120         if (!dmar_find_matched_atsr_unit(info->dev))
1121                 return NULL;
1122
1123         info->iommu = iommu;
1124
1125         return info;
1126 }
1127
1128 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1129 {
1130         if (!info)
1131                 return;
1132
1133         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1134 }
1135
1136 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1137 {
1138         if (!info->dev || !pci_ats_enabled(info->dev))
1139                 return;
1140
1141         pci_disable_ats(info->dev);
1142 }
1143
1144 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1145                                   u64 addr, unsigned mask)
1146 {
1147         u16 sid, qdep;
1148         unsigned long flags;
1149         struct device_domain_info *info;
1150
1151         spin_lock_irqsave(&device_domain_lock, flags);
1152         list_for_each_entry(info, &domain->devices, link) {
1153                 if (!info->dev || !pci_ats_enabled(info->dev))
1154                         continue;
1155
1156                 sid = info->bus << 8 | info->devfn;
1157                 qdep = pci_ats_queue_depth(info->dev);
1158                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1159         }
1160         spin_unlock_irqrestore(&device_domain_lock, flags);
1161 }
1162
1163 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1164                                   unsigned long pfn, unsigned int pages, int map)
1165 {
1166         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1167         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1168
1169         BUG_ON(pages == 0);
1170
1171         /*
1172          * Fallback to domain selective flush if no PSI support or the size is
1173          * too big.
1174          * PSI requires page size to be 2 ^ x, and the base address is naturally
1175          * aligned to the size
1176          */
1177         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1178                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1179                                                 DMA_TLB_DSI_FLUSH);
1180         else
1181                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1182                                                 DMA_TLB_PSI_FLUSH);
1183
1184         /*
1185          * In caching mode, changes of pages from non-present to present require
1186          * flush. However, device IOTLB doesn't need to be flushed in this case.
1187          */
1188         if (!cap_caching_mode(iommu->cap) || !map)
1189                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1190 }
1191
1192 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1193 {
1194         u32 pmen;
1195         unsigned long flags;
1196
1197         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1198         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1199         pmen &= ~DMA_PMEN_EPM;
1200         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1201
1202         /* wait for the protected region status bit to clear */
1203         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1204                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1205
1206         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1207 }
1208
1209 static int iommu_enable_translation(struct intel_iommu *iommu)
1210 {
1211         u32 sts;
1212         unsigned long flags;
1213
1214         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1215         iommu->gcmd |= DMA_GCMD_TE;
1216         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1217
1218         /* Make sure hardware complete it */
1219         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1220                       readl, (sts & DMA_GSTS_TES), sts);
1221
1222         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1223         return 0;
1224 }
1225
1226 static int iommu_disable_translation(struct intel_iommu *iommu)
1227 {
1228         u32 sts;
1229         unsigned long flag;
1230
1231         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1232         iommu->gcmd &= ~DMA_GCMD_TE;
1233         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1234
1235         /* Make sure hardware complete it */
1236         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1237                       readl, (!(sts & DMA_GSTS_TES)), sts);
1238
1239         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1240         return 0;
1241 }
1242
1243
1244 static int iommu_init_domains(struct intel_iommu *iommu)
1245 {
1246         unsigned long ndomains;
1247         unsigned long nlongs;
1248
1249         ndomains = cap_ndoms(iommu->cap);
1250         pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1251                         ndomains);
1252         nlongs = BITS_TO_LONGS(ndomains);
1253
1254         spin_lock_init(&iommu->lock);
1255
1256         /* TBD: there might be 64K domains,
1257          * consider other allocation for future chip
1258          */
1259         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1260         if (!iommu->domain_ids) {
1261                 printk(KERN_ERR "Allocating domain id array failed\n");
1262                 return -ENOMEM;
1263         }
1264         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1265                         GFP_KERNEL);
1266         if (!iommu->domains) {
1267                 printk(KERN_ERR "Allocating domain array failed\n");
1268                 return -ENOMEM;
1269         }
1270
1271         /*
1272          * if Caching mode is set, then invalid translations are tagged
1273          * with domainid 0. Hence we need to pre-allocate it.
1274          */
1275         if (cap_caching_mode(iommu->cap))
1276                 set_bit(0, iommu->domain_ids);
1277         return 0;
1278 }
1279
1280
1281 static void domain_exit(struct dmar_domain *domain);
1282 static void vm_domain_exit(struct dmar_domain *domain);
1283
1284 void free_dmar_iommu(struct intel_iommu *iommu)
1285 {
1286         struct dmar_domain *domain;
1287         int i;
1288         unsigned long flags;
1289
1290         if ((iommu->domains) && (iommu->domain_ids)) {
1291                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1292                         domain = iommu->domains[i];
1293                         clear_bit(i, iommu->domain_ids);
1294
1295                         spin_lock_irqsave(&domain->iommu_lock, flags);
1296                         if (--domain->iommu_count == 0) {
1297                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1298                                         vm_domain_exit(domain);
1299                                 else
1300                                         domain_exit(domain);
1301                         }
1302                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1303                 }
1304         }
1305
1306         if (iommu->gcmd & DMA_GCMD_TE)
1307                 iommu_disable_translation(iommu);
1308
1309         if (iommu->irq) {
1310                 irq_set_handler_data(iommu->irq, NULL);
1311                 /* This will mask the irq */
1312                 free_irq(iommu->irq, iommu);
1313                 destroy_irq(iommu->irq);
1314         }
1315
1316         kfree(iommu->domains);
1317         kfree(iommu->domain_ids);
1318
1319         g_iommus[iommu->seq_id] = NULL;
1320
1321         /* if all iommus are freed, free g_iommus */
1322         for (i = 0; i < g_num_of_iommus; i++) {
1323                 if (g_iommus[i])
1324                         break;
1325         }
1326
1327         if (i == g_num_of_iommus)
1328                 kfree(g_iommus);
1329
1330         /* free context mapping */
1331         free_context_table(iommu);
1332 }
1333
1334 static struct dmar_domain *alloc_domain(void)
1335 {
1336         struct dmar_domain *domain;
1337
1338         domain = alloc_domain_mem();
1339         if (!domain)
1340                 return NULL;
1341
1342         domain->nid = -1;
1343         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1344         domain->flags = 0;
1345
1346         return domain;
1347 }
1348
1349 static int iommu_attach_domain(struct dmar_domain *domain,
1350                                struct intel_iommu *iommu)
1351 {
1352         int num;
1353         unsigned long ndomains;
1354         unsigned long flags;
1355
1356         ndomains = cap_ndoms(iommu->cap);
1357
1358         spin_lock_irqsave(&iommu->lock, flags);
1359
1360         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1361         if (num >= ndomains) {
1362                 spin_unlock_irqrestore(&iommu->lock, flags);
1363                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1364                 return -ENOMEM;
1365         }
1366
1367         domain->id = num;
1368         set_bit(num, iommu->domain_ids);
1369         set_bit(iommu->seq_id, domain->iommu_bmp);
1370         iommu->domains[num] = domain;
1371         spin_unlock_irqrestore(&iommu->lock, flags);
1372
1373         return 0;
1374 }
1375
1376 static void iommu_detach_domain(struct dmar_domain *domain,
1377                                 struct intel_iommu *iommu)
1378 {
1379         unsigned long flags;
1380         int num, ndomains;
1381         int found = 0;
1382
1383         spin_lock_irqsave(&iommu->lock, flags);
1384         ndomains = cap_ndoms(iommu->cap);
1385         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1386                 if (iommu->domains[num] == domain) {
1387                         found = 1;
1388                         break;
1389                 }
1390         }
1391
1392         if (found) {
1393                 clear_bit(num, iommu->domain_ids);
1394                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1395                 iommu->domains[num] = NULL;
1396         }
1397         spin_unlock_irqrestore(&iommu->lock, flags);
1398 }
1399
1400 static struct iova_domain reserved_iova_list;
1401 static struct lock_class_key reserved_rbtree_key;
1402
1403 static int dmar_init_reserved_ranges(void)
1404 {
1405         struct pci_dev *pdev = NULL;
1406         struct iova *iova;
1407         int i;
1408
1409         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1410
1411         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1412                 &reserved_rbtree_key);
1413
1414         /* IOAPIC ranges shouldn't be accessed by DMA */
1415         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1416                 IOVA_PFN(IOAPIC_RANGE_END));
1417         if (!iova) {
1418                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1419                 return -ENODEV;
1420         }
1421
1422         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1423         for_each_pci_dev(pdev) {
1424                 struct resource *r;
1425
1426                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1427                         r = &pdev->resource[i];
1428                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1429                                 continue;
1430                         iova = reserve_iova(&reserved_iova_list,
1431                                             IOVA_PFN(r->start),
1432                                             IOVA_PFN(r->end));
1433                         if (!iova) {
1434                                 printk(KERN_ERR "Reserve iova failed\n");
1435                                 return -ENODEV;
1436                         }
1437                 }
1438         }
1439         return 0;
1440 }
1441
1442 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1443 {
1444         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1445 }
1446
1447 static inline int guestwidth_to_adjustwidth(int gaw)
1448 {
1449         int agaw;
1450         int r = (gaw - 12) % 9;
1451
1452         if (r == 0)
1453                 agaw = gaw;
1454         else
1455                 agaw = gaw + 9 - r;
1456         if (agaw > 64)
1457                 agaw = 64;
1458         return agaw;
1459 }
1460
1461 static int domain_init(struct dmar_domain *domain, int guest_width)
1462 {
1463         struct intel_iommu *iommu;
1464         int adjust_width, agaw;
1465         unsigned long sagaw;
1466
1467         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1468         spin_lock_init(&domain->iommu_lock);
1469
1470         domain_reserve_special_ranges(domain);
1471
1472         /* calculate AGAW */
1473         iommu = domain_get_iommu(domain);
1474         if (guest_width > cap_mgaw(iommu->cap))
1475                 guest_width = cap_mgaw(iommu->cap);
1476         domain->gaw = guest_width;
1477         adjust_width = guestwidth_to_adjustwidth(guest_width);
1478         agaw = width_to_agaw(adjust_width);
1479         sagaw = cap_sagaw(iommu->cap);
1480         if (!test_bit(agaw, &sagaw)) {
1481                 /* hardware doesn't support it, choose a bigger one */
1482                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1483                 agaw = find_next_bit(&sagaw, 5, agaw);
1484                 if (agaw >= 5)
1485                         return -ENODEV;
1486         }
1487         domain->agaw = agaw;
1488         INIT_LIST_HEAD(&domain->devices);
1489
1490         if (ecap_coherent(iommu->ecap))
1491                 domain->iommu_coherency = 1;
1492         else
1493                 domain->iommu_coherency = 0;
1494
1495         if (ecap_sc_support(iommu->ecap))
1496                 domain->iommu_snooping = 1;
1497         else
1498                 domain->iommu_snooping = 0;
1499
1500         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1501         domain->iommu_count = 1;
1502         domain->nid = iommu->node;
1503
1504         /* always allocate the top pgd */
1505         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1506         if (!domain->pgd)
1507                 return -ENOMEM;
1508         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1509         return 0;
1510 }
1511
1512 static void domain_exit(struct dmar_domain *domain)
1513 {
1514         struct dmar_drhd_unit *drhd;
1515         struct intel_iommu *iommu;
1516
1517         /* Domain 0 is reserved, so dont process it */
1518         if (!domain)
1519                 return;
1520
1521         /* Flush any lazy unmaps that may reference this domain */
1522         if (!intel_iommu_strict)
1523                 flush_unmaps_timeout(0);
1524
1525         domain_remove_dev_info(domain);
1526         /* destroy iovas */
1527         put_iova_domain(&domain->iovad);
1528
1529         /* clear ptes */
1530         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1531
1532         /* free page tables */
1533         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1534
1535         for_each_active_iommu(iommu, drhd)
1536                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1537                         iommu_detach_domain(domain, iommu);
1538
1539         free_domain_mem(domain);
1540 }
1541
1542 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1543                                  u8 bus, u8 devfn, int translation)
1544 {
1545         struct context_entry *context;
1546         unsigned long flags;
1547         struct intel_iommu *iommu;
1548         struct dma_pte *pgd;
1549         unsigned long num;
1550         unsigned long ndomains;
1551         int id;
1552         int agaw;
1553         struct device_domain_info *info = NULL;
1554
1555         pr_debug("Set context mapping for %02x:%02x.%d\n",
1556                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1557
1558         BUG_ON(!domain->pgd);
1559         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1560                translation != CONTEXT_TT_MULTI_LEVEL);
1561
1562         iommu = device_to_iommu(segment, bus, devfn);
1563         if (!iommu)
1564                 return -ENODEV;
1565
1566         context = device_to_context_entry(iommu, bus, devfn);
1567         if (!context)
1568                 return -ENOMEM;
1569         spin_lock_irqsave(&iommu->lock, flags);
1570         if (context_present(context)) {
1571                 spin_unlock_irqrestore(&iommu->lock, flags);
1572                 return 0;
1573         }
1574
1575         id = domain->id;
1576         pgd = domain->pgd;
1577
1578         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1579             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1580                 int found = 0;
1581
1582                 /* find an available domain id for this device in iommu */
1583                 ndomains = cap_ndoms(iommu->cap);
1584                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1585                         if (iommu->domains[num] == domain) {
1586                                 id = num;
1587                                 found = 1;
1588                                 break;
1589                         }
1590                 }
1591
1592                 if (found == 0) {
1593                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1594                         if (num >= ndomains) {
1595                                 spin_unlock_irqrestore(&iommu->lock, flags);
1596                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1597                                 return -EFAULT;
1598                         }
1599
1600                         set_bit(num, iommu->domain_ids);
1601                         iommu->domains[num] = domain;
1602                         id = num;
1603                 }
1604
1605                 /* Skip top levels of page tables for
1606                  * iommu which has less agaw than default.
1607                  * Unnecessary for PT mode.
1608                  */
1609                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1610                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1611                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1612                                 if (!dma_pte_present(pgd)) {
1613                                         spin_unlock_irqrestore(&iommu->lock, flags);
1614                                         return -ENOMEM;
1615                                 }
1616                         }
1617                 }
1618         }
1619
1620         context_set_domain_id(context, id);
1621
1622         if (translation != CONTEXT_TT_PASS_THROUGH) {
1623                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1624                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1625                                      CONTEXT_TT_MULTI_LEVEL;
1626         }
1627         /*
1628          * In pass through mode, AW must be programmed to indicate the largest
1629          * AGAW value supported by hardware. And ASR is ignored by hardware.
1630          */
1631         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1632                 context_set_address_width(context, iommu->msagaw);
1633         else {
1634                 context_set_address_root(context, virt_to_phys(pgd));
1635                 context_set_address_width(context, iommu->agaw);
1636         }
1637
1638         context_set_translation_type(context, translation);
1639         context_set_fault_enable(context);
1640         context_set_present(context);
1641         domain_flush_cache(domain, context, sizeof(*context));
1642
1643         /*
1644          * It's a non-present to present mapping. If hardware doesn't cache
1645          * non-present entry we only need to flush the write-buffer. If the
1646          * _does_ cache non-present entries, then it does so in the special
1647          * domain #0, which we have to flush:
1648          */
1649         if (cap_caching_mode(iommu->cap)) {
1650                 iommu->flush.flush_context(iommu, 0,
1651                                            (((u16)bus) << 8) | devfn,
1652                                            DMA_CCMD_MASK_NOBIT,
1653                                            DMA_CCMD_DEVICE_INVL);
1654                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1655         } else {
1656                 iommu_flush_write_buffer(iommu);
1657         }
1658         iommu_enable_dev_iotlb(info);
1659         spin_unlock_irqrestore(&iommu->lock, flags);
1660
1661         spin_lock_irqsave(&domain->iommu_lock, flags);
1662         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1663                 domain->iommu_count++;
1664                 if (domain->iommu_count == 1)
1665                         domain->nid = iommu->node;
1666                 domain_update_iommu_cap(domain);
1667         }
1668         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1669         return 0;
1670 }
1671
1672 static int
1673 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1674                         int translation)
1675 {
1676         int ret;
1677         struct pci_dev *tmp, *parent;
1678
1679         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1680                                          pdev->bus->number, pdev->devfn,
1681                                          translation);
1682         if (ret)
1683                 return ret;
1684
1685         /* dependent device mapping */
1686         tmp = pci_find_upstream_pcie_bridge(pdev);
1687         if (!tmp)
1688                 return 0;
1689         /* Secondary interface's bus number and devfn 0 */
1690         parent = pdev->bus->self;
1691         while (parent != tmp) {
1692                 ret = domain_context_mapping_one(domain,
1693                                                  pci_domain_nr(parent->bus),
1694                                                  parent->bus->number,
1695                                                  parent->devfn, translation);
1696                 if (ret)
1697                         return ret;
1698                 parent = parent->bus->self;
1699         }
1700         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1701                 return domain_context_mapping_one(domain,
1702                                         pci_domain_nr(tmp->subordinate),
1703                                         tmp->subordinate->number, 0,
1704                                         translation);
1705         else /* this is a legacy PCI bridge */
1706                 return domain_context_mapping_one(domain,
1707                                                   pci_domain_nr(tmp->bus),
1708                                                   tmp->bus->number,
1709                                                   tmp->devfn,
1710                                                   translation);
1711 }
1712
1713 static int domain_context_mapped(struct pci_dev *pdev)
1714 {
1715         int ret;
1716         struct pci_dev *tmp, *parent;
1717         struct intel_iommu *iommu;
1718
1719         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1720                                 pdev->devfn);
1721         if (!iommu)
1722                 return -ENODEV;
1723
1724         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1725         if (!ret)
1726                 return ret;
1727         /* dependent device mapping */
1728         tmp = pci_find_upstream_pcie_bridge(pdev);
1729         if (!tmp)
1730                 return ret;
1731         /* Secondary interface's bus number and devfn 0 */
1732         parent = pdev->bus->self;
1733         while (parent != tmp) {
1734                 ret = device_context_mapped(iommu, parent->bus->number,
1735                                             parent->devfn);
1736                 if (!ret)
1737                         return ret;
1738                 parent = parent->bus->self;
1739         }
1740         if (pci_is_pcie(tmp))
1741                 return device_context_mapped(iommu, tmp->subordinate->number,
1742                                              0);
1743         else
1744                 return device_context_mapped(iommu, tmp->bus->number,
1745                                              tmp->devfn);
1746 }
1747
1748 /* Returns a number of VTD pages, but aligned to MM page size */
1749 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1750                                             size_t size)
1751 {
1752         host_addr &= ~PAGE_MASK;
1753         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1754 }
1755
1756 /* Return largest possible superpage level for a given mapping */
1757 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1758                                           unsigned long iov_pfn,
1759                                           unsigned long phy_pfn,
1760                                           unsigned long pages)
1761 {
1762         int support, level = 1;
1763         unsigned long pfnmerge;
1764
1765         support = domain->iommu_superpage;
1766
1767         /* To use a large page, the virtual *and* physical addresses
1768            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1769            of them will mean we have to use smaller pages. So just
1770            merge them and check both at once. */
1771         pfnmerge = iov_pfn | phy_pfn;
1772
1773         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1774                 pages >>= VTD_STRIDE_SHIFT;
1775                 if (!pages)
1776                         break;
1777                 pfnmerge >>= VTD_STRIDE_SHIFT;
1778                 level++;
1779                 support--;
1780         }
1781         return level;
1782 }
1783
1784 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1785                             struct scatterlist *sg, unsigned long phys_pfn,
1786                             unsigned long nr_pages, int prot)
1787 {
1788         struct dma_pte *first_pte = NULL, *pte = NULL;
1789         phys_addr_t uninitialized_var(pteval);
1790         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1791         unsigned long sg_res;
1792         unsigned int largepage_lvl = 0;
1793         unsigned long lvl_pages = 0;
1794
1795         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1796
1797         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1798                 return -EINVAL;
1799
1800         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1801
1802         if (sg)
1803                 sg_res = 0;
1804         else {
1805                 sg_res = nr_pages + 1;
1806                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1807         }
1808
1809         while (nr_pages > 0) {
1810                 uint64_t tmp;
1811
1812                 if (!sg_res) {
1813                         sg_res = aligned_nrpages(sg->offset, sg->length);
1814                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1815                         sg->dma_length = sg->length;
1816                         pteval = page_to_phys(sg_page(sg)) | prot;
1817                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1818                 }
1819
1820                 if (!pte) {
1821                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1822
1823                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1824                         if (!pte)
1825                                 return -ENOMEM;
1826                         /* It is large page*/
1827                         if (largepage_lvl > 1)
1828                                 pteval |= DMA_PTE_LARGE_PAGE;
1829                         else
1830                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1831
1832                 }
1833                 /* We don't need lock here, nobody else
1834                  * touches the iova range
1835                  */
1836                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1837                 if (tmp) {
1838                         static int dumps = 5;
1839                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1840                                iov_pfn, tmp, (unsigned long long)pteval);
1841                         if (dumps) {
1842                                 dumps--;
1843                                 debug_dma_dump_mappings(NULL);
1844                         }
1845                         WARN_ON(1);
1846                 }
1847
1848                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1849
1850                 BUG_ON(nr_pages < lvl_pages);
1851                 BUG_ON(sg_res < lvl_pages);
1852
1853                 nr_pages -= lvl_pages;
1854                 iov_pfn += lvl_pages;
1855                 phys_pfn += lvl_pages;
1856                 pteval += lvl_pages * VTD_PAGE_SIZE;
1857                 sg_res -= lvl_pages;
1858
1859                 /* If the next PTE would be the first in a new page, then we
1860                    need to flush the cache on the entries we've just written.
1861                    And then we'll need to recalculate 'pte', so clear it and
1862                    let it get set again in the if (!pte) block above.
1863
1864                    If we're done (!nr_pages) we need to flush the cache too.
1865
1866                    Also if we've been setting superpages, we may need to
1867                    recalculate 'pte' and switch back to smaller pages for the
1868                    end of the mapping, if the trailing size is not enough to
1869                    use another superpage (i.e. sg_res < lvl_pages). */
1870                 pte++;
1871                 if (!nr_pages || first_pte_in_page(pte) ||
1872                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1873                         domain_flush_cache(domain, first_pte,
1874                                            (void *)pte - (void *)first_pte);
1875                         pte = NULL;
1876                 }
1877
1878                 if (!sg_res && nr_pages)
1879                         sg = sg_next(sg);
1880         }
1881         return 0;
1882 }
1883
1884 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1885                                     struct scatterlist *sg, unsigned long nr_pages,
1886                                     int prot)
1887 {
1888         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1889 }
1890
1891 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1892                                      unsigned long phys_pfn, unsigned long nr_pages,
1893                                      int prot)
1894 {
1895         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1896 }
1897
1898 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1899 {
1900         if (!iommu)
1901                 return;
1902
1903         clear_context_table(iommu, bus, devfn);
1904         iommu->flush.flush_context(iommu, 0, 0, 0,
1905                                            DMA_CCMD_GLOBAL_INVL);
1906         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1907 }
1908
1909 static void domain_remove_dev_info(struct dmar_domain *domain)
1910 {
1911         struct device_domain_info *info;
1912         unsigned long flags;
1913         struct intel_iommu *iommu;
1914
1915         spin_lock_irqsave(&device_domain_lock, flags);
1916         while (!list_empty(&domain->devices)) {
1917                 info = list_entry(domain->devices.next,
1918                         struct device_domain_info, link);
1919                 list_del(&info->link);
1920                 list_del(&info->global);
1921                 if (info->dev)
1922                         info->dev->dev.archdata.iommu = NULL;
1923                 spin_unlock_irqrestore(&device_domain_lock, flags);
1924
1925                 iommu_disable_dev_iotlb(info);
1926                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1927                 iommu_detach_dev(iommu, info->bus, info->devfn);
1928                 free_devinfo_mem(info);
1929
1930                 spin_lock_irqsave(&device_domain_lock, flags);
1931         }
1932         spin_unlock_irqrestore(&device_domain_lock, flags);
1933 }
1934
1935 /*
1936  * find_domain
1937  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1938  */
1939 static struct dmar_domain *
1940 find_domain(struct pci_dev *pdev)
1941 {
1942         struct device_domain_info *info;
1943
1944         /* No lock here, assumes no domain exit in normal case */
1945         info = pdev->dev.archdata.iommu;
1946         if (info)
1947                 return info->domain;
1948         return NULL;
1949 }
1950
1951 /* domain is initialized */
1952 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1953 {
1954         struct dmar_domain *domain, *found = NULL;
1955         struct intel_iommu *iommu;
1956         struct dmar_drhd_unit *drhd;
1957         struct device_domain_info *info, *tmp;
1958         struct pci_dev *dev_tmp;
1959         unsigned long flags;
1960         int bus = 0, devfn = 0;
1961         int segment;
1962         int ret;
1963
1964         domain = find_domain(pdev);
1965         if (domain)
1966                 return domain;
1967
1968         segment = pci_domain_nr(pdev->bus);
1969
1970         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1971         if (dev_tmp) {
1972                 if (pci_is_pcie(dev_tmp)) {
1973                         bus = dev_tmp->subordinate->number;
1974                         devfn = 0;
1975                 } else {
1976                         bus = dev_tmp->bus->number;
1977                         devfn = dev_tmp->devfn;
1978                 }
1979                 spin_lock_irqsave(&device_domain_lock, flags);
1980                 list_for_each_entry(info, &device_domain_list, global) {
1981                         if (info->segment == segment &&
1982                             info->bus == bus && info->devfn == devfn) {
1983                                 found = info->domain;
1984                                 break;
1985                         }
1986                 }
1987                 spin_unlock_irqrestore(&device_domain_lock, flags);
1988                 /* pcie-pci bridge already has a domain, uses it */
1989                 if (found) {
1990                         domain = found;
1991                         goto found_domain;
1992                 }
1993         }
1994
1995         domain = alloc_domain();
1996         if (!domain)
1997                 goto error;
1998
1999         /* Allocate new domain for the device */
2000         drhd = dmar_find_matched_drhd_unit(pdev);
2001         if (!drhd) {
2002                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2003                         pci_name(pdev));
2004                 return NULL;
2005         }
2006         iommu = drhd->iommu;
2007
2008         ret = iommu_attach_domain(domain, iommu);
2009         if (ret) {
2010                 free_domain_mem(domain);
2011                 goto error;
2012         }
2013
2014         if (domain_init(domain, gaw)) {
2015                 domain_exit(domain);
2016                 goto error;
2017         }
2018
2019         /* register pcie-to-pci device */
2020         if (dev_tmp) {
2021                 info = alloc_devinfo_mem();
2022                 if (!info) {
2023                         domain_exit(domain);
2024                         goto error;
2025                 }
2026                 info->segment = segment;
2027                 info->bus = bus;
2028                 info->devfn = devfn;
2029                 info->dev = NULL;
2030                 info->domain = domain;
2031                 /* This domain is shared by devices under p2p bridge */
2032                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2033
2034                 /* pcie-to-pci bridge already has a domain, uses it */
2035                 found = NULL;
2036                 spin_lock_irqsave(&device_domain_lock, flags);
2037                 list_for_each_entry(tmp, &device_domain_list, global) {
2038                         if (tmp->segment == segment &&
2039                             tmp->bus == bus && tmp->devfn == devfn) {
2040                                 found = tmp->domain;
2041                                 break;
2042                         }
2043                 }
2044                 if (found) {
2045                         spin_unlock_irqrestore(&device_domain_lock, flags);
2046                         free_devinfo_mem(info);
2047                         domain_exit(domain);
2048                         domain = found;
2049                 } else {
2050                         list_add(&info->link, &domain->devices);
2051                         list_add(&info->global, &device_domain_list);
2052                         spin_unlock_irqrestore(&device_domain_lock, flags);
2053                 }
2054         }
2055
2056 found_domain:
2057         info = alloc_devinfo_mem();
2058         if (!info)
2059                 goto error;
2060         info->segment = segment;
2061         info->bus = pdev->bus->number;
2062         info->devfn = pdev->devfn;
2063         info->dev = pdev;
2064         info->domain = domain;
2065         spin_lock_irqsave(&device_domain_lock, flags);
2066         /* somebody is fast */
2067         found = find_domain(pdev);
2068         if (found != NULL) {
2069                 spin_unlock_irqrestore(&device_domain_lock, flags);
2070                 if (found != domain) {
2071                         domain_exit(domain);
2072                         domain = found;
2073                 }
2074                 free_devinfo_mem(info);
2075                 return domain;
2076         }
2077         list_add(&info->link, &domain->devices);
2078         list_add(&info->global, &device_domain_list);
2079         pdev->dev.archdata.iommu = info;
2080         spin_unlock_irqrestore(&device_domain_lock, flags);
2081         return domain;
2082 error:
2083         /* recheck it here, maybe others set it */
2084         return find_domain(pdev);
2085 }
2086
2087 static int iommu_identity_mapping;
2088 #define IDENTMAP_ALL            1
2089 #define IDENTMAP_GFX            2
2090 #define IDENTMAP_AZALIA         4
2091
2092 static int iommu_domain_identity_map(struct dmar_domain *domain,
2093                                      unsigned long long start,
2094                                      unsigned long long end)
2095 {
2096         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2097         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2098
2099         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2100                           dma_to_mm_pfn(last_vpfn))) {
2101                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2102                 return -ENOMEM;
2103         }
2104
2105         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2106                  start, end, domain->id);
2107         /*
2108          * RMRR range might have overlap with physical memory range,
2109          * clear it first
2110          */
2111         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2112
2113         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2114                                   last_vpfn - first_vpfn + 1,
2115                                   DMA_PTE_READ|DMA_PTE_WRITE);
2116 }
2117
2118 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2119                                       unsigned long long start,
2120                                       unsigned long long end)
2121 {
2122         struct dmar_domain *domain;
2123         int ret;
2124
2125         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2126         if (!domain)
2127                 return -ENOMEM;
2128
2129         /* For _hardware_ passthrough, don't bother. But for software
2130            passthrough, we do it anyway -- it may indicate a memory
2131            range which is reserved in E820, so which didn't get set
2132            up to start with in si_domain */
2133         if (domain == si_domain && hw_pass_through) {
2134                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2135                        pci_name(pdev), start, end);
2136                 return 0;
2137         }
2138
2139         printk(KERN_INFO
2140                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2141                pci_name(pdev), start, end);
2142         
2143         if (end < start) {
2144                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2145                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2146                         dmi_get_system_info(DMI_BIOS_VENDOR),
2147                         dmi_get_system_info(DMI_BIOS_VERSION),
2148                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2149                 ret = -EIO;
2150                 goto error;
2151         }
2152
2153         if (end >> agaw_to_width(domain->agaw)) {
2154                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2155                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2156                      agaw_to_width(domain->agaw),
2157                      dmi_get_system_info(DMI_BIOS_VENDOR),
2158                      dmi_get_system_info(DMI_BIOS_VERSION),
2159                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2160                 ret = -EIO;
2161                 goto error;
2162         }
2163
2164         ret = iommu_domain_identity_map(domain, start, end);
2165         if (ret)
2166                 goto error;
2167
2168         /* context entry init */
2169         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2170         if (ret)
2171                 goto error;
2172
2173         return 0;
2174
2175  error:
2176         domain_exit(domain);
2177         return ret;
2178 }
2179
2180 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2181         struct pci_dev *pdev)
2182 {
2183         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2184                 return 0;
2185         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2186                 rmrr->end_address);
2187 }
2188
2189 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2190 static inline void iommu_prepare_isa(void)
2191 {
2192         struct pci_dev *pdev;
2193         int ret;
2194
2195         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2196         if (!pdev)
2197                 return;
2198
2199         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2200         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2201
2202         if (ret)
2203                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2204                        "floppy might not work\n");
2205
2206 }
2207 #else
2208 static inline void iommu_prepare_isa(void)
2209 {
2210         return;
2211 }
2212 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2213
2214 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2215
2216 static int __init si_domain_init(int hw)
2217 {
2218         struct dmar_drhd_unit *drhd;
2219         struct intel_iommu *iommu;
2220         int nid, ret = 0;
2221
2222         si_domain = alloc_domain();
2223         if (!si_domain)
2224                 return -EFAULT;
2225
2226         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2227
2228         for_each_active_iommu(iommu, drhd) {
2229                 ret = iommu_attach_domain(si_domain, iommu);
2230                 if (ret) {
2231                         domain_exit(si_domain);
2232                         return -EFAULT;
2233                 }
2234         }
2235
2236         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2237                 domain_exit(si_domain);
2238                 return -EFAULT;
2239         }
2240
2241         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2242
2243         if (hw)
2244                 return 0;
2245
2246         for_each_online_node(nid) {
2247                 unsigned long start_pfn, end_pfn;
2248                 int i;
2249
2250                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2251                         ret = iommu_domain_identity_map(si_domain,
2252                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2253                         if (ret)
2254                                 return ret;
2255                 }
2256         }
2257
2258         return 0;
2259 }
2260
2261 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2262                                           struct pci_dev *pdev);
2263 static int identity_mapping(struct pci_dev *pdev)
2264 {
2265         struct device_domain_info *info;
2266
2267         if (likely(!iommu_identity_mapping))
2268                 return 0;
2269
2270         info = pdev->dev.archdata.iommu;
2271         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2272                 return (info->domain == si_domain);
2273
2274         return 0;
2275 }
2276
2277 static int domain_add_dev_info(struct dmar_domain *domain,
2278                                struct pci_dev *pdev,
2279                                int translation)
2280 {
2281         struct device_domain_info *info;
2282         unsigned long flags;
2283         int ret;
2284
2285         info = alloc_devinfo_mem();
2286         if (!info)
2287                 return -ENOMEM;
2288
2289         ret = domain_context_mapping(domain, pdev, translation);
2290         if (ret) {
2291                 free_devinfo_mem(info);
2292                 return ret;
2293         }
2294
2295         info->segment = pci_domain_nr(pdev->bus);
2296         info->bus = pdev->bus->number;
2297         info->devfn = pdev->devfn;
2298         info->dev = pdev;
2299         info->domain = domain;
2300
2301         spin_lock_irqsave(&device_domain_lock, flags);
2302         list_add(&info->link, &domain->devices);
2303         list_add(&info->global, &device_domain_list);
2304         pdev->dev.archdata.iommu = info;
2305         spin_unlock_irqrestore(&device_domain_lock, flags);
2306
2307         return 0;
2308 }
2309
2310 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2311 {
2312         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2313                 return 1;
2314
2315         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2316                 return 1;
2317
2318         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2319                 return 0;
2320
2321         /*
2322          * We want to start off with all devices in the 1:1 domain, and
2323          * take them out later if we find they can't access all of memory.
2324          *
2325          * However, we can't do this for PCI devices behind bridges,
2326          * because all PCI devices behind the same bridge will end up
2327          * with the same source-id on their transactions.
2328          *
2329          * Practically speaking, we can't change things around for these
2330          * devices at run-time, because we can't be sure there'll be no
2331          * DMA transactions in flight for any of their siblings.
2332          * 
2333          * So PCI devices (unless they're on the root bus) as well as
2334          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2335          * the 1:1 domain, just in _case_ one of their siblings turns out
2336          * not to be able to map all of memory.
2337          */
2338         if (!pci_is_pcie(pdev)) {
2339                 if (!pci_is_root_bus(pdev->bus))
2340                         return 0;
2341                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2342                         return 0;
2343         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2344                 return 0;
2345
2346         /* 
2347          * At boot time, we don't yet know if devices will be 64-bit capable.
2348          * Assume that they will -- if they turn out not to be, then we can 
2349          * take them out of the 1:1 domain later.
2350          */
2351         if (!startup) {
2352                 /*
2353                  * If the device's dma_mask is less than the system's memory
2354                  * size then this is not a candidate for identity mapping.
2355                  */
2356                 u64 dma_mask = pdev->dma_mask;
2357
2358                 if (pdev->dev.coherent_dma_mask &&
2359                     pdev->dev.coherent_dma_mask < dma_mask)
2360                         dma_mask = pdev->dev.coherent_dma_mask;
2361
2362                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2363         }
2364
2365         return 1;
2366 }
2367
2368 static int __init iommu_prepare_static_identity_mapping(int hw)
2369 {
2370         struct pci_dev *pdev = NULL;
2371         int ret;
2372
2373         ret = si_domain_init(hw);
2374         if (ret)
2375                 return -EFAULT;
2376
2377         for_each_pci_dev(pdev) {
2378                 if (iommu_should_identity_map(pdev, 1)) {
2379                         ret = domain_add_dev_info(si_domain, pdev,
2380                                              hw ? CONTEXT_TT_PASS_THROUGH :
2381                                                   CONTEXT_TT_MULTI_LEVEL);
2382                         if (ret) {
2383                                 /* device not associated with an iommu */
2384                                 if (ret == -ENODEV)
2385                                         continue;
2386                                 return ret;
2387                         }
2388                         pr_info("IOMMU: %s identity mapping for device %s\n",
2389                                 hw ? "hardware" : "software", pci_name(pdev));
2390                 }
2391         }
2392
2393         return 0;
2394 }
2395
2396 static int __init init_dmars(void)
2397 {
2398         struct dmar_drhd_unit *drhd;
2399         struct dmar_rmrr_unit *rmrr;
2400         struct pci_dev *pdev;
2401         struct intel_iommu *iommu;
2402         int i, ret;
2403
2404         /*
2405          * for each drhd
2406          *    allocate root
2407          *    initialize and program root entry to not present
2408          * endfor
2409          */
2410         for_each_drhd_unit(drhd) {
2411                 /*
2412                  * lock not needed as this is only incremented in the single
2413                  * threaded kernel __init code path all other access are read
2414                  * only
2415                  */
2416                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2417                         g_num_of_iommus++;
2418                         continue;
2419                 }
2420                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2421                           IOMMU_UNITS_SUPPORTED);
2422         }
2423
2424         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2425                         GFP_KERNEL);
2426         if (!g_iommus) {
2427                 printk(KERN_ERR "Allocating global iommu array failed\n");
2428                 ret = -ENOMEM;
2429                 goto error;
2430         }
2431
2432         deferred_flush = kzalloc(g_num_of_iommus *
2433                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2434         if (!deferred_flush) {
2435                 ret = -ENOMEM;
2436                 goto error;
2437         }
2438
2439         for_each_drhd_unit(drhd) {
2440                 if (drhd->ignored)
2441                         continue;
2442
2443                 iommu = drhd->iommu;
2444                 g_iommus[iommu->seq_id] = iommu;
2445
2446                 ret = iommu_init_domains(iommu);
2447                 if (ret)
2448                         goto error;
2449
2450                 /*
2451                  * TBD:
2452                  * we could share the same root & context tables
2453                  * among all IOMMU's. Need to Split it later.
2454                  */
2455                 ret = iommu_alloc_root_entry(iommu);
2456                 if (ret) {
2457                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2458                         goto error;
2459                 }
2460                 if (!ecap_pass_through(iommu->ecap))
2461                         hw_pass_through = 0;
2462         }
2463
2464         /*
2465          * Start from the sane iommu hardware state.
2466          */
2467         for_each_drhd_unit(drhd) {
2468                 if (drhd->ignored)
2469                         continue;
2470
2471                 iommu = drhd->iommu;
2472
2473                 /*
2474                  * If the queued invalidation is already initialized by us
2475                  * (for example, while enabling interrupt-remapping) then
2476                  * we got the things already rolling from a sane state.
2477                  */
2478                 if (iommu->qi)
2479                         continue;
2480
2481                 /*
2482                  * Clear any previous faults.
2483                  */
2484                 dmar_fault(-1, iommu);
2485                 /*
2486                  * Disable queued invalidation if supported and already enabled
2487                  * before OS handover.
2488                  */
2489                 dmar_disable_qi(iommu);
2490         }
2491
2492         for_each_drhd_unit(drhd) {
2493                 if (drhd->ignored)
2494                         continue;
2495
2496                 iommu = drhd->iommu;
2497
2498                 if (dmar_enable_qi(iommu)) {
2499                         /*
2500                          * Queued Invalidate not enabled, use Register Based
2501                          * Invalidate
2502                          */
2503                         iommu->flush.flush_context = __iommu_flush_context;
2504                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2505                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2506                                "invalidation\n",
2507                                 iommu->seq_id,
2508                                (unsigned long long)drhd->reg_base_addr);
2509                 } else {
2510                         iommu->flush.flush_context = qi_flush_context;
2511                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2512                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2513                                "invalidation\n",
2514                                 iommu->seq_id,
2515                                (unsigned long long)drhd->reg_base_addr);
2516                 }
2517         }
2518
2519         if (iommu_pass_through)
2520                 iommu_identity_mapping |= IDENTMAP_ALL;
2521
2522 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2523         iommu_identity_mapping |= IDENTMAP_GFX;
2524 #endif
2525
2526         check_tylersburg_isoch();
2527
2528         /*
2529          * If pass through is not set or not enabled, setup context entries for
2530          * identity mappings for rmrr, gfx, and isa and may fall back to static
2531          * identity mapping if iommu_identity_mapping is set.
2532          */
2533         if (iommu_identity_mapping) {
2534                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2535                 if (ret) {
2536                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2537                         goto error;
2538                 }
2539         }
2540         /*
2541          * For each rmrr
2542          *   for each dev attached to rmrr
2543          *   do
2544          *     locate drhd for dev, alloc domain for dev
2545          *     allocate free domain
2546          *     allocate page table entries for rmrr
2547          *     if context not allocated for bus
2548          *           allocate and init context
2549          *           set present in root table for this bus
2550          *     init context with domain, translation etc
2551          *    endfor
2552          * endfor
2553          */
2554         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2555         for_each_rmrr_units(rmrr) {
2556                 for (i = 0; i < rmrr->devices_cnt; i++) {
2557                         pdev = rmrr->devices[i];
2558                         /*
2559                          * some BIOS lists non-exist devices in DMAR
2560                          * table.
2561                          */
2562                         if (!pdev)
2563                                 continue;
2564                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2565                         if (ret)
2566                                 printk(KERN_ERR
2567                                        "IOMMU: mapping reserved region failed\n");
2568                 }
2569         }
2570
2571         iommu_prepare_isa();
2572
2573         /*
2574          * for each drhd
2575          *   enable fault log
2576          *   global invalidate context cache
2577          *   global invalidate iotlb
2578          *   enable translation
2579          */
2580         for_each_drhd_unit(drhd) {
2581                 if (drhd->ignored) {
2582                         /*
2583                          * we always have to disable PMRs or DMA may fail on
2584                          * this device
2585                          */
2586                         if (force_on)
2587                                 iommu_disable_protect_mem_regions(drhd->iommu);
2588                         continue;
2589                 }
2590                 iommu = drhd->iommu;
2591
2592                 iommu_flush_write_buffer(iommu);
2593
2594                 ret = dmar_set_interrupt(iommu);
2595                 if (ret)
2596                         goto error;
2597
2598                 iommu_set_root_entry(iommu);
2599
2600                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2601                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2602
2603                 ret = iommu_enable_translation(iommu);
2604                 if (ret)
2605                         goto error;
2606
2607                 iommu_disable_protect_mem_regions(iommu);
2608         }
2609
2610         return 0;
2611 error:
2612         for_each_drhd_unit(drhd) {
2613                 if (drhd->ignored)
2614                         continue;
2615                 iommu = drhd->iommu;
2616                 free_iommu(iommu);
2617         }
2618         kfree(g_iommus);
2619         return ret;
2620 }
2621
2622 /* This takes a number of _MM_ pages, not VTD pages */
2623 static struct iova *intel_alloc_iova(struct device *dev,
2624                                      struct dmar_domain *domain,
2625                                      unsigned long nrpages, uint64_t dma_mask)
2626 {
2627         struct pci_dev *pdev = to_pci_dev(dev);
2628         struct iova *iova = NULL;
2629
2630         /* Restrict dma_mask to the width that the iommu can handle */
2631         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2632
2633         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2634                 /*
2635                  * First try to allocate an io virtual address in
2636                  * DMA_BIT_MASK(32) and if that fails then try allocating
2637                  * from higher range
2638                  */
2639                 iova = alloc_iova(&domain->iovad, nrpages,
2640                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2641                 if (iova)
2642                         return iova;
2643         }
2644         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2645         if (unlikely(!iova)) {
2646                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2647                        nrpages, pci_name(pdev));
2648                 return NULL;
2649         }
2650
2651         return iova;
2652 }
2653
2654 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2655 {
2656         struct dmar_domain *domain;
2657         int ret;
2658
2659         domain = get_domain_for_dev(pdev,
2660                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2661         if (!domain) {
2662                 printk(KERN_ERR
2663                         "Allocating domain for %s failed", pci_name(pdev));
2664                 return NULL;
2665         }
2666
2667         /* make sure context mapping is ok */
2668         if (unlikely(!domain_context_mapped(pdev))) {
2669                 ret = domain_context_mapping(domain, pdev,
2670                                              CONTEXT_TT_MULTI_LEVEL);
2671                 if (ret) {
2672                         printk(KERN_ERR
2673                                 "Domain context map for %s failed",
2674                                 pci_name(pdev));
2675                         return NULL;
2676                 }
2677         }
2678
2679         return domain;
2680 }
2681
2682 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2683 {
2684         struct device_domain_info *info;
2685
2686         /* No lock here, assumes no domain exit in normal case */
2687         info = dev->dev.archdata.iommu;
2688         if (likely(info))
2689                 return info->domain;
2690
2691         return __get_valid_domain_for_dev(dev);
2692 }
2693
2694 static int iommu_dummy(struct pci_dev *pdev)
2695 {
2696         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2697 }
2698
2699 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2700 static int iommu_no_mapping(struct device *dev)
2701 {
2702         struct pci_dev *pdev;
2703         int found;
2704
2705         if (unlikely(dev->bus != &pci_bus_type))
2706                 return 1;
2707
2708         pdev = to_pci_dev(dev);
2709         if (iommu_dummy(pdev))
2710                 return 1;
2711
2712         if (!iommu_identity_mapping)
2713                 return 0;
2714
2715         found = identity_mapping(pdev);
2716         if (found) {
2717                 if (iommu_should_identity_map(pdev, 0))
2718                         return 1;
2719                 else {
2720                         /*
2721                          * 32 bit DMA is removed from si_domain and fall back
2722                          * to non-identity mapping.
2723                          */
2724                         domain_remove_one_dev_info(si_domain, pdev);
2725                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2726                                pci_name(pdev));
2727                         return 0;
2728                 }
2729         } else {
2730                 /*
2731                  * In case of a detached 64 bit DMA device from vm, the device
2732                  * is put into si_domain for identity mapping.
2733                  */
2734                 if (iommu_should_identity_map(pdev, 0)) {
2735                         int ret;
2736                         ret = domain_add_dev_info(si_domain, pdev,
2737                                                   hw_pass_through ?
2738                                                   CONTEXT_TT_PASS_THROUGH :
2739                                                   CONTEXT_TT_MULTI_LEVEL);
2740                         if (!ret) {
2741                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2742                                        pci_name(pdev));
2743                                 return 1;
2744                         }
2745                 }
2746         }
2747
2748         return 0;
2749 }
2750
2751 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2752                                      size_t size, int dir, u64 dma_mask)
2753 {
2754         struct pci_dev *pdev = to_pci_dev(hwdev);
2755         struct dmar_domain *domain;
2756         phys_addr_t start_paddr;
2757         struct iova *iova;
2758         int prot = 0;
2759         int ret;
2760         struct intel_iommu *iommu;
2761         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2762
2763         BUG_ON(dir == DMA_NONE);
2764
2765         if (iommu_no_mapping(hwdev))
2766                 return paddr;
2767
2768         domain = get_valid_domain_for_dev(pdev);
2769         if (!domain)
2770                 return 0;
2771
2772         iommu = domain_get_iommu(domain);
2773         size = aligned_nrpages(paddr, size);
2774
2775         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2776         if (!iova)
2777                 goto error;
2778
2779         /*
2780          * Check if DMAR supports zero-length reads on write only
2781          * mappings..
2782          */
2783         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2784                         !cap_zlr(iommu->cap))
2785                 prot |= DMA_PTE_READ;
2786         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2787                 prot |= DMA_PTE_WRITE;
2788         /*
2789          * paddr - (paddr + size) might be partial page, we should map the whole
2790          * page.  Note: if two part of one page are separately mapped, we
2791          * might have two guest_addr mapping to the same host paddr, but this
2792          * is not a big problem
2793          */
2794         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2795                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2796         if (ret)
2797                 goto error;
2798
2799         /* it's a non-present to present mapping. Only flush if caching mode */
2800         if (cap_caching_mode(iommu->cap))
2801                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2802         else
2803                 iommu_flush_write_buffer(iommu);
2804
2805         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2806         start_paddr += paddr & ~PAGE_MASK;
2807         return start_paddr;
2808
2809 error:
2810         if (iova)
2811                 __free_iova(&domain->iovad, iova);
2812         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2813                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2814         return 0;
2815 }
2816
2817 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2818                                  unsigned long offset, size_t size,
2819                                  enum dma_data_direction dir,
2820                                  struct dma_attrs *attrs)
2821 {
2822         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2823                                   dir, to_pci_dev(dev)->dma_mask);
2824 }
2825
2826 static void flush_unmaps(void)
2827 {
2828         int i, j;
2829
2830         timer_on = 0;
2831
2832         /* just flush them all */
2833         for (i = 0; i < g_num_of_iommus; i++) {
2834                 struct intel_iommu *iommu = g_iommus[i];
2835                 if (!iommu)
2836                         continue;
2837
2838                 if (!deferred_flush[i].next)
2839                         continue;
2840
2841                 /* In caching mode, global flushes turn emulation expensive */
2842                 if (!cap_caching_mode(iommu->cap))
2843                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2844                                          DMA_TLB_GLOBAL_FLUSH);
2845                 for (j = 0; j < deferred_flush[i].next; j++) {
2846                         unsigned long mask;
2847                         struct iova *iova = deferred_flush[i].iova[j];
2848                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2849
2850                         /* On real hardware multiple invalidations are expensive */
2851                         if (cap_caching_mode(iommu->cap))
2852                                 iommu_flush_iotlb_psi(iommu, domain->id,
2853                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2854                         else {
2855                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2856                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2857                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2858                         }
2859                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2860                 }
2861                 deferred_flush[i].next = 0;
2862         }
2863
2864         list_size = 0;
2865 }
2866
2867 static void flush_unmaps_timeout(unsigned long data)
2868 {
2869         unsigned long flags;
2870
2871         spin_lock_irqsave(&async_umap_flush_lock, flags);
2872         flush_unmaps();
2873         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2874 }
2875
2876 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2877 {
2878         unsigned long flags;
2879         int next, iommu_id;
2880         struct intel_iommu *iommu;
2881
2882         spin_lock_irqsave(&async_umap_flush_lock, flags);
2883         if (list_size == HIGH_WATER_MARK)
2884                 flush_unmaps();
2885
2886         iommu = domain_get_iommu(dom);
2887         iommu_id = iommu->seq_id;
2888
2889         next = deferred_flush[iommu_id].next;
2890         deferred_flush[iommu_id].domain[next] = dom;
2891         deferred_flush[iommu_id].iova[next] = iova;
2892         deferred_flush[iommu_id].next++;
2893
2894         if (!timer_on) {
2895                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2896                 timer_on = 1;
2897         }
2898         list_size++;
2899         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2900 }
2901
2902 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2903                              size_t size, enum dma_data_direction dir,
2904                              struct dma_attrs *attrs)
2905 {
2906         struct pci_dev *pdev = to_pci_dev(dev);
2907         struct dmar_domain *domain;
2908         unsigned long start_pfn, last_pfn;
2909         struct iova *iova;
2910         struct intel_iommu *iommu;
2911
2912         if (iommu_no_mapping(dev))
2913                 return;
2914
2915         domain = find_domain(pdev);
2916         BUG_ON(!domain);
2917
2918         iommu = domain_get_iommu(domain);
2919
2920         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2921         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2922                       (unsigned long long)dev_addr))
2923                 return;
2924
2925         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2926         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2927
2928         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2929                  pci_name(pdev), start_pfn, last_pfn);
2930
2931         /*  clear the whole page */
2932         dma_pte_clear_range(domain, start_pfn, last_pfn);
2933
2934         /* free page tables */
2935         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2936
2937         if (intel_iommu_strict) {
2938                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2939                                       last_pfn - start_pfn + 1, 0);
2940                 /* free iova */
2941                 __free_iova(&domain->iovad, iova);
2942         } else {
2943                 add_unmap(domain, iova);
2944                 /*
2945                  * queue up the release of the unmap to save the 1/6th of the
2946                  * cpu used up by the iotlb flush operation...
2947                  */
2948         }
2949 }
2950
2951 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2952                                   dma_addr_t *dma_handle, gfp_t flags,
2953                                   struct dma_attrs *attrs)
2954 {
2955         void *vaddr;
2956         int order;
2957
2958         size = PAGE_ALIGN(size);
2959         order = get_order(size);
2960
2961         if (!iommu_no_mapping(hwdev))
2962                 flags &= ~(GFP_DMA | GFP_DMA32);
2963         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2964                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2965                         flags |= GFP_DMA;
2966                 else
2967                         flags |= GFP_DMA32;
2968         }
2969
2970         vaddr = (void *)__get_free_pages(flags, order);
2971         if (!vaddr)
2972                 return NULL;
2973         memset(vaddr, 0, size);
2974
2975         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2976                                          DMA_BIDIRECTIONAL,
2977                                          hwdev->coherent_dma_mask);
2978         if (*dma_handle)
2979                 return vaddr;
2980         free_pages((unsigned long)vaddr, order);
2981         return NULL;
2982 }
2983
2984 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2985                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
2986 {
2987         int order;
2988
2989         size = PAGE_ALIGN(size);
2990         order = get_order(size);
2991
2992         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2993         free_pages((unsigned long)vaddr, order);
2994 }
2995
2996 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2997                            int nelems, enum dma_data_direction dir,
2998                            struct dma_attrs *attrs)
2999 {
3000         struct pci_dev *pdev = to_pci_dev(hwdev);
3001         struct dmar_domain *domain;
3002         unsigned long start_pfn, last_pfn;
3003         struct iova *iova;
3004         struct intel_iommu *iommu;
3005
3006         if (iommu_no_mapping(hwdev))
3007                 return;
3008
3009         domain = find_domain(pdev);
3010         BUG_ON(!domain);
3011
3012         iommu = domain_get_iommu(domain);
3013
3014         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3015         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3016                       (unsigned long long)sglist[0].dma_address))
3017                 return;
3018
3019         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3020         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3021
3022         /*  clear the whole page */
3023         dma_pte_clear_range(domain, start_pfn, last_pfn);
3024
3025         /* free page tables */
3026         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3027
3028         if (intel_iommu_strict) {
3029                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3030                                       last_pfn - start_pfn + 1, 0);
3031                 /* free iova */
3032                 __free_iova(&domain->iovad, iova);
3033         } else {
3034                 add_unmap(domain, iova);
3035                 /*
3036                  * queue up the release of the unmap to save the 1/6th of the
3037                  * cpu used up by the iotlb flush operation...
3038                  */
3039         }
3040 }
3041
3042 static int intel_nontranslate_map_sg(struct device *hddev,
3043         struct scatterlist *sglist, int nelems, int dir)
3044 {
3045         int i;
3046         struct scatterlist *sg;
3047
3048         for_each_sg(sglist, sg, nelems, i) {
3049                 BUG_ON(!sg_page(sg));
3050                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3051                 sg->dma_length = sg->length;
3052         }
3053         return nelems;
3054 }
3055
3056 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3057                         enum dma_data_direction dir, struct dma_attrs *attrs)
3058 {
3059         int i;
3060         struct pci_dev *pdev = to_pci_dev(hwdev);
3061         struct dmar_domain *domain;
3062         size_t size = 0;
3063         int prot = 0;
3064         struct iova *iova = NULL;
3065         int ret;
3066         struct scatterlist *sg;
3067         unsigned long start_vpfn;
3068         struct intel_iommu *iommu;
3069
3070         BUG_ON(dir == DMA_NONE);
3071         if (iommu_no_mapping(hwdev))
3072                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3073
3074         domain = get_valid_domain_for_dev(pdev);
3075         if (!domain)
3076                 return 0;
3077
3078         iommu = domain_get_iommu(domain);
3079
3080         for_each_sg(sglist, sg, nelems, i)
3081                 size += aligned_nrpages(sg->offset, sg->length);
3082
3083         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3084                                 pdev->dma_mask);
3085         if (!iova) {
3086                 sglist->dma_length = 0;
3087                 return 0;
3088         }
3089
3090         /*
3091          * Check if DMAR supports zero-length reads on write only
3092          * mappings..
3093          */
3094         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3095                         !cap_zlr(iommu->cap))
3096                 prot |= DMA_PTE_READ;
3097         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3098                 prot |= DMA_PTE_WRITE;
3099
3100         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3101
3102         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3103         if (unlikely(ret)) {
3104                 /*  clear the page */
3105                 dma_pte_clear_range(domain, start_vpfn,
3106                                     start_vpfn + size - 1);
3107                 /* free page tables */
3108                 dma_pte_free_pagetable(domain, start_vpfn,
3109                                        start_vpfn + size - 1);
3110                 /* free iova */
3111                 __free_iova(&domain->iovad, iova);
3112                 return 0;
3113         }
3114
3115         /* it's a non-present to present mapping. Only flush if caching mode */
3116         if (cap_caching_mode(iommu->cap))
3117                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3118         else
3119                 iommu_flush_write_buffer(iommu);
3120
3121         return nelems;
3122 }
3123
3124 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3125 {
3126         return !dma_addr;
3127 }
3128
3129 struct dma_map_ops intel_dma_ops = {
3130         .alloc = intel_alloc_coherent,
3131         .free = intel_free_coherent,
3132         .map_sg = intel_map_sg,
3133         .unmap_sg = intel_unmap_sg,
3134         .map_page = intel_map_page,
3135         .unmap_page = intel_unmap_page,
3136         .mapping_error = intel_mapping_error,
3137 };
3138
3139 static inline int iommu_domain_cache_init(void)
3140 {
3141         int ret = 0;
3142
3143         iommu_domain_cache = kmem_cache_create("iommu_domain",
3144                                          sizeof(struct dmar_domain),
3145                                          0,
3146                                          SLAB_HWCACHE_ALIGN,
3147
3148                                          NULL);
3149         if (!iommu_domain_cache) {
3150                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3151                 ret = -ENOMEM;
3152         }
3153
3154         return ret;
3155 }
3156
3157 static inline int iommu_devinfo_cache_init(void)
3158 {
3159         int ret = 0;
3160
3161         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3162                                          sizeof(struct device_domain_info),
3163                                          0,
3164                                          SLAB_HWCACHE_ALIGN,
3165                                          NULL);
3166         if (!iommu_devinfo_cache) {
3167                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3168                 ret = -ENOMEM;
3169         }
3170
3171         return ret;
3172 }
3173
3174 static inline int iommu_iova_cache_init(void)
3175 {
3176         int ret = 0;
3177
3178         iommu_iova_cache = kmem_cache_create("iommu_iova",
3179                                          sizeof(struct iova),
3180                                          0,
3181                                          SLAB_HWCACHE_ALIGN,
3182                                          NULL);
3183         if (!iommu_iova_cache) {
3184                 printk(KERN_ERR "Couldn't create iova cache\n");
3185                 ret = -ENOMEM;
3186         }
3187
3188         return ret;
3189 }
3190
3191 static int __init iommu_init_mempool(void)
3192 {
3193         int ret;
3194         ret = iommu_iova_cache_init();
3195         if (ret)
3196                 return ret;
3197
3198         ret = iommu_domain_cache_init();
3199         if (ret)
3200                 goto domain_error;
3201
3202         ret = iommu_devinfo_cache_init();
3203         if (!ret)
3204                 return ret;
3205
3206         kmem_cache_destroy(iommu_domain_cache);
3207 domain_error:
3208         kmem_cache_destroy(iommu_iova_cache);
3209
3210         return -ENOMEM;
3211 }
3212
3213 static void __init iommu_exit_mempool(void)
3214 {
3215         kmem_cache_destroy(iommu_devinfo_cache);
3216         kmem_cache_destroy(iommu_domain_cache);
3217         kmem_cache_destroy(iommu_iova_cache);
3218
3219 }
3220
3221 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3222 {
3223         struct dmar_drhd_unit *drhd;
3224         u32 vtbar;
3225         int rc;
3226
3227         /* We know that this device on this chipset has its own IOMMU.
3228          * If we find it under a different IOMMU, then the BIOS is lying
3229          * to us. Hope that the IOMMU for this device is actually
3230          * disabled, and it needs no translation...
3231          */
3232         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3233         if (rc) {
3234                 /* "can't" happen */
3235                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3236                 return;
3237         }
3238         vtbar &= 0xffff0000;
3239
3240         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3241         drhd = dmar_find_matched_drhd_unit(pdev);
3242         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3243                             TAINT_FIRMWARE_WORKAROUND,
3244                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3245                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3246 }
3247 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3248
3249 static void __init init_no_remapping_devices(void)
3250 {
3251         struct dmar_drhd_unit *drhd;
3252
3253         for_each_drhd_unit(drhd) {
3254                 if (!drhd->include_all) {
3255                         int i;
3256                         for (i = 0; i < drhd->devices_cnt; i++)
3257                                 if (drhd->devices[i] != NULL)
3258                                         break;
3259                         /* ignore DMAR unit if no pci devices exist */
3260                         if (i == drhd->devices_cnt)
3261                                 drhd->ignored = 1;
3262                 }
3263         }
3264
3265         for_each_drhd_unit(drhd) {
3266                 int i;
3267                 if (drhd->ignored || drhd->include_all)
3268                         continue;
3269
3270                 for (i = 0; i < drhd->devices_cnt; i++)
3271                         if (drhd->devices[i] &&
3272                             !IS_GFX_DEVICE(drhd->devices[i]))
3273                                 break;
3274
3275                 if (i < drhd->devices_cnt)
3276                         continue;
3277
3278                 /* This IOMMU has *only* gfx devices. Either bypass it or
3279                    set the gfx_mapped flag, as appropriate */
3280                 if (dmar_map_gfx) {
3281                         intel_iommu_gfx_mapped = 1;
3282                 } else {
3283                         drhd->ignored = 1;
3284                         for (i = 0; i < drhd->devices_cnt; i++) {
3285                                 if (!drhd->devices[i])
3286                                         continue;
3287                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3288                         }
3289                 }
3290         }
3291 }
3292
3293 #ifdef CONFIG_SUSPEND
3294 static int init_iommu_hw(void)
3295 {
3296         struct dmar_drhd_unit *drhd;
3297         struct intel_iommu *iommu = NULL;
3298
3299         for_each_active_iommu(iommu, drhd)
3300                 if (iommu->qi)
3301                         dmar_reenable_qi(iommu);
3302
3303         for_each_iommu(iommu, drhd) {
3304                 if (drhd->ignored) {
3305                         /*
3306                          * we always have to disable PMRs or DMA may fail on
3307                          * this device
3308                          */
3309                         if (force_on)
3310                                 iommu_disable_protect_mem_regions(iommu);
3311                         continue;
3312                 }
3313         
3314                 iommu_flush_write_buffer(iommu);
3315
3316                 iommu_set_root_entry(iommu);
3317
3318                 iommu->flush.flush_context(iommu, 0, 0, 0,
3319                                            DMA_CCMD_GLOBAL_INVL);
3320                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3321                                          DMA_TLB_GLOBAL_FLUSH);
3322                 if (iommu_enable_translation(iommu))
3323                         return 1;
3324                 iommu_disable_protect_mem_regions(iommu);
3325         }
3326
3327         return 0;
3328 }
3329
3330 static void iommu_flush_all(void)
3331 {
3332         struct dmar_drhd_unit *drhd;
3333         struct intel_iommu *iommu;
3334
3335         for_each_active_iommu(iommu, drhd) {
3336                 iommu->flush.flush_context(iommu, 0, 0, 0,
3337                                            DMA_CCMD_GLOBAL_INVL);
3338                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3339                                          DMA_TLB_GLOBAL_FLUSH);
3340         }
3341 }
3342
3343 static int iommu_suspend(void)
3344 {
3345         struct dmar_drhd_unit *drhd;
3346         struct intel_iommu *iommu = NULL;
3347         unsigned long flag;
3348
3349         for_each_active_iommu(iommu, drhd) {
3350                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3351                                                  GFP_ATOMIC);
3352                 if (!iommu->iommu_state)
3353                         goto nomem;
3354         }
3355
3356         iommu_flush_all();
3357
3358         for_each_active_iommu(iommu, drhd) {
3359                 iommu_disable_translation(iommu);
3360
3361                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3362
3363                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3364                         readl(iommu->reg + DMAR_FECTL_REG);
3365                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3366                         readl(iommu->reg + DMAR_FEDATA_REG);
3367                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3368                         readl(iommu->reg + DMAR_FEADDR_REG);
3369                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3370                         readl(iommu->reg + DMAR_FEUADDR_REG);
3371
3372                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3373         }
3374         return 0;
3375
3376 nomem:
3377         for_each_active_iommu(iommu, drhd)
3378                 kfree(iommu->iommu_state);
3379
3380         return -ENOMEM;
3381 }
3382
3383 static void iommu_resume(void)
3384 {
3385         struct dmar_drhd_unit *drhd;
3386         struct intel_iommu *iommu = NULL;
3387         unsigned long flag;
3388
3389         if (init_iommu_hw()) {
3390                 if (force_on)
3391                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3392                 else
3393                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3394                 return;
3395         }
3396
3397         for_each_active_iommu(iommu, drhd) {
3398
3399                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3400
3401                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3402                         iommu->reg + DMAR_FECTL_REG);
3403                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3404                         iommu->reg + DMAR_FEDATA_REG);
3405                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3406                         iommu->reg + DMAR_FEADDR_REG);
3407                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3408                         iommu->reg + DMAR_FEUADDR_REG);
3409
3410                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3411         }
3412
3413         for_each_active_iommu(iommu, drhd)
3414                 kfree(iommu->iommu_state);
3415 }
3416
3417 static struct syscore_ops iommu_syscore_ops = {
3418         .resume         = iommu_resume,
3419         .suspend        = iommu_suspend,
3420 };
3421
3422 static void __init init_iommu_pm_ops(void)
3423 {
3424         register_syscore_ops(&iommu_syscore_ops);
3425 }
3426
3427 #else
3428 static inline void init_iommu_pm_ops(void) {}
3429 #endif  /* CONFIG_PM */
3430
3431 LIST_HEAD(dmar_rmrr_units);
3432
3433 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3434 {
3435         list_add(&rmrr->list, &dmar_rmrr_units);
3436 }
3437
3438
3439 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3440 {
3441         struct acpi_dmar_reserved_memory *rmrr;
3442         struct dmar_rmrr_unit *rmrru;
3443
3444         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3445         if (!rmrru)
3446                 return -ENOMEM;
3447
3448         rmrru->hdr = header;
3449         rmrr = (struct acpi_dmar_reserved_memory *)header;
3450         rmrru->base_address = rmrr->base_address;
3451         rmrru->end_address = rmrr->end_address;
3452
3453         dmar_register_rmrr_unit(rmrru);
3454         return 0;
3455 }
3456
3457 static int __init
3458 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3459 {
3460         struct acpi_dmar_reserved_memory *rmrr;
3461         int ret;
3462
3463         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3464         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3465                 ((void *)rmrr) + rmrr->header.length,
3466                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3467
3468         if (ret || (rmrru->devices_cnt == 0)) {
3469                 list_del(&rmrru->list);
3470                 kfree(rmrru);
3471         }
3472         return ret;
3473 }
3474
3475 static LIST_HEAD(dmar_atsr_units);
3476
3477 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3478 {
3479         struct acpi_dmar_atsr *atsr;
3480         struct dmar_atsr_unit *atsru;
3481
3482         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3483         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3484         if (!atsru)
3485                 return -ENOMEM;
3486
3487         atsru->hdr = hdr;
3488         atsru->include_all = atsr->flags & 0x1;
3489
3490         list_add(&atsru->list, &dmar_atsr_units);
3491
3492         return 0;
3493 }
3494
3495 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3496 {
3497         int rc;
3498         struct acpi_dmar_atsr *atsr;
3499
3500         if (atsru->include_all)
3501                 return 0;
3502
3503         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3504         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3505                                 (void *)atsr + atsr->header.length,
3506                                 &atsru->devices_cnt, &atsru->devices,
3507                                 atsr->segment);
3508         if (rc || !atsru->devices_cnt) {
3509                 list_del(&atsru->list);
3510                 kfree(atsru);
3511         }
3512
3513         return rc;
3514 }
3515
3516 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3517 {
3518         int i;
3519         struct pci_bus *bus;
3520         struct acpi_dmar_atsr *atsr;
3521         struct dmar_atsr_unit *atsru;
3522
3523         dev = pci_physfn(dev);
3524
3525         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3526                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3527                 if (atsr->segment == pci_domain_nr(dev->bus))
3528                         goto found;
3529         }
3530
3531         return 0;
3532
3533 found:
3534         for (bus = dev->bus; bus; bus = bus->parent) {
3535                 struct pci_dev *bridge = bus->self;
3536
3537                 if (!bridge || !pci_is_pcie(bridge) ||
3538                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3539                         return 0;
3540
3541                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3542                         for (i = 0; i < atsru->devices_cnt; i++)
3543                                 if (atsru->devices[i] == bridge)
3544                                         return 1;
3545                         break;
3546                 }
3547         }
3548
3549         if (atsru->include_all)
3550                 return 1;
3551
3552         return 0;
3553 }
3554
3555 int __init dmar_parse_rmrr_atsr_dev(void)
3556 {
3557         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3558         struct dmar_atsr_unit *atsr, *atsr_n;
3559         int ret = 0;
3560
3561         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3562                 ret = rmrr_parse_dev(rmrr);
3563                 if (ret)
3564                         return ret;
3565         }
3566
3567         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3568                 ret = atsr_parse_dev(atsr);
3569                 if (ret)
3570                         return ret;
3571         }
3572
3573         return ret;
3574 }
3575
3576 /*
3577  * Here we only respond to action of unbound device from driver.
3578  *
3579  * Added device is not attached to its DMAR domain here yet. That will happen
3580  * when mapping the device to iova.
3581  */
3582 static int device_notifier(struct notifier_block *nb,
3583                                   unsigned long action, void *data)
3584 {
3585         struct device *dev = data;
3586         struct pci_dev *pdev = to_pci_dev(dev);
3587         struct dmar_domain *domain;
3588
3589         if (iommu_no_mapping(dev))
3590                 return 0;
3591
3592         domain = find_domain(pdev);
3593         if (!domain)
3594                 return 0;
3595
3596         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3597                 domain_remove_one_dev_info(domain, pdev);
3598
3599                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3600                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3601                     list_empty(&domain->devices))
3602                         domain_exit(domain);
3603         }
3604
3605         return 0;
3606 }
3607
3608 static struct notifier_block device_nb = {
3609         .notifier_call = device_notifier,
3610 };
3611
3612 int __init intel_iommu_init(void)
3613 {
3614         int ret = 0;
3615
3616         /* VT-d is required for a TXT/tboot launch, so enforce that */
3617         force_on = tboot_force_iommu();
3618
3619         if (dmar_table_init()) {
3620                 if (force_on)
3621                         panic("tboot: Failed to initialize DMAR table\n");
3622                 return  -ENODEV;
3623         }
3624
3625         if (dmar_dev_scope_init() < 0) {
3626                 if (force_on)
3627                         panic("tboot: Failed to initialize DMAR device scope\n");
3628                 return  -ENODEV;
3629         }
3630
3631         if (no_iommu || dmar_disabled)
3632                 return -ENODEV;
3633
3634         if (iommu_init_mempool()) {
3635                 if (force_on)
3636                         panic("tboot: Failed to initialize iommu memory\n");
3637                 return  -ENODEV;
3638         }
3639
3640         if (list_empty(&dmar_rmrr_units))
3641                 printk(KERN_INFO "DMAR: No RMRR found\n");
3642
3643         if (list_empty(&dmar_atsr_units))
3644                 printk(KERN_INFO "DMAR: No ATSR found\n");
3645
3646         if (dmar_init_reserved_ranges()) {
3647                 if (force_on)
3648                         panic("tboot: Failed to reserve iommu ranges\n");
3649                 return  -ENODEV;
3650         }
3651
3652         init_no_remapping_devices();
3653
3654         ret = init_dmars();
3655         if (ret) {
3656                 if (force_on)
3657                         panic("tboot: Failed to initialize DMARs\n");
3658                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3659                 put_iova_domain(&reserved_iova_list);
3660                 iommu_exit_mempool();
3661                 return ret;
3662         }
3663         printk(KERN_INFO
3664         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3665
3666         init_timer(&unmap_timer);
3667 #ifdef CONFIG_SWIOTLB
3668         swiotlb = 0;
3669 #endif
3670         dma_ops = &intel_dma_ops;
3671
3672         init_iommu_pm_ops();
3673
3674         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3675
3676         bus_register_notifier(&pci_bus_type, &device_nb);
3677
3678         intel_iommu_enabled = 1;
3679
3680         return 0;
3681 }
3682
3683 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3684                                            struct pci_dev *pdev)
3685 {
3686         struct pci_dev *tmp, *parent;
3687
3688         if (!iommu || !pdev)
3689                 return;
3690
3691         /* dependent device detach */
3692         tmp = pci_find_upstream_pcie_bridge(pdev);
3693         /* Secondary interface's bus number and devfn 0 */
3694         if (tmp) {
3695                 parent = pdev->bus->self;
3696                 while (parent != tmp) {
3697                         iommu_detach_dev(iommu, parent->bus->number,
3698                                          parent->devfn);
3699                         parent = parent->bus->self;
3700                 }
3701                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3702                         iommu_detach_dev(iommu,
3703                                 tmp->subordinate->number, 0);
3704                 else /* this is a legacy PCI bridge */
3705                         iommu_detach_dev(iommu, tmp->bus->number,
3706                                          tmp->devfn);
3707         }
3708 }
3709
3710 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3711                                           struct pci_dev *pdev)
3712 {
3713         struct device_domain_info *info;
3714         struct intel_iommu *iommu;
3715         unsigned long flags;
3716         int found = 0;
3717         struct list_head *entry, *tmp;
3718
3719         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3720                                 pdev->devfn);
3721         if (!iommu)
3722                 return;
3723
3724         spin_lock_irqsave(&device_domain_lock, flags);
3725         list_for_each_safe(entry, tmp, &domain->devices) {
3726                 info = list_entry(entry, struct device_domain_info, link);
3727                 if (info->segment == pci_domain_nr(pdev->bus) &&
3728                     info->bus == pdev->bus->number &&
3729                     info->devfn == pdev->devfn) {
3730                         list_del(&info->link);
3731                         list_del(&info->global);
3732                         if (info->dev)
3733                                 info->dev->dev.archdata.iommu = NULL;
3734                         spin_unlock_irqrestore(&device_domain_lock, flags);
3735
3736                         iommu_disable_dev_iotlb(info);
3737                         iommu_detach_dev(iommu, info->bus, info->devfn);
3738                         iommu_detach_dependent_devices(iommu, pdev);
3739                         free_devinfo_mem(info);
3740
3741                         spin_lock_irqsave(&device_domain_lock, flags);
3742
3743                         if (found)
3744                                 break;
3745                         else
3746                                 continue;
3747                 }
3748
3749                 /* if there is no other devices under the same iommu
3750                  * owned by this domain, clear this iommu in iommu_bmp
3751                  * update iommu count and coherency
3752                  */
3753                 if (iommu == device_to_iommu(info->segment, info->bus,
3754                                             info->devfn))
3755                         found = 1;
3756         }
3757
3758         spin_unlock_irqrestore(&device_domain_lock, flags);
3759
3760         if (found == 0) {
3761                 unsigned long tmp_flags;
3762                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3763                 clear_bit(iommu->seq_id, domain->iommu_bmp);
3764                 domain->iommu_count--;
3765                 domain_update_iommu_cap(domain);
3766                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3767
3768                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3769                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3770                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3771                         clear_bit(domain->id, iommu->domain_ids);
3772                         iommu->domains[domain->id] = NULL;
3773                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3774                 }
3775         }
3776 }
3777
3778 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3779 {
3780         struct device_domain_info *info;
3781         struct intel_iommu *iommu;
3782         unsigned long flags1, flags2;
3783
3784         spin_lock_irqsave(&device_domain_lock, flags1);
3785         while (!list_empty(&domain->devices)) {
3786                 info = list_entry(domain->devices.next,
3787                         struct device_domain_info, link);
3788                 list_del(&info->link);
3789                 list_del(&info->global);
3790                 if (info->dev)
3791                         info->dev->dev.archdata.iommu = NULL;
3792
3793                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3794
3795                 iommu_disable_dev_iotlb(info);
3796                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3797                 iommu_detach_dev(iommu, info->bus, info->devfn);
3798                 iommu_detach_dependent_devices(iommu, info->dev);
3799
3800                 /* clear this iommu in iommu_bmp, update iommu count
3801                  * and capabilities
3802                  */
3803                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3804                 if (test_and_clear_bit(iommu->seq_id,
3805                                        domain->iommu_bmp)) {
3806                         domain->iommu_count--;
3807                         domain_update_iommu_cap(domain);
3808                 }
3809                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3810
3811                 free_devinfo_mem(info);
3812                 spin_lock_irqsave(&device_domain_lock, flags1);
3813         }
3814         spin_unlock_irqrestore(&device_domain_lock, flags1);
3815 }
3816
3817 /* domain id for virtual machine, it won't be set in context */
3818 static unsigned long vm_domid;
3819
3820 static struct dmar_domain *iommu_alloc_vm_domain(void)
3821 {
3822         struct dmar_domain *domain;
3823
3824         domain = alloc_domain_mem();
3825         if (!domain)
3826                 return NULL;
3827
3828         domain->id = vm_domid++;
3829         domain->nid = -1;
3830         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3831         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3832
3833         return domain;
3834 }
3835
3836 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3837 {
3838         int adjust_width;
3839
3840         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3841         spin_lock_init(&domain->iommu_lock);
3842
3843         domain_reserve_special_ranges(domain);
3844
3845         /* calculate AGAW */
3846         domain->gaw = guest_width;
3847         adjust_width = guestwidth_to_adjustwidth(guest_width);
3848         domain->agaw = width_to_agaw(adjust_width);
3849
3850         INIT_LIST_HEAD(&domain->devices);
3851
3852         domain->iommu_count = 0;
3853         domain->iommu_coherency = 0;
3854         domain->iommu_snooping = 0;
3855         domain->iommu_superpage = 0;
3856         domain->max_addr = 0;
3857         domain->nid = -1;
3858
3859         /* always allocate the top pgd */
3860         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3861         if (!domain->pgd)
3862                 return -ENOMEM;
3863         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3864         return 0;
3865 }
3866
3867 static void iommu_free_vm_domain(struct dmar_domain *domain)
3868 {
3869         unsigned long flags;
3870         struct dmar_drhd_unit *drhd;
3871         struct intel_iommu *iommu;
3872         unsigned long i;
3873         unsigned long ndomains;
3874
3875         for_each_drhd_unit(drhd) {
3876                 if (drhd->ignored)
3877                         continue;
3878                 iommu = drhd->iommu;
3879
3880                 ndomains = cap_ndoms(iommu->cap);
3881                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3882                         if (iommu->domains[i] == domain) {
3883                                 spin_lock_irqsave(&iommu->lock, flags);
3884                                 clear_bit(i, iommu->domain_ids);
3885                                 iommu->domains[i] = NULL;
3886                                 spin_unlock_irqrestore(&iommu->lock, flags);
3887                                 break;
3888                         }
3889                 }
3890         }
3891 }
3892
3893 static void vm_domain_exit(struct dmar_domain *domain)
3894 {
3895         /* Domain 0 is reserved, so dont process it */
3896         if (!domain)
3897                 return;
3898
3899         vm_domain_remove_all_dev_info(domain);
3900         /* destroy iovas */
3901         put_iova_domain(&domain->iovad);
3902
3903         /* clear ptes */
3904         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3905
3906         /* free page tables */
3907         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3908
3909         iommu_free_vm_domain(domain);
3910         free_domain_mem(domain);
3911 }
3912
3913 static int intel_iommu_domain_init(struct iommu_domain *domain)
3914 {
3915         struct dmar_domain *dmar_domain;
3916
3917         dmar_domain = iommu_alloc_vm_domain();
3918         if (!dmar_domain) {
3919                 printk(KERN_ERR
3920                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3921                 return -ENOMEM;
3922         }
3923         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3924                 printk(KERN_ERR
3925                         "intel_iommu_domain_init() failed\n");
3926                 vm_domain_exit(dmar_domain);
3927                 return -ENOMEM;
3928         }
3929         domain_update_iommu_cap(dmar_domain);
3930         domain->priv = dmar_domain;
3931
3932         return 0;
3933 }
3934
3935 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3936 {
3937         struct dmar_domain *dmar_domain = domain->priv;
3938
3939         domain->priv = NULL;
3940         vm_domain_exit(dmar_domain);
3941 }
3942
3943 static int intel_iommu_attach_device(struct iommu_domain *domain,
3944                                      struct device *dev)
3945 {
3946         struct dmar_domain *dmar_domain = domain->priv;
3947         struct pci_dev *pdev = to_pci_dev(dev);
3948         struct intel_iommu *iommu;
3949         int addr_width;
3950
3951         /* normally pdev is not mapped */
3952         if (unlikely(domain_context_mapped(pdev))) {
3953                 struct dmar_domain *old_domain;
3954
3955                 old_domain = find_domain(pdev);
3956                 if (old_domain) {
3957                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3958                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3959                                 domain_remove_one_dev_info(old_domain, pdev);
3960                         else
3961                                 domain_remove_dev_info(old_domain);
3962                 }
3963         }
3964
3965         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3966                                 pdev->devfn);
3967         if (!iommu)
3968                 return -ENODEV;
3969
3970         /* check if this iommu agaw is sufficient for max mapped address */
3971         addr_width = agaw_to_width(iommu->agaw);
3972         if (addr_width > cap_mgaw(iommu->cap))
3973                 addr_width = cap_mgaw(iommu->cap);
3974
3975         if (dmar_domain->max_addr > (1LL << addr_width)) {
3976                 printk(KERN_ERR "%s: iommu width (%d) is not "
3977                        "sufficient for the mapped address (%llx)\n",
3978                        __func__, addr_width, dmar_domain->max_addr);
3979                 return -EFAULT;
3980         }
3981         dmar_domain->gaw = addr_width;
3982
3983         /*
3984          * Knock out extra levels of page tables if necessary
3985          */
3986         while (iommu->agaw < dmar_domain->agaw) {
3987                 struct dma_pte *pte;
3988
3989                 pte = dmar_domain->pgd;
3990                 if (dma_pte_present(pte)) {
3991                         dmar_domain->pgd = (struct dma_pte *)
3992                                 phys_to_virt(dma_pte_addr(pte));
3993                         free_pgtable_page(pte);
3994                 }
3995                 dmar_domain->agaw--;
3996         }
3997
3998         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3999 }
4000
4001 static void intel_iommu_detach_device(struct iommu_domain *domain,
4002                                       struct device *dev)
4003 {
4004         struct dmar_domain *dmar_domain = domain->priv;
4005         struct pci_dev *pdev = to_pci_dev(dev);
4006
4007         domain_remove_one_dev_info(dmar_domain, pdev);
4008 }
4009
4010 static int intel_iommu_map(struct iommu_domain *domain,
4011                            unsigned long iova, phys_addr_t hpa,
4012                            size_t size, int iommu_prot)
4013 {
4014         struct dmar_domain *dmar_domain = domain->priv;
4015         u64 max_addr;
4016         int prot = 0;
4017         int ret;
4018
4019         if (iommu_prot & IOMMU_READ)
4020                 prot |= DMA_PTE_READ;
4021         if (iommu_prot & IOMMU_WRITE)
4022                 prot |= DMA_PTE_WRITE;
4023         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4024                 prot |= DMA_PTE_SNP;
4025
4026         max_addr = iova + size;
4027         if (dmar_domain->max_addr < max_addr) {
4028                 u64 end;
4029
4030                 /* check if minimum agaw is sufficient for mapped address */
4031                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4032                 if (end < max_addr) {
4033                         printk(KERN_ERR "%s: iommu width (%d) is not "
4034                                "sufficient for the mapped address (%llx)\n",
4035                                __func__, dmar_domain->gaw, max_addr);
4036                         return -EFAULT;
4037                 }
4038                 dmar_domain->max_addr = max_addr;
4039         }
4040         /* Round up size to next multiple of PAGE_SIZE, if it and
4041            the low bits of hpa would take us onto the next page */
4042         size = aligned_nrpages(hpa, size);
4043         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4044                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4045         return ret;
4046 }
4047
4048 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4049                              unsigned long iova, size_t size)
4050 {
4051         struct dmar_domain *dmar_domain = domain->priv;
4052         int order;
4053
4054         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4055                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4056
4057         if (dmar_domain->max_addr == iova + size)
4058                 dmar_domain->max_addr = iova;
4059
4060         return PAGE_SIZE << order;
4061 }
4062
4063 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4064                                             unsigned long iova)
4065 {
4066         struct dmar_domain *dmar_domain = domain->priv;
4067         struct dma_pte *pte;
4068         u64 phys = 0;
4069
4070         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4071         if (pte)
4072                 phys = dma_pte_addr(pte);
4073
4074         return phys;
4075 }
4076
4077 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4078                                       unsigned long cap)
4079 {
4080         struct dmar_domain *dmar_domain = domain->priv;
4081
4082         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4083                 return dmar_domain->iommu_snooping;
4084         if (cap == IOMMU_CAP_INTR_REMAP)
4085                 return intr_remapping_enabled;
4086
4087         return 0;
4088 }
4089
4090 /*
4091  * Group numbers are arbitrary.  Device with the same group number
4092  * indicate the iommu cannot differentiate between them.  To avoid
4093  * tracking used groups we just use the seg|bus|devfn of the lowest
4094  * level we're able to differentiate devices
4095  */
4096 static int intel_iommu_device_group(struct device *dev, unsigned int *groupid)
4097 {
4098         struct pci_dev *pdev = to_pci_dev(dev);
4099         struct pci_dev *bridge;
4100         union {
4101                 struct {
4102                         u8 devfn;
4103                         u8 bus;
4104                         u16 segment;
4105                 } pci;
4106                 u32 group;
4107         } id;
4108
4109         if (iommu_no_mapping(dev))
4110                 return -ENODEV;
4111
4112         id.pci.segment = pci_domain_nr(pdev->bus);
4113         id.pci.bus = pdev->bus->number;
4114         id.pci.devfn = pdev->devfn;
4115
4116         if (!device_to_iommu(id.pci.segment, id.pci.bus, id.pci.devfn))
4117                 return -ENODEV;
4118
4119         bridge = pci_find_upstream_pcie_bridge(pdev);
4120         if (bridge) {
4121                 if (pci_is_pcie(bridge)) {
4122                         id.pci.bus = bridge->subordinate->number;
4123                         id.pci.devfn = 0;
4124                 } else {
4125                         id.pci.bus = bridge->bus->number;
4126                         id.pci.devfn = bridge->devfn;
4127                 }
4128         }
4129
4130         if (!pdev->is_virtfn && iommu_group_mf)
4131                 id.pci.devfn = PCI_DEVFN(PCI_SLOT(id.pci.devfn), 0);
4132
4133         *groupid = id.group;
4134
4135         return 0;
4136 }
4137
4138 static struct iommu_ops intel_iommu_ops = {
4139         .domain_init    = intel_iommu_domain_init,
4140         .domain_destroy = intel_iommu_domain_destroy,
4141         .attach_dev     = intel_iommu_attach_device,
4142         .detach_dev     = intel_iommu_detach_device,
4143         .map            = intel_iommu_map,
4144         .unmap          = intel_iommu_unmap,
4145         .iova_to_phys   = intel_iommu_iova_to_phys,
4146         .domain_has_cap = intel_iommu_domain_has_cap,
4147         .device_group   = intel_iommu_device_group,
4148         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4149 };
4150
4151 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4152 {
4153         /*
4154          * Mobile 4 Series Chipset neglects to set RWBF capability,
4155          * but needs it:
4156          */
4157         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4158         rwbf_quirk = 1;
4159
4160         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4161         if (dev->revision == 0x07) {
4162                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4163                 dmar_map_gfx = 0;
4164         }
4165 }
4166
4167 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4168
4169 #define GGC 0x52
4170 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4171 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4172 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4173 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4174 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4175 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4176 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4177 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4178
4179 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4180 {
4181         unsigned short ggc;
4182
4183         if (pci_read_config_word(dev, GGC, &ggc))
4184                 return;
4185
4186         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4187                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4188                 dmar_map_gfx = 0;
4189         } else if (dmar_map_gfx) {
4190                 /* we have to ensure the gfx device is idle before we flush */
4191                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4192                 intel_iommu_strict = 1;
4193        }
4194 }
4195 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4196 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4197 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4198 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4199
4200 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4201    ISOCH DMAR unit for the Azalia sound device, but not give it any
4202    TLB entries, which causes it to deadlock. Check for that.  We do
4203    this in a function called from init_dmars(), instead of in a PCI
4204    quirk, because we don't want to print the obnoxious "BIOS broken"
4205    message if VT-d is actually disabled.
4206 */
4207 static void __init check_tylersburg_isoch(void)
4208 {
4209         struct pci_dev *pdev;
4210         uint32_t vtisochctrl;
4211
4212         /* If there's no Azalia in the system anyway, forget it. */
4213         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4214         if (!pdev)
4215                 return;
4216         pci_dev_put(pdev);
4217
4218         /* System Management Registers. Might be hidden, in which case
4219            we can't do the sanity check. But that's OK, because the
4220            known-broken BIOSes _don't_ actually hide it, so far. */
4221         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4222         if (!pdev)
4223                 return;
4224
4225         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4226                 pci_dev_put(pdev);
4227                 return;
4228         }
4229
4230         pci_dev_put(pdev);
4231
4232         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4233         if (vtisochctrl & 1)
4234                 return;
4235
4236         /* Drop all bits other than the number of TLB entries */
4237         vtisochctrl &= 0x1c;
4238
4239         /* If we have the recommended number of TLB entries (16), fine. */
4240         if (vtisochctrl == 0x10)
4241                 return;
4242
4243         /* Zero TLB entries? You get to ride the short bus to school. */
4244         if (!vtisochctrl) {
4245                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4246                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4247                      dmi_get_system_info(DMI_BIOS_VENDOR),
4248                      dmi_get_system_info(DMI_BIOS_VERSION),
4249                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4250                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4251                 return;
4252         }
4253         
4254         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4255                vtisochctrl);
4256 }