2 * High memory handling common code and variables.
4 * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
5 * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
8 * Redesigned the x86 32-bit VM architecture to deal with
9 * 64-bit physical space. With current x86 CPUs this
10 * means up to 64 Gigabytes physical RAM.
12 * Rewrote high memory support to move the page cache into
13 * high memory. Implemented permanent (schedulable) kmaps
14 * based on Linus' idea.
16 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
20 #include <linux/pagemap.h>
21 #include <linux/mempool.h>
22 #include <linux/blkdev.h>
24 static mempool_t *page_pool, *isa_page_pool;
26 static void *page_pool_alloc(int gfp_mask, void *data)
28 int gfp = gfp_mask | (int) (long) data;
30 return alloc_page(gfp);
33 static void page_pool_free(void *page, void *data)
39 * Virtual_count is not a pure "count".
40 * 0 means that it is not mapped, and has not been mapped
41 * since a TLB flush - it is usable.
42 * 1 means that there are no users, but it has been mapped
43 * since the last TLB flush - so we can't use it.
44 * n means that there are (n-1) current users of it.
47 static int pkmap_count[LAST_PKMAP];
48 static unsigned int last_pkmap_nr;
49 static spinlock_t kmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
51 pte_t * pkmap_page_table;
53 static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
55 static void flush_all_zero_pkmaps(void)
61 for (i = 0; i < LAST_PKMAP; i++) {
65 * zero means we don't have anything to do,
66 * >1 means that it is still in use. Only
67 * a count of 1 means that it is free but
68 * needs to be unmapped
70 if (pkmap_count[i] != 1)
75 if (pte_none(pkmap_page_table[i]))
79 * Don't need an atomic fetch-and-clear op here;
80 * no-one has the page mapped, and cannot get at
81 * its virtual address (and hence PTE) without first
82 * getting the kmap_lock (which is held here).
83 * So no dangers, even with speculative execution.
85 page = pte_page(pkmap_page_table[i]);
86 pte_clear(&pkmap_page_table[i]);
93 static inline unsigned long map_new_virtual(struct page *page)
100 /* Find an empty entry */
102 last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
103 if (!last_pkmap_nr) {
104 flush_all_zero_pkmaps();
107 if (!pkmap_count[last_pkmap_nr])
108 break; /* Found a usable entry */
113 * Sleep for somebody else to unmap their entries
116 DECLARE_WAITQUEUE(wait, current);
118 current->state = TASK_UNINTERRUPTIBLE;
119 add_wait_queue(&pkmap_map_wait, &wait);
120 spin_unlock(&kmap_lock);
122 remove_wait_queue(&pkmap_map_wait, &wait);
123 spin_lock(&kmap_lock);
125 /* Somebody else might have mapped it while we slept */
127 return (unsigned long) page->virtual;
133 vaddr = PKMAP_ADDR(last_pkmap_nr);
134 set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
136 pkmap_count[last_pkmap_nr] = 1;
137 page->virtual = (void *) vaddr;
142 void *kmap_high(struct page *page)
147 * For highmem pages, we can't trust "virtual" until
148 * after we have the lock.
150 * We cannot call this from interrupts, as it may block
152 spin_lock(&kmap_lock);
153 vaddr = (unsigned long) page->virtual;
155 vaddr = map_new_virtual(page);
156 pkmap_count[PKMAP_NR(vaddr)]++;
157 if (pkmap_count[PKMAP_NR(vaddr)] < 2)
159 spin_unlock(&kmap_lock);
160 return (void*) vaddr;
163 void kunmap_high(struct page *page)
169 spin_lock(&kmap_lock);
170 vaddr = (unsigned long) page->virtual;
173 nr = PKMAP_NR(vaddr);
176 * A count must never go down to zero
177 * without a TLB flush!
180 switch (--pkmap_count[nr]) {
185 * Avoid an unnecessary wake_up() function call.
186 * The common case is pkmap_count[] == 1, but
188 * The tasks queued in the wait-queue are guarded
189 * by both the lock in the wait-queue-head and by
190 * the kmap_lock. As the kmap_lock is held here,
191 * no need for the wait-queue-head's lock. Simply
192 * test if the queue is empty.
194 need_wakeup = waitqueue_active(&pkmap_map_wait);
196 spin_unlock(&kmap_lock);
198 /* do wake-up, if needed, race-free outside of the spin lock */
200 wake_up(&pkmap_map_wait);
205 static __init int init_emergency_pool(void)
214 page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
217 printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
222 __initcall(init_emergency_pool);
225 * highmem version, map in to vec
227 static inline void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
232 local_irq_save(flags);
233 vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
234 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
235 kunmap_atomic(vto, KM_BOUNCE_READ);
236 local_irq_restore(flags);
239 #else /* CONFIG_HIGHMEM */
241 #define bounce_copy_vec(to, vfrom) \
242 memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
246 #define ISA_POOL_SIZE 16
249 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
250 * as the max address, so check if the pool has already been created.
252 int init_emergency_isa_pool(void)
257 isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA);
261 printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
266 * Simple bounce buffer support for highmem pages. Depending on the
267 * queue gfp mask set, *to may or may not be a highmem page. kmap it
268 * always, it will do the Right Thing
270 static inline void copy_to_high_bio_irq(struct bio *to, struct bio *from)
272 unsigned char *vfrom;
273 struct bio_vec *tovec, *fromvec;
276 __bio_for_each_segment(tovec, to, i, 0) {
277 fromvec = from->bi_io_vec + i;
282 if (tovec->bv_page == fromvec->bv_page)
285 vfrom = page_address(fromvec->bv_page) + fromvec->bv_offset;
287 bounce_copy_vec(tovec, vfrom);
291 static inline void bounce_end_io(struct bio *bio, mempool_t *pool)
293 struct bio *bio_orig = bio->bi_private;
294 struct bio_vec *bvec, *org_vec;
297 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
300 set_bit(BIO_UPTODATE, &bio_orig->bi_flags);
303 * free up bounce indirect pages used
305 __bio_for_each_segment(bvec, bio, i, 0) {
306 org_vec = bio_orig->bi_io_vec + i;
307 if (bvec->bv_page == org_vec->bv_page)
310 mempool_free(bvec->bv_page, pool);
314 bio_orig->bi_end_io(bio_orig);
318 static void bounce_end_io_write(struct bio *bio)
320 bounce_end_io(bio, page_pool);
323 static void bounce_end_io_write_isa(struct bio *bio)
325 bounce_end_io(bio, isa_page_pool);
328 static inline void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
330 struct bio *bio_orig = bio->bi_private;
332 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
333 copy_to_high_bio_irq(bio_orig, bio);
335 bounce_end_io(bio, pool);
338 static void bounce_end_io_read(struct bio *bio)
340 __bounce_end_io_read(bio, page_pool);
343 static void bounce_end_io_read_isa(struct bio *bio)
345 return __bounce_end_io_read(bio, isa_page_pool);
348 void create_bounce(unsigned long pfn, int gfp, struct bio **bio_orig)
351 struct bio *bio = NULL;
352 int i, rw = bio_data_dir(*bio_orig), bio_gfp;
353 struct bio_vec *to, *from;
356 BUG_ON((*bio_orig)->bi_idx);
359 * for non-isa bounce case, just check if the bounce pfn is equal
360 * to or bigger than the highest pfn in the system -- in that case,
361 * don't waste time iterating over bio segments
363 if (!(gfp & GFP_DMA)) {
364 if (pfn >= blk_max_pfn)
367 bio_gfp = GFP_NOHIGHIO;
370 BUG_ON(!isa_page_pool);
372 pool = isa_page_pool;
375 bio_for_each_segment(from, *bio_orig, i) {
376 page = from->bv_page;
379 * is destination page below bounce pfn?
381 if ((page - page->zone->zone_mem_map) + (page->zone->zone_start_paddr >> PAGE_SHIFT) < pfn)
388 bio = bio_alloc(bio_gfp, (*bio_orig)->bi_vcnt);
390 to = bio->bi_io_vec + i;
392 to->bv_page = mempool_alloc(pool, gfp);
393 to->bv_len = from->bv_len;
394 to->bv_offset = from->bv_offset;
399 vto = page_address(to->bv_page) + to->bv_offset;
400 vfrom = kmap(from->bv_page) + from->bv_offset;
401 memcpy(vto, vfrom, to->bv_len);
402 kunmap(from->bv_page);
413 * at least one page was bounced, fill in possible non-highmem
416 bio_for_each_segment(from, *bio_orig, i) {
417 to = &bio->bi_io_vec[i];
419 to->bv_page = from->bv_page;
420 to->bv_len = from->bv_len;
421 to->bv_offset = to->bv_offset;
425 bio->bi_dev = (*bio_orig)->bi_dev;
426 bio->bi_sector = (*bio_orig)->bi_sector;
427 bio->bi_rw = (*bio_orig)->bi_rw;
429 bio->bi_vcnt = (*bio_orig)->bi_vcnt;
431 bio->bi_size = (*bio_orig)->bi_size;
433 if (pool == page_pool) {
435 bio->bi_end_io = bounce_end_io_write;
437 bio->bi_end_io = bounce_end_io_read;
440 bio->bi_end_io = bounce_end_io_write_isa;
442 bio->bi_end_io = bounce_end_io_read_isa;
445 bio->bi_private = *bio_orig;