2 * High memory handling common code and variables.
4 * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
5 * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
8 * Redesigned the x86 32-bit VM architecture to deal with
9 * 64-bit physical space. With current x86 CPUs this
10 * means up to 64 Gigabytes physical RAM.
12 * Rewrote high memory support to move the page cache into
13 * high memory. Implemented permanent (schedulable) kmaps
14 * based on Linus' idea.
16 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
20 #include <linux/pagemap.h>
21 #include <linux/mempool.h>
22 #include <linux/blkdev.h>
23 #include <asm/pgalloc.h>
25 static mempool_t *page_pool, *isa_page_pool;
27 static void *page_pool_alloc(int gfp_mask, void *data)
29 int gfp = gfp_mask | (int) (long) data;
31 return alloc_page(gfp);
34 static void page_pool_free(void *page, void *data)
40 * Virtual_count is not a pure "count".
41 * 0 means that it is not mapped, and has not been mapped
42 * since a TLB flush - it is usable.
43 * 1 means that there are no users, but it has been mapped
44 * since the last TLB flush - so we can't use it.
45 * n means that there are (n-1) current users of it.
48 static int pkmap_count[LAST_PKMAP];
49 static unsigned int last_pkmap_nr;
50 static spinlock_t kmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
52 pte_t * pkmap_page_table;
54 static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
56 static void flush_all_zero_pkmaps(void)
62 for (i = 0; i < LAST_PKMAP; i++) {
66 * zero means we don't have anything to do,
67 * >1 means that it is still in use. Only
68 * a count of 1 means that it is free but
69 * needs to be unmapped
71 if (pkmap_count[i] != 1)
76 if (pte_none(pkmap_page_table[i]))
80 * Don't need an atomic fetch-and-clear op here;
81 * no-one has the page mapped, and cannot get at
82 * its virtual address (and hence PTE) without first
83 * getting the kmap_lock (which is held here).
84 * So no dangers, even with speculative execution.
86 page = pte_page(pkmap_page_table[i]);
87 pte_clear(&pkmap_page_table[i]);
91 flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
94 static inline unsigned long map_new_virtual(struct page *page)
101 /* Find an empty entry */
103 last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
104 if (!last_pkmap_nr) {
105 flush_all_zero_pkmaps();
108 if (!pkmap_count[last_pkmap_nr])
109 break; /* Found a usable entry */
114 * Sleep for somebody else to unmap their entries
117 DECLARE_WAITQUEUE(wait, current);
119 current->state = TASK_UNINTERRUPTIBLE;
120 add_wait_queue(&pkmap_map_wait, &wait);
121 spin_unlock(&kmap_lock);
123 remove_wait_queue(&pkmap_map_wait, &wait);
124 spin_lock(&kmap_lock);
126 /* Somebody else might have mapped it while we slept */
128 return (unsigned long) page->virtual;
134 vaddr = PKMAP_ADDR(last_pkmap_nr);
135 set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
137 pkmap_count[last_pkmap_nr] = 1;
138 page->virtual = (void *) vaddr;
143 void *kmap_high(struct page *page)
148 * For highmem pages, we can't trust "virtual" until
149 * after we have the lock.
151 * We cannot call this from interrupts, as it may block
153 spin_lock(&kmap_lock);
154 vaddr = (unsigned long) page->virtual;
156 vaddr = map_new_virtual(page);
157 pkmap_count[PKMAP_NR(vaddr)]++;
158 if (pkmap_count[PKMAP_NR(vaddr)] < 2)
160 spin_unlock(&kmap_lock);
161 return (void*) vaddr;
164 void kunmap_high(struct page *page)
170 spin_lock(&kmap_lock);
171 vaddr = (unsigned long) page->virtual;
174 nr = PKMAP_NR(vaddr);
177 * A count must never go down to zero
178 * without a TLB flush!
181 switch (--pkmap_count[nr]) {
186 * Avoid an unnecessary wake_up() function call.
187 * The common case is pkmap_count[] == 1, but
189 * The tasks queued in the wait-queue are guarded
190 * by both the lock in the wait-queue-head and by
191 * the kmap_lock. As the kmap_lock is held here,
192 * no need for the wait-queue-head's lock. Simply
193 * test if the queue is empty.
195 need_wakeup = waitqueue_active(&pkmap_map_wait);
197 spin_unlock(&kmap_lock);
199 /* do wake-up, if needed, race-free outside of the spin lock */
201 wake_up(&pkmap_map_wait);
206 static __init int init_emergency_pool(void)
215 page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
218 printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
223 __initcall(init_emergency_pool);
226 * highmem version, map in to vec
228 static inline void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
233 local_irq_save(flags);
234 vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
235 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
236 kunmap_atomic(vto, KM_BOUNCE_READ);
237 local_irq_restore(flags);
240 #else /* CONFIG_HIGHMEM */
242 #define bounce_copy_vec(to, vfrom) \
243 memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
247 #define ISA_POOL_SIZE 16
250 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
251 * as the max address, so check if the pool has already been created.
253 int init_emergency_isa_pool(void)
258 isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA);
262 printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
267 * Simple bounce buffer support for highmem pages. Depending on the
268 * queue gfp mask set, *to may or may not be a highmem page. kmap it
269 * always, it will do the Right Thing
271 static inline void copy_to_high_bio_irq(struct bio *to, struct bio *from)
273 unsigned char *vfrom;
274 struct bio_vec *tovec, *fromvec;
277 __bio_for_each_segment(tovec, to, i, 0) {
278 fromvec = from->bi_io_vec + i;
283 if (tovec->bv_page == fromvec->bv_page)
286 vfrom = page_address(fromvec->bv_page) + fromvec->bv_offset;
288 bounce_copy_vec(tovec, vfrom);
292 static inline void bounce_end_io(struct bio *bio, mempool_t *pool)
294 struct bio *bio_orig = bio->bi_private;
295 struct bio_vec *bvec, *org_vec;
298 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
301 set_bit(BIO_UPTODATE, &bio_orig->bi_flags);
304 * free up bounce indirect pages used
306 __bio_for_each_segment(bvec, bio, i, 0) {
307 org_vec = bio_orig->bi_io_vec + i;
308 if (bvec->bv_page == org_vec->bv_page)
311 mempool_free(bvec->bv_page, pool);
315 bio_orig->bi_end_io(bio_orig);
319 static void bounce_end_io_write(struct bio *bio)
321 bounce_end_io(bio, page_pool);
324 static void bounce_end_io_write_isa(struct bio *bio)
326 bounce_end_io(bio, isa_page_pool);
329 static inline void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
331 struct bio *bio_orig = bio->bi_private;
333 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
334 copy_to_high_bio_irq(bio_orig, bio);
336 bounce_end_io(bio, pool);
339 static void bounce_end_io_read(struct bio *bio)
341 __bounce_end_io_read(bio, page_pool);
344 static void bounce_end_io_read_isa(struct bio *bio)
346 return __bounce_end_io_read(bio, isa_page_pool);
349 void create_bounce(unsigned long pfn, int gfp, struct bio **bio_orig)
352 struct bio *bio = NULL;
353 int i, rw = bio_data_dir(*bio_orig), bio_gfp;
354 struct bio_vec *to, *from;
357 BUG_ON((*bio_orig)->bi_idx);
360 * for non-isa bounce case, just check if the bounce pfn is equal
361 * to or bigger than the highest pfn in the system -- in that case,
362 * don't waste time iterating over bio segments
364 if (!(gfp & GFP_DMA)) {
365 if (pfn >= blk_max_pfn)
368 bio_gfp = GFP_NOHIGHIO;
371 BUG_ON(!isa_page_pool);
373 pool = isa_page_pool;
376 bio_for_each_segment(from, *bio_orig, i) {
377 page = from->bv_page;
380 * is destination page below bounce pfn?
382 if ((page - page_zone(page)->zone_mem_map) + (page_zone(page)->zone_start_paddr >> PAGE_SHIFT) < pfn)
389 bio = bio_alloc(bio_gfp, (*bio_orig)->bi_vcnt);
391 to = bio->bi_io_vec + i;
393 to->bv_page = mempool_alloc(pool, gfp);
394 to->bv_len = from->bv_len;
395 to->bv_offset = from->bv_offset;
400 vto = page_address(to->bv_page) + to->bv_offset;
401 vfrom = kmap(from->bv_page) + from->bv_offset;
402 memcpy(vto, vfrom, to->bv_len);
403 kunmap(from->bv_page);
414 * at least one page was bounced, fill in possible non-highmem
417 bio_for_each_segment(from, *bio_orig, i) {
418 to = &bio->bi_io_vec[i];
420 to->bv_page = from->bv_page;
421 to->bv_len = from->bv_len;
422 to->bv_offset = to->bv_offset;
426 bio->bi_bdev = (*bio_orig)->bi_bdev;
427 bio->bi_sector = (*bio_orig)->bi_sector;
428 bio->bi_rw = (*bio_orig)->bi_rw;
430 bio->bi_vcnt = (*bio_orig)->bi_vcnt;
432 bio->bi_size = (*bio_orig)->bi_size;
434 if (pool == page_pool) {
436 bio->bi_end_io = bounce_end_io_write;
438 bio->bi_end_io = bounce_end_io_read;
441 bio->bi_end_io = bounce_end_io_write_isa;
443 bio->bi_end_io = bounce_end_io_read_isa;
446 bio->bi_private = *bio_orig;
450 #if CONFIG_DEBUG_HIGHMEM
451 void check_highmem_ptes(void)
455 for (type = 0; type < KM_TYPE_NR; type++) {
456 idx = type + KM_TYPE_NR*smp_processor_id();
457 if (!pte_none(*(kmap_pte-idx))) {
458 printk("scheduling with KM_TYPE %d held!\n", type);