2 * High memory handling common code and variables.
4 * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
5 * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
8 * Redesigned the x86 32-bit VM architecture to deal with
9 * 64-bit physical space. With current x86 CPUs this
10 * means up to 64 Gigabytes physical RAM.
12 * Rewrote high memory support to move the page cache into
13 * high memory. Implemented permanent (schedulable) kmaps
14 * based on Linus' idea.
16 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
20 #include <linux/pagemap.h>
21 #include <linux/highmem.h>
22 #include <linux/swap.h>
23 #include <linux/slab.h>
24 #include <linux/compiler.h>
26 #include <linux/kernel_stat.h>
29 * Virtual_count is not a pure "count".
30 * 0 means that it is not mapped, and has not been mapped
31 * since a TLB flush - it is usable.
32 * 1 means that there are no users, but it has been mapped
33 * since the last TLB flush - so we can't use it.
34 * n means that there are (n-1) current users of it.
36 static int pkmap_count[LAST_PKMAP];
37 static unsigned int last_pkmap_nr;
38 static spinlock_t kmap_lock = SPIN_LOCK_UNLOCKED;
40 pte_t * pkmap_page_table;
42 static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
44 static void flush_all_zero_pkmaps(void)
50 for (i = 0; i < LAST_PKMAP; i++) {
54 * zero means we don't have anything to do,
55 * >1 means that it is still in use. Only
56 * a count of 1 means that it is free but
57 * needs to be unmapped
59 if (pkmap_count[i] != 1)
64 if (pte_none(pkmap_page_table[i]))
68 * Don't need an atomic fetch-and-clear op here;
69 * no-one has the page mapped, and cannot get at
70 * its virtual address (and hence PTE) without first
71 * getting the kmap_lock (which is held here).
72 * So no dangers, even with speculative execution.
74 page = pte_page(pkmap_page_table[i]);
75 pte_clear(&pkmap_page_table[i]);
82 static inline unsigned long map_new_virtual(struct page *page)
89 /* Find an empty entry */
91 last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
93 flush_all_zero_pkmaps();
96 if (!pkmap_count[last_pkmap_nr])
97 break; /* Found a usable entry */
102 * Sleep for somebody else to unmap their entries
105 DECLARE_WAITQUEUE(wait, current);
107 current->state = TASK_UNINTERRUPTIBLE;
108 add_wait_queue(&pkmap_map_wait, &wait);
109 spin_unlock(&kmap_lock);
111 remove_wait_queue(&pkmap_map_wait, &wait);
112 spin_lock(&kmap_lock);
114 /* Somebody else might have mapped it while we slept */
116 return (unsigned long) page->virtual;
122 vaddr = PKMAP_ADDR(last_pkmap_nr);
123 set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
125 pkmap_count[last_pkmap_nr] = 1;
126 page->virtual = (void *) vaddr;
131 void *kmap_high(struct page *page)
136 * For highmem pages, we can't trust "virtual" until
137 * after we have the lock.
139 * We cannot call this from interrupts, as it may block
141 spin_lock(&kmap_lock);
142 vaddr = (unsigned long) page->virtual;
144 vaddr = map_new_virtual(page);
145 pkmap_count[PKMAP_NR(vaddr)]++;
146 if (pkmap_count[PKMAP_NR(vaddr)] < 2)
148 spin_unlock(&kmap_lock);
149 return (void*) vaddr;
152 void kunmap_high(struct page *page)
158 spin_lock(&kmap_lock);
159 vaddr = (unsigned long) page->virtual;
162 nr = PKMAP_NR(vaddr);
165 * A count must never go down to zero
166 * without a TLB flush!
169 switch (--pkmap_count[nr]) {
174 * Avoid an unnecessary wake_up() function call.
175 * The common case is pkmap_count[] == 1, but
177 * The tasks queued in the wait-queue are guarded
178 * by both the lock in the wait-queue-head and by
179 * the kmap_lock. As the kmap_lock is held here,
180 * no need for the wait-queue-head's lock. Simply
181 * test if the queue is empty.
183 need_wakeup = waitqueue_active(&pkmap_map_wait);
185 spin_unlock(&kmap_lock);
187 /* do wake-up, if needed, race-free outside of the spin lock */
189 wake_up(&pkmap_map_wait);
195 * This lock gets no contention at all, normally.
197 static spinlock_t emergency_lock = SPIN_LOCK_UNLOCKED;
199 int nr_emergency_pages;
200 static LIST_HEAD(emergency_pages);
202 int nr_emergency_bhs;
203 static LIST_HEAD(emergency_bhs);
206 * Simple bounce buffer support for highmem pages. Depending on the
207 * queue gfp mask set, *to may or may not be a highmem page. kmap it
208 * always, it will do the Right Thing
210 static inline void copy_to_high_bio_irq(struct bio *to, struct bio *from)
212 unsigned char *vto, *vfrom;
214 struct bio_vec *tovec, *fromvec;
217 __bio_for_each_segment(tovec, to, i, 0) {
218 fromvec = &from->bi_io_vec[i];
223 if (tovec->bv_page == fromvec->bv_page)
226 vfrom = page_address(fromvec->bv_page) + fromvec->bv_offset;
228 local_irq_save(flags);
229 vto = kmap_atomic(tovec->bv_page, KM_BOUNCE_READ);
230 memcpy(vto + tovec->bv_offset, vfrom, tovec->bv_len);
231 kunmap_atomic(vto, KM_BOUNCE_READ);
232 local_irq_restore(flags);
236 static __init int init_emergency_pool(void)
245 spin_lock_irq(&emergency_lock);
246 while (nr_emergency_pages < POOL_SIZE) {
247 struct page * page = alloc_page(GFP_ATOMIC);
249 printk("couldn't refill highmem emergency pages");
252 list_add(&page->list, &emergency_pages);
253 nr_emergency_pages++;
255 spin_unlock_irq(&emergency_lock);
256 printk("allocated %d pages reserved for the highmem bounces\n", nr_emergency_pages);
260 __initcall(init_emergency_pool);
262 static inline int bounce_end_io (struct bio *bio, int nr_sectors)
264 struct bio *bio_orig = bio->bi_private;
265 struct bio_vec *bvec, *org_vec;
269 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
272 set_bit(BIO_UPTODATE, &bio_orig->bi_flags);
275 * free up bounce indirect pages used
277 spin_lock_irqsave(&emergency_lock, flags);
278 __bio_for_each_segment(bvec, bio, i, 0) {
279 org_vec = &bio_orig->bi_io_vec[i];
280 if (bvec->bv_page == org_vec->bv_page)
283 if (nr_emergency_pages >= POOL_SIZE)
284 __free_page(bvec->bv_page);
287 * We are abusing page->list to manage
288 * the highmem emergency pool:
290 list_add(&bvec->bv_page->list, &emergency_pages);
291 nr_emergency_pages++;
294 spin_unlock_irqrestore(&emergency_lock, flags);
297 ret = bio_orig->bi_end_io(bio_orig, nr_sectors);
303 static int bounce_end_io_write(struct bio *bio, int nr_sectors)
305 return bounce_end_io(bio, nr_sectors);
308 static int bounce_end_io_read (struct bio *bio, int nr_sectors)
310 struct bio *bio_orig = bio->bi_private;
312 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
313 copy_to_high_bio_irq(bio_orig, bio);
315 return bounce_end_io(bio, nr_sectors);
318 struct page *alloc_bounce_page(int gfp_mask)
320 struct list_head *tmp;
323 page = alloc_page(gfp_mask);
327 * No luck. First, kick the VM so it doesnt idle around while
328 * we are using up our emergency rations.
334 * Try to allocate from the emergency pool.
336 tmp = &emergency_pages;
337 spin_lock_irq(&emergency_lock);
338 if (!list_empty(tmp)) {
339 page = list_entry(tmp->next, struct page, list);
341 nr_emergency_pages--;
343 spin_unlock_irq(&emergency_lock);
347 /* we need to wait I/O completion */
348 run_task_queue(&tq_disk);
350 current->policy |= SCHED_YIELD;
351 __set_current_state(TASK_RUNNING);
356 void create_bounce(unsigned long pfn, struct bio **bio_orig)
359 struct bio *bio = NULL;
360 int i, rw = bio_data_dir(*bio_orig);
361 struct bio_vec *to, *from;
363 BUG_ON((*bio_orig)->bi_idx);
365 bio_for_each_segment(from, *bio_orig, i) {
366 page = from->bv_page;
369 * is destination page below bounce pfn?
371 if ((page - page->zone->zone_mem_map) + (page->zone->zone_start_paddr >> PAGE_SHIFT) < pfn)
378 bio = bio_alloc(GFP_NOHIGHIO, (*bio_orig)->bi_vcnt);
380 to = &bio->bi_io_vec[i];
382 to->bv_page = alloc_bounce_page(GFP_NOHIGHIO);
383 to->bv_len = from->bv_len;
384 to->bv_offset = from->bv_offset;
389 vto = page_address(to->bv_page) + to->bv_offset;
390 vfrom = kmap(from->bv_page) + from->bv_offset;
391 memcpy(vto, vfrom, to->bv_len);
392 kunmap(from->bv_page);
403 * at least one page was bounced, fill in possible non-highmem
406 bio_for_each_segment(from, *bio_orig, i) {
407 to = &bio->bi_io_vec[i];
409 to->bv_page = from->bv_page;
410 to->bv_len = from->bv_len;
411 to->bv_offset = to->bv_offset;
415 bio->bi_dev = (*bio_orig)->bi_dev;
416 bio->bi_sector = (*bio_orig)->bi_sector;
417 bio->bi_rw = (*bio_orig)->bi_rw;
419 bio->bi_vcnt = (*bio_orig)->bi_vcnt;
421 bio->bi_size = (*bio_orig)->bi_size;
424 bio->bi_end_io = bounce_end_io_write;
426 bio->bi_end_io = bounce_end_io_read;
428 bio->bi_private = *bio_orig;