ARM: 7379/1: DT: fix atags_to_fdt() second call site
[linux-flexiantxendom0.git] / arch / arm / boot / compressed / head.S
index 82f5fcf..9c18ebd 100644 (file)
 
 #if defined(CONFIG_DEBUG_ICEDCC)
 
-#ifdef CONFIG_CPU_V6
-               .macro  loadsp, rb
+#if defined(CONFIG_CPU_V6) || defined(CONFIG_CPU_V6K) || defined(CONFIG_CPU_V7)
+               .macro  loadsp, rb, tmp
                .endm
                .macro  writeb, ch, rb
                mcr     p14, 0, \ch, c0, c5, 0
                .endm
 #elif defined(CONFIG_CPU_XSCALE)
-               .macro  loadsp, rb
+               .macro  loadsp, rb, tmp
                .endm
                .macro  writeb, ch, rb
                mcr     p14, 0, \ch, c8, c0, 0
                .endm
 #else
-               .macro  loadsp, rb
+               .macro  loadsp, rb, tmp
                .endm
                .macro  writeb, ch, rb
                mcr     p14, 0, \ch, c1, c0, 0
@@ -50,7 +50,7 @@
                .endm
 
 #if defined(CONFIG_ARCH_SA1100)
-               .macro  loadsp, rb
+               .macro  loadsp, rb, tmp
                mov     \rb, #0x80000000        @ physical base address
 #ifdef CONFIG_DEBUG_LL_SER3
                add     \rb, \rb, #0x00050000   @ Ser3
 #endif
                .endm
 #elif defined(CONFIG_ARCH_S3C2410)
-               .macro loadsp, rb
+               .macro loadsp, rb, tmp
                mov     \rb, #0x50000000
                add     \rb, \rb, #0x4000 * CONFIG_S3C_LOWLEVEL_UART_PORT
                .endm
 #else
-               .macro  loadsp, rb
-               addruart \rb
+               .macro  loadsp, rb, tmp
+               addruart \rb, \tmp
                .endm
 #endif
 #endif
  * sort out different calling conventions
  */
                .align
+               .arm                            @ Always enter in ARM state
 start:
                .type   start,#function
-               .rept   8
+               .rept   7
                mov     r0, r0
                .endr
+   ARM(                mov     r0, r0          )
+   ARM(                b       1f              )
+ THUMB(                adr     r12, BSYM(1f)   )
+ THUMB(                bx      r12             )
 
-               b       1f
                .word   0x016f2818              @ Magic numbers to help the loader
                .word   start                   @ absolute load/run zImage address
                .word   _edata                  @ zImage end address
+ THUMB(                .thumb                  )
 1:             mov     r7, r1                  @ save architecture ID
                mov     r8, r2                  @ save atags pointer
 
@@ -140,7 +145,8 @@ start:
                tst     r2, #3                  @ not user?
                bne     not_angel
                mov     r0, #0x17               @ angel_SWIreason_EnterSVC
-               swi     0x123456                @ angel_SWI_ARM
+ ARM(          swi     0x123456        )       @ angel_SWI_ARM
+ THUMB(                svc     0xab            )       @ angel_SWI_THUMB
 not_angel:
                mrs     r2, cpsr                @ turn off interrupts to
                orr     r2, r2, #0xc0           @ prevent angel from running
@@ -160,57 +166,273 @@ not_angel:
                 */
 
                .text
-               adr     r0, LC0
-               ldmia   r0, {r1, r2, r3, r4, r5, r6, ip, sp}
-               subs    r0, r0, r1              @ calculate the delta offset
 
-                                               @ if delta is zero, we are
-               beq     not_relocated           @ running at the address we
-                                               @ were linked at.
+#ifdef CONFIG_AUTO_ZRELADDR
+               @ determine final kernel image address
+               mov     r4, pc
+               and     r4, r4, #0xf8000000
+               add     r4, r4, #TEXT_OFFSET
+#else
+               ldr     r4, =zreladdr
+#endif
+
+               bl      cache_on
+
+restart:       adr     r0, LC0
+               ldmia   r0, {r1, r2, r3, r6, r10, r11, r12}
+               ldr     sp, [r0, #28]
 
                /*
-                * We're running at a different address.  We need to fix
-                * up various pointers:
-                *   r5 - zImage base address
-                *   r6 - GOT start
-                *   ip - GOT end
+                * We might be running at a different address.  We need
+                * to fix up various pointers.
                 */
-               add     r5, r5, r0
-               add     r6, r6, r0
-               add     ip, ip, r0
+               sub     r0, r0, r1              @ calculate the delta offset
+               add     r6, r6, r0              @ _edata
+               add     r10, r10, r0            @ inflated kernel size location
+
+               /*
+                * The kernel build system appends the size of the
+                * decompressed kernel at the end of the compressed data
+                * in little-endian form.
+                */
+               ldrb    r9, [r10, #0]
+               ldrb    lr, [r10, #1]
+               orr     r9, r9, lr, lsl #8
+               ldrb    lr, [r10, #2]
+               ldrb    r10, [r10, #3]
+               orr     r9, r9, lr, lsl #16
+               orr     r9, r9, r10, lsl #24
+
+#ifndef CONFIG_ZBOOT_ROM
+               /* malloc space is above the relocated stack (64k max) */
+               add     sp, sp, r0
+               add     r10, sp, #0x10000
+#else
+               /*
+                * With ZBOOT_ROM the bss/stack is non relocatable,
+                * but someone could still run this code from RAM,
+                * in which case our reference is _edata.
+                */
+               mov     r10, r6
+#endif
+
+               mov     r5, #0                  @ init dtb size to 0
+#ifdef CONFIG_ARM_APPENDED_DTB
+/*
+ *   r0  = delta
+ *   r2  = BSS start
+ *   r3  = BSS end
+ *   r4  = final kernel address
+ *   r5  = appended dtb size (still unknown)
+ *   r6  = _edata
+ *   r7  = architecture ID
+ *   r8  = atags/device tree pointer
+ *   r9  = size of decompressed image
+ *   r10 = end of this image, including  bss/stack/malloc space if non XIP
+ *   r11 = GOT start
+ *   r12 = GOT end
+ *   sp  = stack pointer
+ *
+ * if there are device trees (dtb) appended to zImage, advance r10 so that the
+ * dtb data will get relocated along with the kernel if necessary.
+ */
+
+               ldr     lr, [r6, #0]
+#ifndef __ARMEB__
+               ldr     r1, =0xedfe0dd0         @ sig is 0xd00dfeed big endian
+#else
+               ldr     r1, =0xd00dfeed
+#endif
+               cmp     lr, r1
+               bne     dtb_check_done          @ not found
+
+#ifdef CONFIG_ARM_ATAG_DTB_COMPAT
+               /*
+                * OK... Let's do some funky business here.
+                * If we do have a DTB appended to zImage, and we do have
+                * an ATAG list around, we want the later to be translated
+                * and folded into the former here.  To be on the safe side,
+                * let's temporarily move  the stack away into the malloc
+                * area.  No GOT fixup has occurred yet, but none of the
+                * code we're about to call uses any global variable.
+               */
+               add     sp, sp, #0x10000
+               stmfd   sp!, {r0-r3, ip, lr}
+               mov     r0, r8
+               mov     r1, r6
+               sub     r2, sp, r6
+               bl      atags_to_fdt
+
+               /*
+                * If returned value is 1, there is no ATAG at the location
+                * pointed by r8.  Try the typical 0x100 offset from start
+                * of RAM and hope for the best.
+                */
+               cmp     r0, #1
+               sub     r0, r4, #TEXT_OFFSET
+               add     r0, r0, #0x100
+               mov     r1, r6
+               sub     r2, sp, r6
+               bleq    atags_to_fdt
+
+               ldmfd   sp!, {r0-r3, ip, lr}
+               sub     sp, sp, #0x10000
+#endif
+
+               mov     r8, r6                  @ use the appended device tree
+
+               /*
+                * Make sure that the DTB doesn't end up in the final
+                * kernel's .bss area. To do so, we adjust the decompressed
+                * kernel size to compensate if that .bss size is larger
+                * than the relocated code.
+                */
+               ldr     r5, =_kernel_bss_size
+               adr     r1, wont_overwrite
+               sub     r1, r6, r1
+               subs    r1, r5, r1
+               addhi   r9, r9, r1
+
+               /* Get the dtb's size */
+               ldr     r5, [r6, #4]
+#ifndef __ARMEB__
+               /* convert r5 (dtb size) to little endian */
+               eor     r1, r5, r5, ror #16
+               bic     r1, r1, #0x00ff0000
+               mov     r5, r5, ror #8
+               eor     r5, r5, r1, lsr #8
+#endif
+
+               /* preserve 64-bit alignment */
+               add     r5, r5, #7
+               bic     r5, r5, #7
+
+               /* relocate some pointers past the appended dtb */
+               add     r6, r6, r5
+               add     r10, r10, r5
+               add     sp, sp, r5
+dtb_check_done:
+#endif
+
+/*
+ * Check to see if we will overwrite ourselves.
+ *   r4  = final kernel address
+ *   r9  = size of decompressed image
+ *   r10 = end of this image, including  bss/stack/malloc space if non XIP
+ * We basically want:
+ *   r4 - 16k page directory >= r10 -> OK
+ *   r4 + image length <= address of wont_overwrite -> OK
+ */
+               add     r10, r10, #16384
+               cmp     r4, r10
+               bhs     wont_overwrite
+               add     r10, r4, r9
+               adr     r9, wont_overwrite
+               cmp     r10, r9
+               bls     wont_overwrite
+
+/*
+ * Relocate ourselves past the end of the decompressed kernel.
+ *   r6  = _edata
+ *   r10 = end of the decompressed kernel
+ * Because we always copy ahead, we need to do it from the end and go
+ * backward in case the source and destination overlap.
+ */
+               /*
+                * Bump to the next 256-byte boundary with the size of
+                * the relocation code added. This avoids overwriting
+                * ourself when the offset is small.
+                */
+               add     r10, r10, #((reloc_code_end - restart + 256) & ~255)
+               bic     r10, r10, #255
+
+               /* Get start of code we want to copy and align it down. */
+               adr     r5, restart
+               bic     r5, r5, #31
+
+               sub     r9, r6, r5              @ size to copy
+               add     r9, r9, #31             @ rounded up to a multiple
+               bic     r9, r9, #31             @ ... of 32 bytes
+               add     r6, r9, r5
+               add     r9, r9, r10
+
+1:             ldmdb   r6!, {r0 - r3, r10 - r12, lr}
+               cmp     r6, r5
+               stmdb   r9!, {r0 - r3, r10 - r12, lr}
+               bhi     1b
+
+               /* Preserve offset to relocated code. */
+               sub     r6, r9, r6
+
+#ifndef CONFIG_ZBOOT_ROM
+               /* cache_clean_flush may use the stack, so relocate it */
+               add     sp, sp, r6
+#endif
+
+               bl      cache_clean_flush
+
+               adr     r0, BSYM(restart)
+               add     r0, r0, r6
+               mov     pc, r0
+
+wont_overwrite:
+/*
+ * If delta is zero, we are running at the address we were linked at.
+ *   r0  = delta
+ *   r2  = BSS start
+ *   r3  = BSS end
+ *   r4  = kernel execution address
+ *   r5  = appended dtb size (0 if not present)
+ *   r7  = architecture ID
+ *   r8  = atags pointer
+ *   r11 = GOT start
+ *   r12 = GOT end
+ *   sp  = stack pointer
+ */
+               orrs    r1, r0, r5
+               beq     not_relocated
+
+               add     r11, r11, r0
+               add     r12, r12, r0
 
 #ifndef CONFIG_ZBOOT_ROM
                /*
                 * If we're running fully PIC === CONFIG_ZBOOT_ROM = n,
                 * we need to fix up pointers into the BSS region.
-                *   r2 - BSS start
-                *   r3 - BSS end
-                *   sp - stack pointer
+                * Note that the stack pointer has already been fixed up.
                 */
                add     r2, r2, r0
                add     r3, r3, r0
-               add     sp, sp, r0
 
                /*
                 * Relocate all entries in the GOT table.
+                * Bump bss entries to _edata + dtb size
                 */
-1:             ldr     r1, [r6, #0]            @ relocate entries in the GOT
-               add     r1, r1, r0              @ table.  This fixes up the
-               str     r1, [r6], #4            @ C references.
-               cmp     r6, ip
+1:             ldr     r1, [r11, #0]           @ relocate entries in the GOT
+               add     r1, r1, r0              @ This fixes up C references
+               cmp     r1, r2                  @ if entry >= bss_start &&
+               cmphs   r3, r1                  @       bss_end > entry
+               addhi   r1, r1, r5              @    entry += dtb size
+               str     r1, [r11], #4           @ next entry
+               cmp     r11, r12
                blo     1b
+
+               /* bump our bss pointers too */
+               add     r2, r2, r5
+               add     r3, r3, r5
+
 #else
 
                /*
                 * Relocate entries in the GOT table.  We only relocate
                 * the entries that are outside the (relocated) BSS region.
                 */
-1:             ldr     r1, [r6, #0]            @ relocate entries in the GOT
+1:             ldr     r1, [r11, #0]           @ relocate entries in the GOT
                cmp     r1, r2                  @ entry < bss_start ||
                cmphs   r3, r1                  @ _end < entry
                addlo   r1, r1, r0              @ table.  This fixes up the
-               str     r1, [r6], #4            @ C references.
-               cmp     r6, ip
+               str     r1, [r11], #4           @ C references.
+               cmp     r11, r12
                blo     1b
 #endif
 
@@ -222,91 +444,41 @@ not_relocated:    mov     r0, #0
                cmp     r2, r3
                blo     1b
 
-               /*
-                * The C runtime environment should now be setup
-                * sufficiently.  Turn the cache on, set up some
-                * pointers, and start decompressing.
-                */
-               bl      cache_on
-
-               mov     r1, sp                  @ malloc space above stack
-               add     r2, sp, #0x10000        @ 64k max
-
 /*
- * Check to see if we will overwrite ourselves.
- *   r4 = final kernel address
- *   r5 = start of this image
- *   r2 = end of malloc space (and therefore this image)
- * We basically want:
- *   r4 >= r2 -> OK
- *   r4 + image length <= r5 -> OK
+ * The C runtime environment should now be setup sufficiently.
+ * Set up some pointers, and start decompressing.
+ *   r4  = kernel execution address
+ *   r7  = architecture ID
+ *   r8  = atags pointer
  */
-               cmp     r4, r2
-               bhs     wont_overwrite
-               sub     r3, sp, r5              @ > compressed kernel size
-               add     r0, r4, r3, lsl #2      @ allow for 4x expansion
-               cmp     r0, r5
-               bls     wont_overwrite
-
-               mov     r5, r2                  @ decompress after malloc space
-               mov     r0, r5
+               mov     r0, r4
+               mov     r1, sp                  @ malloc space above stack
+               add     r2, sp, #0x10000        @ 64k max
                mov     r3, r7
                bl      decompress_kernel
-
-               add     r0, r0, #127 + 128      @ alignment + stack
-               bic     r0, r0, #127            @ align the kernel length
-/*
- * r0     = decompressed kernel length
- * r1-r3  = unused
- * r4     = kernel execution address
- * r5     = decompressed kernel start
- * r6     = processor ID
- * r7     = architecture ID
- * r8     = atags pointer
- * r9-r14 = corrupted
- */
-               add     r1, r5, r0              @ end of decompressed kernel
-               adr     r2, reloc_start
-               ldr     r3, LC1
-               add     r3, r2, r3
-1:             ldmia   r2!, {r9 - r14}         @ copy relocation code
-               stmia   r1!, {r9 - r14}
-               ldmia   r2!, {r9 - r14}
-               stmia   r1!, {r9 - r14}
-               cmp     r2, r3
-               blo     1b
-               add     sp, r1, #128            @ relocate the stack
-
                bl      cache_clean_flush
-               add     pc, r5, r0              @ call relocation code
-
-/*
- * We're not in danger of overwriting ourselves.  Do this the simple way.
- *
- * r4     = kernel execution address
- * r7     = architecture ID
- */
-wont_overwrite:        mov     r0, r4
-               mov     r3, r7
-               bl      decompress_kernel
-               b       call_kernel
+               bl      cache_off
+               mov     r0, #0                  @ must be zero
+               mov     r1, r7                  @ restore architecture number
+               mov     r2, r8                  @ restore atags pointer
+ ARM(          mov     pc, r4  )               @ call kernel
+ THUMB(                bx      r4      )               @ entry point is always ARM
 
                .align  2
                .type   LC0, #object
 LC0:           .word   LC0                     @ r1
                .word   __bss_start             @ r2
                .word   _end                    @ r3
-               .word   zreladdr                @ r4
-               .word   _start                  @ r5
-               .word   _got_start              @ r6
+               .word   _edata                  @ r6
+               .word   input_data_end - 4      @ r10 (inflated size location)
+               .word   _got_start              @ r11
                .word   _got_end                @ ip
-               .word   user_stack+4096         @ sp
-LC1:           .word   reloc_end - reloc_start
+               .word   .L_user_stack_end       @ sp
                .size   LC0, . - LC0
 
 #ifdef CONFIG_ARCH_RPC
                .globl  params
-params:                ldr     r0, =params_phys
+params:                ldr     r0, =0x10000100         @ params_phys for RPC
                mov     pc, lr
                .ltorg
                .align
@@ -322,14 +494,12 @@ params:           ldr     r0, =params_phys
  *
  * On entry,
  *  r4 = kernel execution address
- *  r6 = processor ID
  *  r7 = architecture number
  *  r8 = atags pointer
- *  r9 = run-time address of "start"  (???)
  * On exit,
- *  r1, r2, r3, r9, r10, r12 corrupted
+ *  r0, r1, r2, r3, r9, r10, r12 corrupted
  * This routine must preserve:
- *  r4, r5, r6, r7, r8
+ *  r4, r7, r8
  */
                .align  5
 cache_on:      mov     r3, #8                  @ cache_on function
@@ -382,12 +552,18 @@ __armv3_mpu_cache_on:
 
                mov     r0, #0
                mcr     p15, 0, r0, c7, c0, 0   @ invalidate whole cache v3
+               /*
+                * ?? ARMv3 MMU does not allow reading the control register,
+                * does this really work on ARMv3 MPU?
+                */
                mrc     p15, 0, r0, c1, c0, 0   @ read control reg
                                                @ .... .... .... WC.M
                orr     r0, r0, #0x000d         @ .... .... .... 11.1
+               /* ?? this overwrites the value constructed above? */
                mov     r0, #0
                mcr     p15, 0, r0, c1, c0, 0   @ write control reg
 
+               /* ?? invalidate for the second time? */
                mcr     p15, 0, r0, c7, c0, 0   @ invalidate whole cache v3
                mov     pc, lr
 
@@ -406,7 +582,11 @@ __setup_mmu:       sub     r3, r4, #16384          @ Page directory size
                orr     r1, r1, #3 << 10
                add     r2, r3, #16384
 1:             cmp     r1, r9                  @ if virt > start of RAM
+#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH
+               orrhs   r1, r1, #0x08           @ set cacheable
+#else
                orrhs   r1, r1, #0x0c           @ set cacheable, bufferable
+#endif
                cmp     r1, r10                 @ if virt > end of RAM
                bichs   r1, r1, #0x0c           @ clear cacheable, bufferable
                str     r1, [r0], #4            @ 1:1 mapping
@@ -421,7 +601,8 @@ __setup_mmu:        sub     r3, r4, #16384          @ Page directory size
  */
                mov     r1, #0x1e
                orr     r1, r1, #3 << 10
-               mov     r2, pc, lsr #20
+               mov     r2, pc
+               mov     r2, r2, lsr #20
                orr     r1, r1, r2, lsl #20
                add     r0, r3, r2, lsl #2
                str     r1, [r0], #4
@@ -430,8 +611,15 @@ __setup_mmu:       sub     r3, r4, #16384          @ Page directory size
                mov     pc, lr
 ENDPROC(__setup_mmu)
 
+__arm926ejs_mmu_cache_on:
+#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH
+               mov     r0, #4                  @ put dcache in WT mode
+               mcr     p15, 7, r0, c15, c0, 0
+#endif
+
 __armv4_mmu_cache_on:
                mov     r12, lr
+#ifdef CONFIG_MMU
                bl      __setup_mmu
                mov     r0, #0
                mcr     p15, 0, r0, c7, c10, 4  @ drain write buffer
@@ -445,10 +633,12 @@ __armv4_mmu_cache_on:
                bl      __common_mmu_cache_on
                mov     r0, #0
                mcr     p15, 0, r0, c8, c7, 0   @ flush I,D TLBs
+#endif
                mov     pc, r12
 
 __armv7_mmu_cache_on:
                mov     r12, lr
+#ifdef CONFIG_MMU
                mrc     p15, 0, r11, c0, c1, 4  @ read ID_MMFR0
                tst     r11, #0xf               @ VMSA
                blne    __setup_mmu
@@ -456,9 +646,11 @@ __armv7_mmu_cache_on:
                mcr     p15, 0, r0, c7, c10, 4  @ drain write buffer
                tst     r11, #0xf               @ VMSA
                mcrne   p15, 0, r0, c8, c7, 0   @ flush I,D TLBs
+#endif
                mrc     p15, 0, r0, c1, c0, 0   @ read control reg
                orr     r0, r0, #0x5000         @ I-cache enable, RR cache replacement
                orr     r0, r0, #0x003c         @ write buffer
+#ifdef CONFIG_MMU
 #ifdef CONFIG_CPU_ENDIAN_BE8
                orr     r0, r0, #1 << 25        @ big-endian page tables
 #endif
@@ -466,6 +658,7 @@ __armv7_mmu_cache_on:
                movne   r1, #-1
                mcrne   p15, 0, r3, c2, c0, 0   @ load page table pointer
                mcrne   p15, 0, r1, c3, c0, 0   @ load domain access control
+#endif
                mcr     p15, 0, r0, c1, c0, 0   @ load control register
                mrc     p15, 0, r0, c1, c0, 0   @ and read it back
                mov     r0, #0
@@ -499,6 +692,7 @@ __arm6_mmu_cache_on:
                mov     pc, r12
 
 __common_mmu_cache_on:
+#ifndef CONFIG_THUMB2_KERNEL
 #ifndef DEBUG
                orr     r0, r0, #0x000d         @ Write buffer, mmu
 #endif
@@ -510,43 +704,9 @@ __common_mmu_cache_on:
 1:             mcr     p15, 0, r0, c1, c0, 0   @ load control register
                mrc     p15, 0, r0, c1, c0, 0   @ and read it back to
                sub     pc, lr, r0, lsr #32     @ properly flush pipeline
+#endif
 
-/*
- * All code following this line is relocatable.  It is relocated by
- * the above code to the end of the decompressed kernel image and
- * executed there.  During this time, we have no stacks.
- *
- * r0     = decompressed kernel length
- * r1-r3  = unused
- * r4     = kernel execution address
- * r5     = decompressed kernel start
- * r6     = processor ID
- * r7     = architecture ID
- * r8     = atags pointer
- * r9-r14 = corrupted
- */
-               .align  5
-reloc_start:   add     r9, r5, r0
-               sub     r9, r9, #128            @ do not copy the stack
-               debug_reloc_start
-               mov     r1, r4
-1:
-               .rept   4
-               ldmia   r5!, {r0, r2, r3, r10 - r14}    @ relocate kernel
-               stmia   r1!, {r0, r2, r3, r10 - r14}
-               .endr
-
-               cmp     r5, r9
-               blo     1b
-               add     sp, r1, #128            @ relocate the stack
-               debug_reloc_end
-
-call_kernel:   bl      cache_clean_flush
-               bl      cache_off
-               mov     r0, #0                  @ must be zero
-               mov     r1, r7                  @ restore architecture number
-               mov     r2, r8                  @ restore atags pointer
-               mov     pc, r4                  @ call kernel
+#define PROC_ENTRY_SIZE (4*5)
 
 /*
  * Here follow the relocatable cache support functions for the
@@ -558,22 +718,24 @@ call_kernel:      bl      cache_clean_flush
  *  r1  = corrupted
  *  r2  = corrupted
  *  r3  = block offset
- *  r6  = corrupted
+ *  r9  = corrupted
  *  r12 = corrupted
  */
 
 call_cache_fn: adr     r12, proc_types
 #ifdef CONFIG_CPU_CP15
-               mrc     p15, 0, r6, c0, c0      @ get processor ID
+               mrc     p15, 0, r9, c0, c0      @ get processor ID
 #else
-               ldr     r6, =CONFIG_PROCESSOR_ID
+               ldr     r9, =CONFIG_PROCESSOR_ID
 #endif
 1:             ldr     r1, [r12, #0]           @ get value
                ldr     r2, [r12, #4]           @ get mask
-               eor     r1, r1, r6              @ (real ^ match)
+               eor     r1, r1, r9              @ (real ^ match)
                tst     r1, r2                  @       & mask
-               addeq   pc, r12, r3             @ call cache function
-               add     r12, r12, #4*5
+ ARM(          addeq   pc, r12, r3             ) @ call cache function
+ THUMB(                addeq   r12, r3                 )
+ THUMB(                moveq   pc, r12                 ) @ call cache function
+               add     r12, r12, #PROC_ENTRY_SIZE
                b       1b
 
 /*
@@ -595,9 +757,10 @@ call_cache_fn:     adr     r12, proc_types
 proc_types:
                .word   0x41560600              @ ARM6/610
                .word   0xffffffe0
-               b       __arm6_mmu_cache_off    @ works, but slow
-               b       __arm6_mmu_cache_off
+               W(b)    __arm6_mmu_cache_off    @ works, but slow
+               W(b)    __arm6_mmu_cache_off
                mov     pc, lr
+ THUMB(                nop                             )
 @              b       __arm6_mmu_cache_on             @ untested
 @              b       __arm6_mmu_cache_off
 @              b       __armv3_mmu_cache_flush
@@ -605,76 +768,84 @@ proc_types:
                .word   0x00000000              @ old ARM ID
                .word   0x0000f000
                mov     pc, lr
+ THUMB(                nop                             )
                mov     pc, lr
+ THUMB(                nop                             )
                mov     pc, lr
+ THUMB(                nop                             )
 
                .word   0x41007000              @ ARM7/710
                .word   0xfff8fe00
-               b       __arm7_mmu_cache_off
-               b       __arm7_mmu_cache_off
+               W(b)    __arm7_mmu_cache_off
+               W(b)    __arm7_mmu_cache_off
                mov     pc, lr
+ THUMB(                nop                             )
 
                .word   0x41807200              @ ARM720T (writethrough)
                .word   0xffffff00
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
                mov     pc, lr
+ THUMB(                nop                             )
 
                .word   0x41007400              @ ARM74x
                .word   0xff00ff00
-               b       __armv3_mpu_cache_on
-               b       __armv3_mpu_cache_off
-               b       __armv3_mpu_cache_flush
+               W(b)    __armv3_mpu_cache_on
+               W(b)    __armv3_mpu_cache_off
+               W(b)    __armv3_mpu_cache_flush
                
                .word   0x41009400              @ ARM94x
                .word   0xff00ff00
-               b       __armv4_mpu_cache_on
-               b       __armv4_mpu_cache_off
-               b       __armv4_mpu_cache_flush
+               W(b)    __armv4_mpu_cache_on
+               W(b)    __armv4_mpu_cache_off
+               W(b)    __armv4_mpu_cache_flush
+
+               .word   0x41069260              @ ARM926EJ-S (v5TEJ)
+               .word   0xff0ffff0
+               W(b)    __arm926ejs_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv5tej_mmu_cache_flush
 
                .word   0x00007000              @ ARM7 IDs
                .word   0x0000f000
                mov     pc, lr
+ THUMB(                nop                             )
                mov     pc, lr
+ THUMB(                nop                             )
                mov     pc, lr
+ THUMB(                nop                             )
 
                @ Everything from here on will be the new ID system.
 
                .word   0x4401a100              @ sa110 / sa1100
                .word   0xffffffe0
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv4_mmu_cache_flush
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv4_mmu_cache_flush
 
                .word   0x6901b110              @ sa1110
                .word   0xfffffff0
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv4_mmu_cache_flush
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv4_mmu_cache_flush
 
-               .word   0x56056930
-               .word   0xff0ffff0              @ PXA935
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv4_mmu_cache_flush
+               .word   0x56056900
+               .word   0xffffff00              @ PXA9xx
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv4_mmu_cache_flush
 
                .word   0x56158000              @ PXA168
                .word   0xfffff000
-               b __armv4_mmu_cache_on
-               b __armv4_mmu_cache_off
-               b __armv5tej_mmu_cache_flush
-
-               .word   0x56056930
-               .word   0xff0ffff0              @ PXA935
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv4_mmu_cache_flush
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv5tej_mmu_cache_flush
 
                .word   0x56050000              @ Feroceon
                .word   0xff0f0000
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv5tej_mmu_cache_flush
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv5tej_mmu_cache_flush
 
 #ifdef CONFIG_CPU_FEROCEON_OLD_ID
                /* this conflicts with the standard ARMv5TE entry */
@@ -687,57 +858,71 @@ proc_types:
 
                .word   0x66015261              @ FA526
                .word   0xff01fff1
-               b       __fa526_cache_on
-               b       __armv4_mmu_cache_off
-               b       __fa526_cache_flush
+               W(b)    __fa526_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __fa526_cache_flush
 
                @ These match on the architecture ID
 
                .word   0x00020000              @ ARMv4T
                .word   0x000f0000
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv4_mmu_cache_flush
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv4_mmu_cache_flush
 
                .word   0x00050000              @ ARMv5TE
                .word   0x000f0000
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv4_mmu_cache_flush
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv4_mmu_cache_flush
 
                .word   0x00060000              @ ARMv5TEJ
                .word   0x000f0000
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv5tej_mmu_cache_flush
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv5tej_mmu_cache_flush
 
                .word   0x0007b000              @ ARMv6
                .word   0x000ff000
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv6_mmu_cache_flush
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv6_mmu_cache_flush
 
                .word   0x000f0000              @ new CPU Id
                .word   0x000f0000
-               b       __armv7_mmu_cache_on
-               b       __armv7_mmu_cache_off
-               b       __armv7_mmu_cache_flush
+               W(b)    __armv7_mmu_cache_on
+               W(b)    __armv7_mmu_cache_off
+               W(b)    __armv7_mmu_cache_flush
 
                .word   0                       @ unrecognised type
                .word   0
                mov     pc, lr
+ THUMB(                nop                             )
                mov     pc, lr
+ THUMB(                nop                             )
                mov     pc, lr
+ THUMB(                nop                             )
 
                .size   proc_types, . - proc_types
 
+               /*
+                * If you get a "non-constant expression in ".if" statement"
+                * error from the assembler on this line, check that you have
+                * not accidentally written a "b" instruction where you should
+                * have written W(b).
+                */
+               .if (. - proc_types) % PROC_ENTRY_SIZE != 0
+               .error "The size of one or more proc_types entries is wrong."
+               .endif
+
 /*
  * Turn off the Cache and MMU.  ARMv3 does not support
  * reading the control register, but ARMv4 does.
  *
- * On entry,  r6 = processor ID
- * On exit,   r0, r1, r2, r3, r12 corrupted
- * This routine must preserve: r4, r6, r7
+ * On exit,
+ *  r0, r1, r2, r3, r9, r12 corrupted
+ * This routine must preserve:
+ *  r4, r7, r8
  */
                .align  5
 cache_off:     mov     r3, #12                 @ cache_off function
@@ -762,22 +947,30 @@ __armv3_mpu_cache_off:
                mov     pc, lr
 
 __armv4_mmu_cache_off:
+#ifdef CONFIG_MMU
                mrc     p15, 0, r0, c1, c0
                bic     r0, r0, #0x000d
                mcr     p15, 0, r0, c1, c0      @ turn MMU and cache off
                mov     r0, #0
                mcr     p15, 0, r0, c7, c7      @ invalidate whole cache v4
                mcr     p15, 0, r0, c8, c7      @ invalidate whole TLB v4
+#endif
                mov     pc, lr
 
 __armv7_mmu_cache_off:
                mrc     p15, 0, r0, c1, c0
+#ifdef CONFIG_MMU
                bic     r0, r0, #0x000d
+#else
+               bic     r0, r0, #0x000c
+#endif
                mcr     p15, 0, r0, c1, c0      @ turn MMU and cache off
                mov     r12, lr
                bl      __armv7_mmu_cache_flush
                mov     r0, #0
+#ifdef CONFIG_MMU
                mcr     p15, 0, r0, c8, c7, 0   @ invalidate whole TLB
+#endif
                mcr     p15, 0, r0, c7, c5, 6   @ invalidate BTC
                mcr     p15, 0, r0, c7, c10, 4  @ DSB
                mcr     p15, 0, r0, c7, c5, 4   @ ISB
@@ -801,12 +994,10 @@ __armv3_mmu_cache_off:
 /*
  * Clean and flush the cache to maintain consistency.
  *
- * On entry,
- *  r6 = processor ID
  * On exit,
- *  r1, r2, r3, r11, r12 corrupted
+ *  r1, r2, r3, r9, r10, r11, r12 corrupted
  * This routine must preserve:
- *  r0, r4, r5, r6, r7
+ *  r4, r6, r7, r8
  */
                .align  5
 cache_clean_flush:
@@ -854,7 +1045,7 @@ __armv7_mmu_cache_flush:
                b       iflush
 hierarchical:
                mcr     p15, 0, r10, c7, c10, 5 @ DMB
-               stmfd   sp!, {r0-r5, r7, r9, r11}
+               stmfd   sp!, {r0-r7, r9-r11}
                mrc     p15, 1, r0, c0, c0, 1   @ read clidr
                ands    r3, r0, #0x7000000      @ extract loc from clidr
                mov     r3, r3, lsr #23         @ left align loc bit field
@@ -879,8 +1070,12 @@ loop1:
 loop2:
                mov     r9, r4                  @ create working copy of max way size
 loop3:
-               orr     r11, r10, r9, lsl r5    @ factor way and cache number into r11
-               orr     r11, r11, r7, lsl r2    @ factor index number into r11
+ ARM(          orr     r11, r10, r9, lsl r5    ) @ factor way and cache number into r11
+ ARM(          orr     r11, r11, r7, lsl r2    ) @ factor index number into r11
+ THUMB(                lsl     r6, r9, r5              )
+ THUMB(                orr     r11, r10, r6            ) @ factor way and cache number into r11
+ THUMB(                lsl     r6, r7, r2              )
+ THUMB(                orr     r11, r11, r6            ) @ factor index number into r11
                mcr     p15, 0, r11, c7, c14, 2 @ clean & invalidate by set/way
                subs    r9, r9, #1              @ decrement the way
                bge     loop3
@@ -891,7 +1086,7 @@ skip:
                cmp     r3, r10
                bgt     loop1
 finished:
-               ldmfd   sp!, {r0-r5, r7, r9, r11}
+               ldmfd   sp!, {r0-r7, r9-r11}
                mov     r10, #0                 @ swith back to cache level 0
                mcr     p15, 2, r10, c0, c0, 0  @ select current cache level in cssr
 iflush:
@@ -912,7 +1107,7 @@ __armv4_mmu_cache_flush:
                mov     r2, #64*1024            @ default: 32K dcache size (*2)
                mov     r11, #32                @ default: 32 byte line size
                mrc     p15, 0, r3, c0, c0, 1   @ read cache type
-               teq     r3, r6                  @ cache ID register present?
+               teq     r3, r9                  @ cache ID register present?
                beq     no_cache_id
                mov     r1, r3, lsr #18
                and     r1, r1, #7
@@ -925,9 +1120,13 @@ __armv4_mmu_cache_flush:
                mov     r11, #8
                mov     r11, r11, lsl r3        @ cache line size in bytes
 no_cache_id:
-               bic     r1, pc, #63             @ align to longest cache line
+               mov     r1, pc
+               bic     r1, r1, #63             @ align to longest cache line
                add     r2, r1, r2
-1:             ldr     r3, [r1], r11           @ s/w flush D cache
+1:
+ ARM(          ldr     r3, [r1], r11           ) @ s/w flush D cache
+ THUMB(                ldr     r3, [r1]                ) @ s/w flush D cache
+ THUMB(                add     r1, r1, r11             )
                teq     r1, r2
                bne     1b
 
@@ -939,7 +1138,7 @@ no_cache_id:
 __armv3_mmu_cache_flush:
 __armv3_mpu_cache_flush:
                mov     r1, #0
-               mcr     p15, 0, r0, c7, c0, 0   @ invalidate whole cache v3
+               mcr     p15, 0, r1, c7, c0, 0   @ invalidate whole cache v3
                mov     pc, lr
 
 /*
@@ -952,6 +1151,7 @@ __armv3_mpu_cache_flush:
 phexbuf:       .space  12
                .size   phexbuf, . - phexbuf
 
+@ phex corrupts {r0, r1, r2, r3}
 phex:          adr     r3, phexbuf
                mov     r2, #0
                strb    r2, [r3, r1]
@@ -966,7 +1166,8 @@ phex:              adr     r3, phexbuf
                strb    r2, [r3, r1]
                b       1b
 
-puts:          loadsp  r3
+@ puts corrupts {r0, r1, r2, r3}
+puts:          loadsp  r3, r1
 1:             ldrb    r2, [r0], #1
                teq     r2, #0
                moveq   pc, lr
@@ -980,12 +1181,14 @@ puts:            loadsp  r3
                teq     r0, #0
                bne     1b
                mov     pc, lr
+@ putc corrupts {r0, r1, r2, r3}
 putc:
                mov     r2, r0
                mov     r0, #0
-               loadsp  r3
+               loadsp  r3, r1
                b       2b
 
+@ memdump corrupts {r0, r1, r2, r3, r10, r11, r12, lr}
 memdump:       mov     r12, r0
                mov     r10, lr
                mov     r11, #0
@@ -1016,8 +1219,9 @@ memdump:  mov     r12, r0
 #endif
 
                .ltorg
-reloc_end:
+reloc_code_end:
 
                .align
-               .section ".stack", "w"
-user_stack:    .space  4096
+               .section ".stack", "aw", %nobits
+.L_user_stack: .space  4096
+.L_user_stack_end: