cpuswitch.S revision 1.42.10.1       1 /*	$NetBSD: cpuswitch.S,v 1.42.10.1 2006/05/11 23:26:18 elad Exp $	*/
      2 
      3 /*
      4  * Copyright 2003 Wasabi Systems, Inc.
      5  * All rights reserved.
      6  *
      7  * Written by Steve C. Woodford for Wasabi Systems, Inc.
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  * 1. Redistributions of source code must retain the above copyright
     13  *    notice, this list of conditions and the following disclaimer.
     14  * 2. Redistributions in binary form must reproduce the above copyright
     15  *    notice, this list of conditions and the following disclaimer in the
     16  *    documentation and/or other materials provided with the distribution.
     17  * 3. All advertising materials mentioning features or use of this software
     18  *    must display the following acknowledgement:
     19  *      This product includes software developed for the NetBSD Project by
     20  *      Wasabi Systems, Inc.
     21  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
     22  *    or promote products derived from this software without specific prior
     23  *    written permission.
     24  *
     25  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
     26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
     27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
     29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     35  * POSSIBILITY OF SUCH DAMAGE.
     36  */
     37 /*
     38  * Copyright (c) 1994-1998 Mark Brinicombe.
     39  * Copyright (c) 1994 Brini.
     40  * All rights reserved.
     41  *
     42  * This code is derived from software written for Brini by Mark Brinicombe
     43  *
     44  * Redistribution and use in source and binary forms, with or without
     45  * modification, are permitted provided that the following conditions
     46  * are met:
     47  * 1. Redistributions of source code must retain the above copyright
     48  *    notice, this list of conditions and the following disclaimer.
     49  * 2. Redistributions in binary form must reproduce the above copyright
     50  *    notice, this list of conditions and the following disclaimer in the
     51  *    documentation and/or other materials provided with the distribution.
     52  * 3. All advertising materials mentioning features or use of this software
     53  *    must display the following acknowledgement:
     54  *	This product includes software developed by Brini.
     55  * 4. The name of the company nor the name of the author may be used to
     56  *    endorse or promote products derived from this software without specific
     57  *    prior written permission.
     58  *
     59  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
     60  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
     61  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     62  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
     63  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     64  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
     65  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     66  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     67  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     68  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     69  * SUCH DAMAGE.
     70  *
     71  * RiscBSD kernel project
     72  *
     73  * cpuswitch.S
     74  *
     75  * cpu switching functions
     76  *
     77  * Created      : 15/10/94
     78  */
     79 
     80 #include "opt_armfpe.h"
     81 #include "opt_arm32_pmap.h"
     82 #include "opt_multiprocessor.h"
     83 #include "opt_lockdebug.h"
     84 
     85 #include "assym.h"
     86 #include <machine/param.h>
     87 #include <machine/cpu.h>
     88 #include <machine/frame.h>
     89 #include <machine/asm.h>
     90 
     91 /* LINTSTUB: include <sys/param.h> */
     92 
     93 #undef IRQdisable
     94 #undef IRQenable
     95 
     96 /*
     97  * New experimental definitions of IRQdisable and IRQenable
     98  * These keep FIQ's enabled since FIQ's are special.
     99  */
    100 
    101 #define IRQdisable \
    102 	mrs	r14, cpsr ; \
    103 	orr	r14, r14, #(I32_bit) ; \
    104 	msr	cpsr_c, r14 ; \
    105 
    106 #define IRQenable \
    107 	mrs	r14, cpsr ; \
    108 	bic	r14, r14, #(I32_bit) ; \
    109 	msr	cpsr_c, r14 ; \
    110 
    111 /*
    112  * These are used for switching the translation table/DACR.
    113  * Since the vector page can be invalid for a short time, we must
    114  * disable both regular IRQs *and* FIQs.
    115  *
    116  * XXX: This is not necessary if the vector table is relocated.
    117  */
    118 #define IRQdisableALL \
    119 	mrs	r14, cpsr ; \
    120 	orr	r14, r14, #(I32_bit | F32_bit) ; \
    121 	msr	cpsr_c, r14
    122 
    123 #define IRQenableALL \
    124 	mrs	r14, cpsr ; \
    125 	bic	r14, r14, #(I32_bit | F32_bit) ; \
    126 	msr	cpsr_c, r14
    127 
    128 	.text
    129 
    130 .Lwhichqs:
    131 	.word	_C_LABEL(sched_whichqs)
    132 
    133 .Lqs:
    134 	.word	_C_LABEL(sched_qs)
    135 
    136 /*
    137  * cpuswitch()
    138  *
    139  * preforms a process context switch.
    140  * This function has several entry points
    141  */
    142 
    143 #ifdef MULTIPROCESSOR
    144 .Lcpu_info_store:
    145 	.word	_C_LABEL(cpu_info_store)
    146 .Lcurlwp:
    147 	/* FIXME: This is bogus in the general case. */
    148 	.word	_C_LABEL(cpu_info_store) + CI_CURLWP
    149 
    150 .Lcurpcb:
    151 	.word	_C_LABEL(cpu_info_store) + CI_CURPCB
    152 #else
    153 .Lcurlwp:
    154 	.word	_C_LABEL(curlwp)
    155 
    156 .Lcurpcb:
    157 	.word	_C_LABEL(curpcb)
    158 #endif
    159 
    160 .Lwant_resched:
    161 	.word	_C_LABEL(want_resched)
    162 
    163 .Lcpufuncs:
    164 	.word	_C_LABEL(cpufuncs)
    165 
    166 #ifndef MULTIPROCESSOR
    167 	.data
    168 	.global	_C_LABEL(curpcb)
    169 _C_LABEL(curpcb):
    170 	.word	0x00000000
    171 	.text
    172 #endif
    173 
    174 .Lblock_userspace_access:
    175 	.word	_C_LABEL(block_userspace_access)
    176 
    177 .Lcpu_do_powersave:
    178 	.word	_C_LABEL(cpu_do_powersave)
    179 
    180 .Lpmap_kernel_cstate:
    181 	.word	(kernel_pmap_store + PMAP_CSTATE)
    182 
    183 .Llast_cache_state_ptr:
    184 	.word	_C_LABEL(pmap_cache_state)
    185 
    186 /*
    187  * Idle loop, exercised while waiting for a process to wake up.
    188  *
    189  * NOTE: When we jump back to .Lswitch_search, we must have a
    190  * pointer to whichqs in r7, which is what it is when we arrive
    191  * here.
    192  */
    193 /* LINTSTUB: Ignore */
    194 ASENTRY_NP(idle)
    195 	ldr	r6, .Lcpu_do_powersave
    196 	IRQenable			/* Enable interrupts */
    197 	ldr	r6, [r6]		/* r6 = cpu_do_powersave */
    198 
    199 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
    200 	bl	_C_LABEL(sched_unlock_idle)
    201 #endif
    202 
    203 	/* Drop to spl0 (returns the current spl level in r0). */
    204 #ifdef __NEWINTR
    205 	mov	r0, #(IPL_NONE)
    206 	bl	_C_LABEL(_spllower)
    207 #else /* ! __NEWINTR */
    208 	mov	r0, #(_SPL_0)
    209 	bl	_C_LABEL(splx)
    210 #endif /* __NEWINTR */
    211 
    212 	teq	r6, #0			/* cpu_do_powersave non zero? */
    213 	ldrne	r6, .Lcpufuncs
    214 	mov	r4, r0			/* Old interrupt level to r4 */
    215 	ldrne	r6, [r6, #(CF_SLEEP)]
    216 
    217 	/*
    218 	 * Main idle loop.
    219 	 * r6 points to power-save idle function if required, else NULL.
    220 	 */
    221 1:	ldr	r3, [r7]		/* r3 = sched_whichqs */
    222 	teq	r3, #0
    223 	bne	2f			/* We have work to do */
    224 	teq	r6, #0			/* Powersave idle? */
    225 	beq	1b			/* Nope. Just sit-n-spin. */
    226 
    227 	/*
    228 	 * Before going into powersave idle mode, disable interrupts
    229 	 * and check sched_whichqs one more time.
    230 	 */
    231 	IRQdisableALL
    232 	ldr	r3, [r7]
    233 	mov	r0, #0
    234 	teq	r3, #0			/* sched_whichqs still zero? */
    235 	moveq	lr, pc
    236 	moveq	pc, r6			/* If so, do powersave idle */
    237 	IRQenableALL
    238 	b	1b			/* Back around */
    239 
    240 	/*
    241 	 * sched_whichqs indicates that at least one lwp is ready to run.
    242 	 * Restore the original interrupt priority level, grab the
    243 	 * scheduler lock if necessary, and jump back into cpu_switch.
    244 	 */
    245 2:	mov	r0, r4
    246 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
    247 	bl	_C_LABEL(splx)
    248 	adr	lr, .Lswitch_search
    249 	b	_C_LABEL(sched_lock_idle)
    250 #else
    251 	adr	lr, .Lswitch_search
    252 	b	_C_LABEL(splx)
    253 #endif
    254 
    255 
    256 /*
    257  * Find a new lwp to run, save the current context and
    258  * load the new context
    259  *
    260  * Arguments:
    261  *	r0	'struct lwp *' of the current LWP
    262  */
    263 
    264 ENTRY(cpu_switch)
    265 /*
    266  * Local register usage. Some of these registers are out of date.
    267  * r1 = oldlwp
    268  * r2 = spl level
    269  * r3 = whichqs
    270  * r4 = queue
    271  * r5 = &qs[queue]
    272  * r6 = newlwp
    273  * r7 = scratch
    274  */
    275 	stmfd	sp!, {r4-r7, lr}
    276 
    277 	/*
    278 	 * Indicate that there is no longer a valid process (curlwp = 0).
    279 	 * Zero the current PCB pointer while we're at it.
    280 	 */
    281 	ldr	r7, .Lcurlwp
    282 	ldr	r6, .Lcurpcb
    283 	mov	r2, #0x00000000
    284 	str	r2, [r7]		/* curproc = NULL */
    285 	str	r2, [r6]		/* curpcb = NULL */
    286 
    287 	/* stash the old proc while we call functions */
    288 	mov	r5, r0
    289 
    290 	/* First phase : find a new lwp */
    291 	ldr	r7, .Lwhichqs
    292 
    293 	/* rem: r5 = old lwp */
    294 	/* rem: r7 = &whichqs */
    295 
    296 .Lswitch_search:
    297 	IRQdisable
    298 
    299 	/* Do we have any active queues  */
    300 	ldr	r3, [r7]
    301 
    302 	/* If not we must idle until we do. */
    303 	teq	r3, #0x00000000
    304 	beq	_ASM_LABEL(idle)
    305 
    306 	/* put old proc back in r1 */
    307 	mov	r1, r5
    308 
    309 	/* rem: r1 = old lwp */
    310 	/* rem: r3 = whichqs */
    311 	/* rem: interrupts are disabled */
    312 
    313 	/* used further down, saves SA stall */
    314 	ldr	r6, .Lqs
    315 
    316 	/*
    317 	 * We have found an active queue. Currently we do not know which queue
    318 	 * is active just that one of them is.
    319 	 */
    320 	/* Non-Xscale version of the ffs algorithm devised by d.seal and
    321 	 * posted to comp.sys.arm on 16 Feb 1994.
    322 	 */
    323  	rsb	r5, r3, #0
    324  	ands	r0, r3, r5
    325 
    326 #ifndef __XSCALE__
    327 	adr	r5, .Lcpu_switch_ffs_table
    328 
    329 				    /* X = R0 */
    330 	orr	r4, r0, r0, lsl #4  /* r4 = X * 0x11 */
    331 	orr	r4, r4, r4, lsl #6  /* r4 = X * 0x451 */
    332 	rsb	r4, r4, r4, lsl #16 /* r4 = X * 0x0450fbaf */
    333 
    334 	/* now lookup in table indexed on top 6 bits of a4 */
    335 	ldrb	r4, [ r5, r4, lsr #26 ]
    336 
    337 #else	/* __XSCALE__ */
    338 	clz	r4, r0
    339 	rsb	r4, r4, #31
    340 #endif	/* __XSCALE__ */
    341 
    342 	/* rem: r0 = bit mask of chosen queue (1 << r4) */
    343 	/* rem: r1 = old lwp */
    344 	/* rem: r3 = whichqs */
    345 	/* rem: r4 = queue number */
    346 	/* rem: interrupts are disabled */
    347 
    348 	/* Get the address of the queue (&qs[queue]) */
    349 	add	r5, r6, r4, lsl #3
    350 
    351 	/*
    352 	 * Get the lwp from the queue and place the next process in
    353 	 * the queue at the head. This basically unlinks the lwp at
    354 	 * the head of the queue.
    355 	 */
    356 	ldr	r6, [r5, #(L_FORW)]
    357 
    358 #ifdef DIAGNOSTIC
    359 	cmp	r6, r5
    360 	beq	.Lswitch_bogons
    361 #endif
    362 
    363 	/* rem: r6 = new lwp */
    364 	ldr	r7, [r6, #(L_FORW)]
    365 	str	r7, [r5, #(L_FORW)]
    366 
    367 	/*
    368 	 * Test to see if the queue is now empty. If the head of the queue
    369 	 * points to the queue itself then there are no more lwps in
    370 	 * the queue. We can therefore clear the queue not empty flag held
    371 	 * in r3.
    372 	 */
    373 
    374 	teq	r5, r7
    375 	biceq	r3, r3, r0
    376 
    377 	/* rem: r0 = bit mask of chosen queue (1 << r4) - NOT NEEDED AN MORE */
    378 
    379 	/* Fix the back pointer for the lwp now at the head of the queue. */
    380 	ldr	r0, [r6, #(L_BACK)]
    381 	str	r0, [r7, #(L_BACK)]
    382 
    383 	/* Update the RAM copy of the queue not empty flags word. */
    384 	ldreq	r7, .Lwhichqs
    385 	streq	r3, [r7]
    386 
    387 	/* rem: r1 = old lwp */
    388 	/* rem: r3 = whichqs - NOT NEEDED ANY MORE */
    389 	/* rem: r4 = queue number - NOT NEEDED ANY MORE */
    390 	/* rem: r6 = new lwp */
    391 	/* rem: interrupts are disabled */
    392 
    393 	/* Clear the want_resched flag */
    394 	ldr	r7, .Lwant_resched
    395 	mov	r0, #0x00000000
    396 	str	r0, [r7]
    397 
    398 	/*
    399 	 * Clear the back pointer of the lwp we have removed from
    400 	 * the head of the queue. The new lwp is isolated now.
    401 	 */
    402 	str	r0, [r6, #(L_BACK)]
    403 
    404 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
    405 	/*
    406 	 * unlock the sched_lock, but leave interrupts off, for now.
    407 	 */
    408 	mov	r7, r1
    409 	bl	_C_LABEL(sched_unlock_idle)
    410 	mov	r1, r7
    411 #endif
    412 
    413 
    414 .Lswitch_resume:
    415 	/* rem: r1 = old lwp */
    416 	/* rem: r4 = return value [not used if came from cpu_switchto()] */
    417 	/* rem: r6 = new lwp */
    418 	/* rem: interrupts are disabled */
    419 
    420 #ifdef MULTIPROCESSOR
    421 	/* XXX use curcpu() */
    422 	ldr	r0, .Lcpu_info_store
    423 	str	r0, [r6, #(L_CPU)]
    424 #else
    425 	/* l->l_cpu initialized in fork1() for single-processor */
    426 #endif
    427 
    428 	/* Process is now on a processor. */
    429 	mov	r0, #LSONPROC			/* l->l_stat = LSONPROC */
    430 	str	r0, [r6, #(L_STAT)]
    431 
    432 	/* We have a new curlwp now so make a note it */
    433 	ldr	r7, .Lcurlwp
    434 	str	r6, [r7]
    435 
    436 	/* Hook in a new pcb */
    437 	ldr	r7, .Lcurpcb
    438 	ldr	r0, [r6, #(L_ADDR)]
    439 	str	r0, [r7]
    440 
    441 	/* At this point we can allow IRQ's again. */
    442 	IRQenable
    443 
    444 	/* rem: r1 = old lwp */
    445 	/* rem: r4 = return value */
    446 	/* rem: r6 = new lwp */
    447 	/* rem: interrupts are enabled */
    448 
    449 	/*
    450 	 * If the new lwp is the same as the lwp that called
    451 	 * cpu_switch() then we do not need to save and restore any
    452 	 * contexts. This means we can make a quick exit.
    453 	 * The test is simple if curlwp on entry (now in r1) is the
    454 	 * same as the lwp removed from the queue we can jump to the exit.
    455 	 */
    456 	teq	r1, r6
    457 	moveq	r4, #0x00000000		/* default to "didn't switch" */
    458 	beq	.Lswitch_return
    459 
    460 	/*
    461 	 * At this point, we are guaranteed to be switching to
    462 	 * a new lwp.
    463 	 */
    464 	mov	r4, #0x00000001
    465 
    466 	/* Remember the old lwp in r0 */
    467 	mov	r0, r1
    468 
    469 	/*
    470 	 * If the old lwp on entry to cpu_switch was zero then the
    471 	 * process that called it was exiting. This means that we do
    472 	 * not need to save the current context. Instead we can jump
    473 	 * straight to restoring the context for the new process.
    474 	 */
    475 	teq	r0, #0x00000000
    476 	beq	.Lswitch_exited
    477 
    478 	/* rem: r0 = old lwp */
    479 	/* rem: r4 = return value */
    480 	/* rem: r6 = new lwp */
    481 	/* rem: interrupts are enabled */
    482 
    483 	/* Stage two : Save old context */
    484 
    485 	/* Get the user structure for the old lwp. */
    486 	ldr	r1, [r0, #(L_ADDR)]
    487 
    488 	/* Save all the registers in the old lwp's pcb */
    489 #ifndef __XSCALE__
    490 	add	r7, r1, #(PCB_R8)
    491 	stmia	r7, {r8-r13}
    492 #else
    493 	strd	r8, [r1, #(PCB_R8)]
    494 	strd	r10, [r1, #(PCB_R10)]
    495 	strd	r12, [r1, #(PCB_R12)]
    496 #endif
    497 
    498 	/*
    499 	 * NOTE: We can now use r8-r13 until it is time to restore
    500 	 * them for the new process.
    501 	 */
    502 
    503 	/* Remember the old PCB. */
    504 	mov	r8, r1
    505 
    506 	/* r1 now free! */
    507 
    508 	/* Get the user structure for the new process in r9 */
    509 	ldr	r9, [r6, #(L_ADDR)]
    510 
    511 	/*
    512 	 * This can be optimised... We know we want to go from SVC32
    513 	 * mode to UND32 mode
    514 	 */
    515         mrs	r3, cpsr
    516 	bic	r2, r3, #(PSR_MODE)
    517 	orr	r2, r2, #(PSR_UND32_MODE | I32_bit)
    518         msr	cpsr_c, r2
    519 
    520 	str	sp, [r8, #(PCB_UND_SP)]
    521 
    522         msr	cpsr_c, r3		/* Restore the old mode */
    523 
    524 	/* rem: r0 = old lwp */
    525 	/* rem: r4 = return value */
    526 	/* rem: r6 = new lwp */
    527 	/* rem: r8 = old PCB */
    528 	/* rem: r9 = new PCB */
    529 	/* rem: interrupts are enabled */
    530 
    531 	/* What else needs to be saved  Only FPA stuff when that is supported */
    532 
    533 	/* Third phase : restore saved context */
    534 
    535 	/* rem: r0 = old lwp */
    536 	/* rem: r4 = return value */
    537 	/* rem: r6 = new lwp */
    538 	/* rem: r8 = old PCB */
    539 	/* rem: r9 = new PCB */
    540 	/* rem: interrupts are enabled */
    541 
    542 	/*
    543 	 * Get the new L1 table pointer into r11.  If we're switching to
    544 	 * an LWP with the same address space as the outgoing one, we can
    545 	 * skip the cache purge and the TTB load.
    546 	 *
    547 	 * To avoid data dep stalls that would happen anyway, we try
    548 	 * and get some useful work done in the mean time.
    549 	 */
    550 	ldr	r10, [r8, #(PCB_PAGEDIR)]	/* r10 = old L1 */
    551 	ldr	r11, [r9, #(PCB_PAGEDIR)]	/* r11 = new L1 */
    552 
    553 	ldr	r0, [r8, #(PCB_DACR)]		/* r0 = old DACR */
    554 	ldr	r1, [r9, #(PCB_DACR)]		/* r1 = new DACR */
    555 	ldr	r8, [r9, #(PCB_CSTATE)]		/* r8 = &new_pmap->pm_cstate */
    556 	ldr	r5, .Llast_cache_state_ptr	/* Previous thread's cstate */
    557 
    558 	teq	r10, r11			/* Same L1? */
    559 	ldr	r5, [r5]
    560 	cmpeq	r0, r1				/* Same DACR? */
    561 	beq	.Lcs_context_switched		/* yes! */
    562 
    563 	ldr	r3, .Lblock_userspace_access
    564 	mov	r12, #0
    565 	cmp	r5, #0				/* No last vm? (switch_exit) */
    566 	beq	.Lcs_cache_purge_skipped	/* No, we can skip cache flsh */
    567 
    568 	mov	r2, #DOMAIN_CLIENT
    569 	cmp	r1, r2, lsl #(PMAP_DOMAIN_KERNEL * 2) /* Sw to kernel thread? */
    570 	beq	.Lcs_cache_purge_skipped	/* Yup. Don't flush cache */
    571 
    572 	cmp	r5, r8				/* Same userland VM space? */
    573 	ldrneb	r12, [r5, #(CS_CACHE_ID)]	/* Last VM space cache state */
    574 
    575 	/*
    576 	 * We're definately switching to a new userland VM space,
    577 	 * and the previous userland VM space has yet to be flushed
    578 	 * from the cache/tlb.
    579 	 *
    580 	 * r12 holds the previous VM space's cs_cache_id state
    581 	 */
    582 	tst	r12, #0xff			/* Test cs_cache_id */
    583 	beq	.Lcs_cache_purge_skipped	/* VM space is not in cache */
    584 
    585 	/*
    586 	 * Definately need to flush the cache.
    587 	 * Mark the old VM space as NOT being resident in the cache.
    588 	 */
    589 	mov	r2, #0x00000000
    590 	strb	r2, [r5, #(CS_CACHE_ID)]
    591 	strb	r2, [r5, #(CS_CACHE_D)]
    592 
    593 	/*
    594 	 * Don't allow user space access between the purge and the switch.
    595 	 */
    596 	mov	r2, #0x00000001
    597 	str	r2, [r3]
    598 
    599 	stmfd	sp!, {r0-r3}
    600 	ldr	r1, .Lcpufuncs
    601 	mov	lr, pc
    602 	ldr	pc, [r1, #CF_IDCACHE_WBINV_ALL]
    603 	ldmfd	sp!, {r0-r3}
    604 
    605 .Lcs_cache_purge_skipped:
    606 	/* rem: r1 = new DACR */
    607 	/* rem: r3 = &block_userspace_access */
    608 	/* rem: r4 = return value */
    609 	/* rem: r5 = &old_pmap->pm_cstate (or NULL) */
    610 	/* rem: r6 = new lwp */
    611 	/* rem: r8 = &new_pmap->pm_cstate */
    612 	/* rem: r9 = new PCB */
    613 	/* rem: r10 = old L1 */
    614 	/* rem: r11 = new L1 */
    615 
    616 	mov	r2, #0x00000000
    617 	ldr	r7, [r9, #(PCB_PL1VEC)]
    618 
    619 	/*
    620 	 * At this point we need to kill IRQ's again.
    621 	 *
    622 	 * XXXSCW: Don't need to block FIQs if vectors have been relocated
    623 	 */
    624 	IRQdisableALL
    625 
    626 	/*
    627 	 * Interrupts are disabled so we can allow user space accesses again
    628 	 * as none will occur until interrupts are re-enabled after the
    629 	 * switch.
    630 	 */
    631 	str	r2, [r3]
    632 
    633 	/*
    634 	 * Ensure the vector table is accessible by fixing up the L1
    635 	 */
    636 	cmp	r7, #0			/* No need to fixup vector table? */
    637 	ldrne	r2, [r7]		/* But if yes, fetch current value */
    638 	ldrne	r0, [r9, #(PCB_L1VEC)]	/* Fetch new vector_page value */
    639 	mcr	p15, 0, r1, c3, c0, 0	/* Update DACR for new context */
    640 	cmpne	r2, r0			/* Stuffing the same value? */
    641 #ifndef PMAP_INCLUDE_PTE_SYNC
    642 	strne	r0, [r7]		/* Nope, update it */
    643 #else
    644 	beq	.Lcs_same_vector
    645 	str	r0, [r7]		/* Otherwise, update it */
    646 
    647 	/*
    648 	 * Need to sync the cache to make sure that last store is
    649 	 * visible to the MMU.
    650 	 */
    651 	ldr	r2, .Lcpufuncs
    652 	mov	r0, r7
    653 	mov	r1, #4
    654 	mov	lr, pc
    655 	ldr	pc, [r2, #CF_DCACHE_WB_RANGE]
    656 
    657 .Lcs_same_vector:
    658 #endif /* PMAP_INCLUDE_PTE_SYNC */
    659 
    660 	cmp	r10, r11		/* Switching to the same L1? */
    661 	ldr	r10, .Lcpufuncs
    662 	beq	.Lcs_same_l1		/* Yup. */
    663 
    664 	/*
    665 	 * Do a full context switch, including full TLB flush.
    666 	 */
    667 	mov	r0, r11
    668 	mov	lr, pc
    669 	ldr	pc, [r10, #CF_CONTEXT_SWITCH]
    670 
    671 	/*
    672 	 * Mark the old VM space as NOT being resident in the TLB
    673 	 */
    674 	mov	r2, #0x00000000
    675 	cmp	r5, #0
    676 	strneh	r2, [r5, #(CS_TLB_ID)]
    677 	b	.Lcs_context_switched
    678 
    679 	/*
    680 	 * We're switching to a different process in the same L1.
    681 	 * In this situation, we only need to flush the TLB for the
    682 	 * vector_page mapping, and even then only if r7 is non-NULL.
    683 	 */
    684 .Lcs_same_l1:
    685 	cmp	r7, #0
    686 	movne	r0, #0			/* We *know* vector_page's VA is 0x0 */
    687 	movne	lr, pc
    688 	ldrne	pc, [r10, #CF_TLB_FLUSHID_SE]
    689 
    690 .Lcs_context_switched:
    691 	/* rem: r8 = &new_pmap->pm_cstate */
    692 
    693 	/* XXXSCW: Safe to re-enable FIQs here */
    694 
    695 	/*
    696 	 * The new VM space is live in the cache and TLB.
    697 	 * Update its cache/tlb state, and if it's not the kernel
    698 	 * pmap, update the 'last cache state' pointer.
    699 	 */
    700 	mov	r2, #-1
    701 	ldr	r5, .Lpmap_kernel_cstate
    702 	ldr	r0, .Llast_cache_state_ptr
    703 	str	r2, [r8, #(CS_ALL)]
    704 	cmp	r5, r8
    705 	strne	r8, [r0]
    706 
    707 	/* rem: r4 = return value */
    708 	/* rem: r6 = new lwp */
    709 	/* rem: r9 = new PCB */
    710 
    711 	/*
    712 	 * This can be optimised... We know we want to go from SVC32
    713 	 * mode to UND32 mode
    714 	 */
    715         mrs	r3, cpsr
    716 	bic	r2, r3, #(PSR_MODE)
    717 	orr	r2, r2, #(PSR_UND32_MODE)
    718         msr	cpsr_c, r2
    719 
    720 	ldr	sp, [r9, #(PCB_UND_SP)]
    721 
    722         msr	cpsr_c, r3		/* Restore the old mode */
    723 
    724 	/* Restore all the save registers */
    725 #ifndef __XSCALE__
    726 	add	r7, r9, #PCB_R8
    727 	ldmia	r7, {r8-r13}
    728 
    729 	sub	r7, r7, #PCB_R8		/* restore PCB pointer */
    730 #else
    731 	mov	r7, r9
    732 	ldr	r8, [r7, #(PCB_R8)]
    733 	ldr	r9, [r7, #(PCB_R9)]
    734 	ldr	r10, [r7, #(PCB_R10)]
    735 	ldr	r11, [r7, #(PCB_R11)]
    736 	ldr	r12, [r7, #(PCB_R12)]
    737 	ldr	r13, [r7, #(PCB_SP)]
    738 #endif
    739 
    740 	ldr	r5, [r6, #(L_PROC)]	/* fetch the proc for below */
    741 
    742 	/* rem: r4 = return value */
    743 	/* rem: r5 = new lwp's proc */
    744 	/* rem: r6 = new lwp */
    745 	/* rem: r7 = new pcb */
    746 
    747 #ifdef ARMFPE
    748 	add	r0, r7, #(USER_SIZE) & 0x00ff
    749 	add	r0, r0, #(USER_SIZE) & 0xff00
    750 	bl	_C_LABEL(arm_fpe_core_changecontext)
    751 #endif
    752 
    753 	/* We can enable interrupts again */
    754 	IRQenableALL
    755 
    756 	/* rem: r4 = return value */
    757 	/* rem: r5 = new lwp's proc */
    758 	/* rem: r6 = new lwp */
    759 	/* rem: r7 = new PCB */
    760 
    761 	/*
    762 	 * Check for restartable atomic sequences (RAS).
    763 	 */
    764 
    765 	ldr	r2, [r5, #(P_RASLIST)]
    766 	ldr	r1, [r7, #(PCB_TF)]	/* r1 = trapframe (used below) */
    767 	teq	r2, #0			/* p->p_nras == 0? */
    768 	bne	.Lswitch_do_ras		/* no, check for one */
    769 
    770 .Lswitch_return:
    771 	/* cpu_switch returns 1 == switched, 0 == didn't switch */
    772 	mov	r0, r4
    773 
    774 	/*
    775 	 * Pull the registers that got pushed when either savectx() or
    776 	 * cpu_switch() was called and return.
    777 	 */
    778 	ldmfd	sp!, {r4-r7, pc}
    779 
    780 .Lswitch_do_ras:
    781 	ldr	r1, [r1, #(TF_PC)]	/* second ras_lookup() arg */
    782 	mov	r0, r5			/* first ras_lookup() arg */
    783 	bl	_C_LABEL(ras_lookup)
    784 	cmn	r0, #1			/* -1 means "not in a RAS" */
    785 	ldrne	r1, [r7, #(PCB_TF)]
    786 	strne	r0, [r1, #(TF_PC)]
    787 	b	.Lswitch_return
    788 
    789 .Lswitch_exited:
    790 	/*
    791 	 * We skip the cache purge because switch_exit() already did it.
    792 	 * Load up registers the way .Lcs_cache_purge_skipped expects.
    793 	 * Userspace access already blocked by switch_exit().
    794 	 */
    795 	ldr	r9, [r6, #(L_ADDR)]		/* r9 = new PCB */
    796 	ldr	r3, .Lblock_userspace_access
    797 	mrc	p15, 0, r10, c2, c0, 0		/* r10 = old L1 */
    798 	mov	r5, #0				/* No previous cache state */
    799 	ldr	r1, [r9, #(PCB_DACR)]		/* r1 = new DACR */
    800 	ldr	r8, [r9, #(PCB_CSTATE)]		/* r8 = new cache state */
    801 	ldr	r11, [r9, #(PCB_PAGEDIR)]	/* r11 = new L1 */
    802 	b	.Lcs_cache_purge_skipped
    803 
    804 
    805 #ifdef DIAGNOSTIC
    806 .Lswitch_bogons:
    807 	adr	r0, .Lswitch_panic_str
    808 	bl	_C_LABEL(panic)
    809 1:	nop
    810 	b	1b
    811 
    812 .Lswitch_panic_str:
    813 	.asciz	"cpu_switch: sched_qs empty with non-zero sched_whichqs!\n"
    814 #endif
    815 
    816 /*
    817  * cpu_switchto(struct lwp *current, struct lwp *next)
    818  * Switch to the specified next LWP
    819  * Arguments:
    820  *
    821  *	r0	'struct lwp *' of the current LWP
    822  *	r1	'struct lwp *' of the LWP to switch to
    823  */
    824 ENTRY(cpu_switchto)
    825 	stmfd	sp!, {r4-r7, lr}
    826 
    827 	mov	r6, r1		/* save new lwp */
    828 
    829 #if defined(LOCKDEBUG)
    830 	mov	r5, r0		/* save old lwp */
    831 	bl	_C_LABEL(sched_unlock_idle)
    832 	mov	r1, r5
    833 #else
    834 	mov	r1, r0
    835 #endif
    836 
    837 	IRQdisable
    838 
    839 	/*
    840 	 * Okay, set up registers the way cpu_switch() wants them,
    841 	 * and jump into the middle of it (where we bring up the
    842 	 * new process).
    843 	 *
    844 	 * r1 = old lwp (r6 = new lwp)
    845 	 */
    846 	b	.Lswitch_resume
    847 
    848 /*
    849  * void switch_exit(struct lwp *l, struct lwp *l0, void (*exit)(struct lwp *));
    850  * Switch to lwp0's saved context and deallocate the address space and kernel
    851  * stack for l.  Then jump into cpu_switch(), as if we were in lwp0 all along.
    852  */
    853 
    854 /* LINTSTUB: Func: void switch_exit(struct lwp *l, struct lwp *l0, void (*func)(struct lwp *)) */
    855 ENTRY(switch_exit)
    856 	/*
    857 	 * The process is going away, so we can use callee-saved
    858 	 * registers here without having to save them.
    859 	 */
    860 
    861 	mov	r4, r0
    862 	ldr	r0, .Lcurlwp
    863 
    864 	mov	r5, r1
    865 	ldr	r1, .Lblock_userspace_access
    866 
    867 	mov	r6, r2
    868 
    869 	/*
    870 	 * r4 = lwp
    871 	 * r5 = lwp0
    872 	 * r6 = exit func
    873 	 */
    874 
    875 	mov	r2, #0x00000000		/* curlwp = NULL */
    876 	str	r2, [r0]
    877 
    878 	/*
    879 	 * We're about to clear both the cache and the TLB.
    880 	 * Make sure to zap the 'last cache state' pointer since the
    881 	 * pmap might be about to go away. Also ensure the outgoing
    882 	 * VM space's cache state is marked as NOT resident in the
    883 	 * cache, and that lwp0's cache state IS resident.
    884 	 */
    885 	ldr	r7, [r4, #(L_ADDR)]		/* r7 = old lwp's PCB */
    886 	ldr	r0, .Llast_cache_state_ptr	/* Last userland cache state */
    887 	ldr	r9, [r7, #(PCB_CSTATE)]		/* Fetch cache state pointer */
    888 	ldr	r3, [r5, #(L_ADDR)]		/* r3 = lwp0's PCB */
    889 	str	r2, [r0]			/* No previous cache state */
    890 	str	r2, [r9, #(CS_ALL)]		/* Zap old lwp's cache state */
    891 	ldr	r3, [r3, #(PCB_CSTATE)]		/* lwp0's cache state */
    892 	mov	r2, #-1
    893 	str	r2, [r3, #(CS_ALL)]		/* lwp0 is in da cache! */
    894 
    895 	/*
    896 	 * Don't allow user space access between the purge and the switch.
    897 	 */
    898 	mov	r2, #0x00000001
    899 	str	r2, [r1]
    900 
    901 	/* Switch to lwp0 context */
    902 
    903 	ldr	r9, .Lcpufuncs
    904 	mov	lr, pc
    905 	ldr	pc, [r9, #CF_IDCACHE_WBINV_ALL]
    906 
    907 	ldr	r0, [r7, #(PCB_PL1VEC)]
    908 	ldr	r1, [r7, #(PCB_DACR)]
    909 
    910 	/*
    911 	 * r0 = Pointer to L1 slot for vector_page (or NULL)
    912 	 * r1 = lwp0's DACR
    913 	 * r4 = lwp we're switching from
    914 	 * r5 = lwp0
    915 	 * r6 = exit func
    916 	 * r7 = lwp0's PCB
    917 	 * r9 = cpufuncs
    918 	 */
    919 
    920 	IRQdisableALL
    921 
    922 	/*
    923 	 * Ensure the vector table is accessible by fixing up lwp0's L1
    924 	 */
    925 	cmp	r0, #0			/* No need to fixup vector table? */
    926 	ldrne	r3, [r0]		/* But if yes, fetch current value */
    927 	ldrne	r2, [r7, #(PCB_L1VEC)]	/* Fetch new vector_page value */
    928 	mcr	p15, 0, r1, c3, c0, 0	/* Update DACR for lwp0's context */
    929 	cmpne	r3, r2			/* Stuffing the same value? */
    930 	strne	r2, [r0]		/* Store if not. */
    931 
    932 #ifdef PMAP_INCLUDE_PTE_SYNC
    933 	/*
    934 	 * Need to sync the cache to make sure that last store is
    935 	 * visible to the MMU.
    936 	 */
    937 	movne	r1, #4
    938 	movne	lr, pc
    939 	ldrne	pc, [r9, #CF_DCACHE_WB_RANGE]
    940 #endif /* PMAP_INCLUDE_PTE_SYNC */
    941 
    942 	/*
    943 	 * Note: We don't do the same optimisation as cpu_switch() with
    944 	 * respect to avoiding flushing the TLB if we're switching to
    945 	 * the same L1 since this process' VM space may be about to go
    946 	 * away, so we don't want *any* turds left in the TLB.
    947 	 */
    948 
    949 	/* Switch the memory to the new process */
    950 	ldr	r0, [r7, #(PCB_PAGEDIR)]
    951 	mov	lr, pc
    952 	ldr	pc, [r9, #CF_CONTEXT_SWITCH]
    953 
    954 	ldr	r0, .Lcurpcb
    955 
    956 	/* Restore all the save registers */
    957 #ifndef __XSCALE__
    958 	add	r1, r7, #PCB_R8
    959 	ldmia	r1, {r8-r13}
    960 #else
    961 	ldr	r8, [r7, #(PCB_R8)]
    962 	ldr	r9, [r7, #(PCB_R9)]
    963 	ldr	r10, [r7, #(PCB_R10)]
    964 	ldr	r11, [r7, #(PCB_R11)]
    965 	ldr	r12, [r7, #(PCB_R12)]
    966 	ldr	r13, [r7, #(PCB_SP)]
    967 #endif
    968 	str	r7, [r0]	/* curpcb = lwp0's PCB */
    969 
    970 	IRQenableALL
    971 
    972 	/*
    973 	 * Schedule the vmspace and stack to be freed.
    974 	 */
    975 	mov	r0, r4			/* {lwp_}exit2(l) */
    976 	mov	lr, pc
    977 	mov	pc, r6
    978 
    979 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
    980 	bl	_C_LABEL(sched_lock_idle)
    981 #endif
    982 
    983 	ldr	r7, .Lwhichqs		/* r7 = &whichqs */
    984 	mov	r5, #0x00000000		/* r5 = old lwp = NULL */
    985 	b	.Lswitch_search
    986 
    987 /* LINTSTUB: Func: void savectx(struct pcb *pcb) */
    988 ENTRY(savectx)
    989 	/*
    990 	 * r0 = pcb
    991 	 */
    992 
    993 	/* Push registers.*/
    994 	stmfd	sp!, {r4-r7, lr}
    995 
    996 	/* Store all the registers in the process's pcb */
    997 #ifndef __XSCALE__
    998 	add	r2, r0, #(PCB_R8)
    999 	stmia	r2, {r8-r13}
   1000 #else
   1001 	strd	r8, [r0, #(PCB_R8)]
   1002 	strd	r10, [r0, #(PCB_R10)]
   1003 	strd	r12, [r0, #(PCB_R12)]
   1004 #endif
   1005 
   1006 	/* Pull the regs of the stack */
   1007 	ldmfd	sp!, {r4-r7, pc}
   1008 
   1009 ENTRY(proc_trampoline)
   1010 #ifdef __NEWINTR
   1011 	mov	r0, #(IPL_NONE)
   1012 	bl	_C_LABEL(_spllower)
   1013 #else /* ! __NEWINTR */
   1014 	mov	r0, #(_SPL_0)
   1015 	bl	_C_LABEL(splx)
   1016 #endif /* __NEWINTR */
   1017 
   1018 #ifdef MULTIPROCESSOR
   1019 	bl	_C_LABEL(proc_trampoline_mp)
   1020 #endif
   1021 	mov	r0, r5
   1022 	mov	r1, sp
   1023 	mov	lr, pc
   1024 	mov	pc, r4
   1025 
   1026 	/* Kill irq's */
   1027         mrs     r0, cpsr
   1028         orr     r0, r0, #(I32_bit)
   1029         msr     cpsr_c, r0
   1030 
   1031 	PULLFRAME
   1032 
   1033 	movs	pc, lr			/* Exit */
   1034 
   1035 #ifndef __XSCALE__
   1036 	.type .Lcpu_switch_ffs_table, _ASM_TYPE_OBJECT;
   1037 .Lcpu_switch_ffs_table:
   1038 /* same as ffs table but all nums are -1 from that */
   1039 /*               0   1   2   3   4   5   6   7           */
   1040 	.byte	 0,  0,  1, 12,  2,  6,  0, 13  /*  0- 7 */
   1041 	.byte	 3,  0,  7,  0,  0,  0,  0, 14  /*  8-15 */
   1042 	.byte	10,  4,  0,  0,  8,  0,  0, 25  /* 16-23 */
   1043 	.byte	 0,  0,  0,  0,  0, 21, 27, 15  /* 24-31 */
   1044 	.byte	31, 11,  5,  0,  0,  0,  0,  0	/* 32-39 */
   1045 	.byte	 9,  0,  0, 24,  0,  0, 20, 26  /* 40-47 */
   1046 	.byte	30,  0,  0,  0,  0, 23,  0, 19  /* 48-55 */
   1047 	.byte   29,  0, 22, 18, 28, 17, 16,  0  /* 56-63 */
   1048 #endif	/* !__XSCALE_ */
   1049